linux/kernel/el_posix.c

9531 lines
255 KiB
C

/*
* Here is implementation of Posix Support
* Two variants are implemented:
*
* 1. There is pobjs pointer in thread_t structure which points
* to el_pobjs.
*
* 2. There are two variants to work with mutex (see el_posix.h):
*
* #define WAKEUP_MUTEX_ONE 1 // to wakeup only one thread
* #define WAKEUP_MUTEX_ONE 0 // to wakeup all
*
* Short description of work.
*
* el_pobjs has two head queues for posix threads which are waiting
* mutexes or conditions. User's address of needed mutex or condition
* are located in item of queue.
* Internel function pthread_run() will wakeup needed pthreads only.
*
* Implementation in kernel.
* Aditional system call el_posix() was implemented for el_pthread lib:
*
* sys_el_posix() (see kernel/el_posix.c)
* sys_clone2() (see arch/e2k/process.c)
*
* To define syscall el_posix() was done:
*
* unistd.h:
* #define __NR_el_posix 255
* e2k_syswork.h :
* static inline _syscall5(int, el_posix, int, req,
* void *, a1, void *, a2, void *, a3, void *, a4);
*
* systable.c :
* SYSTEM_CALL_DEFINE(sys_el_posix);
* SYSTEM_CALL_TBL_ENTRY(sys_el_posix),
*
* Note that to port our posix for sparc and i386 only sys_el_posix()
* is needed as syscall el_posix().
* sys_el_posix() can be port without changing as additional system call
* (as for E2K)
* or can be done as psevdo driver.
* Instead sys_clone2() should be used nativ clone() (system call
* (see el_pthread.c)
*
*
* Implementation of posix lib.
* See el_pthread.c where lib posix is inplemented using el_posix()
*
* Posix implementation includes
* 1. new linux/el_posix.h
* 2. new kernel/el_posix.c
* additional element is needed in struct thread_t:
* void *pobjs;
*
* == SVS ==
*/
#define DEBUG_POSIX 0 /* DEBUG_POSIX */
#if DEBUG_POSIX
# define DEBUG
# define DbgPos(fmt, ...) \
trace_printk("%d " fmt, current->pid ,##__VA_ARGS__)
#else
# define DbgPos(...)
#endif
#include <linux/el_posix.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/interrupt.h>
#include <linux/compat.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#ifdef CONFIG_MCST
#include <linux/hrtimer.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <uapi/linux/mcst_rt.h>
#include <uapi/linux/el_posix.h>
#endif
#include <linux/sched/rt.h>
#ifdef CONFIG_E90S
#include <asm/e90s.h>
#endif
#include <asm/delay.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
#include <asm/el_posix.h>
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
#ifdef CONFIG_SCLKR_CLOCKSOURCE
#include <asm/sclkr.h>
#endif
#define PMUTEX_UNLOCKED 1
#define PMUTEX_LOCKED_ONCE 0
//#define PMUTEX_HAS_QUEUE 2
#define PTHREAD_WAIT (PMUTEX_WAIT | PCOND_WAIT | WAKEUP_PID_WAIT | SWITCH_WAIT)
/*
* wakeup_mode
*/
#define WAKEUP_ALL 0x100
#define WAKEUP_ONE 0x101
#define MOVE_TO_MUTEX 0x102
#define WAKEUP_PID 0x103
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
//typedef struct kmem_cache struct kmem_cache;
long redir_to_waiter = 1;
static long to_do_move_to_mutex = 1;
static long wakeup_mutex_one = 1;
static long PImutex = 0;
static DECLARE_RWSEM(posix_sem);
static struct kmem_cache *posix_objects = NULL;
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
int have_pps_mpv = 0;
EXPORT_SYMBOL(have_pps_mpv);
#ifdef CONFIG_MCST
static DEFINE_RAW_SPINLOCK(rts_lock);
long rts_mode = 0; // hard realtime mode 0-unactive, 1-active
EXPORT_SYMBOL(rts_mode);
// mcst realtime mode mask
long rts_act_mask = 0;
EXPORT_SYMBOL(rts_act_mask);
#endif /* CONFIG_MCST */
/* cpu_freq_hz is used to convert clocks into ns in user space */
u32 cpu_freq_hz = UNSET_CPU_FREQ; /* CPU freq (Hz) */
EXPORT_SYMBOL(cpu_freq_hz);
int __init cpufreq_setup(char *str)
{
cpu_freq_hz = simple_strtoul(str, &str, 0);
return 1;
}
__setup("cpufreq=", cpufreq_setup);
#if defined(CONFIG_E2K)
extern long irq_bind_to_cpu(int irq_msk, int cpu);
extern long el_set_apic_timer(void);
extern long el_unset_apic_timer(void);
#endif
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
/*
* pthread structs are as in glibc
* We use fields as follow:
* __spinlock for atomic operation
* __m_count for count of sleepers
* __m_owner to save pid of owner
*/
/*=============================================*/
struct _pthread_fastlock
{
long int __status;
int __spinlock;
};
typedef struct
{
int __m_owner_org_prio; // int __m_reserved; in bits/pthreadtypes.h/
int __m_count;
void *__m_owner;
int __m_kind;
struct _pthread_fastlock __m_lock;
} pthread_mutex_t;
typedef struct
{
struct _pthread_fastlock __c_lock;
void *__c_waiting;
} pthread_cond_t;
/*=============================================*/
#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
typedef struct {
unsigned long bitmap[BITMAP_SIZE];
struct list_head prio_list[MAX_PRIO];
} p_array_t;
typedef struct el_pobjs {
int pjobs_flag;
int adaptive_count;
raw_spinlock_t pobj_lock;
atomic_t users_number;
p_array_t pmutx_task_list;
p_array_t pcond_task_list;
} el_pobjs_t;
#define PJOBS_FLG_KERNEL_IMPL 0x00001
typedef struct pwait_q {
void *my_pobj;
struct task_struct *task;
struct list_head task_list;
pthread_mutex_t *mutex; // for cond wait
int cond_wakeuped;
int moved_to_mutex;
} pwait_q_t;
inline void pwaitq_init(pwait_q_t *wait, struct task_struct *tsk,
void *object)
{
wait->my_pobj = object;
wait->task = tsk;
wait->task_list.prev = NULL;
wait->task_list.next = NULL;
wait->mutex = NULL;
wait->cond_wakeuped = 0;
wait->moved_to_mutex = 0;
}
static inline void add_to_p_array(struct list_head *new, p_array_t *a, int prio)
{
list_add(new, a->prio_list + prio);
__set_bit(prio, a->bitmap);
}
static int
pmutex_trylock(pthread_mutex_t *mutex, struct task_struct *p)
{
int rval;
int *mutex_spin = &mutex->__m_lock.__spinlock;
DbgPos("trylock: start mutex=%p\n", mutex);
mutex->__m_count++;
rval = xchg(mutex_spin, -(mutex->__m_count));
WARN_ON(rval > PMUTEX_UNLOCKED);
if (rval == PMUTEX_UNLOCKED) {
DbgPos("trylock: rval == PMUTEX_UNLOCKED\n");
mutex->__m_count--;
mutex->__m_owner = (void *)(long)p->pid; /* target task is an owner */
mutex->__m_owner_org_prio = p->prio;
return 0;
}
return -EBUSY;
}
static inline void move_to_mutex(pwait_q_t *curr)
{
el_pobjs_t *pobjs;
pthread_cond_t *cond;
pthread_mutex_t *mutex;
pobjs = current->pobjs;
cond = (pthread_cond_t *)curr->my_pobj;
mutex = curr->mutex;
if(!pmutex_trylock(mutex, curr->task)) {
curr->moved_to_mutex = 1;
wake_up_process(curr->task);
return;
}
list_del(&curr->task_list);
curr->my_pobj = mutex;
add_to_p_array(&curr->task_list, &pobjs->pmutx_task_list, curr->task->prio);
DbgPos("lock_continue: start %p __m_count=%d spn=%d\n",
mutex, mutex->__m_count, mutex->__m_lock.__spinlock);
xchg(&mutex->__m_lock.__spinlock, -(mutex->__m_count));
}
static int pthread_run_prio(struct list_head *head, void *obj, int up_mode, int pid)
{
struct list_head *tmp;
int wokenup = 0;
DbgPos("pthread_run: up_mode %x\n", up_mode);
if (!head->next || !head->prev)
panic("pthread_run: !head->next || !head->prev\n");
tmp = head->next;
while (tmp != head) {
pwait_q_t *curr = list_entry(tmp, pwait_q_t, task_list);
tmp = tmp->next;
DbgPos("pthread_run: obj=%p cur=%p pid=%d\n", obj, curr->my_pobj, curr->task->pid);
if (obj == curr->my_pobj) {
if (up_mode == WAKEUP_ALL) {
curr->cond_wakeuped = 1;
wake_up_process(curr->task);
wokenup++;
continue;
}
if (up_mode == WAKEUP_ONE) {
curr->cond_wakeuped = 1;
wake_up_process(curr->task);
wokenup++;
break;
}
if (up_mode == MOVE_TO_MUTEX) {
curr->cond_wakeuped = 1;
move_to_mutex(curr);
wokenup++;
continue;
}
if (up_mode == WAKEUP_PID) {
if (curr->task->pid == pid) {
curr->cond_wakeuped = 1;
wake_up_process(curr->task);
wokenup++;
break;
}
continue;
}
printk(KERN_INFO "pthread_run: bad up_mode =%d \n",
up_mode);
dump_stack();
}
}
DbgPos("pthread_run end\n");
return wokenup;
}
static int pthread_run(p_array_t *head, void *obj, int up_mode, int pid)
{
int i = 0;
int num = 0;
i = sched_find_first_bit(head->bitmap);
while (i < MAX_PRIO) {
num += pthread_run_prio(head->prio_list + i, obj, up_mode, pid);
if (list_empty(head->prio_list + i)) {
__clear_bit(i, head->bitmap);
}
if (num && ((up_mode == WAKEUP_ONE) ||
(up_mode == WAKEUP_PID))) {
break;
}
i = find_next_bit(head->bitmap, MAX_PRIO, i + 1);
}
return num;
}
static void
pmutex_unlock_continue(pthread_mutex_t *mutex)
{
el_pobjs_t *pobjs = current->pobjs;
mutex->__m_owner = (void *)0;
if (PImutex)
current->prio = mutex->__m_owner_org_prio;
DbgPos("unlock_continue: start %p __m_count=%d spn=%d\n",
mutex, mutex->__m_count, mutex->__m_lock.__spinlock);
if (wakeup_mutex_one) {
pthread_run(&pobjs->pmutx_task_list, mutex, WAKEUP_ONE, 0);
} else {
pthread_run(&pobjs->pmutx_task_list, mutex, WAKEUP_ALL, 0);
}
}
/*
* Main synchro algorithm beetwin lock - unlock is based on
* atomic operation:
* xchg(mutex_spin, PMUTEX_UNLOCKED); and
* xchg(mutex_spin, -mutex->__m_count); or
*
* In our schema mutex_spin (mutex->__m_lock.__spinlock) can be:
*
* == 1 (PMUTEX_UNLOCKED)
* == 0 (PMUTEX_LOCKED_ONCE)
* == -__m_count (sleepers) (LOCKED too)
*
* When user do
* xchg(mutex_spin, -mutex->__m_count);
* user go in kernel if mutex_spin <= PMUTEX_LOCKED_ONCE
* When user do
* xchg(mutex_spin, PMUTEX_UNLOCKED);
* user go in kernel if mutex_spin < 0 (no sleepers).
*
* Bellow we try to substantiate that all is OK for our schema
* (time growes down: t1, t2, ...).
*
* Consider next sitations:
*
* 1. There are 2 threads thread_0 & thread_1 which work so:
*
* t1: thread_0 xchg(mutex_spin, PMUTEX_LOCKED_ONCE);
* t2: thread_1 xchg(mutex_spin, PMUTEX_LOCKED_ONCE);
* t3: thread_0 xchg(mutex_spin, PMUTEX_UNLOCKED);
*
* In this time (t3) thread_1 works or will work in kernel and will do
*
* xchg(mutex_spin, -mutex->__m_count);
*
* in any case before or after thread_0 (owner) will do
*
* xchg(mutex_spin, PMUTEX_UNLOCKED);
*
* 1.1. thread_0 before thread_1:
*
* t1: thread_0 xchg(mutex_spin, PMUTEX_LOCKED_ONCE);
* t2: thread_0 xchg(mutex_spin, PMUTEX_UNLOCKED);
* t3: thread_1 xchg(mutex_spin, -mutex->__m_count);
*
* In this case thread_1 will be owner,
* because mutex_spin == PMUTEX_UNLOCKED
*
* 1.2. thread_1 before thread_0
*
* t1: thread_0 xchg(mutex_spin, PMUTEX_LOCKED_ONCE);
* t2: thread_1 xchg(mutex_spin, -mutex->__m_count);
* t3: thread_0 xchg(mutex_spin, PMUTEX_UNLOCKED);
*
* When thread_0 do
* xchg(mutex_spin, PMUTEX_UNLOCKED);
* __m_count == 1; and mutex_spin == -1;
* so thread_0 go to kernel because mutex_spin < PMUTEX_LOCKED_ONCE (-1)
* to do wakeup()
* So for two threads every thing is OK in any case.
*
* 2. There are 3 threads thread_0, thread_1, thread_2.
* 2.1
* For any threads i (i > 0) we can say the same as
* in 1 (instead of thread_1 can be any thread_i).
* Different is only that value of __m_count can be > 1
* and value of mutex_spin < -1
*
* Uh !!!
*/
static int
pmutex_unlock(pthread_mutex_t *mutex)
{
int rval;
int *mutex_spin = &mutex->__m_lock.__spinlock;
DbgPos("unlock: start %p __m_count=%d spn=%d\n",
mutex, mutex->__m_count, *mutex_spin);
rval = xchg(mutex_spin, PMUTEX_UNLOCKED);
if (rval == PMUTEX_LOCKED_ONCE) {
return 0;
}
if (rval >= PMUTEX_UNLOCKED) {
WARN_ON_ONCE(rval > PMUTEX_UNLOCKED);
printk(KERN_INFO "%d pmutex_unlock: mutex %p ISN'T locked "
"rval=%d\n", current->pid, mutex, rval);
WARN_ON_ONCE(1);
return -EINVAL;
}
pmutex_unlock_continue(mutex);
DbgPos("unlock: end\n");
return 0;
}
static struct task_struct *__find_task_by_pid_check(pid_t pid);
static int
pmutex_lock_continue(pthread_mutex_t *mutex)
{
int rval;
el_pobjs_t *pobjs;
struct task_struct *tsk = current;
struct task_struct *owner_tsk;
int *mutex_spin = &mutex->__m_lock.__spinlock;
pwait_q_t wait;
pwaitq_init(&wait, tsk, mutex);
pobjs = current->pobjs;
add_to_p_array(&wait.task_list, &pobjs->pmutx_task_list,tsk->prio);
mutex->__m_count++;
DbgPos("lock_continue: start %p __m_count=%d spn=%d\n",
mutex, mutex->__m_count, mutex->__m_lock.__spinlock);
for (;;) {
rval = xchg(mutex_spin, -(mutex->__m_count));
if (rval == PMUTEX_UNLOCKED) {
DbgPos("lock_continue: I am owner __m_count=%d\n",
mutex->__m_count);
//WARN_ON(mutex->__m_count != 1);
break;
}
if ( PImutex && // debuging
(long)(mutex->__m_owner) &&
tsk->prio < mutex->__m_owner_org_prio) { // this test is for optimization
read_lock(&tasklist_lock);
owner_tsk = __find_task_by_pid_check((long)(mutex->__m_owner));
if (owner_tsk) {
get_task_struct(owner_tsk);
read_unlock(&tasklist_lock);
if (tsk->prio < owner_tsk->prio) {
rt_mutex_setprio(owner_tsk, tsk->prio);
}
put_task_struct(owner_tsk);
} else {
read_unlock(&tasklist_lock);
printk(KERN_INFO "PImutex owner_tsk empty "
"mutex->__m_owner=%ld\n",
(long)(mutex->__m_owner));
}
}
tsk->state = TASK_INTERRUPTIBLE;
raw_spin_unlock_irq_no_resched(&pobjs->pobj_lock);
schedule();
raw_spin_lock_irq(&pobjs->pobj_lock);
if (signal_pending(current)) {
mutex->__m_count--;
WARN_ON(mutex->__m_count < 0);
DbgPos("lock_continue: sig __m_count=%d spn=%d atomic_dec=%d\n",
mutex->__m_count, mutex->__m_lock.__spinlock, rval);
WARN_ON(wait.task_list.next == LIST_POISON1);
WARN_ON(wait.task_list.prev == LIST_POISON2);
list_del(&wait.task_list);
tsk->state = TASK_RUNNING;
return -EINTR;
}
}
WARN_ON(tsk->state != TASK_RUNNING);
DbgPos("lock_continue: end __m_count=%d spn=%d atomic_dec=%d\n",
mutex->__m_count, mutex->__m_lock.__spinlock, rval);
mutex->__m_count--;
WARN_ON(mutex->__m_count < 0);
WARN_ON(wait.task_list.next == LIST_POISON1);
WARN_ON(wait.task_list.prev == LIST_POISON2);
list_del(&wait.task_list);
mutex->__m_owner = (void *)(long)current->pid;
mutex->__m_owner_org_prio = current->prio;
return 0;
}
static int
pmutex_lock(pthread_mutex_t *mutex, el_pobjs_t *pobjs)
{
int rval;
int *mutex_spin = &mutex->__m_lock.__spinlock;
unsigned long flags;
// unsigned long t1, t2;
int i;
DbgPos("lock: start mutex=%p __m_count=%d __spinlock=%d\n",
mutex, mutex->__m_count, *mutex_spin);
/*
* __m_count should be >=0 in any time
* We do it without any synchro and check it on the off chance
*/
WARN_ON(mutex->__m_count < 0);
for (i = 0; i < pobjs->adaptive_count; i++) {
rval = xchg(mutex_spin, -(mutex->__m_count));
WARN_ON(rval > PMUTEX_UNLOCKED);
if (rval == PMUTEX_UNLOCKED) {
DbgPos("lock: rval == PMUTEX_UNLOCKED\n");
mutex->__m_owner = (void *)(long)current->pid; /* I am owner */
// PI mutex->__m_owner_org_prio = current->prio;
return 0;
}
udelay(1);
}
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
rval = pmutex_lock_continue(mutex); // returns 0 or EINTR
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
return rval;
}
static int
pcond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
{
el_pobjs_t *pobjs;
int rval;
int rval1;
struct task_struct *tsk = current;
pwait_q_t wait;
unsigned long flags;
DbgPos("pcond_wait: start cond=%p mutex=%p\n", cond, mutex);
pobjs = current->pobjs;
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
rval = pmutex_unlock(mutex);
if (rval) {
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
return rval;
}
pwaitq_init(&wait, tsk, (void *)cond);
wait.mutex = mutex;
add_to_p_array(&wait.task_list, &pobjs->pcond_task_list, tsk->prio);
while (!wait.cond_wakeuped && !signal_pending(current)) {
tsk->state = TASK_INTERRUPTIBLE;
raw_spin_unlock_irq_no_resched(&pobjs->pobj_lock);
schedule();
raw_spin_lock_irq(&pobjs->pobj_lock);
};
WARN_ON(tsk->state != TASK_RUNNING);
DbgPos("pcond_wait: after schedule() %p\n", cond);
WARN_ON(wait.task_list.next == LIST_POISON1);
WARN_ON(wait.task_list.prev == LIST_POISON2);
list_del(&wait.task_list);
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
if (wait.moved_to_mutex)
return 0;
do {
if (signal_pending(current) &&
!(sigismember(&current->pending.signal, SIGKILL))) {
if (test_and_clear_tsk_thread_flag(current,
TIF_SIGPENDING))
rval = -EINTR;
}
rval1 = pmutex_lock(mutex, pobjs); // returns 0 or EINTR
} while (rval1 == -EINTR && !(sigismember(&current->pending.signal, SIGKILL)));
if (rval == -EINTR)
set_tsk_thread_flag(current, TIF_SIGPENDING);
DbgPos("pcond_wait: rval=%d\n", rval);
return rval;
}
static int
pcond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
struct timespec *rqtp)
{
el_pobjs_t *pobjs;
unsigned long expire;
int rval = 0;
struct task_struct *tsk = current;
pwait_q_t wait;
unsigned long flags;
DbgPos("pcond_timedwait: start\n");
pobjs = current->pobjs;
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
rval = pmutex_unlock(mutex);
if (rval) {
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
return rval;
}
pwaitq_init(&wait, tsk, (void *)cond);
wait.mutex = mutex;
add_to_p_array(&wait.task_list, &pobjs->pcond_task_list, tsk->prio);
expire = timespec_to_jiffies(rqtp) + (rqtp->tv_sec || rqtp->tv_nsec);
while (!wait.cond_wakeuped && !expire &&
!signal_pending(current)) {
tsk->state = TASK_INTERRUPTIBLE;
raw_spin_unlock_irq_no_resched(&pobjs->pobj_lock);
expire = schedule_timeout(expire);
raw_spin_lock_irq(&pobjs->pobj_lock);
};
WARN_ON(tsk->state != TASK_RUNNING);
WARN_ON(wait.task_list.next == LIST_POISON1);
WARN_ON(wait.task_list.prev == LIST_POISON2);
list_del(&wait.task_list);
if (expire == 0) {
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
return -ETIMEDOUT;
}
if (signal_pending(current)) {
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
return -EINTR;
}
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
do {
rval = pmutex_lock(mutex, pobjs); // returns 0 or EINTR
} while (rval == -EINTR && !(sigismember(&current->pending.signal, SIGKILL)));
return rval;
}
static int
pcond_broadcast(pthread_cond_t *cond)
{
el_pobjs_t *pobjs;
unsigned long flags;
DbgPos("pcond_broadcast: cond=%p start\n", cond);
pobjs = current->pobjs;
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
if (to_do_move_to_mutex)
pthread_run(&pobjs->pcond_task_list, (void *)cond, MOVE_TO_MUTEX, 0);
else
pthread_run(&pobjs->pcond_task_list, (void *)cond, WAKEUP_ALL, 0);
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
DbgPos("pcond_broadcast: finish\n");
return 0;
}
static int
pcond_signal(pthread_cond_t *cond)
{
el_pobjs_t *pobjs;
unsigned long flags;
DbgPos("pcond_broadcast: start\n");
pobjs = current->pobjs;
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
pthread_run(&pobjs->pcond_task_list, (void *)cond, WAKEUP_ONE, 0);
DbgPos("pcond_broadcast: finish\n");
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
return 0;
}
static int
pcond_unlock_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
{
el_pobjs_t *pobjs;
int rval;
unsigned long flags;
struct task_struct *tsk = current;
pwait_q_t wait;
DbgPos("pcond_unlock_wait: start\n");
pobjs = current->pobjs;
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
rval = pmutex_unlock(mutex);
if (rval) {
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
return rval;
}
pwaitq_init(&wait, tsk, (void *)cond);
wait.mutex = mutex;
add_to_p_array(&wait.task_list, &pobjs->pcond_task_list, tsk->prio);
while (!wait.cond_wakeuped && !signal_pending(current)) {
tsk->state = TASK_INTERRUPTIBLE;
raw_spin_unlock_irq_no_resched(&pobjs->pobj_lock);
schedule();
raw_spin_lock_irq(&pobjs->pobj_lock);
};
WARN_ON(tsk->state != TASK_RUNNING);
WARN_ON(wait.task_list.next == LIST_POISON1);
WARN_ON(wait.task_list.prev == LIST_POISON2);
list_del(&wait.task_list);
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
DbgPos("pcond_unlock_wait: finish wakeuped=%d sig_pend=%d\n",
wait.cond_wakeuped, signal_pending(current));
if (signal_pending(current))
return -EINTR;
WARN_ON(!wait.cond_wakeuped);
return 0;
}
static int
el_pthread_wait(pthread_cond_t *cond)
{
el_pobjs_t *pobjs;
unsigned long flags;
struct task_struct *tsk = current;
pwait_q_t wait;
DbgPos("el_pthread_wait: start\n");
pwaitq_init(&wait, tsk, (void *)cond);
pobjs = current->pobjs;
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
add_to_p_array(&wait.task_list, &pobjs->pcond_task_list, tsk->prio);
while (!wait.cond_wakeuped && !signal_pending(current)) {
tsk->state = TASK_INTERRUPTIBLE;
raw_spin_unlock_irq_no_resched(&pobjs->pobj_lock);
schedule();
raw_spin_lock_irq(&pobjs->pobj_lock);
};
WARN_ON(tsk->state != TASK_RUNNING);
WARN_ON(wait.task_list.next == LIST_POISON1);
WARN_ON(wait.task_list.prev == LIST_POISON2);
list_del(&wait.task_list);
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
DbgPos("el_pthread_wait: finish\n");
if (wait.cond_wakeuped)
return 0;
if (signal_pending(current))
return -EINTR;
WARN_ON(1);
return 0;
}
static int
el_wakeup_pthread(pthread_cond_t *cond, int pid)
{
el_pobjs_t *pobjs;
unsigned long flags;
DbgPos("el_wakeup_pthread: start for %d\n", pid);
pobjs = current->pobjs;
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
pthread_run(&pobjs->pcond_task_list, (void *)cond, WAKEUP_PID, pid);
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
DbgPos("el_wakeup_pthread: finish el_wakeup_pthread\n");
return 0;
}
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
/*
* Set rts_mode. enable mlock, param priority, setaffinity, cpu_bind,
* irq_bind & mlock for all users.
*/
#ifdef CONFIG_MCST
static long
change_rts_mode_mask(long mode, long mask)
{
unsigned long flags;
long ret = rts_mode;
if (mode != -1 && mask != -1) {
printk("change_rts_mode_mask wrong mode = %ld, mask = %ld\n", mode, mask);
return -EINVAL;
}
raw_spin_lock_irqsave(&rts_lock, flags);
if (mode != -1) {
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto unlock;
}
mode = !!mode;
if (mode == rts_mode) {
goto unlock;
}
rts_mode = mode;
if (mode) {
mask = RTS_SOFT__RT;
} else {
mask = 0;
}
}
ret = rts_act_mask;
rts_act_mask = mask;
unlock:
raw_spin_unlock_irq(&rts_lock);
return ret;
}
#include <linux/sysctl.h>
static DEFINE_MUTEX(sysctl_lock);
static int sysctl_rts_mode;
static int sysctl_rts_mask;
static int
rts_mode_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
mutex_lock(&sysctl_lock);
sysctl_rts_mode = !!rts_mode;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write )
goto out;
ret = (int)change_rts_mode_mask((long)sysctl_rts_mode, -1);
out:
mutex_unlock(&sysctl_lock);
return ret;
}
static int
rts_mask_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
mutex_lock(&sysctl_lock);
sysctl_rts_mask = (int)rts_act_mask;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write )
goto out;
ret = (int)change_rts_mode_mask(-1, (long)sysctl_rts_mask);
out:
mutex_unlock(&sysctl_lock);
return ret;
}
struct ctl_table rt_table[] = {
{
.procname = "rts_mode",
.data = &sysctl_rts_mode,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = rts_mode_sysctl,
},
{
.procname = "rts_act_mask",
.data = &sysctl_rts_mask,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = rts_mask_sysctl,
},
{}
};
#endif
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
#include <asm/processor.h>
extern unsigned long loops_per_jiffy;
int
pthread_main_init(void)
{
el_pobjs_t *pobjs;
int i;
down_write(&posix_sem);
if (posix_objects == NULL) {
/* it is first action in the system */
posix_objects =
(struct kmem_cache *)kmem_cache_create("el_pobjs_t",
sizeof(el_pobjs_t), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!posix_objects) {
printk(KERN_INFO "Cannot create posix_objects "
"SLAB cache\n");
up_write(&posix_sem);
return -ENOMEM;
}
}
up_write(&posix_sem);
pobjs = (el_pobjs_t *)kmem_cache_alloc(posix_objects, GFP_KERNEL);
if (!pobjs) {
printk(KERN_INFO "Cannot alloc el_pobjs_t in SLAB cache\n");
return -ENOMEM;
}
current->pobjs = (void *)pobjs;
raw_spin_lock_init(&pobjs->pobj_lock);
atomic_set(&pobjs->users_number, 0);
pobjs->adaptive_count = 5;
for (i = 0; i < MAX_PRIO; i++) {
INIT_LIST_HEAD(pobjs->pmutx_task_list.prio_list + i);
INIT_LIST_HEAD(pobjs->pcond_task_list.prio_list + i);
}
__set_bit(MAX_PRIO, pobjs->pmutx_task_list.bitmap);
__set_bit(MAX_PRIO, pobjs->pcond_task_list.bitmap);
return 0;
}
void pthread_exit(void)
{
struct task_struct *tsk = current;
el_pobjs_t *pobjs = current->pobjs;
DbgPos("pthread_exit: kmem_cache_free for posix_objects\n");
if (atomic_dec_and_test(&pobjs->users_number))
kmem_cache_free(posix_objects, tsk->pobjs);
tsk->pobjs = NULL;
}
int pmutex_init(pthread_mutex_t *mutex)
{
DbgPos("pmutex_init: start for %p\n", mutex);
mutex->__m_lock.__spinlock = PMUTEX_UNLOCKED;
mutex->__m_count = 0;
DbgPos("pmutex_init: finish\n");
return 0;
}
int pcond_init(pthread_cond_t *cond)
{
cond->__c_lock.__spinlock = PMUTEX_UNLOCKED;
cond->__c_waiting = NULL;
return 0;
}
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
#define BAD_USER_REGION(addr, type) \
(unlikely(!access_ok(VERIFY_WRITE, addr, sizeof(type)) \
|| (((unsigned long) addr) % __alignof__(type)) != 0 ))
/*
* If user call any func with bad addres in user area then he get SIGSEGV
* from kernel's do_page_fault()
*/
/* To simplify 32-bit support user-space library always uses
* 64-bit values for tv_sec and tv_nsec in struct timespec. */
struct timespec_64 {
long long tv_sec;
long long tv_nsec;
};
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
static int do_main_init(unsigned int *cs_cost, unsigned int *kernel_flags);
static int do_object_init_fini(unsigned long type,
void *op,
void *obj,
int arg);
static int do_sem_post(struct posix_sem_s *__restrict const sem,
const int __s_desc);
#if defined ARCH_HAS_ATOMIC_CMPXCHG
static int do_sem_timedwait(struct posix_sem_s *__restrict const sem,
struct timespec_64 *__restrict const abstime,
const int __s_desc);
#else
static int do_sem_timedwait(struct posix_sem_s *__restrict const sem,
struct timespec_64 *__restrict const abstime,
const int __s_desc,
const int try);
#endif
static int do_mutex_timedlock(
struct pthread_mutex_s *__restrict const mutex,
const struct timespec_64 *__restrict const abstime,
const int __m_kind,
const int __m_desc);
static int do_mutex_unlock(
struct pthread_mutex_s *__restrict const mutex,
const int __m_kind,
const int __m_desc);
static int do_cond_timedwait(
struct pthread_cond_s *const cond,
struct pthread_mutex_s *const mutex,
const struct timespec_64 *const abstime,
const int ptr_64);
static int do_cond_wake(
struct pthread_cond_s *const cond,
const int __c_desc,
const int up_mode);
static int do_barrier_wait(
struct pthread_barrier_s *const barr,
const unsigned int required,
const int restarted,
const int __b_desc);
static int do_cancel(pid_t tgid, pid_t *p, int signal);
#if !defined ARCH_HAS_ATOMIC_CMPXCHG
static int do_sem_getvalue(struct posix_sem_s *__restrict const sem,
const int __s_desc);
#endif
static int do_mutex_set_ceiling(
struct pthread_mutex_s *const mutex,
const int __m_desc,
const int __m_kind_new,
const int ptr_64);
static int do_mutex_consistent(
struct pthread_mutex_s *const mutex,
const int __m_kind,
const int __m_desc);
static int do_set_unsafe_shared(pid_t pid, int *old_unsafe, int unsafe);
static int do_get_prio_protect(void);
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
static DEFINE_RAW_SPINLOCK(atomic_add_lock);
/*#define EL_TIMERFD_USING */
#ifdef CONFIG_MCST_RT
#ifdef EL_TIMERFD_USING
static int el_open_timerfd(void);
static int el_timerfd_settime(int ufd, struct itimerspec __user *tmr);
#ifdef CONFIG_COMPAT
static int compat_el_timerfd_settime(int ufd, struct compat_itimerspec __user *tmr);
#endif
#endif /*EL_TIMERFD_USING */
#endif
/*#define SHOW_WOKEN_TIME*/
#ifdef SHOW_WOKEN_TIME
int show_woken_time = 0;
EXPORT_SYMBOL(show_woken_time);
int __init woken_setup(char *str)
{
show_woken_time = simple_strtoul(str, &str, 0);
return 1;
}
__setup("wokent=", woken_setup);
static ssize_t woken_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *ppos)
{
char str[64];
if (count == 0)
return 0;
if (copy_from_user(str, ubuf, sizeof(str)))
return -EFAULT;
show_woken_time = simple_strtoul(str, NULL, 0);
return count;
}
int show_woken(struct seq_file *p, void *v)
{
seq_printf(p, "wokent= %d\n", show_woken_time);
return 0;
}
static int woken_open(struct inode *inode, struct file *filp)
{
return single_open(filp, show_woken, PDE_DATA(inode));
}
static const struct file_operations proc_woken_operations = {
.open = woken_open,
.read = seq_read,
.write = woken_write,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init proc_woken_init(void)
{
proc_create("woken-time", 0, NULL, &proc_woken_operations);
return 0;
}
module_init(proc_woken_init);
#endif
int cpus_intcount[NR_CPUS];
#include <linux/cpuset.h>
#ifdef SHOW_WOKEN_TIME
static int pr_err_done = 0;
#endif
long do_el_posix(int req, void __user *a1, void __user *a2,
void __user *a3, int a4)
{
long rval = 0;
int cpu = 0;
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
el_pobjs_t *pobjs;
unsigned long flags;
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
/* This way compiler will use a jump table here. */
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
switch (req - POSIX_MAIN_INIT) {
case POSIX_MAIN_INIT - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, unsigned int)) {
rval = -EINVAL;
break;
}
if (BAD_USER_REGION(a2, unsigned int)) {
rval = -EINVAL;
break;
}
rval = do_main_init((unsigned int *) a1, (unsigned int *) a2);
break;
case POSIX_OBJECT_INIT_FINI - POSIX_MAIN_INIT:
rval = do_object_init_fini((unsigned long) a1, a2, a3, a4);
break;
case POSIX_SEM_POST - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct posix_sem_s)) {
rval = -EINVAL;
break;
}
rval = do_sem_post((struct posix_sem_s *) a1,
(int) (unsigned long) a2);
break;
case POSIX_SEM_TIMEDWAIT - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct posix_sem_s)) {
rval = -EINVAL;
break;
}
#if defined ARCH_HAS_ATOMIC_CMPXCHG
rval = do_sem_timedwait((struct posix_sem_s *) a1,
(struct timespec_64 *) a2,
(int) (unsigned long) a3);
#else
rval = do_sem_timedwait((struct posix_sem_s *) a1,
(struct timespec_64 *) a2,
(int) (unsigned long) a3, a4);
#endif
break;
case POSIX_MUTEX_TIMEDLOCK - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct pthread_mutex_s)) {
rval = -EINVAL;
break;
}
rval = do_mutex_timedlock((struct pthread_mutex_s *) a1,
(struct timespec_64 *) a2,
(int) (unsigned long) a3, a4);
break;
case POSIX_MUTEX_UNLOCK - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct pthread_mutex_s)) {
rval = -EINVAL;
break;
}
rval = do_mutex_unlock((struct pthread_mutex_s *) a1,
(int) (unsigned long) a2,
(int) (unsigned long) a3);
break;
case POSIX_COND_TIMEDWAIT - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct pthread_cond_s)) {
rval = -EINVAL;
break;
}
if (BAD_USER_REGION(a2, struct pthread_mutex_s)) {
rval = -EINVAL;
break;
}
rval = do_cond_timedwait((struct pthread_cond_s *) a1,
(struct pthread_mutex_s *) a2,
(struct timespec_64 *) a3, a4);
break;
case POSIX_COND_WAKE - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct pthread_cond_s)) {
rval = -EINVAL;
break;
}
rval = do_cond_wake((struct pthread_cond_s *) a1,
(int) (unsigned long) a2,
(int) (unsigned long) a3);
break;
case POSIX_BARRIER_WAIT - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct pthread_barrier_s)) {
rval = -EINVAL;
break;
}
rval = do_barrier_wait((struct pthread_barrier_s *) a1,
(unsigned int) (unsigned long) a2,
(int) (unsigned long) a3, a4);
break;
case POSIX_CANCEL - POSIX_MAIN_INIT:
rval = do_cancel((pid_t) (unsigned long) a1, (pid_t *) a2,
(int) (unsigned long) a3);
break;
case POSIX_COLLECT_SHARED - POSIX_MAIN_INIT:
rval = -ENOSYS;
break;
case POSIX_SEM_GET_VALUE - POSIX_MAIN_INIT:
#ifdef ARCH_HAS_ATOMIC_CMPXCHG
rval = -ENOSYS;
#else
if (BAD_USER_REGION(a1, struct posix_sem_s)) {
rval = -EINVAL;
break;
}
rval = do_sem_getvalue((struct posix_sem_s *) a1,
(int) (unsigned long) a2);
#endif
break;
case POSIX_MUTEX_SET_CEILING - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct pthread_mutex_s)) {
rval = -EINVAL;
break;
}
rval = do_mutex_set_ceiling((struct pthread_mutex_s *) a1,
(int) (unsigned long) a2,
(int) (unsigned long) a3, a4);
break;
case POSIX_MUTEX_CONSISTENT - POSIX_MAIN_INIT:
if (BAD_USER_REGION(a1, struct pthread_mutex_s)) {
rval = -EINVAL;
break;
}
rval = do_mutex_consistent((struct pthread_mutex_s *) a1,
(int) (unsigned long) a2,
(int) (unsigned long) a3);
break;
case POSIX_SET_PARAMETER - POSIX_MAIN_INIT:
switch ((unsigned long) a1) {
case POSIX_UNSAFE_SHARED:
rval = do_set_unsafe_shared((pid_t) (unsigned long) a2,
(int *) a3, a4);
break;
default:
rval = -EINVAL;
break;
}
break;
case POSIX_GET_PRIO_PROTECT - POSIX_MAIN_INIT:
rval = security_task_getscheduler(current);
if (!rval)
rval = do_get_prio_protect();
break;
}
if (req >= POSIX_MAIN_INIT)
goto out;
pobjs = current->pobjs;
if (pobjs == NULL && req > PTHREAD_MAIN_INIT)
goto BAD;
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
switch (req) {
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
case PTHREAD_MAIN_INIT:
rval = pthread_main_init();
break;
case PTHREAD_SET_KERNEL_IMPL:
if (current->pobjs == NULL) {
return -EINVAL;
}
((el_pobjs_t *)current->pobjs)->pjobs_flag |= PJOBS_FLG_KERNEL_IMPL;
break;
case PTHREAD_MUTEX_INIT:
if (BAD_USER_REGION(a1, sizeof(pthread_mutex_t)))
return -EFAULT;
rval = pmutex_init((pthread_mutex_t *)a1);
break;
case PTHREAD_COND_INIT:
if (BAD_USER_REGION(a1, sizeof(pthread_cond_t))) {
rval = -EFAULT;
break;
}
rval = pcond_init((pthread_cond_t *)a1);
break;
case PTHREAD_MUTEX_LOCK:
if (BAD_USER_REGION(a1, sizeof(pthread_mutex_t)))
return -EFAULT;
// DELAY_PRINT(("MutexLock Enter 0x%lx\n", a1));
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
rval = pmutex_lock_continue((pthread_mutex_t *)a1); //rval=0 or EINTR
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
// DELAY_PRINT(("MutexLock Exit 0x%lx\n", a1));
break;
case PTHREAD_MUTEX_UNLOCK:
if (BAD_USER_REGION(a1, sizeof(pthread_mutex_t)))
return -EFAULT;
raw_spin_lock_irqsave(&pobjs->pobj_lock, flags);
if (((el_pobjs_t *)current->pobjs)->pjobs_flag & PJOBS_FLG_KERNEL_IMPL) {
pmutex_unlock((pthread_mutex_t *)a1);
} else {
pmutex_unlock_continue((pthread_mutex_t *)a1);
}
raw_spin_unlock_irqrestore(&pobjs->pobj_lock, flags);
rval = 0;
break;
case PTHREAD_COND_WAIT:
if (BAD_USER_REGION(a1, sizeof(pthread_cond_t)))
return -EFAULT;
if (BAD_USER_REGION(a2, sizeof(pthread_mutex_t)))
return -EFAULT;
rval = pcond_wait((pthread_cond_t *)a1, (pthread_mutex_t *)a2);
break;
case PTHREAD_COND_TIMEDWAIT:
if (BAD_USER_REGION(a1, sizeof(pthread_cond_t)))
return -EFAULT;
if (BAD_USER_REGION(a2, sizeof(pthread_mutex_t)))
return -EFAULT;
if (BAD_USER_REGION(a3, sizeof(struct timespec_64)))
return -EFAULT;
rval = pcond_timedwait((pthread_cond_t *)a1,
(pthread_mutex_t *)a2, (struct timespec *)a3);
break;
case PTHREAD_COND_BROADCAST:
if (BAD_USER_REGION(a1, sizeof(pthread_cond_t)))
return -EFAULT;
rval = pcond_broadcast((pthread_cond_t *)a1);
break;
case PTHREAD_COND_SIGNAL:
if (BAD_USER_REGION(a1, sizeof(pthread_cond_t)))
return -EFAULT;
// DELAY_PRINT(("CondSignal Enter 0x%lx\n", a1));
rval = pcond_signal((pthread_cond_t *)a1);
// DELAY_PRINT(("CondSignal Exit 0x%lx\n", a1));
break;
/* It isn't POSIX standart bellow */
case EL_PCOND_UNLOCK_WAIT:
if (BAD_USER_REGION(a1, sizeof(pthread_cond_t)))
return -EFAULT;
if (BAD_USER_REGION(a2, sizeof(pthread_mutex_t)))
return -EFAULT;
rval = pcond_unlock_wait((pthread_cond_t *)a1,
(pthread_mutex_t *)a2);
break;
case EL_PTHREAD_WAIT:
if (BAD_USER_REGION(a1, sizeof(pthread_cond_t)))
return -EFAULT;
rval = el_pthread_wait((pthread_cond_t *)a1);
break;
case EL_WAKEUP_PTHREAD_COND:
if (BAD_USER_REGION(a1, sizeof(pthread_cond_t)))
return -EFAULT;
rval = el_wakeup_pthread((pthread_cond_t *)a1,
(long)a2 & ~(int)0);
break;
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
case EL_ATOMIC_ADD: {
int *target = (int *)a1;
int delta = (long)a2;
int *dst = (int *)a3;
int val;
raw_spin_lock_irq(&atomic_add_lock);
rval = get_user(val, target);
rval |= put_user(val, dst);
rval |= put_user((val + delta), target);
raw_spin_unlock_irq(&atomic_add_lock);
break;
}
#ifdef CONFIG_MCST
/* For mcst_rt lib: */
case EL_GET_CPUS_NUM:
rval = 0;
for_each_online_cpu(cpu) rval++;
return rval;
case EL_GET_CPUS_MASK:
rval = 0;
for_each_online_cpu(cpu) {
if (cpu > 63)
return 0;
rval |= (1LL << cpu);
}
return rval;
case EL_MY_CPU_ID:
rval = raw_smp_processor_id();
return rval;
#ifdef CONFIG_SMP
case EL_SET_IRQ_MASK:
{
unsigned long __maybe_unused cpu_mask = (long)a2;
#if defined(CONFIG_E2K) || defined(CONFIG_E90S) || \
(defined(__i386__) && defined(CONFIG_GENERIC_PENDING_IRQ))
unsigned long irq_mask = (long)a1;
cpumask_var_t cpu_mask_bitmap;
int i;
if (!alloc_cpumask_var(&cpu_mask_bitmap, GFP_KERNEL))
return -ENOMEM;
cpumask_clear(cpu_mask_bitmap);
for_each_online_cpu(i) {
if (cpu_mask & (1 << i)) {
if (cpu_online(i)) {
cpumask_set_cpu(i, cpu_mask_bitmap);
} else {
return -EINVAL;
}
}
}
for (i = 0; i < NR_IRQS; i++) {
if ((irq_mask >= (1<<24)) || (irq_mask & 1 << i))
if (irq_to_desc(i) && irq_can_set_affinity(i))
irq_set_affinity(i, cpu_mask_bitmap);
}
free_cpumask_var(cpu_mask_bitmap);
#elif defined(CONFIG_E90)
extern int smp4m_irq_set_mask(int cpu_mask, int on);
extern int smp4m_irq_get_mask(void);
unsigned long all_cpu_mask = 0;
smp4m_irq_set_mask(cpu_mask, 1);
for_each_online_cpu(cpu) {
all_cpu_mask |= (1 << cpu);
}
rval = smp4m_irq_set_mask(all_cpu_mask & ~cpu_mask, 0);
return smp4m_irq_get_mask();
#endif
}
break;
#endif /* SMP */
#if defined(CONFIG_E90S)
case SPARC_GET_USEC:
rval = put_user(get_cycles() * 1000000 / cpu_freq_hz,
(long long *)a1);
return rval;
#endif
case EL_RTS_MODE:
rval = change_rts_mode_mask((long) a1, -1);
return rval;
case EL_SET_RTS_ACTIVE:
rval = change_rts_mode_mask(-1, (long) a1);
return rval;
case EL_GET_RTS_ACTIVE:
return rts_act_mask;
case EL_GET_CPU_FREQ:
#ifdef CONFIG_E90S
return cpu_data(raw_smp_processor_id()).clock_tick;
#elif defined(__e2k__)
return cpu_data[raw_smp_processor_id()].proc_freq;
#endif
#if 0
case EL_SET_NET_RT:
local_irq_disable();
raw_spin_lock(&current->pi_lock);
current->rt_flags |= RT_TASK_IS_NET_RT;
raw_spin_unlock(&current->pi_lock);
local_irq_enable();
break;
case EL_UNSET_NET_RT:
local_irq_disable();
raw_spin_lock(&current->pi_lock);
current->rt_flags &= ~RT_TASK_IS_NET_RT;
raw_spin_unlock(&current->pi_lock);
local_irq_enable();
break;
#endif
case EL_SET_MLOCK_CONTROL :
current->extra_flags |= RT_MLOCK_CONTROL;
break;
case EL_UNSET_MLOCK_CONTROL :
current->extra_flags &= ~RT_MLOCK_CONTROL;
break;
#ifdef SHOW_WOKEN_TIME
case EL_GET_TIMES:
{
size_t sz = ((size_t)a2) / sizeof(long long);
unsigned long long ip;
unsigned long long *m = (unsigned long long *)a1;
if (show_woken_time < 2) {
if (pr_err_done)
return -EINVAL;
pr_err_done = 1;
return -EINVAL;
}
if (sz > EL_GET_TIMES_WAKEUP) {
rval = put_user(current->wakeup_tm,
(m + EL_GET_TIMES_WAKEUP));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_SCHED_ENTER) {
rval = put_user(current->sched_enter_tm,
(m + EL_GET_TIMES_SCHED_ENTER));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_SCHED_LOCK) {
rval = put_user(current->sched_lock_tm,
(m + EL_GET_TIMES_SCHED_LOCK));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_WOKEN) {
rval = put_user(current->waken_tm,
(m + EL_GET_TIMES_WOKEN));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_LAST_PRMT_ENAB) {
ip = (unsigned long long)current->last_ipi_prmt_enable;
rval = put_user(ip, (m + EL_GET_TIMES_LAST_PRMT_ENAB));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_INTR_W) {
rval = put_user(current->intr_w,
(m + EL_GET_TIMES_INTR_W));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_INTR_S) {
rval = put_user(current->intr_s,
(m + EL_GET_TIMES_INTR_S));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_CNTXB) {
rval = put_user(current->cntx_swb_tm,
(m + EL_GET_TIMES_CNTXB));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_CNTXE) {
rval = put_user(current->cntx_swe_tm,
(m + EL_GET_TIMES_CNTXE));
}
if (rval) {
break;
}
if (sz > EL_GET_TIMES_INTR_SC) {
rval = put_user(current->intr_sc,
(m + EL_GET_TIMES_INTR_SC));
}
if (rval) {
break;
}
break;
}
#endif
case EL_USER_TICK: {
int interval_us = (int)(long long)a1;
do_postpone_tick(interval_us * 1000);
break;
}
#ifdef CONFIG_MCST_RT
#ifdef EL_TIMERFD_USING
case EL_OPEN_TIMERFD :
rval = el_open_timerfd();
break;
case EL_TIMERFD_SETTIME :
rval = el_timerfd_settime((int) (unsigned long) a1, a2);
break;
#endif /*EL_TIMERFD_USING */
#endif
#ifdef CONFIG_E90S
case EL_SYNC_CYCLS: {
int i, this_cpu;
do_sync_cpu_clocks = (unsigned long) a1;
preempt_disable();
this_cpu = smp_processor_id();
for_each_online_cpu(i) {
if (i != this_cpu)
smp_synchronize_one_tick(i);
else
delta_ticks[i] = 0;
}
preempt_enable();
return copy_to_user((void *)a2, (void *)delta_ticks,
num_possible_cpus() * sizeof(long));
}
#endif
case EL_RT_CPU: {
int set = (int)(long long)a1;
struct task_struct *p, *t;
unsigned long cpu = smp_processor_id();
cpumask_var_t new_mask;
int retval;
int restore_flag = 0;
if (num_possible_cpus() == 1)
return 0;
if (set) {
if (cpumask_test_cpu(cpu, rt_cpu_mask))
return 0;
if (!zalloc_cpumask_var(&new_mask, GFP_NOWAIT)) {
return -ENOMEM;
}
cpumask_set_cpu(cpu, rt_cpu_mask);
#if 0
pr_warning("RT_CPUset %lu rm=%5lx\n",
cpu, cpumask_bits(rt_cpu_mask)[0]);
trace_printk("RCPs rm=%5lx\n",
cpumask_bits(rt_cpu_mask)[0]);
#endif
read_lock(&tasklist_lock);
do_each_thread(t, p) {
#if 0
pr_warning("RT_CPUseB %lu %20s/%6d m=0x%5lx"
" tcpu=%d md=%x na=%d\n",
cpu, p->comm, p->pid,
cpumask_bits(&p->cpus_allowed)[0],
task_cpu(p), p->migrate_disable,
p->nr_cpus_allowed);
#endif
if (cpumask_weight(&p->cpus_mask) == 1)
continue;
get_task_struct(p);
if (p->state > TASK_UNINTERRUPTIBLE) {
put_task_struct(p);
continue;
}
cpumask_copy(new_mask, p->cpus_ptr);
cpumask_clear_cpu(cpu, new_mask);
if (cpumask_empty(new_mask)) {
put_task_struct(p);
continue;
}
restore_flag = p->flags & PF_NO_SETAFFINITY;
p->flags &= ~PF_NO_SETAFFINITY;
retval = sched_setaffinity(p->pid, new_mask);
p->flags |= restore_flag;
if (retval)
pr_err("EL_RT_CPU: Could not set affinity "
"%20s/%6d cpu_mask=0x%5lx ER=%d\n",
p->comm, p->pid,
cpumask_bits(new_mask)[0],
retval);
#if 0
cpuset_cpus_allowed(p, new_mask);
pr_warning("RT_CPUset %lu %20s/%6d m=0x%4lx "
"sm=0x%4lx tcpu=%d md=%x na=%d ret=%d\n",
cpu, p->comm, p->pid,
cpumask_bits(&p->cpus_allowed)[0],
cpumask_bits(new_mask)[0],
task_cpu(p), p->migrate_disable,
p->nr_cpus_allowed, retval);
#endif
put_task_struct(p);
} while_each_thread(t, p);
read_unlock(&tasklist_lock);
#if 0
#if defined(CONFIG_E2K) || defined(CONFIG_E90S) || \
(defined(__i386__) && defined(CONFIG_GENERIC_PENDING_IRQ))
int i;
for (i = 0; i < NR_IRQS; i++) {
struct irq_desc *desc =
irq_to_desc((long)m->private);
const struct cpumask *mask =
desc->irq_data.affinity;
if (irq_to_desc(i) && irq_can_set_affinity(i))
irq_set_affinity(i, new_mask);
}
#endif
#endif
free_cpumask_var(new_mask);
/* let rcu works completition in this cpu */
schedule_timeout_interruptible(3);
#if 0
cpu_callback(NULL, CPU_DEAD, cpu);
#endif
} else {
if (!cpumask_test_cpu(cpu, rt_cpu_mask))
return 0;
cpumask_clear_cpu(cpu, rt_cpu_mask);
if (!zalloc_cpumask_var(&new_mask, GFP_NOWAIT)) {
return -ENOMEM;
}
read_lock(&tasklist_lock);
do_each_thread(t, p) {
if (cpumask_weight(&p->cpus_mask) == 1)
continue;
get_task_struct(p);
if (p->state > TASK_UNINTERRUPTIBLE) {
put_task_struct(p);
continue;
}
cpumask_copy(new_mask, p->cpus_ptr);
cpumask_set_cpu(cpu, new_mask);
p->flags &= ~PF_NO_SETAFFINITY;
retval = sched_setaffinity(p->pid, new_mask);
p->flags |= restore_flag;
if (retval)
pr_err("Could not set affinity to cpu "
"%20s/%6d m=0x%5lx ER=%d\n",
p->comm, p->pid,
cpumask_bits(new_mask)[0],
retval);
#if 0
pr_warning("RT_CPUunset %lu %20s/%6d m=0x%5lx"
"curcpu=%d md=%d na=%d ret=%d\n",
cpu, p->comm, p->pid,
cpumask_bits(&p->cpus_allowed)[0],
task_cpu(p), p->migrate_disable,
p->nr_cpus_allowed, retval);
#endif
put_task_struct(p);
} while_each_thread(t, p);
read_unlock(&tasklist_lock);
free_cpumask_var(new_mask);
#if 0
cpu_callback(NULL, CPU_UP_PREPARE, cpu);
cpu_callback(NULL, CPU_ONLINE, cpu);
#endif
}
return 0;
}
#endif
#ifdef CONFIG_SCLKR_CLOCKSOURCE
case EL_SCLKR_READ:
return clocksource_sclkr.read(NULL);
#endif
case EL_MISC_TO_DEBUG:
switch ((long) a1) {
#ifdef CONFIG_E90S
#include <asm/pcr.h>
case 6: {
int reg = (int)(long long)a2;
int val = (int)(long long)a3;
wr_pcr(E90S_PCR_SYS | (val << 11));
pr_warn("write_pcr reg=%d val=0x%x [reg]=0x%lx\n",
reg, val, E90S_PCR_SYS | (val << 11));
break;
}
#endif
case 11:
current->utime += (long)a2;
current->se.sum_exec_runtime += (long)a2 * 10000000;
printk(KERN_INFO "DBG pid=%d times are u=%lld s=%lld r=%lld"
" after adding %ld\n",
current->pid, current->utime, current->stime,
current->se.sum_exec_runtime, (long)a2);
break;
}
DbgPos("sys_el_posix: EL_MISC_TO_DEBUG\n");
break;
default:
rval = -EINVAL;
}
return rval;
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
out:
if (rval < 0)
DbgPos("posix ret error %ld for req %ld\n", rval, req);
return rval;
BAD:
return -EINVAL;
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
}
SYSCALL_DEFINE5(el_posix, int, req, void __user *, a1, void __user *, a2,
void __user *, a3, int, a4)
{
return do_el_posix(req, a1, a2, a3, a4);
}
#if !defined(CONFIG_E2K) && !defined(CONFIG_E90S)
asmlinkage long sys_el_posix(int req, void __user *a1, void __user *a2,
void __user *a3, int a4)
{
return do_el_posix(req, a1, a2, a3, a4);
}
#endif
#ifdef CONFIG_COMPAT
asmlinkage long compat_sys_el_posix(int req, void __user *a1, void __user *a2,
void __user *a3, int a4)
{
long rval;
switch (req) {
#ifdef CONFIG_MCST_RT
#ifdef EL_TIMERFD_USING
case EL_TIMERFD_SETTIME:
rval = compat_el_timerfd_settime((int) (unsigned long) a1, a2);
break;
#endif /*EL_TIMERFD_USING */
#endif
/* TODO: all el_posix users must use this interface */
default:
rval = do_el_posix(req, a1, a2, a3, a4);
break;
}
return rval;
}
#endif /* CONFIG_COMPAT */
#ifdef CONFIG_HAVE_EL_POSIX_SYSCALL
/* POSIX compliant implementation.
*
* Highlights:
* 1. Descriptors for mutexes and conditions are not allocated and freed
* dynamically, thus it is necessary to call destructors.
* 2. To support static initializers, functions mutex_once(), cond_once()
* and so on check whether the descriptor has already been allocated.
* 3. Broadcasting a condition with private mutex associated wakes only
* one thread, but it is necessary to check for one special case: if a
* thread locks a recursive mutex more than one time and calls
* pthread_cond_wait(), then it remains an owner of the mutex.
* 4. Use of mlock() will not protect from minor page faults. For
* example, move_pages() still can move data around.
*/
/* Mutex types. */
enum {
PTHREAD_MUTEX_TIMED_NP,
PTHREAD_MUTEX_RECURSIVE_NP,
PTHREAD_MUTEX_ERRORCHECK_NP,
PTHREAD_MUTEX_ADAPTIVE_NP
};
/* Mutex protocols. */
enum {
PTHREAD_PRIO_NONE,
PTHREAD_PRIO_INHERIT,
PTHREAD_PRIO_PROTECT
};
/*
* wake up modes
*/
enum wake_modes {
MOVE_TO_MUTEX_ALL = 0,
MOVE_TO_MUTEX_ONE = 1
};
/*
* enum waiting_states - possible values of el_waiter.state field
* @NOT_WAITING: process is not queued anywhere.
* @WAITING_ON_CONDITION: process is queued on a condition.
* @WAITING_ON_MUTEX: process is queued on a mutex.
* @WAITING_ON_BARRIER: process is queued on a barrier.
* @WAITING_ON_SEMAPHORE: process is queued on a semaphore.
*/
enum waiting_states {
NOT_WAITING = 0,
WAITING_ON_CONDITION,
WAITING_ON_MUTEX,
WAITING_ON_BARRIER,
WAITING_ON_SEMAPHORE
};
/**
* enum robust_state - robust mutex states.
* @NOT_ROBUST: mutex was initialized without robust attribute set.
* @ROBUST: mutex was initialized with robust attribute set and
* is in consistent state.
* @OWNER_DEAD: mutex is in inconsistent state and has an owner that must
* mark it either as consistent with pthread_mutex_consistent() or
* as permanently unusable by unlocking immediately.
* @NOT_RECOVERABLE:
* mutex is permanently unusable, all waiting threads are woken.
*
*
* ARCH_HAS_ATOMIC_CMPXCHG is set:
*
* There are two ways for a mutex to arrive at OWNER_DEAD state - when owner
* dies with fast-locked mutex (mutex->__m_lock field holds owner's pid) and
* when owner dies with slow-locked mutex (mutex->__m_lock field holds -1).
*
* If the first case (the mutex was fast-locked) the next thread trying to
* lock it will discover that the owner is dead and change mutex's state to
* OWNER_DEAD. In this case mutex will have an owner.
*
* In the second case (the mutex was slow-locked) when the owner dies he will
* either unlock it and move to OWNER_DEAD state (if mutex has waiters) or
* will put an invalid pid (PID_MAX_LIMIT) into mutex->__m_lock field so that
* the next thread to lock it will put the mutex in the OWNER_DEAD state.
*
* Thus mutex in OWNER_DEAD state always has an owner.
*
* If the mutex has no owner (!m_desc->owner && !m_desc->pending_owner) then
* mutex->__m_lock field contains owner's pid and the task which called
* mutex_lock() will try to set m_desc->owner field by calling
* task_fast_locked_pi_mutex_proxy() or task_locked_pp_mutex_proxy(). If
* the call succeeds the task will just block or return -EBUSY, and if it
* does not the task will return the corresponding error code (-EOWNERDEAD).
*
* If m_desc->robust is set to NOT_RECOVERABLE, the mutex is in not
* recoverable state and mutex->__m_lock is left with value '-1' to
* make fast locking impossible. Mutex can enter this state only from
* pthread_mutex_unlock(), so if we find out in do_mutex_unlock() that the
* mutex is in OWNER_DEAD state, we change it to NOT_RECOVERABLE and wake
* all waiters. Since mutex being in NOT_RECOVERABLE state is a new condition
* indicating that the thread should wake up, the additional check
* (m_desc->robust != NOT_RECOVERABLE) is done right before schedule() in
* __do_mutex_timedlock() and do_cond_timedwait().
*
*
* ARCH_HAS_ATOMIC_CMPXCHG is not set:
*
* This case is different: we cannot use mutex->__m_lock field to store
* additional information about robust state the mutex is in. So in this
* case OWNER_DEAD state does _not_ indicate that mutex has an owner -
* only that its owner died.
*/
enum robust_state {
NOT_ROBUST = 0,
ROBUST,
OWNER_DEAD,
NOT_RECOVERABLE
};
/*
* Descriptors.
*/
/**
* struct mutex_desc - descriptor of a userspace POSIX mutex.
* @lock: the internal lock.
* @next_free: if this descriptor is free, then next_free points to the next
* free descriptor, thus forming a single-linked list of free descriptors.
* Otherwise stores '-1'.
* @private: is the mutex private?
* @desc_type: descriptor's type (MUTEX).
* @wait_list: list of all tasks waiting for the release of this mutex.
* @pending_owner: points to owner if ownership is pending.
* @owner: points to owner if ownership is not pending.
* @protocol: priority protection protocol used.
* @type: the mutex's type (errorcheck, recursive, normal).
* @robust: the mutex's robust state (see 'enum robust_state').
* @prioceiling: prioceiling for PTHREAD_PRIO_PROTECT mutexes (in kernel
* units, i.e. 0 is the highest priority).
* @mutex_list_entry: list entry for the list of all priority protected
* mutexes this thread owns (i.e. all PTHREAD_PRIO_INHERIT and
* PTHREAD_PRIO_PROTECT mutexes).
*
* Pending owner is an owner that was given the mutex but has not been able
* to enter its critical section yet because of contention for CPU, so if
* some other (with higher priority) task tries to lock the mutex now it can
* get it without corrupting anything.
*
* Using direct pointers to owners' task_struct's is OK since the kernel
* maintains the list of all objects the process holds, and when the owner
* dies it parses the list and unlocks all encountered mutexes leaving no
* hanging pointers behind.
*/
struct mutex_desc {
struct raw_spinlock lock;
s16 next_free;
char private;
char desc_type;
struct plist_head wait_list;
struct task_struct *pending_owner;
/* For priority inheritance and priority protection mutexes */
struct task_struct *owner;
char protocol;
char type;
char robust;
/* Prioceiling (converted to kernel values). */
#if MAX_RT_PRIO > 256
# error char is not enough
#endif
unsigned char prioceiling;
union {
struct list_head pi;
struct plist_node pp;
} mutex_list_entry;
};
/**
* struct cond_desc - descriptor of a userspace POSIX condition variable.
* @lock: the internal lock.
* @next_free: if this descriptor is free, then next_free points to the next
* free descriptor, thus forming a single-linked list of free descriptors.
* Otherwise stores '-1'.
* @private: is the condition variable private?
* @desc_type: descriptor's type (CONDITION).
* @wait_list: list of all tasks waiting for this condition.
* @m_desc: the descriptor of the associated mutex (NULL if there aren't any).
*/
struct cond_desc {
struct raw_spinlock lock;
s16 next_free;
char private;
char desc_type;
struct plist_head wait_list;
struct mutex_desc *m_desc;
};
/**
* struct barr_desc - descriptor of a userspace POSIX barrier.
* @lock: the internal lock.
* @next_free: if this descriptor is free, then next_free points to the next
* free descriptor, thus forming a single-linked list of free descriptors.
* Otherwise stores '-1'.
* @private: is the barrier private?
* @desc_type: descriptor's type (BARRIER).
* @wait_list: list of all tasks waiting on this barrier.
* @present: number of threads that have already arrived at this barrier.
*/
struct barr_desc {
struct raw_spinlock lock;
s16 next_free;
char private;
char desc_type;
struct plist_head wait_list;
unsigned int present;
};
/**
* struct sem_desc - descriptor of a userspace POSIX semaphore.
* @lock: the internal lock.
* @next_free: if this descriptor is free, then next_free points to the next
* free descriptor, thus forming a single-linked list of free descriptors.
* Otherwise stores '-1'.
* @private: is the semaphore private?
* @desc_type: descriptor's type (SEMAPHORE).
* @wait_list: list of all tasks waiting on this semaphore.
* @waiters_nr: total number of waiters (for semaphores located partly in
* kernel space and partly in user space).
* @value: semaphore's value (for semaphores located entirely in kernel space).
*/
struct sem_desc {
struct raw_spinlock lock;
s16 next_free;
char private;
char desc_type;
struct plist_head wait_list;
#if defined ARCH_HAS_ATOMIC_CMPXCHG
int waiters_nr;
#else
int value;
#endif
};
/**
* struct el_waiter - used for waiting on mutexes and condition variables.
* @state: state of the waiting thread (see enum waiting_states).
* @list_entry: the list entry which is queued into descriptor's wait_list.
* @task: points to the task_struct of the blocked task.
* @desc: points to the descriptor if the task is waiting on a mutex.
* @timedout: is set to 1 when the waiting task times out.
* @pi_list_entry: used to implement PTHREAD_PRIO_INHERIT and
* PTHREAD_PRIO_PROTECT protocols.
*
* struct el_waiter is allocated in the stack.
*
* The first four fields must be the same as in 'struct el_barrier_waiter'
* (which is basically the same but only for barriers). */
struct el_waiter {
int state;
struct plist_node list_entry;
struct task_struct *task;
void *pi_desc;
int timedout;
struct plist_node pi_list_entry;
};
/*
* Defines used when allocating descriptors.
*/
#define DESCS_NUMBER_BITS 6
#define DESCS_NUMBER (1 << DESCS_NUMBER_BITS)
#define BLOCKS_NUMBER_BITS 8
#define BLOCKS_NUMBER (1 << BLOCKS_NUMBER_BITS)
#define DESC_ALIGN 32
/* DESCS_NUMBER cannot be 16 because one bit
* is used to mark process shared descs. */
#if BLOCKS_NUMBER_BITS > 16 || DESCS_NUMBER_BITS > 15
# error Bad configuration
#endif
#define DESC_INDEX_SHIFT BLOCKS_NUMBER_BITS
#define DESC_PSHARED_FLAG ((1 << BLOCKS_NUMBER_BITS) << DESCS_NUMBER_BITS)
#define BLOCK_INDEX_MASK ((1 << BLOCKS_NUMBER_BITS) - 1)
#define DESC_INDEX_MASK (((1 << DESCS_NUMBER_BITS) - 1) << BLOCKS_NUMBER_BITS)
#define GET_PRIVATE(desc) (!(desc & DESC_PSHARED_FLAG))
#define GET_BLOCK_INDEX(desc) (desc & BLOCK_INDEX_MASK)
#define GET_DESC_INDEX(desc) ((desc & DESC_INDEX_MASK) >> DESC_INDEX_SHIFT)
#define SET_DESC(private, block_index, desc_index) \
((private ? 0 : DESC_PSHARED_FLAG) \
| (desc_index << DESC_INDEX_SHIFT) | block_index)
/* Zero descriptor index is not allowed (it is used
* for static initialization, and first descriptor
* is used as descriptors' list head). */
#define GOOD_DESC(desc) (desc >= (1 << DESC_INDEX_SHIFT))
/**
* struct common_desc - kind of a 'parent class' of all descriptor types
* (struct mutex_desc, struct cond_desc, etc), contains all shared
* fields.
* @lock: the internal lock.
* @next_free: if this descriptor is free, then next_free points to the next
* free descriptor, thus forming a single-linked list of free descriptors.
* Otherwise stores '-1'.
* @private: is the corresponding object private?
* @desc_type: descriptor's type (see 'enum types').
* @wait_list: list of all tasks waiting for this object.
*
* This structure is used by (de)allocation routines which are the same
* for all descriptor types.
*/
struct common_desc {
struct raw_spinlock lock;
s16 next_free;
char private;
char desc_type;
struct plist_head wait_list;
};
/**
* struct zero_cell - first cell in a block of descriptors stores
* head of the free descriptors' list.
* @free_desc: index of the first free descriptor in the block.
* @used_descs: how many descriptors in the block have been initialized
* at least once.
*/
struct zero_cell {
s16 free_desc;
s16 used_descs;
};
struct common_cell_private {
struct common_desc desc;
};
struct mutex_cell_private {
union {
struct common_desc common_desc;
struct mutex_desc m_desc;
} desc;
void *mutex;
} __attribute__((__aligned__(DESC_ALIGN)));
struct other_cell_private {
union desc {
struct common_desc common_desc;
struct cond_desc c_desc;
struct barr_desc b_desc;
struct sem_desc s_desc;
} desc;
void *object;
} __attribute__((__aligned__(DESC_ALIGN)));
struct mutex_block_private {
struct mutex_cell_private descs[DESCS_NUMBER];
};
struct other_block_private {
struct other_cell_private descs[DESCS_NUMBER];
};
struct allocated_descs_common {
int free_block;
int used_blocks;
/* .blocks is an array of BLOCKS_NUMBER pointers to arrays
* consisting of DESCS_NUMBER cells. */
void *blocks[BLOCKS_NUMBER];
u16 next_free[BLOCKS_NUMBER];
};
/* There is one instance of this structure per process. */
struct allocated_private_mutex_descs {
int free_block;
int used_blocks;
/* .blocks is an array of BLOCKS_NUMBER pointers to arrays
* consisting of DESCS_NUMBER 'mutex_cell_private' structures. */
struct mutex_block_private *blocks[BLOCKS_NUMBER];
u16 next_free[BLOCKS_NUMBER];
struct mutex_block_private first_block;
};
/* There is one instance of this structure per process. */
struct allocated_private_other_descs {
int free_block;
int used_blocks;
/* .blocks is an array of BLOCKS_NUMBER pointers to arrays
* consisting of DESCS_NUMBER 'other_cell_private' structures. */
struct other_block_private *blocks[BLOCKS_NUMBER];
u16 next_free[BLOCKS_NUMBER];
struct other_block_private first_block;
};
/*
* Structures used to store descriptors for shared objects
*/
/* This union is the same as 'union futex_key' used in futex code. */
union key_shared {
struct {
unsigned long pgoff;
struct inode *inode;
int offset;
} shared;
struct {
unsigned long address;
struct mm_struct *mm;
int offset;
} private;
struct {
unsigned long word;
void *ptr;
int offset;
} both;
};
static __always_inline int key_cmp(union key_shared *k1, union key_shared *k2)
{
return unlikely(k1->both.word != k2->both.word
|| k1->both.ptr != k2->both.ptr
|| k1->both.offset != k2->both.offset);
}
struct common_cell_shared {
struct list_head shared_descs_list_entry;
struct common_desc desc;
};
struct mutex_cell_shared {
struct list_head shared_descs_list_entry;
struct mutex_desc desc;
union key_shared key;
int __desc;
struct user_struct *user;
} __attribute__((__aligned__(DESC_ALIGN)));
struct other_cell_shared {
struct list_head shared_descs_list_entry;
union {
struct cond_desc c_desc;
struct barr_desc b_desc;
struct sem_desc s_desc;
} desc;
union key_shared key;
int __desc;
struct user_struct *user;
} __attribute__((__aligned__(DESC_ALIGN)));
struct mutex_block_shared {
struct mutex_cell_shared descs[DESCS_NUMBER];
};
struct other_block_shared {
struct other_cell_shared descs[DESCS_NUMBER];
};
struct allocated_shared_mutex_descs {
int free_block;
int used_blocks;
/* .blocks is an array of BLOCKS_NUMBER pointers to arrays
* consisting of DESCS_NUMBER 'mutex_cell_shared' structures. */
struct mutex_block_shared *blocks[BLOCKS_NUMBER];
u16 next_free[BLOCKS_NUMBER];
struct mutex_block_shared first_block;
};
struct allocated_shared_other_descs {
int free_block;
int used_blocks;
/* .blocks is an array of BLOCKS_NUMBER pointers to arrays
* consisting of DESCS_NUMBER 'other_cell_shared' structures. */
struct other_block_shared *blocks[BLOCKS_NUMBER];
u16 next_free[BLOCKS_NUMBER];
struct other_block_shared first_block;
};
/*
* Statically allocate shared objects.
*/
static struct {
struct rw_semaphore lock;
struct allocated_shared_mutex_descs *mutexes;
struct allocated_shared_other_descs *others;
} shared;
static struct allocated_shared_mutex_descs shared_mutexes_struct = {
.free_block = 1,
.used_blocks = 1,
.blocks = {
[1] = &shared_mutexes_struct.first_block
}
};
static struct allocated_shared_other_descs shared_others_struct = {
.free_block = 1,
.used_blocks = 1,
.blocks = {
[1] = &shared_others_struct.first_block
}
};
static inline void *cell_to_desc(void *cell, int private)
{
if (private)
return &((struct common_cell_private *) cell)->desc;
else
return &((struct common_cell_shared *) cell)->desc;
}
/* On PREEMPT_COUNT kernels page fault handling is disabled by
* calls to preempt_disable() from raw_spin_lock*() functions. */
#ifdef CONFIG_PREEMPT_COUNT
# define el_pagefault_disable()
# define el_pagefault_enable()
#else
# define el_pagefault_disable() pagefault_disable()
# define el_pagefault_enable() pagefault_enable()
#endif
static void block_init(s8 *block, int sz, int private)
{
int i;
/* Block must be cleared already */
for (i = 1; i < DESCS_NUMBER; i++) {
struct common_desc *common_desc =
cell_to_desc(block + i * sz, private);
/* This place may confuse lockdep since all locks
* will look as they are of the same type to it. */
raw_spin_lock_init(&common_desc->lock);
plist_head_init(&common_desc->wait_list);
}
}
static __always_inline union key_shared *desc_to_key(void *desc,
const enum types type)
{
switch (type) {
case MUTEX:
return &container_of(desc, struct mutex_cell_shared, desc)->key;
case CONDITION:
case BARRIER:
case SEMAPHORE:
return &container_of(desc, struct other_cell_shared, desc)->key;
default:
return NULL;
}
}
static __always_inline struct user_struct **desc_to_user(void *desc,
const enum types type)
{
switch (type) {
case MUTEX:
return &container_of(desc, struct mutex_cell_shared,
desc)->user;
case CONDITION:
case BARRIER:
case SEMAPHORE:
return &container_of(desc, struct other_cell_shared,
desc)->user;
default:
return NULL;
}
}
static __always_inline int **desc_to_object(void *desc, const enum types type)
{
switch (type) {
case MUTEX:
return (int **) &container_of(desc, struct mutex_cell_private,
desc)->mutex;
case CONDITION:
case BARRIER:
case SEMAPHORE:
return (int **) &container_of(desc, struct other_cell_private,
desc)->object;
default:
return NULL;
}
}
static __always_inline void *desc_get_object(void *desc, const enum types type)
{
int **object_ptr = desc_to_object(desc, type);
return (void *) *object_ptr;
}
static __always_inline int get_sz(const int private, const enum types type)
{
int sz;
switch (type) {
case MUTEX:
sz = private ? sizeof(struct mutex_cell_private)
: sizeof(struct mutex_cell_shared);
break;
case CONDITION:
case BARRIER:
case SEMAPHORE:
case OTHER:
sz = private ? sizeof(struct other_cell_private)
: sizeof(struct other_cell_shared);
break;
default:
sz = 0;
break;
}
return sz;
}
static __always_inline struct allocated_descs_common *get_all_blocks(
struct task_struct *const task,
const int private, const enum types type)
{
struct allocated_descs_common *all_blocks;
switch (type) {
case MUTEX:
if (private)
all_blocks = (struct allocated_descs_common *)
task->mm->el_posix.mutexes;
else
all_blocks = (struct allocated_descs_common *)
shared.mutexes;
break;
case CONDITION:
case BARRIER:
case SEMAPHORE:
if (private)
all_blocks = (struct allocated_descs_common *)
task->mm->el_posix.others;
else
all_blocks = (struct allocated_descs_common *)
shared.others;
break;
default:
all_blocks = NULL;
break;
}
return all_blocks;
}
/* For shared objects key is (page->index,
* vma->vm_file->f_path.dentry->d_inode, offset_within_page).
* The key words are stored in *key on success.
* Returns 0 on success. */
static int get_shared_key(unsigned long uaddr, union key_shared *key,
const int get_reference)
{
struct page *page;
int err;
key->both.offset = uaddr % PAGE_SIZE;
WARN_ON(key->both.offset & 1);
again:
err = get_user_pages_fast(uaddr - key->both.offset, 1, 1, &page);
if (err < 0)
goto out;
page = compound_head(page);
lock_page(page);
if (!page->mapping) {
unlock_page(page);
put_page(page);
goto again;
}
if (PageAnon(page)) {
/* Mapping is actually private. */
key->private.address = uaddr;
key->private.mm = current->mm;
key->private.offset |= 1;
/* Do not increase mm reference counter: if mm is destroyed
* before the descriptor, then descriptor will be moved to
* the global freed_shared_descs list. */
} else {
key->shared.pgoff = page->index;
key->shared.inode = page->mapping->host;
if (unlikely(get_reference))
atomic_inc(&key->shared.inode->i_count);
}
unlock_page(page);
put_page(page);
err = 0;
out:
DbgPos("get_shared_key: err=%d, word=%ld, ptr=%p, offset=%d\n",
err, key->both.word, key->both.ptr, key->both.offset);
return err;
}
static void sub_descriptors_count(const int private, int how_many,
struct user_struct *const user)
{
int current_num;
int *counter;
if (private)
counter = &user->el_posix.private_objects;
else
counter = &user->el_posix.shared_objects;
again:
current_num = READ_ONCE(*counter);
if (how_many > current_num) {
WARN(1, "%s objects underflow in %d: was %d, trying to subtract %d (user %p)\n",
private ? "private" : "shared",
current->pid, current_num, how_many, user);
return;
}
if (unlikely(cmpxchg(counter, current_num, current_num - how_many) !=
current_num)) {
cpu_relax();
goto again;
}
DbgPos("sub_descriptors_count: user %lx, private=%d, decreasing by %d, was %d\n",
user, private, how_many, current_num);
}
static int add_descriptors_count(const int private, int how_many,
struct user_struct *const user)
{
int current_num, new_num, descs_limit;
int *counter;
if (private) {
counter = &user->el_posix.private_objects;
descs_limit = INT_MAX;
} else {
counter = &user->el_posix.shared_objects;
/* 1/2 of all allocated shared descriptors */
descs_limit = (DESCS_NUMBER - 1) * (BLOCKS_NUMBER - 1) / 2;
}
again:
current_num = READ_ONCE(*counter);
new_num = current_num + how_many;
if (unlikely(new_num < 0)) {
WARN_ONCE(1, "%s objects overflow in %d: was %d, trying to add %d (user %p)\n",
private ? "private" : "shared", current->pid,
current_num, how_many, user);
return -EAGAIN;
}
if (unlikely(new_num > descs_limit && !capable(CAP_SYS_RESOURCE)))
return -EAGAIN;
if (unlikely(cmpxchg(counter, current_num, new_num) != current_num)) {
cpu_relax();
goto again;
}
DbgPos("add_descriptors_count: user %lx, private=%d, increasing by %d, was %d\n",
user, private, how_many, current_num);
return 0;
}
void el_posix_switch_user(struct user_struct *old_user,
struct user_struct *new_user)
{
struct mm_struct *mm = current->mm;
int descs_number;
if (!mm)
return;
down_write(&mm->el_posix.lock);
if (!mm->el_posix.user || mm->el_posix.user != old_user)
goto out_unlock;
descs_number = 0;
if (mm->el_posix.mutexes)
descs_number += mm->el_posix.mutexes->used_blocks
* DESCS_NUMBER;
if (mm->el_posix.others)
descs_number += mm->el_posix.others->used_blocks * DESCS_NUMBER;
if (descs_number) {
DbgPos("el_posix_switch_user: moving %d private objects from user %lx to user %lx\n",
descs_number, old_user, new_user);
if (add_descriptors_count(1, descs_number, new_user))
goto out_unlock;
sub_descriptors_count(1, descs_number, old_user);
mm->el_posix.user = new_user;
get_uid(new_user);
free_uid(old_user);
}
out_unlock:
up_write(&mm->el_posix.lock);
}
static DEFINE_RAW_SPINLOCK(freed_shared_descs_lock);
static LIST_HEAD(freed_shared_descs);
/* Called from destroy_inode(). */
void el_posix_inode_free(struct inode *inode)
{
unsigned long flags;
if (list_empty(&inode->el_posix_objects))
return;
raw_spin_lock_irqsave(&freed_shared_descs_lock, flags);
list_splice_init(&inode->el_posix_objects, &freed_shared_descs);
raw_spin_unlock_irqrestore(&freed_shared_descs_lock, flags);
}
static int desc_free(int, const enum types, void *, const int);
/* Must be called with shared.lock held to avoid race conditions
* with other tasks that are calling desc_free/desc_alloc. */
static void free_unused_shared_descs(void)
{
while (!list_empty(&freed_shared_descs)) {
struct list_head *list_entry;
struct common_cell_shared *cell;
int __desc;
raw_spin_lock_irq(&freed_shared_descs_lock);
if (!list_empty(&freed_shared_descs)) {
list_entry = freed_shared_descs.next;
list_del_init(list_entry);
} else {
list_entry = NULL;
}
raw_spin_unlock_irq(&freed_shared_descs_lock);
if (!list_entry)
return;
cell = container_of(list_entry, struct common_cell_shared,
shared_descs_list_entry);
switch (cell->desc.desc_type) {
case MUTEX:
__desc = ((struct mutex_cell_shared *) cell)->__desc;
break;
case CONDITION:
case BARRIER:
case SEMAPHORE:
__desc = ((struct other_cell_shared *) cell)->__desc;
break;
default:
WARN_ON(1);
continue;
}
DbgPos("free_unused_shared_descs: found unused descriptor at "
"%p.\n", &cell->desc);
desc_free(__desc, cell->desc.desc_type, NULL, 0);
}
}
static void add_desc_to_inode(void *desc, struct inode *inode,
const enum types type)
{
struct common_cell_shared *cell;
cell = container_of(desc, struct common_cell_shared, desc);
raw_spin_lock_irq(&freed_shared_descs_lock);
list_add(&cell->shared_descs_list_entry, &inode->el_posix_objects);
raw_spin_unlock_irq(&freed_shared_descs_lock);
}
static void add_desc_to_mm(void *desc, struct mm_struct *mm,
const enum types type)
{
struct common_cell_shared *cell;
cell = container_of(desc, struct common_cell_shared, desc);
raw_spin_lock_irq(&freed_shared_descs_lock);
list_add(&cell->shared_descs_list_entry, &mm->el_posix.shared_objects);
raw_spin_unlock_irq(&freed_shared_descs_lock);
}
static void remove_desc_from_inode_or_mm(void *desc, const enum types type)
{
struct common_cell_shared *cell;
cell = container_of(desc, struct common_cell_shared, desc);
raw_spin_lock_irq(&freed_shared_descs_lock);
list_del_init(&cell->shared_descs_list_entry);
raw_spin_unlock_irq(&freed_shared_descs_lock);
}
struct mutex_init_args {
char protocol;
char type;
char robust;
unsigned char prioceiling;
};
/**
* desc_init() - initializes the newly allocated descriptor.
* @desc: the pointer to the descriptor in question.
* @private: is the descriptor private?
* @type: the type of the descriptor (mutex, barrier, etc).
* @init_args: the pointer to additional initialization arguments
* which are dependent on descriptor's type.
*
* We assume that the spinlock and the waitqueue of the descriptor
* have already been initialized.
*/
static void desc_init(void *desc, const int private,
const enum types type, void *init_args)
{
struct mutex_desc *m_desc;
struct cond_desc *c_desc;
struct barr_desc *b_desc;
struct sem_desc *s_desc;
struct mutex_init_args *m_args;
switch (type) {
case MUTEX:
m_desc = (struct mutex_desc *) desc;
m_args = (struct mutex_init_args *) init_args;
m_desc->private = private;
m_desc->pending_owner = NULL;
m_desc->owner = NULL;
m_desc->protocol = m_args->protocol;
m_desc->type = m_args->type;
m_desc->robust = m_args->robust;
m_desc->prioceiling = m_args->prioceiling;
if (m_desc->type == PTHREAD_PRIO_INHERIT)
INIT_LIST_HEAD(&m_desc->mutex_list_entry.pi);
m_desc->desc_type = MUTEX;
break;
case CONDITION:
c_desc = (struct cond_desc *) desc;
c_desc->m_desc = NULL;
c_desc->private = private;
c_desc->desc_type = CONDITION;
break;
case BARRIER:
b_desc = (struct barr_desc *) desc;
b_desc->present = 0;
b_desc->private = private;
b_desc->desc_type = BARRIER;
break;
case SEMAPHORE:
s_desc = (struct sem_desc *) desc;
#if defined ARCH_HAS_ATOMIC_CMPXCHG
s_desc->waiters_nr = 0;
#else
s_desc->value = *((int *) init_args);
#endif
s_desc->private = private;
s_desc->desc_type = SEMAPHORE;
break;
case OTHER:
break;
}
}
/**
* creation_lock() - lock a global lock protecting all descriptor allocations.
* @private: type of a descriptor being (de)allocated.
*/
static void creation_lock(int private)
{
if (private)
down_write(&current->mm->el_posix.lock);
else
down_write(&shared.lock);
}
/**
* creation_unlock() - unlock a global lock protecting all descriptor
* allocations.
* @private: type of a descriptor being (de)allocated.
*/
static void creation_unlock(int private)
{
if (private)
up_write(&current->mm->el_posix.lock);
else
up_write(&shared.lock);
}
/**
* desc_alloc() - creates a new descriptor, allocates memory for it
* if necessary and initializes it.
* @private: is the new descriptor private?
* @desc: points to pointer to the descriptor.
* @__desc: points to index of the new descriptor.
* @type: type of the descriptor to be allocated.
* @addr: address of the corresponding object in userspace.
* @init_args: the pointer to additional initialization arguments
* which are dependent on descriptor's type.
*
* If desc_alloc() return 0, then the pointer to the new descriptor and its
* index are saved to *desc and *__desc.
*/
static int desc_alloc(const int private, void *desc, int *__desc,
const enum types type, unsigned long addr, void *init_args)
{
struct allocated_descs_common *all_blocks;
s8 *block;
struct user_struct *user = NULL;
struct zero_cell *zero_cell;
int block_index, desc_index, sz, rval;
union key_shared key;
struct common_desc *common_desc;
sz = get_sz(private, type);
all_blocks = get_all_blocks(current, private, type);
DbgPos("desc_alloc: start, type=%d, all_blocks=%p, sz=%d\n",
type, all_blocks, sz);
if (unlikely(!all_blocks)) {
if (private) {
struct allocated_private_mutex_descs *mutexes;
struct allocated_private_other_descs *others;
/* First thing check permissions. */
if (!current->mm->el_posix.user)
current->mm->el_posix.user = get_current_user();
rval = add_descriptors_count(1, DESCS_NUMBER,
current->mm->el_posix.user);
if (rval)
return rval;
switch (type) {
case MUTEX:
mutexes = kzalloc(sizeof(*mutexes), GFP_USER);
if (!mutexes) {
sub_descriptors_count(1, DESCS_NUMBER,
current->mm->el_posix.user);
return -ENOMEM;
}
mutexes->free_block = 1;
mutexes->used_blocks = 1;
mutexes->blocks[1] = &mutexes->first_block;
block_init((s8 *) &mutexes->first_block, sz, 1);
current->mm->el_posix.mutexes = mutexes;
break;
case CONDITION:
case BARRIER:
case SEMAPHORE:
others = kzalloc(sizeof(*others), GFP_USER);
if (!others) {
sub_descriptors_count(1, DESCS_NUMBER,
current->mm->el_posix.user);
return -ENOMEM;
}
others->free_block = 1;
others->used_blocks = 1;
others->blocks[1] = &others->first_block;
block_init((s8 *) &others->first_block, sz, 1);
current->mm->el_posix.others = others;
break;
case OTHER:
break;
}
} else {
switch (type) {
case MUTEX:
block_init((s8 *)
&shared_mutexes_struct.first_block,
sz, 0);
/* A pair to smp_read_barrier_depends()
* in get_desc(). */
smp_wmb();
shared.mutexes = &shared_mutexes_struct;
break;
case CONDITION:
case BARRIER:
case SEMAPHORE:
block_init((s8 *)
&shared_others_struct.first_block,
sz, 0);
/* A pair to smp_read_barrier_depends()
* in get_desc(). */
smp_wmb();
shared.others = &shared_others_struct;
break;
case OTHER:
break;
}
}
all_blocks = get_all_blocks(current, private, type);
}
/* Check if some descriptors are unused but was not freed by user. */
if (!private)
free_unused_shared_descs();
/* Test permissions. */
if (!private) {
rval = get_shared_key(addr, &key, 1);
if (unlikely(rval))
return rval;
if (!(key.both.offset & 1)) {
user = alloc_uid(key.shared.inode->i_uid);
if (unlikely(!user)) {
DbgPos("desc_alloc: alloc_uid failed\n");
iput(key.shared.inode);
return -ENOMEM;
}
} else {
user = current_user();
}
rval = add_descriptors_count(0, 1, user);
if (unlikely(rval))
goto out_put_key;
} else {
if (!current->mm->el_posix.user)
current->mm->el_posix.user = get_current_user();
user = current->mm->el_posix.user;
}
/* Find block with unused descriptor */
if (unlikely(!all_blocks->free_block)) {
if (all_blocks->used_blocks < (BLOCKS_NUMBER - 1)) {
if (private) {
rval = add_descriptors_count(1, DESCS_NUMBER,
user);
if (unlikely(rval))
return rval;
}
block = kzalloc(sz * DESCS_NUMBER, GFP_USER);
if (!block) {
if (private) {
sub_descriptors_count(1, DESCS_NUMBER,
user);
return -ENOMEM;
} else {
rval = -ENOMEM;
goto out_sub_count;
}
}
block_init(block, sz, private);
/* A pair to smp_read_barrier_depends()
* in get_desc(). */
smp_wmb();
block_index = ++all_blocks->used_blocks;
all_blocks->blocks[block_index] = block;
all_blocks->free_block = block_index;
} else {
static int printed;
if (!printed) {
printk(KERN_WARNING "%d el_posix object "
"initialization: all available %s "
"descriptors used!\n", current->pid,
private ? "private" : "shared");
printed = 1;
}
DbgPos("el_posix object initialization: all "
"available descriptors used!\n");
rval = -EAGAIN;
goto out_sub_count;
}
} else {
block_index = all_blocks->free_block;
}
block = (s8 *) all_blocks->blocks[block_index];
zero_cell = (struct zero_cell *) block;
/* Find an unused descriptor within the block */
if (!zero_cell->free_desc) {
if (zero_cell->used_descs < (DESCS_NUMBER - 1)) {
desc_index = ++zero_cell->used_descs;
} else {
WARN_ON(1);
rval = -EINVAL;
goto out_sub_count;
}
} else {
desc_index = (int) zero_cell->free_desc;
/* Read 'next_free' field of the [zero_cell->free_desc] cell. */
zero_cell->free_desc = ((struct common_desc *)
cell_to_desc(block + ((int) zero_cell->free_desc) * sz,
private))->next_free;
}
/* Point of no return */
/* Check whether the block is full now */
if (!zero_cell->free_desc && unlikely(
zero_cell->used_descs == (DESCS_NUMBER - 1)))
all_blocks->free_block = all_blocks->next_free[block_index];
/* Found descriptor to use */
common_desc = cell_to_desc(block + desc_index * sz, private);
*(struct common_desc **) desc = common_desc;
*__desc = SET_DESC(private, block_index, desc_index);
if (private) {
int **object = desc_to_object(common_desc, type);
*object = (int *) addr;
} else {
union key_shared *desc_key = desc_to_key(common_desc, type);
struct user_struct **desc_user = desc_to_user(common_desc,
type);
*desc_key = key;
*desc_user = user;
DbgPos("desc_alloc: key stored at %p: offset=%d, ptr=%p, "
"word=%ld\n", desc_key, key.both.offset,
key.both.ptr, key.both.word);
if (!(key.both.offset & 1)) {
/* inode based mapping. Enable automatic freeing
* of this descriptor. */
switch (type) {
case MUTEX:
((struct mutex_cell_shared *) (block +
desc_index * sz))->__desc = *__desc;
break;
case CONDITION:
case BARRIER:
case SEMAPHORE:
((struct other_cell_shared *) (block +
desc_index * sz))->__desc = *__desc;
break;
case OTHER:
break;
}
add_desc_to_inode(common_desc, key.shared.inode,
type);
iput(key.shared.inode);
} else {
add_desc_to_mm(common_desc, key.private.mm, type);
}
}
raw_spin_lock_irq(&common_desc->lock);
desc_init(common_desc, private, type, init_args);
/* Now that the descriptor is allocated and initialized we set
* the 'next_free' field to -1, indicating that it is in use
* (we will use this for runtime checks, see desc_in_use()). */
common_desc->next_free = -1;
raw_spin_unlock_irq(&common_desc->lock);
DbgPos("desc_alloc: success __desc=%d, block=%d, desc=%d, private=%d\n",
*__desc, block_index, desc_index, private);
return 0;
out_sub_count:
if (!private)
sub_descriptors_count(0, 1, user);
out_put_key:
if (!private) {
if (unlikely(!key.both.ptr))
WARN_ON_ONCE(1);
if (!(key.both.offset & 1)) {
/* inode based mapping. */
iput(key.shared.inode);
free_uid(user);
}
}
return rval;
}
/**
* desc_in_use() - returns 1 if the descriptor is used and 0 otherwise.
* @desc: the descriptor in question.
*/
static __always_inline int desc_in_use(void *desc)
{
int rval = likely(((struct common_desc *) desc)->next_free == -1);
#if DEBUG_POSIX
if (!rval)
DbgPos("desc_in_use: bad descriptor at %p!!!\n", desc);
#endif
return rval;
}
/**
* desc_private() - returns 1 if the descriptor is private and 0 otherwise.
* @desc: the descriptor in question.
*/
static __always_inline int desc_private(void *desc)
{
return likely(((struct common_desc *) desc)->private);
}
/**
* desc_check_type() - returns 0 if the descriptor type matches
* and 1 otherwisee.
* @desc: the descriptor in question.
* @type: expected type to check against.
*/
static __always_inline int desc_check_type(void *desc, enum types type)
{
int rval;
switch (type) {
case MUTEX:
rval = 0;
break;
case CONDITION:
case BARRIER:
case SEMAPHORE:
rval = unlikely(((char) type) !=
((struct common_desc *) desc)->desc_type);
break;
default:
rval = 1;
break;
}
#if DEBUG_POSIX
if (rval)
DbgPos("desc_check_type: bad descriptor at %p (%d != %d)!!!\n",
desc, type,
(int) ((struct common_desc *) desc)->desc_type);
#endif
return rval;
}
/**
* desc_check_type() - returns 0 if the descriptor is good and 1 otherwise.
* @desc: the descriptor in question.
* @type: expected type to check against.
* @object: address of the corresponding userspace object (for private
* descriptors only).
*/
static __always_inline int check_desc(void *desc, enum types type, void *object)
{
return unlikely(desc_in_use(desc) == 0
|| (desc_get_object(desc, type) != object &&
desc_private(desc))
|| desc_check_type(desc, type));
}
/**
* desc_is_busy() - returns 1 if the descriptor's userspace object is in use
* and 0 otherwise.
* @desc: the descriptor in question.
* @type: the descriptor's type.
* @free_arg: meaning depends on type, used to perform additional checks.
*/
static int desc_is_busy(void *desc, const enum types type, const int free_arg)
{
struct mutex_desc *m_desc;
struct barr_desc *b_desc;
switch (type) {
case MUTEX:
m_desc = (struct mutex_desc *) desc;
if (unlikely((free_arg && m_desc->robust <= OWNER_DEAD)
|| m_desc->owner || m_desc->pending_owner)) {
DbgPos("mutex_destroy: locked mutex, lock=%d\n",
free_arg);
return 1;
}
break;
case BARRIER:
b_desc = (struct barr_desc *) desc;
if (unlikely(b_desc->present))
return 1;
break;
default:
break;
}
return 0;
}
/**
* desc_free() - frees the descriptor.
* @__desc: index of the descriptor in question.
* @type: the descriptor's type.
* @object: address of the corresponding userspace object (for private
* descriptors only).
* @free_arg: meaning depends on type, used to perform additional checks.
*/
static int desc_free(int __desc, const enum types type, void *object,
const int free_arg)
{
struct allocated_descs_common *all_blocks;
s8 *block;
struct common_desc *desc;
struct zero_cell *zero_cell;
int private, block_index, desc_index, sz;
private = GET_PRIVATE(__desc);
block_index = GET_BLOCK_INDEX(__desc);
desc_index = GET_DESC_INDEX(__desc);
sz = get_sz(private, type);
all_blocks = get_all_blocks(current, private, type);
if (unlikely(!all_blocks || block_index > all_blocks->used_blocks))
return -EINVAL;
block = all_blocks->blocks[block_index];
if (!block)
return -EINVAL;
zero_cell = (struct zero_cell *) block;
if (desc_index > zero_cell->used_descs)
return -EINVAL;
desc = cell_to_desc(block + desc_index * sz, private);
/* Before we actually do anything, we must make sure that
* no one is waiting on this descriptor. Return -EBUSY
* if there is someone. */
raw_spin_lock_irq(&desc->lock);
if (check_desc(desc, type, object)) {
raw_spin_unlock_irq(&desc->lock);
return -EINVAL;
}
if (desc_is_busy(desc, type, free_arg)) {
raw_spin_unlock_irq(&desc->lock);
return -EBUSY;
}
/* There is no need to check here whether mutex_list_entry is empty,
* because if it was not then desc_is_busy() would return true
* after checking 'owner' field. */
if (plist_head_empty(&desc->wait_list))
desc->next_free = 0;
raw_spin_unlock_irq(&desc->lock);
if (desc->next_free != 0)
return -EBUSY;
/* All checks passed, proceed to actual freeing. */
/* Permissions and accounting stuff */
if (!private) {
union key_shared *desc_key = desc_to_key(desc, type);
struct user_struct **desc_user = desc_to_user(desc, type);
/* This must be done before calling free_uid(). */
sub_descriptors_count(0, 1, *desc_user);
/* Clear the key of the freed descriptor. */
if (unlikely(!desc_key->both.ptr)) {
WARN_ON_ONCE(1);
} else {
/* Disable automatic freeing of this descriptor. */
remove_desc_from_inode_or_mm(desc, type);
if (!(desc_key->both.offset & 1))
/* inode based mapping. */
free_uid(*desc_user);
desc_key->both.ptr = NULL;
}
desc_key->both.word = 0;
desc_key->both.offset = 0;
} else {
int **object = desc_to_object(desc, type);
*object = NULL;
}
desc->desc_type = 0;
if (unlikely(!zero_cell->free_desc
&& zero_cell->used_descs == (DESCS_NUMBER - 1))) {
/* This block was full, but now it has an unused descriptor */
all_blocks->next_free[block_index] = all_blocks->free_block;
all_blocks->free_block = block_index;
}
/* Set 'next_free' field of the [desc_index] cell. */
desc->next_free = zero_cell->free_desc;
zero_cell->free_desc = (s16) desc_index;
DbgPos("desc_free: success_desc=%d, block=%d, desc=%d, private=%d\n",
__desc, block_index, desc_index, private);
return 0;
}
/**
* get_desc() - returns the descriptor's address.
* @task: the pointer to current task_struct.
* @__desc: index of the descriptor in question.
* @type: the descriptor's type.
* @addr: address of the corresponding userspace object.
* @force_check: if set to 0 then the validity of the shared
* objects' descriptors will be checked if and only if
* task->mm->el_posix.unsafe_shared_objects variable is set.
*
* Descriptor's index __desc must be checked with GOOD_DESC() macro
* before calling this function.
*/
static __always_inline void *get_desc(struct task_struct *const task,
const int __desc, const enum types type, void *addr,
const int force_check)
{
struct allocated_descs_common *all_blocks;
s8 *block, *desc;
int block_index, desc_index, sz;
const int private = GET_PRIVATE(__desc);
all_blocks = get_all_blocks(task, private, type);
block_index = GET_BLOCK_INDEX(__desc);
if (unlikely(!all_blocks)) {
DbgPos("get_desc: all_blocks is zero!\n");
return ERR_PTR(-EINVAL);
}
block = all_blocks->blocks[block_index];
desc_index = GET_DESC_INDEX(__desc);
sz = get_sz(private, type);
if (unlikely(!block)) {
DbgPos("get_desc: the block is zero!\n");
return ERR_PTR(-EINVAL);
}
smp_read_barrier_depends();
desc = cell_to_desc(block + sz * desc_index, private);
/* Check descriptor's type. */
if (desc_check_type(desc, type))
return ERR_PTR(-EINVAL);
if (unlikely(!private && (force_check
|| !task->mm->el_posix.unsafe_shared_objects))) {
int rval;
union key_shared user_key;
union key_shared *desc_key;
rval = get_shared_key((unsigned long) addr, &user_key, 0);
if (unlikely(rval))
return ERR_PTR(rval);
desc_key = desc_to_key(desc, type);
if (key_cmp(desc_key, &user_key)) {
DbgPos("bad key at %p: %ld, %ld, %p, %p, %d, %d\n",
desc_key, desc_key->both.word,
user_key.both.word, desc_key->both.ptr,
user_key.both.ptr, desc_key->both.offset,
user_key.both.offset);
return ERR_PTR(-EINVAL);
}
}
return (void *) desc;
}
/* Initialize @mutex. */
static int do_mutex_init(struct pthread_mutex_s *mutex, const int __m_kind)
{
int rval = 0, __m_desc, private;
struct mutex_desc *m_desc;
struct mutex_init_args m_args;
char protocol, m_kind;
private = !(__m_kind & PTHREAD_MUTEXATTR_FLAG_PSHARED);
creation_lock(private);
if (__get_user(__m_desc, &mutex->__m_desc)) {
rval = -EFAULT;
goto out_unlock;
}
if (unlikely(GOOD_DESC(__m_desc))) {
/* Attempted to initialize an initialized mutex. */
rval = -EBUSY;
goto out_unlock;
}
protocol = (char) ((__m_kind & PTHREAD_MUTEXATTR_PROTOCOL_MASK)
>> PTHREAD_MUTEXATTR_PROTOCOL_SHIFT);
m_kind = (char) (__m_kind & ~PTHREAD_MUTEXATTR_FLAG_BITS);
if (unlikely((unsigned char) protocol > PTHREAD_PRIO_PROTECT
|| (unsigned char) m_kind > PTHREAD_MUTEX_ADAPTIVE_NP)) {
rval = -EINVAL;
goto out_unlock;
}
m_args.protocol = protocol;
m_args.type = m_kind;
if (__m_kind & PTHREAD_MUTEXATTR_FLAG_ROBUST) {
if (unlikely(m_args.protocol == PTHREAD_PRIO_NONE)) {
rval = -ENOSYS;
goto out_unlock;
}
m_args.robust = ROBUST;
} else {
m_args.robust = NOT_ROBUST;
}
if (unlikely(protocol == PTHREAD_PRIO_PROTECT)) {
int prioceiling = (__m_kind
& PTHREAD_MUTEXATTR_PRIO_CEILING_MASK)
>> PTHREAD_MUTEXATTR_PRIO_CEILING_SHIFT;
if (prioceiling < 1 || prioceiling > MAX_USER_RT_PRIO-1) {
rval = -EINVAL;
goto out_unlock;
}
if (!capable(CAP_SYS_NICE)) {
unsigned long flags, rlim_rtprio;
if (!lock_task_sighand(current, &flags)) {
rval = -EPERM;
goto out_unlock;
}
rlim_rtprio =
current->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
unlock_task_sighand(current, &flags);
if ((current->policy != SCHED_FIFO && !rlim_rtprio)
|| (prioceiling > current->rt_priority
&& prioceiling > rlim_rtprio)) {
rval = -EPERM;
goto out_unlock;
}
}
if (current->policy == SCHED_IDLE) {
rval = -EPERM;
goto out_unlock;
}
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_task_has_rt_runtime(current)) {
rval = -EPERM;
goto out_unlock;
}
#endif
rval = security_task_setscheduler(current);
if (rval)
goto out_unlock;
m_args.prioceiling = (unsigned char)
(MAX_RT_PRIO-1 - prioceiling);
}
/* Allocate a new descriptor */
rval = desc_alloc(private, &m_desc, &__m_desc, MUTEX,
(unsigned long) mutex, &m_args);
if (unlikely(rval))
goto out_unlock;
if (__put_user(__m_desc, &mutex->__m_desc)) {
desc_free(__m_desc, MUTEX, mutex, 0);
rval = -EFAULT;
goto out_unlock;
}
out_unlock:
creation_unlock(private);
DbgPos("pmutex_init: allocated descr %p for mutex %p, rval = %d\n",
m_desc, mutex, rval);
return rval;
}
/* Destroy @mutex. */
static int do_mutex_destroy(struct pthread_mutex_s *mutex, const int __m_desc)
{
int __m_lock, private, rval;
struct mutex_desc *m_desc;
if (__get_user(__m_lock, &mutex->__m_lock))
return -EFAULT;
if (unlikely(!__m_desc)) {
/* Looks like statically initialized mutex */
__put_user(1, &mutex->__m_desc);
return 0;
}
if (unlikely(!GOOD_DESC(__m_desc)))
return -EINVAL;
m_desc = get_desc(current, __m_desc, MUTEX, mutex, 1);
if (unlikely(IS_ERR(m_desc)))
return PTR_ERR(m_desc);
/* Deallocate the descriptor */
private = GET_PRIVATE(__m_desc);
creation_lock(private);
rval = desc_free(__m_desc, MUTEX, mutex, __m_lock);
creation_unlock(private);
if (!rval)
__put_user(1, &mutex->__m_desc);
return rval;
}
/* mutex_once() function is needed to dynamically allocate a descriptor
* if a mutex was initialized via static initializer. */
static struct mutex_desc *mutex_once(struct task_struct *const task,
struct pthread_mutex_s *const mutex, int __m_desc, int __m_kind)
{
if (unlikely(!GOOD_DESC(__m_desc))) {
int rval;
if (unlikely(__m_desc)) {
DbgPos("mutex_once: bad desc %x, rval = %d\n",
__m_desc, -EINVAL);
return ERR_PTR(-EINVAL);
}
/* Statically initialized mutex. do_mutex_init() will
* return -EBUSY when several threads simultaneously
* initialize the mutex. */
rval = do_mutex_init(mutex, __m_kind);
if (rval && rval != -EBUSY)
return ERR_PTR(rval);
if (unlikely(__get_user(__m_desc, &mutex->__m_desc)))
return ERR_PTR(-EFAULT);
}
return (struct mutex_desc *) get_desc(task, __m_desc, MUTEX, mutex, 0);
}
/* Initialize @cond. */
static int do_cond_init(struct pthread_cond_s *cond)
{
int rval = 0, private, __c_desc, __c_value;
struct cond_desc *c_desc;
if (__get_user(__c_value, &cond->__c_value))
return -EFAULT;
private = !(__c_value & PTHREAD_CONDATTR_FLAG_PSHARED);
creation_lock(private);
if (__get_user(__c_desc, &cond->__c_desc)) {
rval = -EFAULT;
goto out_unlock;
}
if (unlikely(GOOD_DESC(__c_desc))) {
/* Attempted to initialize an initialized condition. */
rval = -EBUSY;
goto out_unlock;
}
/* Allocate a new descriptor */
rval = desc_alloc(private, &c_desc, &__c_desc, CONDITION,
(unsigned long) cond, NULL);
if (unlikely(rval))
goto out_unlock;
if (__put_user(__c_desc, &cond->__c_desc)) {
desc_free(__c_desc, CONDITION, cond, 0);
rval = -EFAULT;
goto out_unlock;
}
out_unlock:
creation_unlock(private);
DbgPos("pcond_init: allocated descr %p for cond %p, rval = %d\n",
c_desc, cond, rval);
return rval;
}
/* Destroy @cond. */
static int do_cond_destroy(struct pthread_cond_s *cond, int __c_desc)
{
int private, rval;
struct cond_desc *c_desc;
if (unlikely(!__c_desc)) {
/* Looks like statically initialized condition */
__put_user(1, &cond->__c_desc);
return 0;
}
if (unlikely(!GOOD_DESC(__c_desc)))
return -EINVAL;
c_desc = get_desc(current, __c_desc, CONDITION, cond, 1);
if (unlikely(IS_ERR(c_desc)))
return PTR_ERR(c_desc);
/* Deallocate the descriptor */
private = GET_PRIVATE(__c_desc);
creation_lock(private);
rval = desc_free(__c_desc, CONDITION, cond, 0);
creation_unlock(private);
if (!rval)
__put_user(1, &cond->__c_desc);
return rval;
}
/* cond_once() function is needed to dynamically allocate a descriptor
* if a condition was initialized via static initializer. */
static struct cond_desc *cond_once(struct task_struct *const task,
struct pthread_cond_s *const cond, int __c_desc)
{
if (unlikely(!GOOD_DESC(__c_desc))) {
int rval;
if (unlikely(__c_desc)) {
DbgPos("cond_once: bad desc %x, rval = %d\n",
__c_desc, -EINVAL);
return ERR_PTR(-EINVAL);
}
/* Statically initialized condition variable.
* do_cond_init() will return -EBUSY when several
* threads simultaneously initialize the variable. */
rval = do_cond_init(cond);
if (rval && rval != -EBUSY)
return ERR_PTR(rval);
if (unlikely(__get_user(__c_desc, &cond->__c_desc)))
return ERR_PTR(-EFAULT);
}
return (struct cond_desc *) get_desc(task, __c_desc,
CONDITION, cond, 0);
}
/* Initialize @barr. */
static int do_barrier_init(struct pthread_barrier_s *__restrict barr,
const int pshared)
{
int rval = 0, __b_desc;
struct barr_desc *b_desc;
creation_lock(!pshared);
if (__get_user(__b_desc, &barr->__b_desc)) {
rval = -EFAULT;
goto out_unlock;
}
if (unlikely(GOOD_DESC(__b_desc))) {
/* Attempted to initialize an initialized barrier. */
rval = -EBUSY;
goto out_unlock;
}
/* Allocate a new descriptor */
rval = desc_alloc(!pshared, &b_desc, &__b_desc, BARRIER,
(unsigned long) barr, NULL);
if (unlikely(rval))
goto out_unlock;
if (__put_user(__b_desc, &barr->__b_desc)) {
desc_free(__b_desc, BARRIER, barr, 0);
rval = -EFAULT;
goto out_unlock;
}
out_unlock:
creation_unlock(!pshared);
DbgPos("pbarrier_init: allocated descr %p for barrier %p, rval %d\n",
b_desc, barr, rval);
return rval;
}
/* Destroy @barr. */
static int do_barrier_destroy(struct pthread_barrier_s *barr, int __b_desc)
{
int private, rval;
struct barr_desc *b_desc;
if (unlikely(!__b_desc)) {
/* Looks like statically initialized barrier */
__put_user(1, &barr->__b_desc);
return 0;
}
if (unlikely(!GOOD_DESC(__b_desc)))
return -EINVAL;
b_desc = get_desc(current, __b_desc, BARRIER, barr, 1);
if (unlikely(IS_ERR(b_desc)))
return PTR_ERR(b_desc);
/* Deallocate the descriptor */
private = GET_PRIVATE(__b_desc);
creation_lock(private);
rval = desc_free(__b_desc, BARRIER, barr, 0);
creation_unlock(private);
if (!rval)
__put_user(1, &barr->__b_desc);
return rval;
}
static struct barr_desc *barr_once(struct task_struct *const task,
struct pthread_barrier_s *barr, int __b_desc)
{
if (unlikely(!GOOD_DESC(__b_desc))) {
int rval;
if (unlikely(__b_desc))
return ERR_PTR(-EINVAL);
/* Statically initialized barrier.
* do_barrier_init() will return -EBUSY when several
* threads simultaneously initialize the barrier. */
rval = do_barrier_init(barr, 0);
if (rval && rval != -EBUSY)
return ERR_PTR(rval);
if (unlikely(__get_user(__b_desc, &barr->__b_desc)))
return ERR_PTR(-EFAULT);
}
return (struct barr_desc *) get_desc(task, __b_desc, BARRIER, barr, 0);
}
/* Initialize @sem. */
static int do_sem_init(struct posix_sem_s *sem, const int pshared)
{
int rval = 0, __s_desc;
struct sem_desc *s_desc;
#if !defined ARCH_HAS_ATOMIC_CMPXCHG
int value;
if (unlikely(__get_user(value, &sem->__s_value)))
return -EFAULT;
#endif
creation_lock(!pshared);
if (__get_user(__s_desc, &sem->__s_desc)) {
rval = -EFAULT;
goto out_unlock;
}
if (unlikely(GOOD_DESC(__s_desc))) {
/* Attempted to initialize an initialized semaphore. */
rval = -EBUSY;
goto out_unlock;
}
/* Allocate a new descriptor */
#if !defined ARCH_HAS_ATOMIC_CMPXCHG
rval = desc_alloc(!pshared, &s_desc, &__s_desc, SEMAPHORE,
(unsigned long) sem, &value);
#else
rval = desc_alloc(!pshared, &s_desc, &__s_desc, SEMAPHORE,
(unsigned long) sem, NULL);
#endif
if (unlikely(rval))
goto out_unlock;
if (__put_user(__s_desc, &sem->__s_desc)) {
desc_free(__s_desc, SEMAPHORE, sem, 0);
rval = -EFAULT;
goto out_unlock;
}
out_unlock:
creation_unlock(!pshared);
DbgPos("sem_init: allocated descr %p for sem %p, rval = %d\n",
s_desc, sem, rval);
return rval;
}
/* Destroy @sem. */
static int do_sem_destroy(struct posix_sem_s *sem, int __s_desc)
{
int private, rval;
struct sem_desc *s_desc;
if (unlikely(!__s_desc)) {
/* Looks like statically initialized semaphore */
__put_user(1, &sem->__s_desc);
return 0;
}
if (unlikely(!GOOD_DESC(__s_desc)))
return -EINVAL;
s_desc = get_desc(current, __s_desc, SEMAPHORE, sem, 1);
if (unlikely(IS_ERR(s_desc)))
return PTR_ERR(s_desc);
/* Deallocate the descriptor */
private = GET_PRIVATE(__s_desc);
creation_lock(private);
rval = desc_free(__s_desc, SEMAPHORE, sem, 0);
creation_unlock(private);
if (!rval)
__put_user(1, &sem->__s_desc);
return rval;
}
/* sem_once() function is needed to dynamically allocate a descriptor
* if a semaphore was initialized via static initializer. */
static struct sem_desc *sem_once(struct task_struct *const task,
struct posix_sem_s *const sem, int __s_desc)
{
if (unlikely(!GOOD_DESC(__s_desc))) {
int rval;
if (unlikely(__s_desc))
return ERR_PTR(-EINVAL);
/* Statically initialized semaphore.
* do_sem_init() will return -EBUSY when several
* threads simultaneously initialize the semaphore. */
rval = do_sem_init(sem, 0);
if (rval && rval != -EBUSY)
return ERR_PTR(rval);
if (unlikely(__get_user(__s_desc, &sem->__s_desc)))
return ERR_PTR(-EFAULT);
}
return (struct sem_desc *) get_desc(task, __s_desc,
SEMAPHORE, sem, 0);
}
/**
* do_object_init_fini() - calls corresponding *_init() or *_destroy() function.
* @type: type of the object in question.
* @op: initialize the object if 0 and destroy otherwise.
* @obj: the pointer to the object
* @arg: initialization argument which is dependent on the object's type.
*/
static int do_object_init_fini(unsigned long type, void *op, void *obj, int arg)
{
int rval;
switch (type) {
case MUTEX:
if (BAD_USER_REGION(obj, struct pthread_mutex_s)) {
rval = -EINVAL;
break;
}
if (!op)
rval = do_mutex_init((struct pthread_mutex_s *) obj,
arg);
else
rval = do_mutex_destroy((struct pthread_mutex_s *) obj,
arg);
break;
case CONDITION:
if (BAD_USER_REGION(obj, struct pthread_cond_s)) {
rval = -EINVAL;
break;
}
if (!op)
rval = do_cond_init((struct pthread_cond_s *) obj);
else
rval = do_cond_destroy((struct pthread_cond_s *) obj,
arg);
break;
case BARRIER:
if (BAD_USER_REGION(obj, struct pthread_barrier_s)) {
rval = -EINVAL;
break;
}
if (!op)
rval = do_barrier_init((struct pthread_barrier_s *) obj,
arg);
else
rval = do_barrier_destroy(
(struct pthread_barrier_s *) obj, arg);
break;
case SEMAPHORE:
if (BAD_USER_REGION(obj, struct posix_sem_s)) {
rval = -EINVAL;
break;
}
if (!op)
rval = do_sem_init((struct posix_sem_s *) obj, arg);
else
rval = do_sem_destroy((struct posix_sem_s *) obj, arg);
break;
default:
rval = -EINVAL;
break;
}
return rval;
}
/*
* Functions used for priority boosting.
*/
/**
* task_has_pi_waiters() - returns 1 if the task's priority
* was temporarily boosted.
* @p: the task in question.
*/
static __always_inline int task_has_pi_waiters(struct task_struct *p)
{
return !plist_head_empty(&p->el_posix.pi_waiters);
}
/**
* __mutex_adjust_prio() - compares task's expected priority with
* actual priority and calls rt_mutex_setprio() on a mismatch.
* @task: the task in question.
* @has_pi_waiters: is task's priority boosted?
*/
static __always_inline void __mutex_adjust_prio(struct task_struct *task,
const int has_pi_waiters)
{
int prio;
/* Take waiters from rt mutexes into account. */
prio = el_posix_getprio(task, has_pi_waiters);
/* Do not allow transition from one non-RT priority to another. */
if (!rt_prio(prio) && !rt_prio(task->prio))
return;
DbgPos("__mutex_adjust_prio: boosting task %d from %d to %d\n",
task->pid, task->prio, prio);
if (task->prio != prio) {
rt_mutex_setprio(task, prio);
WARN_ON(task->prio != prio);
}
}
/**
* mutex_top_waiter() - get mutex's waiter with the highest priority.
* @m_desc: the descriptor of the mutex in question.
*/
static __always_inline struct el_waiter *mutex_top_waiter(
struct mutex_desc *m_desc)
{
return plist_first_entry(&m_desc->wait_list, struct el_waiter,
list_entry);
}
/**
* mutex_has_waiters() - does mutex have any waiters?
* @m_desc: the descriptor of the mutex in question.
*/
static __always_inline int mutex_has_waiters(struct mutex_desc *m_desc)
{
return !plist_head_empty(&m_desc->wait_list);
}
static __always_inline int __task_can_steal_mutex(struct mutex_desc *m_desc,
struct task_struct *task)
{
int owner_prio = m_desc->pending_owner->prio;
return !rt_prio(owner_prio) || task->prio < owner_prio;
}
/**
* task_can_steal_mutex() - check if the mutex can be stealed, i.e. whether
* it has a pending owner with a lesser priority.
* @m_desc: the descriptor of the mutex in question.
* @task: the task that tries to lock the mutex.
*/
static __always_inline int task_can_steal_mutex(struct mutex_desc *m_desc,
struct task_struct *task)
{
if (unlikely(m_desc->pending_owner) &&
__task_can_steal_mutex(m_desc, task))
return 1;
else
return 0;
}
/*
* Functions used for priority inheritance.
*/
/* Max number of times we'll walk the boosting chain: */
static int l_max_lock_depth = 1024;
/**
* mutex_adjust_prio_chain() - adjusts priorities in a priority inheritance
* chain of tasks.
* @task: the task in the chain that had its priority changed.
* @orig_m_desc: descriptor of the very first mutex in the chain
* @top_task: the just blocked task from which the PI cahin starts (used
* to catch some cases with loops in PI chain).
*/
static void mutex_adjust_prio_chain(struct task_struct *task,
struct mutex_desc *const orig_m_desc,
struct task_struct *const top_task)
{
unsigned long flags;
struct mutex_desc *m_desc;
struct el_waiter *waiter, *top_waiter;
int depth = 0;
DbgPos("mutex_adjust_prio_chain started (task=%d, orig_m_desc=%p, "
"top_task=%d\n", task->pid, orig_m_desc, top_task->pid);
again:
if (unlikely(++depth > l_max_lock_depth)) {
static int prev_max;
/* Print this only once. If the admin changes the limit,
* print a new message when reaching the limit again */
if (prev_max != l_max_lock_depth) {
prev_max = l_max_lock_depth;
printk(KERN_WARNING "Maximum lock depth %d reached task"
": %s\n", l_max_lock_depth, task->comm);
}
goto out_put_task;
}
retry:
/* Task can not go away as we did a get_task_struct() before */
raw_spin_lock_irqsave(&task->pi_lock, flags);
/* Check whether the end of the boosting chain has been reached or
* the state of the chain has changed while we dropped the locks.
* 'pi_blocked_on' field can be unset only while holding pi_lock,
* thus it can be safely checked here */
waiter = task->el_posix.pi_blocked_on;
DbgPos("mutex_adjust_prio_chain task=%d waiter=%p\n",
task->pid, waiter);
if (!waiter)
goto out_unlock_pi;
smp_read_barrier_depends();
m_desc = waiter->pi_desc;
DbgPos("mutex_adjust_prio_chain m_desc=%p\n", m_desc);
if (unlikely(!raw_spin_trylock(&m_desc->lock))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
cpu_relax();
goto retry;
}
/* Check if further priority adjustment is necessary. */
if (waiter->list_entry.prio == task->prio || (!rt_prio(task->prio) &&
!rt_prio(waiter->list_entry.prio))) {
raw_spin_unlock(&m_desc->lock);
goto out_unlock_pi;
}
/* Deadlock detection */
if (unlikely(m_desc == orig_m_desc)) {
raw_spin_unlock(&m_desc->lock);
goto out_unlock_pi;
}
/* Since el_posix.pi_blocked_on field was not empty,
* the mutex has at least one waiter (i.e. task itself).
* Remember old top waiter. */
DbgPos("mutex_adjust_prio_chain finding top waiter\n");
top_waiter = mutex_top_waiter(m_desc);
/* Requeue the waiter */
plist_del(&waiter->list_entry, &m_desc->wait_list);
waiter->list_entry.prio = min(task->prio, MAX_RT_PRIO);
plist_add(&waiter->list_entry, &m_desc->wait_list);
/* Release the task */
raw_spin_unlock(&task->pi_lock);
if (unlikely(m_desc->pending_owner)) {
struct el_waiter *new_top_waiter = mutex_top_waiter(m_desc);
DbgPos("mutex_adjust_prio_chain pending_owner\n");
/* The pending owner does not have to be the first in the wait
* queue. Such a situation can arise when a low priority task
* blocks on mutex, but its priority gets boosted before the
* task checks for stealing. Then the check will fail and the
* task will be queued with high priority.
* We have just dropped task->pi_lock, so task->prio may not
* equal waiter->list_entry.prio. That's why we try to give
* the mutex to the top waiter: since m_desc->lock is still
* held that information is reliable. */
if (waiter == new_top_waiter) {
/* Check if we can steal the mutex. */
if (__task_can_steal_mutex(m_desc, task)) {
/* Since task's priority is higher than pending
* owner's priority, the task is not the pending
* owner and it can steal the mutex.
* Note: it is still possible that @task is the
* pending owner, and in that very unlikely case
* task will just receive a second wakeup which
* is OK. */
m_desc->pending_owner = task;
wake_up_state(task, TASK_INTERRUPTIBLE);
}
} else if (top_waiter == waiter) {
if (__task_can_steal_mutex(m_desc,
new_top_waiter->task)) {
/* The task is not on top of wait list anymore,
* so give the mutex to the more appropriate
* waiter. (It is still possible that the new
* top waiter is the pending owner already).*/
m_desc->pending_owner = new_top_waiter->task;
wake_up_state(new_top_waiter->task,
TASK_INTERRUPTIBLE);
}
}
/* We finished walking pi chain. */
raw_spin_unlock_irqrestore(&m_desc->lock, flags);
goto out_put_task;
}
/* Grab the next task */
if (unlikely(!m_desc->owner)) {
/* This should not happen - mutex must have either owner
* or a pending owner. */
WARN_ON(m_desc->robust == ROBUST);
raw_spin_unlock_irqrestore(&m_desc->lock, flags);
DbgPos("el_posix error: owner of mutex is dead.\n");
goto out_put_task;
}
put_task_struct(task);
task = m_desc->owner;
get_task_struct(task);
raw_spin_lock(&task->pi_lock);
DbgPos("mutex_adjust_prio_chain (de)boosting\n");
if (waiter == mutex_top_waiter(m_desc)) {
/* (De)boost the owner */
plist_del(&top_waiter->pi_list_entry,
&task->el_posix.pi_waiters);
plist_node_init(&waiter->pi_list_entry,
waiter->list_entry.prio);
plist_add(&waiter->pi_list_entry, &task->el_posix.pi_waiters);
__mutex_adjust_prio(task, 1);
} else if (top_waiter == waiter) {
/* Deboost the owner */
plist_del(&waiter->pi_list_entry, &task->el_posix.pi_waiters);
waiter = mutex_top_waiter(m_desc);
plist_node_init(&waiter->pi_list_entry,
waiter->list_entry.prio);
plist_add(&waiter->pi_list_entry, &task->el_posix.pi_waiters);
__mutex_adjust_prio(task, 1);
}
raw_spin_unlock(&task->pi_lock);
DbgPos("mutex_adjust_prio_chain iteration ended\n");
top_waiter = mutex_top_waiter(m_desc);
raw_spin_unlock_irqrestore(&m_desc->lock, flags);
/* Return if priority of the mutex owner was not changed. */
if (waiter != top_waiter)
goto out_put_task;
/* Deadlock detection */
if (unlikely(task == top_task))
goto out_put_task;
goto again;
out_unlock_pi:
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
out_put_task:
put_task_struct(task);
return;
}
/**
* task_fast_locked_pi_mutex() - called when task locked PI mutex without
* blocking on it (i.e. without queuing).
* @task: points to the current task_struct.
* @m_desc: the descriptor of the mutex in question.
*/
static void task_fast_locked_pi_mutex(struct task_struct *task,
struct mutex_desc *const m_desc)
{
if (unlikely(m_desc->owner))
return;
/* Set the task as the new owner */
m_desc->owner = task;
raw_spin_lock(&task->pi_lock);
list_add(&m_desc->mutex_list_entry.pi, &task->el_posix.pi_mutex_list);
#if defined ARCH_HAS_ATOMIC_CMPXCHG
if (likely(mutex_has_waiters(m_desc))) {
#else
if (unlikely(mutex_has_waiters(m_desc))) {
#endif
struct el_waiter *top_waiter;
top_waiter = mutex_top_waiter(m_desc);
plist_node_init(&top_waiter->pi_list_entry,
top_waiter->list_entry.prio);
plist_add(&top_waiter->pi_list_entry,
&task->el_posix.pi_waiters);
__mutex_adjust_prio(task, 1);
}
raw_spin_unlock(&task->pi_lock);
}
/**
* __find_task_by_pid_check() - wrapper around find_task_by_vpid()
* with additional checks.
* @pid: the task's pid.
*/
static struct task_struct *__find_task_by_pid_check(pid_t pid)
{
struct task_struct *task;
const struct cred *cred, *task_cred;
task = find_task_by_vpid(pid);
if (unlikely(!task))
return NULL;
cred = current_cred();
task_cred = __task_cred(task);
if (unlikely(!uid_eq(cred->euid, task_cred->euid) &&
!uid_eq(cred->euid, task_cred->uid)))
return NULL;
return task;
}
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/**
* task_fast_locked_pi_mutex_proxy() - same as task_fast_locked_pi_mutex()
* but owner is not current, thus checks for dead owner are required.
* @pid: owner's pid.
* @m_desc: the descriptor of the mutex in question.
*/
static struct task_struct *task_fast_locked_pi_mutex_proxy(const int pid,
struct mutex_desc *const m_desc)
{
struct task_struct *task;
int chain_walk = 0;
if (unlikely(pid == -1)) {
if (m_desc->owner || m_desc->pending_owner)
return NULL;
else
return ERR_PTR(-EOWNERDEAD);
}
/* Set the task as the new owner */
rcu_read_lock();
task = __find_task_by_pid_check(pid);
if (unlikely(!task)) {
int rval;
owner_dead:
rcu_read_unlock();
DbgPos("el_posix: owner of mutex is dead (pid %d).\n", pid);
/* Now mutex has -1 in '__m_lock' field but has no owner,
* so it can be acquired. */
switch (m_desc->robust) {
case ROBUST:
m_desc->robust = OWNER_DEAD;
rval = -EOWNERDEAD;
break;
case OWNER_DEAD:
WARN_ON_ONCE(1);
rval = -EOWNERDEAD;
break;
case NOT_RECOVERABLE:
WARN_ON_ONCE(1);
rval = -ENOTRECOVERABLE;
break;
default:
rval = 0;
break;
}
return ERR_PTR(rval);
}
prefetch(&task->flags);
raw_spin_lock(&task->pi_lock);
if (unlikely(task->flags & PF_EXITING)) {
/* This is the only function that deals with PI stuff on behalf
* of another task, so there is no need to check PF_EXITING
* flag anywhere else: we cannot race with ourselves. */
raw_spin_unlock(&task->pi_lock);
goto owner_dead;
}
m_desc->owner = task;
list_add(&m_desc->mutex_list_entry.pi, &task->el_posix.pi_mutex_list);
if (mutex_has_waiters(m_desc)) {
struct el_waiter *top_waiter;
top_waiter = mutex_top_waiter(m_desc);
plist_node_init(&top_waiter->pi_list_entry,
top_waiter->list_entry.prio);
plist_add(&top_waiter->pi_list_entry,
&task->el_posix.pi_waiters);
__mutex_adjust_prio(task, 1);
if (task->el_posix.pi_blocked_on)
chain_walk = 1;
}
raw_spin_unlock(&task->pi_lock);
if (unlikely(chain_walk))
get_task_struct(task);
rcu_read_unlock();
if (likely(!chain_walk))
return NULL;
else
return task;
}
#endif
/**
* task_slow_locked_pi_mutex() - task locked PI mutex after being blocked on it.
* @task: the current task_struct.
* @m_desc: the descriptor of the mutex in question.
* @fast_unlock: (only for architectures with ARCH_HAS_ATOMIC_CMPXCHG set)
* if not zero then this function will not do any PI stuff in
* the kernel to avoid fast unlocking entirely in userspace (which is
* done by cmpxchg(pid, 0, &mutex->__m_lock) operation).
*/
#if defined ARCH_HAS_ATOMIC_CMPXCHG
static __always_inline void task_slow_locked_pi_mutex(
struct task_struct *const task,
struct mutex_desc *const m_desc, const int fast_unlock)
{
if (unlikely(m_desc->owner))
return;
raw_spin_lock(&task->pi_lock);
task->el_posix.pi_blocked_on = NULL;
if (!fast_unlock && mutex_has_waiters(m_desc)) {
struct el_waiter *top_waiter = mutex_top_waiter(m_desc);
plist_node_init(&top_waiter->pi_list_entry,
top_waiter->list_entry.prio);
plist_add(&top_waiter->pi_list_entry,
&task->el_posix.pi_waiters);
__mutex_adjust_prio(task, 1);
}
/* Set the task as the new owner */
if (!fast_unlock)
list_add(&m_desc->mutex_list_entry.pi,
&task->el_posix.pi_mutex_list);
raw_spin_unlock(&task->pi_lock);
/* Set the task as the new owner */
if (!fast_unlock)
m_desc->owner = task;
}
#else
static void task_slow_locked_pi_mutex(
struct task_struct *const task,
struct mutex_desc *const m_desc)
{
if (unlikely(m_desc->owner))
return;
raw_spin_lock(&task->pi_lock);
list_add(&m_desc->mutex_list_entry.pi, &task->el_posix.pi_mutex_list);
task->el_posix.pi_blocked_on = NULL;
if (mutex_has_waiters(m_desc)) {
struct el_waiter *top_waiter = mutex_top_waiter(m_desc);
plist_node_init(&top_waiter->pi_list_entry,
top_waiter->list_entry.prio);
plist_add(&top_waiter->pi_list_entry,
&task->el_posix.pi_waiters);
__mutex_adjust_prio(task, 1);
}
raw_spin_unlock(&task->pi_lock);
/* Set the task as the new owner */
m_desc->owner = task;
}
#endif
/**
* pi_mutex_waiters_changed() - is called from pthread_cond_broadcast() after
* a number of waiters were requeued from a condition variable to a mutex,
* checks whether a PI chain parsing is needed.
* @m_desc: the descriptor of the mutex in question.
* @old_top_waiter: points to the waiter which had the highest priority before
* requeue.
* @new_top_waiter: points to the waiter which has the highest priority after
* requeue.
* @owner_pid: (only for architectures with ARCH_HAS_ATOMIC_CMPXCHG set)
* contains owner's pid if the mutex was fast locked.
*
* The check for stealing of PTHREAD_PRIO_INHERIT mutexes must be done
* after the setting of @task->el_posix.pi_blocked_on to avoid races
* when another thread changes our priority in between setting of
* pi_blocked_on and checking for stealing.
*
* Returns the pointer to the first task is PI chain if parsing is needed.
*/
#if defined ARCH_HAS_ATOMIC_CMPXCHG
static struct task_struct *pi_mutex_waiters_changed(
struct mutex_desc *const m_desc,
struct el_waiter *const old_top_waiter,
struct el_waiter *const new_top_waiter,
const int owner_pid)
{
struct task_struct *const owner = m_desc->owner;
int chain_walk = 0;
/* We cannot trust owner_pid because it is read from user space,
* so check m_desc->owner instead. */
if (!owner) {
/* Mutex has no owner (it may still have a pending owner).
* This means that either we set m_desc->owner based on
* owner_pid or there is no need to do PI stuff. */
if (unlikely(m_desc->pending_owner))
/* If the new top waiter has higher priority than the
* old one, it will steal the mutex. */
return NULL;
if (owner_pid == 0)
return NULL;
else
return task_fast_locked_pi_mutex_proxy(
owner_pid, m_desc);
}
if (old_top_waiter != new_top_waiter) {
raw_spin_lock(&owner->pi_lock);
if (old_top_waiter)
plist_del(&old_top_waiter->pi_list_entry,
&owner->el_posix.pi_waiters);
plist_node_init(&new_top_waiter->pi_list_entry,
new_top_waiter->list_entry.prio);
plist_add(&new_top_waiter->pi_list_entry,
&owner->el_posix.pi_waiters);
if (!old_top_waiter || old_top_waiter->pi_list_entry.prio
!= new_top_waiter->pi_list_entry.prio) {
__mutex_adjust_prio(owner, 1);
if (owner->el_posix.pi_blocked_on)
chain_walk = 1;
}
raw_spin_unlock(&owner->pi_lock);
}
if (!chain_walk) {
return NULL;
} else {
get_task_struct(owner);
return owner;
}
}
#else
static struct task_struct *pi_mutex_waiters_changed(
struct mutex_desc *const m_desc,
struct el_waiter *const old_top_waiter,
struct el_waiter *const new_top_waiter)
{
struct task_struct *const owner = m_desc->owner;
int chain_walk = 0;
if (!owner)
return NULL;
if (old_top_waiter != new_top_waiter) {
raw_spin_lock(&owner->pi_lock);
if (old_top_waiter)
plist_del(&old_top_waiter->pi_list_entry,
&owner->el_posix.pi_waiters);
plist_node_init(&new_top_waiter->pi_list_entry,
new_top_waiter->list_entry.prio);
plist_add(&new_top_waiter->pi_list_entry,
&owner->el_posix.pi_waiters);
if (!old_top_waiter || old_top_waiter->pi_list_entry.prio
!= new_top_waiter->pi_list_entry.prio) {
__mutex_adjust_prio(owner, 1);
if (owner->el_posix.pi_blocked_on)
chain_walk = 1;
}
raw_spin_unlock(&owner->pi_lock);
}
if (!chain_walk) {
return NULL;
} else {
get_task_struct(owner);
return owner;
}
}
#endif
/**
* task_blocks_on_pi_mutex() - is called when current task blocks on
* a PTHREAD_PRIO_INHERIT mutex, checks whether a PI chain parsing
* is needed.
* @task: current task's task_struct.
* @waiter: the pointer to the allocated and initialized el_waiter structure.
* @m_desc: the descriptor of the mutex in question.
* @owner_pid: (only for architectures with ARCH_HAS_ATOMIC_CMPXCHG set)
* contains owner's pid if the mutex was fast locked.
*
* Returns the pointer to the first task is PI chain if parsing is needed.
*/
#ifdef ARCH_HAS_ATOMIC_CMPXCHG
static struct task_struct *task_blocks_on_pi_mutex(
struct task_struct *const task,
struct el_waiter *const waiter,
struct mutex_desc *const m_desc,
const int owner_pid)
{
struct task_struct *owner;
struct el_waiter *top_waiter;
int prio, chain_walk = 0;
DbgPos("task_blocks_on_pi_mutex started\n");
init_list_entry:
prio = task->prio;
plist_node_init(&waiter->list_entry, min(prio, MAX_RT_PRIO));
/* The task is blocked on this mutex now. Corresponding
* smp_read_barrier_depends() is called from mutex_adjust_prio_chain()
* and el_posix_adjust_pi(). */
smp_wmb();
/* Now that waiter is initialized, we can set 'pi_blocked_on' field. */
task->el_posix.pi_blocked_on = waiter;
/* There was a small window between reading task->prio and writing
* task->el_posix.pi_blocked_on in which task's priority may have
* changed, so re-read it. This is faster than locking task->pi_lock.
* We may have old priority stored in waiter for some time, but it is
* OK since the m_desc->lock is locked now. */
smp_mb();
if (unlikely(task->prio != prio))
goto init_list_entry;
/* We cannot trust owner_pid because it is read from user space,
* so check m_desc->owner instead. */
if (!m_desc->owner) {
/* Mutex has no owner (it may still have a pending owner).
* This means that either we set m_desc->owner based on
* owner_pid or there is no need to do PI stuff. */
plist_add(&waiter->list_entry, &m_desc->wait_list);
if (unlikely(m_desc->pending_owner))
/* If the new waiter has higher priority than the
* mutex top waiter, it will steal the mutex. */
return NULL;
if (owner_pid == 0)
return NULL;
else
return task_fast_locked_pi_mutex_proxy(
owner_pid, m_desc);
}
/* So, m_desc->owner is set, and that means that PI waiters are
* queued. Check if the new waiter has the biggest priority. */
/* Remember the top waiter on the lock */
if (mutex_has_waiters(m_desc))
top_waiter = mutex_top_waiter(m_desc);
else
top_waiter = waiter;
/* Add this task to the mutex waitqueue */
plist_add(&waiter->list_entry, &m_desc->wait_list);
/* Check if the top waiter changed and PI adjustments must be made */
if (waiter == mutex_top_waiter(m_desc)) {
/* Mutex top waiter changed, so we must
* change owner's pi_waiters */
owner = m_desc->owner;
raw_spin_lock(&owner->pi_lock);
/* plist_node_init must be called before plist_del, because
* sometimes top_waiter == waiter and pi_list_entry would be
* uninitialized! */
plist_node_init(&waiter->pi_list_entry,
waiter->list_entry.prio);
plist_del(&top_waiter->pi_list_entry,
&owner->el_posix.pi_waiters);
plist_add(&waiter->pi_list_entry, &owner->el_posix.pi_waiters);
__mutex_adjust_prio(owner, 1);
if (owner->el_posix.pi_blocked_on)
chain_walk = 1;
raw_spin_unlock(&owner->pi_lock);
} else {
/* Prevent compiler warning about uninitialized owner. */
owner = NULL;
}
if (!chain_walk) {
return NULL;
} else {
get_task_struct(owner);
return owner;
}
}
#else
static struct task_struct *task_blocks_on_pi_mutex(
struct task_struct *const task,
struct el_waiter *const waiter,
struct mutex_desc *const m_desc)
{
struct task_struct *owner;
struct el_waiter *top_waiter;
int prio, chain_walk = 0;
DbgPos("task_blocks_on_pi_mutex started\n");
init_list_entry:
prio = task->prio;
plist_node_init(&waiter->list_entry, min(prio, MAX_RT_PRIO));
/* The task is blocked on this mutex now. Corresponding
* smp_read_barrier_depends() is called from mutex_adjust_prio_chain()
* and el_posix_adjust_pi(). */
smp_wmb();
/* Now that waiter is initialized, we can set 'pi_blocked_on' field. */
task->el_posix.pi_blocked_on = waiter;
/* There was a small window between reading task->prio and writing
* task->el_posix.pi_blocked_on in which task's priority may have
* changed, so re-read it. This is faster than locking task->pi_lock.
* We may have old priority stored in waiter for some time, but it is
* OK since the m_desc->lock is locked now. */
smp_mb();
if (unlikely(task->prio != prio))
goto init_list_entry;
/* Remember the top waiter on the lock */
if (mutex_has_waiters(m_desc))
top_waiter = mutex_top_waiter(m_desc);
else
top_waiter = waiter;
/* Add this task to the mutex waitqueue */
plist_add(&waiter->list_entry, &m_desc->wait_list);
/* Check if the top waiter changed and PI adjustments must be made */
if (waiter == mutex_top_waiter(m_desc)) {
/* Mutex top waiter changed, so we must
* change owner's pi_waiters */
owner = m_desc->owner;
if (unlikely(owner == NULL))
/* Owner unlocked the mutex or is dead. */
return NULL;
raw_spin_lock(&owner->pi_lock);
/* plist_node_init must be called before plist_del, because
* sometimes top_waiter == waiter and pi_list_entry would be
* uninitialized! */
plist_node_init(&waiter->pi_list_entry,
waiter->list_entry.prio);
plist_del(&top_waiter->pi_list_entry,
&owner->el_posix.pi_waiters);
plist_add(&waiter->pi_list_entry, &owner->el_posix.pi_waiters);
__mutex_adjust_prio(owner, 1);
if (owner->el_posix.pi_blocked_on)
chain_walk = 1;
raw_spin_unlock(&owner->pi_lock);
}
if (!chain_walk) {
return NULL;
} else {
get_task_struct(owner);
return owner;
}
}
#endif
/**
* give_up_on_pi_mutex() - unqueues the task and undoes PI boosting.
* @task: current's task_struct.
* @waiter: the pointer to the queued el_waiter structure.
* @m_desc: the descriptor of the mutex in question.
*/
static struct task_struct *give_up_on_pi_mutex(struct task_struct *const task,
struct el_waiter *const waiter,
struct mutex_desc *const m_desc)
{
int chain_walk = 0;
struct task_struct *const owner = m_desc->owner;
struct el_waiter *old_top_waiter, *new_top_waiter;
raw_spin_lock(&task->pi_lock);
task->el_posix.pi_blocked_on = NULL;
raw_spin_unlock(&task->pi_lock);
waiter->state = NOT_WAITING;
if (!owner) {
plist_del(&waiter->list_entry, &m_desc->wait_list);
return 0;
}
old_top_waiter = mutex_top_waiter(m_desc);
plist_del(&waiter->list_entry, &m_desc->wait_list);
if (mutex_has_waiters(m_desc))
new_top_waiter = mutex_top_waiter(m_desc);
else
new_top_waiter = NULL;
if (new_top_waiter != old_top_waiter) {
int has_pi_waiters;
raw_spin_lock(&owner->pi_lock);
plist_del(&old_top_waiter->pi_list_entry,
&owner->el_posix.pi_waiters);
if (new_top_waiter) {
plist_node_init(&new_top_waiter->pi_list_entry,
new_top_waiter->list_entry.prio);
plist_add(&new_top_waiter->pi_list_entry,
&owner->el_posix.pi_waiters);
has_pi_waiters = 1;
} else {
has_pi_waiters = task_has_pi_waiters(owner);
}
__mutex_adjust_prio(owner, has_pi_waiters);
if (owner->el_posix.pi_blocked_on)
chain_walk = 1;
raw_spin_unlock(&owner->pi_lock);
}
if (!chain_walk) {
return 0;
} else {
get_task_struct(owner);
return owner;
}
}
/**
* __task_unlocked_pi_mutex() - is called from pthread_mutex_unlock() and
* when task dies, undoes PI boosting on the current task.
* @task: current's task_struct.
* @m_desc: the descriptor of the mutex in question.
*
* The difference between this function and task_unlocked_pi_mutex() is that
* pi_lock is already held here and m_desc->owner is not zeroed.
*/
static void __task_unlocked_pi_mutex(
struct task_struct *const task,
struct mutex_desc *const m_desc)
{
DbgPos("task_unlocked_pi_mutex: contended=%d\n",
mutex_has_waiters(m_desc));
list_del(&m_desc->mutex_list_entry.pi);
if (mutex_has_waiters(m_desc)) {
/* Remove pi_list_entry */
struct el_waiter *top_waiter;
top_waiter = mutex_top_waiter(m_desc);
plist_del(&top_waiter->pi_list_entry,
&task->el_posix.pi_waiters);
__mutex_adjust_prio(task, task_has_pi_waiters(task));
}
}
/**
* task_unlocked_pi_mutex() - is called from pthread_mutex_unlock(),
* undoes PI boosting on the current task.
* @task: current's task_struct.
* @m_desc: the descriptor of the mutex in question.
*/
static void task_unlocked_pi_mutex(
struct task_struct *const task,
struct mutex_desc *const m_desc)
{
raw_spin_lock(&task->pi_lock);
__task_unlocked_pi_mutex(task, m_desc);
raw_spin_unlock(&task->pi_lock);
m_desc->owner = NULL;
}
/**
* el_posix_adjust_pi() - called fron sched_setscheduler(), this function
* updates PI chain state and (de)boost tasks' priorities if needed.
* @task: the task that had its priority changed.
*/
void el_posix_adjust_pi(struct task_struct *task)
{
struct el_waiter *waiter;
unsigned long flags;
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->el_posix.pi_blocked_on;
if (!waiter)
goto out_unlock;
smp_read_barrier_depends();
if (waiter->list_entry.prio == task->prio || (!rt_prio(task->prio) &&
!rt_prio(waiter->list_entry.prio)))
goto out_unlock;
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
get_task_struct(task);
mutex_adjust_prio_chain(task, NULL, task);
return;
out_unlock:
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
}
/**
* boost_priority() - boosts current task's priority by queueing it as
* a PI waiter to itself.
* @prio: new effective priority.
* @pi_list_entry: initialized plist node to enqueue in pi_waiters list.
*
* Should be called with disabled irqs.
*/
static void boost_priority(const int prio,
struct plist_node *const pi_list_entry)
{
struct task_struct *const task = current;
raw_spin_lock(&task->pi_lock);
if (unlikely(!plist_node_empty(pi_list_entry)))
plist_del(pi_list_entry, &task->el_posix.pi_waiters);
pi_list_entry->prio = prio;
plist_add(pi_list_entry, &task->el_posix.pi_waiters);
__mutex_adjust_prio(task, 1);
raw_spin_unlock(&task->pi_lock);
}
/**
* restore_priority() - restore priority back to the original value
* after boost_priority().
* @pi_list_entry: list entry that was passed to boost_priority().
*/
static void restore_priority(struct plist_node *const pi_list_entry)
{
struct task_struct *const task = current;
/* Remove this task from list */
raw_spin_lock_irq(&task->pi_lock);
if (likely(!plist_node_empty(pi_list_entry))) {
plist_del(pi_list_entry, &task->el_posix.pi_waiters);
/* Undo priority boost */
__mutex_adjust_prio(task, task_has_pi_waiters(task));
}
raw_spin_unlock_irq(&task->pi_lock);
}
/*
* Functions used for priority protection.
*/
/**
* __task_locked_pp_mutex() - called when task locks a PP mutex.
* @task: the new owner's task_struct.
* @m_desc: the descriptor of the mutex in question.
*
* task->pi_lock and m_desc->lock must be held.
*/
static void __task_locked_pp_mutex(struct task_struct *const task,
struct mutex_desc *const m_desc)
{
const int prioceiling = (int) m_desc->prioceiling;
struct plist_node *old_top_entry, *new_top_entry;
if (plist_head_empty(&task->el_posix.pp_mutex_list))
old_top_entry = NULL;
else
old_top_entry = plist_first(&task->el_posix.pp_mutex_list);
plist_node_init(&m_desc->mutex_list_entry.pp, prioceiling);
plist_add(&m_desc->mutex_list_entry.pp, &task->el_posix.pp_mutex_list);
new_top_entry = plist_first(&task->el_posix.pp_mutex_list);
if (old_top_entry != new_top_entry) {
/* Boosting priority has changed. */
if (old_top_entry)
plist_del(&task->el_posix.pi_list_entry,
&task->el_posix.pi_waiters);
task->el_posix.pi_list_entry.prio = prioceiling;
plist_add(&task->el_posix.pi_list_entry,
&task->el_posix.pi_waiters);
/* Change priority */
__mutex_adjust_prio(task, 1);
}
}
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/**
* task_locked_pp_mutex_proxy() - called when a PP mutex was fast locked
* by another task and we need to do all the priority protection stuff
* (i.e. to boost owner's priority to the priority ceiling).
* @pid: owner's pid.
* @m_desc: the descriptor of the mutex in question.
*/
static int task_locked_pp_mutex_proxy(const int pid,
struct mutex_desc *const m_desc)
{
struct task_struct *task;
if (unlikely(pid == -1)) {
if (printk_ratelimit())
pr_info("elpthread: possible memory corruption detected"
"in thread %d\n", pid);
return 0;
}
/* Set the task as the new owner */
rcu_read_lock();
task = __find_task_by_pid_check(pid);
if (unlikely(!task)) {
int rval;
owner_dead:
rcu_read_unlock();
DbgPos("el_posix: owner of mutex is dead (pid %d).\n", pid);
/* Now mutex has -1 in '__m_lock' field but has no owner,
* so it can be acquired. */
switch (m_desc->robust) {
case ROBUST:
m_desc->robust = OWNER_DEAD;
rval = -EOWNERDEAD;
break;
case OWNER_DEAD:
WARN_ON_ONCE(1);
rval = -EOWNERDEAD;
break;
case NOT_RECOVERABLE:
WARN_ON_ONCE(1);
rval = -ENOTRECOVERABLE;
break;
default:
rval = 0;
break;
}
return rval;
}
prefetch(&task->flags);
raw_spin_lock(&task->pi_lock);
if (unlikely(task->flags & PF_EXITING)) {
/* This is the only function that deals with PP stuff on behalf
* of another task, so there is no need to check PF_EXITING
* flag anywhere else: we cannot race with ourselves. */
raw_spin_unlock(&task->pi_lock);
goto owner_dead;
}
rcu_read_unlock();
__task_locked_pp_mutex(task, m_desc);
raw_spin_unlock(&task->pi_lock);
m_desc->owner = task;
return 0;
}
#endif
/**
* task_locked_pp_mutex() - called when current locks a PP mutex.
* @task: the current task_struct.
* @m_desc: the descriptor of the mutex in question.
*/
static void task_locked_pp_mutex(struct task_struct *const task,
struct mutex_desc *const m_desc)
{
raw_spin_lock(&task->pi_lock);
__task_locked_pp_mutex(task, m_desc);
raw_spin_unlock(&task->pi_lock);
m_desc->owner = task;
}
/**
* __task_unlocked_pp_mutex() - is called from pthread_mutex_unlock() and
* when task dies, undoes PP boosting on the current task.
* @task: current's task_struct.
* @m_desc: the descriptor of the mutex in question.
*
* The difference between this function and task_unlocked_pp_mutex() is that
* pi_lock is already held and m_desc->owner is not zeroed.
*/
static void __task_unlocked_pp_mutex(struct task_struct *task,
struct mutex_desc *m_desc)
{
struct plist_node *old_top_entry, *new_top_entry;
old_top_entry = plist_first(&current->el_posix.pp_mutex_list);
plist_del(&m_desc->mutex_list_entry.pp,
&current->el_posix.pp_mutex_list);
if (plist_head_empty(&current->el_posix.pp_mutex_list))
new_top_entry = NULL;
else
new_top_entry = plist_first(&current->el_posix.pp_mutex_list);
if (!new_top_entry || new_top_entry->prio != old_top_entry->prio) {
/* Boosting priority has changed. */
plist_del(&current->el_posix.pi_list_entry,
&current->el_posix.pi_waiters);
if (new_top_entry) {
current->el_posix.pi_list_entry.prio =
new_top_entry->prio;
plist_add(&current->el_posix.pi_list_entry,
&current->el_posix.pi_waiters);
}
/* Change priority */
__mutex_adjust_prio(current, task_has_pi_waiters(current));
}
}
/**
* task_unlocked_pp_mutex() - is called from pthread_mutex_unlock(),
* undoes PP boosting on the current task.
* @task: current's task_struct.
* @m_desc: the descriptor of the mutex in question.
*/
static void task_unlocked_pp_mutex(struct task_struct *const task,
struct mutex_desc *m_desc)
{
raw_spin_lock(&task->pi_lock);
__task_unlocked_pp_mutex(task, m_desc);
raw_spin_unlock(&task->pi_lock);
m_desc->owner = NULL;
}
/**
* do_get_prio_protect() - returns the priority boosted by PTHREAD_PRIO_PROTECT
* mutexes.
*
* This function is not required by POSIX and is used for testing
* PTHREAD_PRIO_PROTECT mutexes.
*/
static int do_get_prio_protect()
{
struct task_struct *const task = current;
int prio;
prio = task->rt_priority;
if (!plist_head_empty(&task->el_posix.pp_mutex_list)) {
struct plist_node *top_entry;
top_entry = plist_first(&task->el_posix.pp_mutex_list);
prio = max(prio, MAX_RT_PRIO-1 - top_entry->prio);
}
return prio;
}
/**
* handle_fault() - handles a page fault on a given (aligned) address.
* @address: the faulted address.
*/
static int handle_fault(unsigned long address)
{
struct mm_struct *mm = current->mm;
int ret = 0;
down_read(&mm->mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long) address,
FAULT_FLAG_WRITE);
up_read(&mm->mmap_sem);
DbgPos("handle_fault in el_posix returned %d\n", ret);
return ret;
}
/* Iterating backwards has superior performance when moving plists. */
#ifndef plist_for_each_entry_safe_reverse
#define plist_for_each_entry_safe_reverse(pos, n, head, m) \
list_for_each_entry_safe_reverse(pos, n, &(head)->node_list, \
m.node_list)
#endif
/**
* requeue_waiters() - moves maximum @count first entries from the plist @from
* at a condition variable to the plist @to at a mutex.
* @from: the list to move.
* @to: where to move.
* @count: how many entries to move.
* @m_desc: the descriptor containing the 'to' plist.
* @protocol: the mutex's protocol (the same as m_desc->protocol).
*/
static void requeue_waiters(struct plist_head *from,
struct plist_head *to, const int count,
struct mutex_desc *m_desc, const char protocol)
{
int moved, prio;
struct el_waiter *this, *tmp;
WARN_ON(plist_head_empty(from));
moved = 0;
plist_for_each_entry_safe_reverse(this, tmp, from, list_entry) {
/* Add this task to the mutex waitqueue */
DbgPos("requeue_waiters: moving task %d\n", this->task->pid);
this->state = WAITING_ON_MUTEX;
plist_del(&this->list_entry, from);
init_list_entry:
prio = this->task->prio;
this->list_entry.prio = min(prio, MAX_RT_PRIO);
if (protocol == PTHREAD_PRIO_INHERIT) {
this->pi_desc = m_desc;
/* The task is blocked on this mutex now.
* Corresponding smp_read_barrier_depends()
* is called from mutex_adjust_prio_chain()
* and el_posix_adjust_pi(). */
smp_wmb();
this->task->el_posix.pi_blocked_on = this;
/* There was a small window between reading
* task->prio and writing el_posix.pi_blocked_on
* in which task's priority may have changed,
* so re-read it. This is faster than locking
* task->pi_lock. We may have old priority
* stored in waiter for some time, but it is
* OK since the m_desc->lock is locked now. */
smp_mb();
if (unlikely(this->task->prio != prio))
goto init_list_entry;
}
plist_add(&this->list_entry, to);
if (unlikely(++moved >= count))
break;
}
return;
}
/**
* try_to_lock_mutex_proxy() - check mutex->__m_lock field to see whether
* the first waiter in mutex's waitqueue should be woken up.
* @mutex: the mutex in question.
* @m_desc: descriptor of the mutex.
* @protocol: priority protection protocol of the mutex (the same as
* m_desc->protocol).
*
* try_to_lock_mutex_proxy() should be called after requeue_waiters() -
* i.e. we first move waiters and then check whether to wake up the first one.
*
* Returns zero on success (if the mutex is available).
*
* Must be called with m_desc->lock held.
*/
static __always_inline int try_to_lock_mutex_proxy(
struct pthread_mutex_s *const mutex,
struct mutex_desc *const m_desc,
const char protocol)
{
int rval, oldval;
switch (protocol) {
case PTHREAD_PRIO_NONE:
/* Try to acquire the mutex for the thread-to-be-woken */
if (unlikely(__get_user(rval, &mutex->__m_lock)))
return -EFAULT;
if (likely(rval != -1)) {
rval = el_atomic_xchg_acq(oldval, &mutex->__m_lock, -1);
if (likely(!rval))
rval = !!oldval;
}
break;
#if !defined ARCH_HAS_ATOMIC_CMPXCHG
case PTHREAD_PRIO_INHERIT:
case PTHREAD_PRIO_PROTECT:
rval = m_desc->owner || m_desc->pending_owner;
break;
#endif
default:
BUG();
}
return rval;
}
/* How many threads to move at once before re-enabling interrupts
* (smaller values improve irqs-off latency and decrease throughput of
* pthread_cond_broadcast() and pthread_barrier_wait()). Must be
* greater than 1. */
#define MOVE_AT_MOST 8
/* How many threads to wake at once */
#define WAKE_AT_MOST 2
/**
* do_cond_wake() - implements pthread_cond_signal() and
* pthread_cond_broadcast() functionality.
* @cond: the condition variable in question.
* @__c_desc: the key for finding condition variable's descriptor
* (the same as cond->__c_desc).
* @up_mode:
* 1 == MOVE_TO_MUTEX_ONE - move and wake one,
* 0 == MOVE_TO_MUTEX_ALL - move all, wake one.
*
* do_cond_wake() moves one or all threads waiting on the condition
* to the corresponding mutex and wakes up the first one of them if
* the mutex can be locked.
*/
static int do_cond_wake(
struct pthread_cond_s *const cond,
const int __c_desc,
const int up_mode)
{
struct pthread_mutex_s *mutex;
struct task_struct *first_in_pi_chain = NULL;
struct task_struct *const task = current;
struct cond_desc *const c_desc = cond_once(task, cond, __c_desc);
struct mutex_desc *m_desc;
struct el_waiter *waiter = NULL, *old_top_waiter, *temp_waiter;
struct plist_head detached_list;
struct plist_node pi_list_entry = PLIST_NODE_INIT(pi_list_entry,
MAX_PRIO-1);
int i, rval, do_wake_up = 0, waiting_owner;
#if defined ARCH_HAS_ATOMIC_CMPXCHG
int oldval;
#endif
restart:
DbgPos("do_cond_wake: up_mode %d, cond %p, c_desc %p\n",
up_mode, cond, c_desc);
if (unlikely(IS_ERR(c_desc)))
return PTR_ERR(c_desc);
el_pagefault_disable();
raw_spin_lock_irq(&c_desc->lock);
if (unlikely(__get_user(mutex, &cond->__c_mutex)))
goto handle_fault_in_condition;
/* There is no need to check with desc_in_use() here because we
* test whether the c_desc->wait_list is empty, and that check
* is enough (if it is not empty then desc_in_use() == 1, and
* if it is we do not care). */
if (desc_check_type(c_desc, CONDITION)
|| (desc_get_object(c_desc, CONDITION) != cond &&
desc_private(c_desc))) {
rval = -EINVAL;
goto out_error_unlock_cond;
}
if (unlikely(!mutex))
goto out_success_unlock_cond;
/* This check also ensures that c_desc->m_desc is not NULL. */
if (unlikely(plist_head_empty(&c_desc->wait_list))) {
/* Although the wait queue is empty, it is still possible that
* __c_mutex is not NULL (if fork() happened while there were
* waiters). Since we are at this instruction and we tested
* __c_mutex before, it actually happened. Make sure
* cond->__c_mutex has the up-to-date information. */
if (unlikely(__put_user(NULL, &cond->__c_mutex)))
goto handle_fault_in_condition;
goto out_success_unlock_cond;
}
m_desc = (void *) ((unsigned long) c_desc->m_desc & ~1UL);
waiting_owner = (int) ((unsigned long) c_desc->m_desc & 1UL);
/* For shared mutexes there is no fast way to retrieve pointer
* to the mutex for the current process (it might even be
* impossible). That's why for shared mutexes we do not try
* to take the mutex by ourselves and just wake the waiter. */
if (desc_private(m_desc)) {
/* Private mutex. */
if (BAD_USER_REGION(mutex, struct pthread_mutex_s)) {
DbgPos("do_cond_wake: bad private mutex address %p "
"(c_desc->m_desc=%p)\n",
mutex, c_desc->m_desc);
rval = -EINVAL;
goto out_error_unlock_cond;
}
}
switch (up_mode) {
case MOVE_TO_MUTEX_ONE:
waiter = plist_first_entry(&c_desc->wait_list,
struct el_waiter, list_entry);
if (c_desc->wait_list.node_list.next->next
== &c_desc->wait_list.node_list) {
/* Since there are no more threads waiting on
* condition, disassociate the mutex from it */
if (unlikely(__put_user(NULL, &cond->__c_mutex)))
goto handle_fault_in_condition;
c_desc->m_desc = NULL;
}
if ((!desc_private(m_desc) &&
#ifndef ARCH_HAS_ATOMIC_CMPXCHG
m_desc->protocol == PTHREAD_PRIO_NONE &&
#endif
!mutex_has_waiters(m_desc))
|| unlikely(waiting_owner)) {
/* Special case with locked recursive mutex. We should
* not move the waiter because he already has the mutex.
* No need for priority inheritance stuff here.
*
* We also cannot move the waiter if the mutex
* is shared (this does not apply to mutexes
* located entirely in kernel space and mutexes
* with waiters which do not require access to
* mutex->__m_lock). */
c_desc->m_desc = (void *) ((unsigned long)
c_desc->m_desc & ~1UL);
just_wake_waiter:
plist_del(&waiter->list_entry, &c_desc->wait_list);
waiter->state = NOT_WAITING;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
break;
}
/* We detach waiter from the descriptor so that if
* atomic operation on mutex fails, waiter will be
* able to wake itself. */
plist_del(&waiter->list_entry, &c_desc->wait_list);
INIT_LIST_HEAD(&waiter->list_entry.prio_list);
waiter->list_entry.node_list.next = &detached_list.node_list;
waiter->list_entry.node_list.prev = &detached_list.node_list;
detached_list.node_list.next = &waiter->list_entry.node_list;
detached_list.node_list.prev = &waiter->list_entry.node_list;
continue_signal:
raw_spin_lock(&m_desc->lock);
if (check_desc(m_desc, MUTEX, mutex)
|| unlikely(m_desc->robust == NOT_RECOVERABLE)
#if defined ARCH_HAS_ATOMIC_CMPXCHG
|| (unlikely(!desc_private(m_desc))
#else
|| (unlikely(!desc_private(m_desc) &&
m_desc->protocol == PTHREAD_PRIO_NONE)
#endif
&& !mutex_has_waiters(m_desc))) {
/* Oops. check_desc() should not fail in good user
* programs. Wake the waiter and let him sort it out.
* It does not matter that the waiter was moved:
* we still hold the condition variable's spinlock.
*
* If the mutex is in OWNER_DEAD state then it has
* an owner that will take care of everything, so
* there is no need to check for this case.
*
* Also check for shared mutex again since the previous
* check was done without holding the spinlock. */
raw_spin_unlock(&m_desc->lock);
goto just_wake_waiter;
}
switch (m_desc->protocol) {
case PTHREAD_PRIO_NONE:
/* We do not check here whether the mutex is shared or
* whether its owner died because we already know that
* if it is shared (or its owner died) then it has
* waiters (or a new owner), and the code below does
* exactly what we want to do in this case. */
if (mutex_has_waiters(m_desc)) {
/* The mutex has waiters so there is no point
* in trying to fast lock it. */
do_wake_up = 0;
} else {
do_wake_up = try_to_lock_mutex_proxy(mutex,
m_desc, PTHREAD_PRIO_NONE);
if (unlikely(do_wake_up == -EFAULT))
goto handle_fault_in_mutex;
do_wake_up = !do_wake_up;
}
plist_node_init(&waiter->list_entry,
waiter->list_entry.prio);
plist_add(&waiter->list_entry, &m_desc->wait_list);
/* State is changing under both spinlocks */
waiter->state = WAITING_ON_MUTEX;
raw_spin_unlock(&c_desc->lock);
if (do_wake_up || task_can_steal_mutex(m_desc,
waiter->task)) {
/* Mutex is free or can be stealed */
m_desc->pending_owner = waiter->task;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
}
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
break;
case PTHREAD_PRIO_INHERIT:
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/* We do not check here whether the mutex is shared or
* whether its owner died because we already know that
* if it is shared (or its owner died) then it has
* waiters (or a new owner), and the code below does
* exactly what we want to do in this case. */
if (mutex_has_waiters(m_desc)) {
/* The mutex has waiters so we know already
* that mutex->__m_lock == -1. */
oldval = -1;
} else {
rval = el_atomic_xchg_acq(oldval,
&mutex->__m_lock, -1);
if (unlikely(rval))
goto handle_fault_in_mutex;
}
#endif
plist_node_init(&waiter->list_entry,
waiter->list_entry.prio);
waiter->state = WAITING_ON_MUTEX;
raw_spin_unlock(&c_desc->lock);
#if defined ARCH_HAS_ATOMIC_CMPXCHG
waiter->pi_desc = m_desc;
first_in_pi_chain = task_blocks_on_pi_mutex(
waiter->task, waiter, m_desc, oldval);
if (unlikely(IS_ERR(first_in_pi_chain))) {
/* PTR_ERR(first_in_pi_chain) == -EOWNERDEAD */
first_in_pi_chain = NULL;
do_wake_up = 1;
} else if (unlikely(first_in_pi_chain)) {
/* Mutex has an owner, thus it cannot
* be stealed. */
boost_priority(waiter->list_entry.prio,
&pi_list_entry);
do_wake_up = 0;
} else {
do_wake_up = !oldval || task_can_steal_mutex(
m_desc, waiter->task);
}
#else
waiter->pi_desc = m_desc;
first_in_pi_chain = task_blocks_on_pi_mutex(
waiter->task, waiter, m_desc);
if (unlikely(first_in_pi_chain)) {
boost_priority(waiter->list_entry.prio,
&pi_list_entry);
do_wake_up = 0;
} else {
do_wake_up = !m_desc->owner
&& (!m_desc->pending_owner
|| m_desc->pending_owner->prio
> waiter->task->prio);
}
#endif
if (do_wake_up) {
/* Mutex is free or can be stealed */
m_desc->pending_owner = waiter->task;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
}
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
if (unlikely(first_in_pi_chain)) {
WARN_ON(do_wake_up);
mutex_adjust_prio_chain(first_in_pi_chain,
m_desc, waiter->task);
restore_priority(&pi_list_entry);
}
break;
case PTHREAD_PRIO_PROTECT:
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/* We do not check here whether the mutex is shared or
* whether its owner died because we already know that
* if it is shared (or its owner died) then it has
* waiters (or a new owner), and the code below does
* exactly what we want to do in this case. */
if (mutex_has_waiters(m_desc)) {
/* The mutex has waiters so we know already
* that mutex->__m_lock == -1. */
oldval = -1;
} else {
rval = el_atomic_xchg_acq(oldval,
&mutex->__m_lock, -1);
if (unlikely(rval))
goto handle_fault_in_mutex;
}
#endif
plist_node_init(&waiter->list_entry,
waiter->list_entry.prio);
plist_add(&waiter->list_entry, &m_desc->wait_list);
waiter->state = WAITING_ON_MUTEX;
raw_spin_unlock(&c_desc->lock);
#if defined ARCH_HAS_ATOMIC_CMPXCHG
if (unlikely(oldval > 0)) {
rval = task_locked_pp_mutex_proxy(oldval,
m_desc);
if (unlikely(rval)) {
/* rval == -EOWNERDEAD */
do_wake_up = 1;
} else {
do_wake_up = task_can_steal_mutex(
m_desc, waiter->task);
}
} else {
do_wake_up = !oldval || task_can_steal_mutex(
m_desc, waiter->task);
}
#else
do_wake_up = !m_desc->owner && (!m_desc->pending_owner
|| task_can_steal_mutex(m_desc,
waiter->task));
#endif
if (do_wake_up) {
/* Mutex is free or it can be stealed */
m_desc->pending_owner = waiter->task;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
}
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
break;
}
break;
case MOVE_TO_MUTEX_ALL:
/* Wake up one thread and move others from the
* waitqueue in condition to the waitqueue in mutex */
if (unlikely(__put_user(NULL, &cond->__c_mutex)))
goto handle_fault_in_condition;
/* Detach the list of waiting threads from
* the condition variable so that we can
* later drop the spinlock */
detached_list = c_desc->wait_list;
detached_list.node_list.next->prev = &detached_list.node_list;
detached_list.node_list.prev->next = &detached_list.node_list;
c_desc->m_desc = NULL;
plist_head_init(&c_desc->wait_list);
continue_broadcast:
raw_spin_lock(&m_desc->lock);
if (check_desc(m_desc, MUTEX, mutex)
|| unlikely(m_desc->robust == NOT_RECOVERABLE)
#if defined ARCH_HAS_ATOMIC_CMPXCHG
|| (unlikely(!desc_private(m_desc))
#else
|| (unlikely(!desc_private(m_desc) &&
m_desc->protocol == PTHREAD_PRIO_NONE)
#endif
&& !mutex_has_waiters(m_desc))) {
/* check_desc() should not fail in good user programs,
* we wake all waiters in this case and let them sort
* it out.
*
* If the mutex is shared and has no waiters then we
* do not have access to mutex->__m_lock field and
* the only solution is to wake all waiters.
*
* If the mutex is in OWNER_DEAD state then it has
* an owner that will take care of everything, so
* there is no need to check for this case.
*
* If the mutex is in a not recoverable state none of
* the waiters should be blocked. */
struct el_waiter *this, *tmp;
int i = 0;
DbgPos("do_wake_up: unlikely case, just wake them all\n");
/* m_desc is not needed, so we release the lock. */
raw_spin_unlock(&m_desc->lock);
/* Wake all waiters. */
plist_for_each_entry_safe(this, tmp, &detached_list,
list_entry) {
plist_del(&this->list_entry, &detached_list);
this->state = NOT_WAITING;
wake_up_state(this->task, TASK_INTERRUPTIBLE);
if (unlikely(++i >= WAKE_AT_MOST))
break;
}
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
goto broadcast_iteration_end;
}
if (unlikely(waiting_owner)) {
/* This is a special case, because when thread calls
* pthread_cond_wait() while holding mutex, it may not
* release the mutex, so we wake the owner. We know
* that the owner is the first in the list because
* he was queued with priority -1. */
DbgPos("do_cond_wake: owner is waiting\n");
waiter = plist_first_entry(&detached_list,
struct el_waiter, list_entry);
waiting_owner = 0;
plist_del(&waiter->list_entry, &detached_list);
waiter->state = NOT_WAITING;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
}
/* Actually move tasks from the condition to the mutex */
switch (m_desc->protocol) {
case PTHREAD_PRIO_NONE:
/* We do not check here whether the mutex is shared or
* whether its owner died because we already know that
* if it is shared (or its owner died) then it has
* waiters (or a new owner), and the code below does
* exactly what we want to do in this case. */
if (mutex_has_waiters(m_desc)) {
DbgPos("do_cond_wake: mutex 0x%lx has waiters\n",
mutex);
/* The mutex has waiters so there is no point
* in trying to fast lock it. */
do_wake_up = 0;
} else {
do_wake_up = try_to_lock_mutex_proxy(mutex,
m_desc, PTHREAD_PRIO_NONE);
if (unlikely(do_wake_up == -EFAULT))
goto handle_fault_in_mutex;
do_wake_up = !do_wake_up;
DbgPos("do_cond_wake: proxy locking was %ssuccessfull\n",
(do_wake_up) ? "" : "un");
}
waiter = plist_first_entry(&detached_list,
struct el_waiter, list_entry);
requeue_waiters(&detached_list, &m_desc->wait_list,
MOVE_AT_MOST, m_desc, PTHREAD_PRIO_NONE);
raw_spin_unlock(&c_desc->lock);
break;
case PTHREAD_PRIO_INHERIT:
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/* We do not check here whether the mutex is shared or
* whether its owner died because we already know that
* if it is shared (or its owner died) then it has
* waiters (or a new owner), and the code below does
* exactly what we want to do in this case. */
if (mutex_has_waiters(m_desc)) {
old_top_waiter = mutex_top_waiter(m_desc);
oldval = -1;
} else {
if (unlikely(__get_user(oldval,
&mutex->__m_lock)))
goto handle_fault_in_mutex;
if (likely(oldval != -1)) {
rval = el_atomic_xchg_acq(oldval,
&mutex->__m_lock, -1);
if (unlikely(rval))
goto handle_fault_in_mutex;
}
old_top_waiter = NULL;
}
requeue_waiters(&detached_list, &m_desc->wait_list,
MOVE_AT_MOST, m_desc, PTHREAD_PRIO_INHERIT);
#else
if (mutex_has_waiters(m_desc))
old_top_waiter = mutex_top_waiter(m_desc);
else
old_top_waiter = NULL;
requeue_waiters(&detached_list, &m_desc->wait_list,
MOVE_AT_MOST, m_desc, PTHREAD_PRIO_INHERIT);
#endif
raw_spin_unlock(&c_desc->lock);
waiter = mutex_top_waiter(m_desc);
#if defined ARCH_HAS_ATOMIC_CMPXCHG
first_in_pi_chain = pi_mutex_waiters_changed(
m_desc, old_top_waiter, waiter, oldval);
if (unlikely(IS_ERR(first_in_pi_chain))) {
WARN_ON(PTR_ERR(first_in_pi_chain)
== -ENOTRECOVERABLE);
/* PTR_ERR(first_in_pi_chain) == -EOWNERDEAD */
first_in_pi_chain = NULL;
do_wake_up = 1;
} else if (unlikely(first_in_pi_chain)) {
boost_priority(waiter->list_entry.prio,
&pi_list_entry);
do_wake_up = 0;
} else {
do_wake_up = !oldval;
}
#else
first_in_pi_chain = pi_mutex_waiters_changed(
m_desc, old_top_waiter, waiter);
if (unlikely(first_in_pi_chain))
boost_priority(waiter->list_entry.prio,
&pi_list_entry);
/* 'mutex' parameter is not used here. */
do_wake_up = !try_to_lock_mutex_proxy(NULL,
m_desc, PTHREAD_PRIO_INHERIT);
#endif
break;
case PTHREAD_PRIO_PROTECT:
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/* We do not check here whether the mutex is shared or
* whether its owner died because we already know that
* if it is shared (or its owner died) then it has
* waiters (or a new owner), and the code below does
* exactly what we want to do in this case. */
if (mutex_has_waiters(m_desc)) {
/* The mutex has waiters so we know already
* that mutex->__m_lock == -1. */
oldval = -1;
} else {
if (unlikely(__get_user(oldval,
&mutex->__m_lock)))
goto handle_fault_in_mutex;
if (likely(oldval != -1)) {
rval = el_atomic_xchg_acq(oldval,
&mutex->__m_lock, -1);
if (unlikely(rval))
goto handle_fault_in_mutex;
}
}
#endif
/* Move threads waiting on the condition
* to the mutex waitqueue */
waiter = plist_first_entry(&detached_list,
struct el_waiter, list_entry);
requeue_waiters(&detached_list, &m_desc->wait_list,
MOVE_AT_MOST, m_desc,
PTHREAD_PRIO_PROTECT);
raw_spin_unlock(&c_desc->lock);
#if defined ARCH_HAS_ATOMIC_CMPXCHG
if (unlikely(oldval > 0)) {
rval = task_locked_pp_mutex_proxy(oldval,
m_desc);
if (unlikely(rval)) {
/* rval == -EOWNERDEAD */
do_wake_up = 1;
}
} else {
do_wake_up = !oldval;
}
#else
/* 'mutex' parameter is not used here. */
do_wake_up = !try_to_lock_mutex_proxy(NULL,
m_desc, PTHREAD_PRIO_PROTECT);
#endif
break;
}
if (do_wake_up || task_can_steal_mutex(m_desc, waiter->task)) {
/* Mutex is free or can be stealed */
DbgPos("do_cond_wake: waking %d thread\n",
waiter->task->pid);
m_desc->pending_owner = waiter->task;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
}
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
if (m_desc->protocol == PTHREAD_PRIO_INHERIT
&& unlikely(first_in_pi_chain)) {
mutex_adjust_prio_chain(first_in_pi_chain,
m_desc, waiter->task);
restore_priority(&pi_list_entry);
}
broadcast_iteration_end:
if (unlikely(!plist_head_empty(&detached_list))) {
cpu_relax();
raw_spin_lock_irq(&c_desc->lock);
if (!plist_head_empty(&detached_list)) {
el_pagefault_disable();
goto continue_broadcast;
}
raw_spin_unlock_irq(&c_desc->lock);
}
break;
}
goto out_success;
out_success_unlock_cond:
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
out_success:
DbgPos("do_cond_wake end\n");
return 0;
out_error_unlock_cond:
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
out_error:
DbgPos("do_cond_wake end, error=%d\n", rval);
return rval;
handle_fault_in_condition:
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
rval = handle_fault((unsigned long) &cond->__c_mutex);
if (!rval)
goto restart;
else
goto out_error;
handle_fault_in_mutex:
raw_spin_unlock(&m_desc->lock);
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
rval = handle_fault((unsigned long) &mutex->__m_lock);
el_pagefault_disable();
raw_spin_lock_irq(&c_desc->lock);
if (rval)
goto wake_all_in_detached_list;
if (!plist_head_empty(&detached_list)) {
switch (up_mode) {
case MOVE_TO_MUTEX_ONE:
goto continue_signal;
case MOVE_TO_MUTEX_ALL:
goto continue_broadcast;
}
} else {
DbgPos("do_cond_wake end after fault\n");
return 0;
}
wake_all_in_detached_list:
i = 0;
plist_for_each_entry_safe_reverse(waiter, temp_waiter, &detached_list,
list_entry) {
plist_del(&waiter->list_entry, &detached_list);
waiter->state = NOT_WAITING;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
if (++i >= WAKE_AT_MOST)
break;
}
if (!plist_head_empty(&detached_list)) {
raw_spin_unlock_irq(&c_desc->lock);
cpu_relax();
raw_spin_lock_irq(&c_desc->lock);
goto wake_all_in_detached_list;
}
rval = -EFAULT;
goto out_error_unlock_cond;
}
/**
* schedule_with_timeout() - sleep with an absolute timeout.
* @task: the current task_struct.
* @clock_id: the clock to use for an alarm.
* @abstime: absolute timeout.
*/
static int schedule_with_timeout(struct task_struct *task,
clockid_t clock_id,
struct timespec_64 *abstime)
{
struct hrtimer_sleeper hrtimer;
ktime_t k_abstime = ktime_set(abstime->tv_sec, abstime->tv_nsec);
int timedout = 0;
hrtimer_init_on_stack(&hrtimer.timer, clock_id, HRTIMER_MODE_ABS);
hrtimer_init_sleeper(&hrtimer, task);
hrtimer_set_expires_range_ns(&hrtimer.timer, k_abstime,
current->timer_slack_ns);
DbgPos("hrtimer set with slack of %ld\n", current->timer_slack_ns);
hrtimer_start_expires(&hrtimer.timer, HRTIMER_MODE_ABS);
if (!hrtimer_active(&hrtimer.timer))
hrtimer.task = NULL;
if (hrtimer.task)
schedule();
hrtimer_cancel(&hrtimer.timer);
if (hrtimer.task == NULL)
timedout = 1;
destroy_hrtimer_on_stack(&hrtimer.timer);
return timedout;
}
/**
* give_up_on_mutex() - stop waiting on mutex.
* @mutex: the mutex in question.
* @m_desc: the mutex's descriptor.
* @waiter: the pointer to the used el_waiter structure.
*
* Must be called with m_desc->lock held.
*/
static int give_up_on_mutex(struct pthread_mutex_s *mutex,
struct mutex_desc *m_desc, struct el_waiter *waiter)
{
struct task_struct *first_in_pi_chain = NULL;
int rval = 0;
if (unlikely(m_desc->robust > ROBUST))
goto skip_fixing_user_space;
restart:
/* Change synchronization variable as needed */
switch (m_desc->protocol) {
#if defined ARCH_HAS_ATOMIC_CMPXCHG
case PTHREAD_PRIO_INHERIT:
case PTHREAD_PRIO_PROTECT:
#endif
case PTHREAD_PRIO_NONE:
if (m_desc->wait_list.node_list.next->next ==
&m_desc->wait_list.node_list) {
/* There are no other waiters. */
if (unlikely(m_desc->pending_owner == current)) {
/* If only we were in the waitqueue and we were
* the pending owner, mutex will be free. */
if (unlikely(__put_user(0, &mutex->__m_lock)))
goto handle_fault;
}
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/* When ARCH_HAS_ATOMIC_CMPXCHG is not defined
* we race here with zeroing __m_lock in
* pthread_mutex_unlock() in userspace library
* (it is still possible to use an atomic
* instruction here though). */
else if (m_desc->protocol == PTHREAD_PRIO_NONE) {
/* So mutex has an owner and the last waiter
* is leaving. Set __m_lock to 1 so that the
* owner will be able to fast unlock it. */
if (unlikely(__put_user(1, &mutex->__m_lock)))
goto handle_fault;
}
#endif
}
break;
}
skip_fixing_user_space:
/* Unqueue this task from mutex */
if (m_desc->protocol != PTHREAD_PRIO_INHERIT) {
plist_del(&waiter->list_entry, &m_desc->wait_list);
waiter->state = NOT_WAITING;
} else {
first_in_pi_chain = give_up_on_pi_mutex(waiter->task,
waiter, m_desc);
}
if (unlikely(m_desc->pending_owner == current)) {
/* This task was set as the pending owner,
* give the mutex to the next waiter */
if (mutex_has_waiters(m_desc)) {
struct task_struct *to_wake =
plist_first_entry(&m_desc->wait_list,
struct el_waiter, list_entry)->task;
m_desc->pending_owner = to_wake;
wake_up_state(to_wake, TASK_INTERRUPTIBLE);
} else {
m_desc->pending_owner = NULL;
}
}
DbgPos("give_up_on_mutex: mutex %p unqueued\n", mutex);
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
if (first_in_pi_chain)
mutex_adjust_prio_chain(first_in_pi_chain, m_desc,
waiter->task);
return rval;
handle_fault:
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
rval = handle_fault((unsigned long) &mutex->__m_lock);
el_pagefault_disable();
raw_spin_lock_irq(&m_desc->lock);
if (!rval)
goto restart;
else
goto skip_fixing_user_space;
}
/**
* normal_prio() - calculate the expected normal priority, i.e. priority
* without taking priority inheritance and priority protection into
* account. Returned priority is "kernel priority", i.e. 0 is the
* highest possible priority.
* @p: the task in question.
*/
static __always_inline int normal_prio(struct task_struct *p)
{
int prio;
if (likely(p->policy == SCHED_FIFO || p->policy == SCHED_RR))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = p->static_prio;
return prio;
}
/* Be careful: if this function fails, it does not enable interrupts
* and check for preemption. Must be called with the spinlock held. */
static int try_to_take_mutex(struct task_struct *const task,
struct pthread_mutex_s *const mutex,
struct mutex_desc *const m_desc,
struct el_waiter *const waiter)
{
#if DEBUG_POSIX
if (m_desc->pending_owner)
DbgPos("try_to_take_mutex: pending_owner %d (prio %d)\n",
m_desc->pending_owner->pid,
m_desc->pending_owner->prio);
else
DbgPos("try_to_take_mutex: no pending_owner\n");
#endif
if (likely(m_desc->pending_owner == task) ||
task_can_steal_mutex(m_desc, task)) {
/* Mutex is ours */
m_desc->pending_owner = NULL;
plist_del(&waiter->list_entry, &m_desc->wait_list);
waiter->state = NOT_WAITING;
switch (m_desc->protocol) {
case PTHREAD_PRIO_INHERIT:
/* Check if there are other waiters */
#if defined ARCH_HAS_ATOMIC_CMPXCHG
if (plist_head_empty(&m_desc->wait_list)
&& likely(m_desc->robust <= ROBUST)
&& likely(!__put_user(task->pid,
&mutex->__m_lock))) {
task_slow_locked_pi_mutex(task, m_desc, 1);
} else {
task_slow_locked_pi_mutex(task, m_desc, 0);
}
#else
task_slow_locked_pi_mutex(task, m_desc);
#endif
break;
case PTHREAD_PRIO_PROTECT:
#if defined ARCH_HAS_ATOMIC_CMPXCHG
if (plist_head_empty(&m_desc->wait_list)
&& likely(m_desc->robust <= ROBUST)
&& likely(!__put_user(task->pid,
&mutex->__m_lock))) {
/* Now the mutex has pid in mutex->__m_lock
* field and it has no waiters, so there is no
* need to boost current's priority: if some
* task blocks on the mutex then it will raise
* current's priority, otherwise boosting will
* just waste CPU time. */
} else
#endif
task_locked_pp_mutex(task, m_desc);
break;
case PTHREAD_PRIO_NONE:
/* Check if there are other waiters */
if (plist_head_empty(&m_desc->wait_list))
/* If this fails, fast unlocking
* will not be possible */
__put_user(1, &mutex->__m_lock);
break;
}
return 0;
} else {
/* Either we caught a signal or the mutex was stealed. */
return 1;
}
}
/**
* __do_mutex_timedlock() - implements pthread_mutex_(timed)lock().
* @mutex: the mutex to be locked.
* @abstime: absolute timeout for CLOCK_REALTIME.
* @task: points to current task_struct.
* @m_desc: the mutex's descriptor.
*
* mutex->__m_lock field is used to synchronize threads. It is
* interpreted differently depending on whether the architecture
* has atomic compare-and-swap instruction and on the type of the
* mutex in question.
*
*
* 1) PTHREAD_PRIO_NONE mutex:
*
* 1.1) ARCH_HAS_ATOMIC_CMPXCHG is set.
* __m_lock == 0 - the mutex is free and there are no waiters.
* It can be acquired promptly from userspace with
* cmpxchg(&__m_lock,0,1).
* __m_lock == 1 - the mutex is locked but has no waiters.
* It can be freed promptly from userspace with
* cmpxchg(&__m_lock,1,0).
* __m_lock == -1 - the mutex is locked and may have waiters. It cannot
* be freed without checking the waitqueue
* (i.e. without a system call).
* Transition 0->1 and 1->0 are the only ones allowed to happen in
* userspace. All other transitions happen in kernel under the mutex's
* spinlock. That's why when we see that __m_lock == -1 and the waitqueue is
* empty while holding spinlock, we can just write '1' into __m_lock to
* permit fast unlocking.
* This is done, for example, when contention for a lock is low
* and only 1 thread is waiting for it: when owner gives the mutex to this
* waiter, waiter before unlocking the spinlock writes '1' (see
* try_to_take_mutex()). This optimization is also used in
* cond_signal/broadcast (see do_cond_wake() and try_to_lock_mutex_proxy()).
*
* 1.2) ARCH_HAS_ATOMIC_CMPXCHG is not set.
* __m_lock == 0 - the mutex is free and can be acquired promptly with
* xchg(&__m_lock,1). NOTE: In contrast to 1.1 (see above)
* the mutex may have waiters! And if it has, at least one
* of them is running and trying to acquire the mutex.
* __m_lock == 1 - the mutex is locked and can be freed promptly from
* userspace with xchg(&__m_lock,0). NOTE: Like with
* __m_lock == 0 it may have waiters!
* __m_lock == -1 - the mutex is locked and may have waiters.
* It cannot be freed without checking the waitqueue
* (i.e. without a system call).
* All transitions are allowed to happen in userspace (*->-1 only happens in
* mutex_trylock, *->1 happens in mutex_lock and *->0 happens in mutex_unlock),
* so the same technique as in 1.1 above can be used in try_to_take_mutex()
* and cond_signal/broadcast.
* When a thread adds itself to the waitqueue, it (like in 1.1 above) changes
* mutex state to -1. When owner unlocks the mutex with xchg(&__m_lock,0) and
* sees that __m_lock was set to -1, it enters the kernel and tries to lock
* the mutex for the thread-to-be-woken (see __do_mutex_unlock()).
* If the mutex had been locked while the owner was entering the kernel (i.e.
* xchg returned non-zero value), then it is left in '-1' state. If mutex is
* free at the moment of xchg (i.e. xchg returns 0), the owner (the one that
* wrote 0) will wake the first waiter.
* One thing to pay attention to: some other thread may try to lock the mutex
* with xchg(&__m_lock,1) when it is in the '-1' state. Owner in this case will
* not do a system call when unlocking the mutex, and an obligation to clean
* up this (i.e. to set __m_lock to -1) lies on that thread that saw '-1'
* in __m_lock.
*
*
* 2) PTHREAD_PRIO_INHERIT or PTHREAD_PRIO_PROTECT mutex and
* ARCH_HAS_ATOMIC_CMPXCHG is set:
* __m_lock == 0 - the mutex is free and there are no waiters.
* It can be acquired promptly from userspace with
* cmpxchg(&__m_lock,0,tid).
* __m_lock == owner's tid (pid in kernel terminology) -
* the mutex is locked but has no waiters.
* It can be freed promptly from userspace with
* cmpxchg(&__m_lock,tid,0).
* __m_lock == -1 - the mutex is locked and may have waiters. It cannot
* be freed without checking the corresponding waitqueue
* (i.e. without a system call).
* Priority inheritance and priority protection protocols require some
* additional fields to be set when locking/unlocking the mutex. For example,
* 'owner' field in mutex descriptor points to owner' task_struct. If a thread
* acquired the mutex in userspace without doing a system call, it cannot set
* those fields. So, '-1' has an additional meaning in this case: it means that
* all necessary priority inheritance stuff (like setting 'owner' field) has
* been done and will have to be undone when unlocking the mutex.
*
*
* 3) All other cases (PTHREAD_PRIO_PROTECT and PTHREAD_PRIO_INHERIT:
* without ARCH_HAS_ATOMIC_CMPXCHG set):
* Since in these cases a system call is always done and the locking/unlocking
* thread can work directly with the mutex's waitqueue, there is no need in
* __m_lock field and it is always 0.
*/
static int __do_mutex_timedlock(
struct pthread_mutex_s *__restrict const mutex,
const struct timespec_64 *__restrict const abstime,
struct task_struct *const task,
struct mutex_desc *const m_desc)
{
int rval, oldval, protocol;
struct el_waiter waiter;
struct timespec_64 iabstime;
#if DEBUG_POSIX
int __m_lock, __m_owner;
__get_user(__m_lock, &mutex->__m_lock);
__get_user(__m_owner, &mutex->__m_owner);
#endif
DbgPos("mutex_lock mutex=%p: start, lock=%d, owner=%d\n",
mutex, __m_lock, __m_owner);
if (unlikely(abstime))
if (unlikely(copy_from_user(&iabstime, abstime,
sizeof(iabstime))))
return -EFAULT;
restart:
el_pagefault_disable();
raw_spin_lock_irq(&m_desc->lock);
if (check_desc(m_desc, MUTEX, mutex)) {
rval = -EINVAL;
goto out_unlock;
}
protocol = (int) m_desc->protocol;
switch (protocol) {
case PTHREAD_PRIO_PROTECT:
DbgPos("Testing prioceiling=%d, prio=%d\n",
(int) m_desc->prioceiling, normal_prio(task));
if (unlikely(((int) m_desc->prioceiling) > normal_prio(task))) {
rval = -EINVAL;
goto out_unlock;
}
/* FALLTHROUGH */
case PTHREAD_PRIO_INHERIT:
DbgPos("mutex_lock mutex=%p: protocol=%d, pending_owner=%d, "
"owner=%d\n", mutex, protocol,
(m_desc->pending_owner)
? (int) m_desc->pending_owner->pid : 0,
(m_desc->owner) ? m_desc->owner->pid : 0);
/* If mutex is in NOT_RECOVERABLE state return an error and
* if it is in OWNER_DEAD state then we'll have to block. */
if (unlikely(m_desc->robust == NOT_RECOVERABLE)) {
rval = -ENOTRECOVERABLE;
goto out_unlock;
}
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/* Read mutex->__m_lock beforehead. */
if (unlikely(__get_user(oldval, &mutex->__m_lock))) {
rval = -EFAULT;
goto out_unlock;
}
if (unlikely(m_desc->type == PTHREAD_MUTEX_ERRORCHECK_NP) &&
(m_desc->owner == task || oldval == task->pid)) {
rval = -EDEADLK;
goto out_unlock;
}
if (likely(oldval != -1)) {
rval = el_atomic_xchg_acq(oldval, &mutex->__m_lock, -1);
if (unlikely(rval))
goto out_unlock;
if (unlikely(oldval == 0)) {
/* Mutex was unlocked while we were
* entering the kernel. */
if (unlikely(__put_user(task->pid,
&mutex->__m_lock)))
/* Priority stuff must be done
* when __m_lock == -1. */
goto lock_protected_mutex;
goto success_unlock;
} else if (protocol == PTHREAD_PRIO_PROTECT) {
/* Mutex's owner priority was not
* boosted so do it now. */
rval = task_locked_pp_mutex_proxy(oldval,
m_desc);
if (rval)
/* Owner died and there are
* no waiters */
goto lock_protected_mutex;
}
} else if (task_can_steal_mutex(m_desc, task)) {
/* We can steal the mutex. Note that this check alone
* is not enough for PTHREAD_PRIO_INHERIT mutexes
* (see comment before task_blocks_on_pi_mutex()). */
m_desc->pending_owner = NULL;
lock_protected_mutex:
if (protocol == PTHREAD_PRIO_INHERIT)
task_fast_locked_pi_mutex(task, m_desc);
else
task_locked_pp_mutex(task, m_desc);
goto success_unlock;
}
#else
if (unlikely(m_desc->type == PTHREAD_MUTEX_ERRORCHECK_NP
&& m_desc->owner == task)) {
rval = -EDEADLK;
goto out_unlock;
}
if (!m_desc->owner && (!m_desc->pending_owner ||
__task_can_steal_mutex(m_desc, task))) {
/* Note that this check alone is not enough for
* PTHREAD_PRIO_INHERIT mutexes (see comment
* before task_blocks_on_pi_mutex()). */
if (m_desc->pending_owner)
/* Steal the mutex */
m_desc->pending_owner = NULL;
if (protocol == PTHREAD_PRIO_INHERIT)
task_fast_locked_pi_mutex(task, m_desc);
else
task_locked_pp_mutex(task, m_desc);
goto success_unlock;
}
#endif
break;
case PTHREAD_PRIO_NONE:
/* Check if owner unlocked mutex while we were entering
* kernel. Also it may be possible to steal the mutex
* (since no priority protection protocol is used, we will
* steal mutex regardless pending owner's priority). */
/* Read mutex->__m_lock beforehead. */
if (unlikely(__get_user(oldval, &mutex->__m_lock))) {
rval = -EFAULT;
goto out_unlock;
}
if (likely(oldval != -1)) {
if (unlikely(el_atomic_xchg_acq(oldval,
&mutex->__m_lock, -1))) {
rval = -EFAULT;
goto out_unlock;
}
}
if (unlikely(oldval == 0)) {
#if !defined ARCH_HAS_ATOMIC_CMPXCHG
if (plist_head_empty(&m_desc->wait_list))
#endif
/* Mutex has no waiters, so
* we try to make fast unlock possible. */
__put_user(1, &mutex->__m_lock);
goto success_unlock;
}
if (task_can_steal_mutex(m_desc, task)) {
/* Success */
DbgPos("mutex_lock: stealed\n");
m_desc->pending_owner = NULL;
goto success_unlock;
}
break;
default:
rval = -EINVAL;
goto out_unlock;
}
/* Check timeout validity before blocking ourselves */
if (unlikely(abstime && (iabstime.tv_nsec < 0 ||
iabstime.tv_nsec >= 1000000000))) {
rval = -EINVAL;
goto out_unlock;
}
waiter.task = task;
waiter.timedout = 0;
waiter.state = WAITING_ON_MUTEX;
/* Queue ourselves. */
if (protocol == PTHREAD_PRIO_INHERIT) {
struct task_struct *first_in_pi_chain;
waiter.pi_desc = m_desc;
#if defined ARCH_HAS_ATOMIC_CMPXCHG
first_in_pi_chain = task_blocks_on_pi_mutex(task,
&waiter, m_desc, oldval);
if (unlikely(IS_ERR(first_in_pi_chain))) {
WARN_ON(PTR_ERR(first_in_pi_chain) == -ENOTRECOVERABLE);
(void) give_up_on_pi_mutex(task, &waiter, m_desc);
task_fast_locked_pi_mutex(task, m_desc);
rval = -EOWNERDEAD;
goto out_unlock;
}
#else
first_in_pi_chain = task_blocks_on_pi_mutex(task,
&waiter, m_desc);
#endif
/* Check for stealing _after_ enqueuing ourselves.
* This protects us from race with another task
* boosting our priority. */
if (task_can_steal_mutex(m_desc, task))
m_desc->pending_owner = task;
/* Since we are going to call schedule() right away,
* there is no need to check preemption. */
raw_spin_unlock_irq_no_resched(&m_desc->lock);
el_pagefault_enable();
/* Adjust priorities if necessary */
if (first_in_pi_chain)
mutex_adjust_prio_chain(first_in_pi_chain, m_desc,
task);
} else {
int prio;
/* This thread is added to wait queue either with
* its own priority if it is a real-time thread or
* with MAX_RT_PRIO if it is non-RT thread. This
* way RT threads get woken up in priority order
* and non-RT threads get woken up in FIFO order. */
prio = min(task->prio, MAX_RT_PRIO);
plist_node_init(&waiter.list_entry, prio);
plist_add(&waiter.list_entry, &m_desc->wait_list);
/* Since we are going to call schedule() right away,
* there is no need to check preemption. */
raw_spin_unlock_irq_no_resched(&m_desc->lock);
el_pagefault_enable();
}
sleep:
set_task_state(task, TASK_INTERRUPTIBLE);
DbgPos("mutex_lock: before schedule(), mutex=%p\n", mutex);
if (likely(m_desc->pending_owner != task
&& m_desc->robust != NOT_RECOVERABLE)) {
/* Sleep only when necessary */
if (likely(!abstime))
schedule();
else
waiter.timedout = schedule_with_timeout(task,
CLOCK_REALTIME, &iabstime);
} else {
preempt_check_resched();
}
__set_task_state(task, TASK_RUNNING);
#if DEBUG_POSIX
__get_user(__m_lock, &mutex->__m_lock);
#endif
DbgPos("mutex_lock: after schedule(), lock=%d\n", __m_lock);
el_pagefault_disable();
raw_spin_lock_irq(&m_desc->lock);
if (unlikely(waiter.timedout) ||
try_to_take_mutex(task, mutex, m_desc, &waiter) != 0) {
/* Timed out, signaled or the mutex was stealed */
if (likely(!waiter.timedout) && !signal_pending(task)
&& likely(m_desc->robust != NOT_RECOVERABLE)) {
raw_spin_unlock_irq_no_resched(&m_desc->lock);
el_pagefault_enable();
goto sleep;
}
rval = give_up_on_mutex(mutex, m_desc, &waiter);
if (unlikely(m_desc->robust == NOT_RECOVERABLE)) {
rval = -ENOTRECOVERABLE;
} else if (likely(!rval)) {
if (waiter.timedout)
rval = -ETIMEDOUT;
else
rval = -EINTR;
}
goto out;
}
success_unlock:
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
if (unlikely(m_desc->robust > ROBUST)) {
/* Since we were able to lock the mutex, it
* should not be in NOT_RECOVERABLE state. */
WARN_ON_ONCE(m_desc->robust == NOT_RECOVERABLE);
rval = -EOWNERDEAD;
} else {
rval = 0;
}
#if DEBUG_POSIX
__get_user(__m_lock, &mutex->__m_lock);
#endif
DbgPos("mutex_lock success! mutex=%p: lock=%d, rval=%d\n",
mutex, __m_lock, rval);
return rval;
out_unlock:
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
if (unlikely(rval == -EFAULT)) {
if (!handle_fault((unsigned long) &mutex->__m_lock))
goto restart;
}
out:
#if DEBUG_POSIX
__get_user(__m_lock, &mutex->__m_lock);
#endif
DbgPos("mutex_lock mutex=%p: lock=%d, rval=%d\n",
mutex, __m_lock, rval);
WARN_ON(rval == 0);
return rval;
}
/**
* __do_mutex_trylock - kernel part of pthread_mutex_trylock() implementation
* @mutex: the mutex in question.
* @task: pointer to current task_struct.
* @m_desc: descriptor of the mutex.
*/
static int __do_mutex_trylock(
struct pthread_mutex_s *__restrict const mutex,
struct task_struct *const task,
struct mutex_desc *const m_desc)
{
#if defined ARCH_HAS_ATOMIC_CMPXCHG
struct task_struct *first_in_pi_chain;
int oldval;
#endif
const int protocol = (int) m_desc->protocol;
int rval;
restart:
el_pagefault_disable();
raw_spin_lock_irq(&m_desc->lock);
if (check_desc(m_desc, MUTEX, mutex)) {
rval = -EINVAL;
goto out_unlock;
}
DbgPos("mutex_trylock mutex=%p: start, pending_owner=%d\n",
mutex, (m_desc->pending_owner) ?
m_desc->pending_owner->pid : 0);
#if defined ARCH_HAS_ATOMIC_CMPXCHG
if (unlikely(protocol == PTHREAD_PRIO_NONE
|| m_desc->robust == NOT_ROBUST)) {
#else
if (unlikely(protocol == PTHREAD_PRIO_NONE)) {
#endif
rval = -EINVAL;
goto out_unlock;
}
/* So now we know that protocol is either
* PTHREAD_PRIO_INHERIT or PTHREAD_PRIO_PROTECT. */
if (unlikely(m_desc->robust == NOT_RECOVERABLE)) {
rval = -ENOTRECOVERABLE;
goto out_unlock;
}
if (protocol == PTHREAD_PRIO_PROTECT) {
DbgPos("Testing prioceiling=%d, prio=%d\n",
(int) m_desc->prioceiling, normal_prio(task));
if (unlikely(((int) m_desc->prioceiling) > normal_prio(task))) {
rval = -EINVAL;
goto out_unlock;
}
}
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/* This is a robust mutex, so check
* everything with extra care. */
if (unlikely(__get_user(oldval, &mutex->__m_lock))) {
rval = -EFAULT;
goto out_unlock;
}
if (likely(oldval != -1)) {
rval = el_atomic_xchg_acq(oldval, &mutex->__m_lock, -1);
if (unlikely(rval))
goto out_unlock;
if (unlikely(oldval == 0)) {
/* Mutex was unlocked while we were
* entering the kernel. */
if (unlikely(__put_user(task->pid, &mutex->__m_lock)))
/* Priority stuff must be done
* when __m_lock == -1. */
goto lock_protected_mutex;
goto out_unlock;
}
} else if (task_can_steal_mutex(m_desc, task)) {
/* We can steal the mutex. */
m_desc->pending_owner = NULL;
rval = 0;
lock_protected_mutex:
if (protocol == PTHREAD_PRIO_INHERIT)
task_fast_locked_pi_mutex(task, m_desc);
else
task_locked_pp_mutex(task, m_desc);
goto out_unlock;
}
if (m_desc->owner || m_desc->pending_owner) {
/* The mutex is busy and has a valid owner. */
rval = -EBUSY;
goto out_unlock;
}
/* Since mutex has no new waiters there should be
* no need to walk priority chain. */
if (protocol == PTHREAD_PRIO_INHERIT) {
first_in_pi_chain = task_fast_locked_pi_mutex_proxy(
oldval, m_desc);
if (IS_ERR(first_in_pi_chain)) {
task_fast_locked_pi_mutex(task, m_desc);
rval = PTR_ERR(first_in_pi_chain);
} else {
WARN_ON(first_in_pi_chain);
rval = -EBUSY;
}
} else {
rval = task_locked_pp_mutex_proxy(oldval, m_desc);
if (rval) {
/* Owner died and there are no waiters */
task_locked_pp_mutex(task, m_desc);
} else {
rval = -EBUSY;
}
}
goto out_unlock;
#else
if (!m_desc->owner && (!m_desc->pending_owner ||
__task_can_steal_mutex(m_desc, task))) {
if (m_desc->pending_owner)
/* Steal the mutex */
m_desc->pending_owner = NULL;
if (protocol == PTHREAD_PRIO_INHERIT)
task_fast_locked_pi_mutex(task, m_desc);
else
task_locked_pp_mutex(task, m_desc);
rval = 0;
} else {
rval = -EBUSY;
}
#endif
out_unlock:
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
if (unlikely(rval == -EFAULT)) {
if (!handle_fault((unsigned long) &mutex->__m_lock))
goto restart;
}
if (rval == 0) {
if (unlikely(m_desc->robust > ROBUST)) {
/* Since we were able to lock the mutex, it
* should not be in NOT_RECOVERABLE state. */
WARN_ON_ONCE(m_desc->robust == NOT_RECOVERABLE);
rval = -EOWNERDEAD;
}
}
DbgPos("mutex_trylock mutex=%p: rval=%d\n", mutex, rval);
return rval;
}
static int do_mutex_timedlock(
struct pthread_mutex_s *__restrict const mutex,
const struct timespec_64 *__restrict const abstime,
const int __m_kind, const int __m_desc)
{
struct task_struct *const task = current;
struct mutex_desc *const m_desc = mutex_once(task, mutex, __m_desc,
__m_kind);
if (unlikely(IS_ERR(m_desc)))
return PTR_ERR(m_desc);
if (likely(abstime != (void *) -1
#ifdef CONFIG_64BIT
/* On 64-bit kernels running 32-bit applications
* we have to test for 32 bits 'long' variables. */
&& ((int) (unsigned long) abstime) != -1
#endif
))
return __do_mutex_timedlock(mutex, abstime, task, m_desc);
else
return __do_mutex_trylock(mutex, task, m_desc);
}
/**
* __do_mutex_unlock() - implements pthread_mutex_unlock().
* @mutex: the mutex to be unlocked.
* @task: pointer to current task_struct.
* @m_desc: the mutex's descriptor.
*
* If __do_mutex_unlock() returns -ENOTRECOVERABLE, then the caller must
* call robust_mutex_wake_all() to wake all waiters on this mutex.
*
* Must be called with m_desc->lock held.
*/
static int __do_mutex_unlock(struct pthread_mutex_s *__restrict const mutex,
struct task_struct *const task,
struct mutex_desc *const m_desc)
{
int rval;
const int protocol = (int) m_desc->protocol;
struct el_waiter *waiter;
DbgPos("mutex_unlock mutex=%p: start\n", mutex);
switch (builtin_expect_wrapper(protocol, PTHREAD_PRIO_INHERIT)) {
case PTHREAD_PRIO_NONE:
if (likely(mutex_has_waiters(m_desc))) {
#if !defined ARCH_HAS_ATOMIC_CMPXCHG
int oldval;
rval = el_atomic_xchg_acq(oldval, &mutex->__m_lock, -1);
if (unlikely(rval || oldval))
/* If el_atomic_xchg_acq failed with -EFAULT or
* the mutex already has owner (i.e. oldval is
* 1 or -1) we do notihng. */
break;
#endif
waiter = plist_first_entry(&m_desc->wait_list,
struct el_waiter, list_entry);
m_desc->pending_owner = waiter->task;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
rval = 0;
} else {
#if defined ARCH_HAS_ATOMIC_CMPXCHG
/* We do not know if there are waiters, so just exit
* (maybe somebody has not added himself to the
* waitqueue yet). */
rval = __put_user(0, &mutex->__m_lock);
#else
rval = 0;
#endif
}
break;
case PTHREAD_PRIO_INHERIT:
case PTHREAD_PRIO_PROTECT:
if (unlikely(task != m_desc->owner)) {
rval = -EPERM;
break;
}
if (protocol == PTHREAD_PRIO_INHERIT)
task_unlocked_pi_mutex(task, m_desc);
else
task_unlocked_pp_mutex(task, m_desc);
if (unlikely(m_desc->robust == OWNER_DEAD)) {
m_desc->robust = NOT_RECOVERABLE;
rval = -ENOTRECOVERABLE;
break;
}
if (likely(mutex_has_waiters(m_desc))) {
waiter = plist_first_entry(&m_desc->wait_list,
struct el_waiter, list_entry);
m_desc->pending_owner = waiter->task;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
} else {
#ifdef ARCH_HAS_ATOMIC_CMPXCHG
if (unlikely(__put_user(0, &mutex->__m_lock))) {
rval = -EFAULT;
break;
}
#endif
}
rval = 0;
break;
default:
rval = -EINVAL;
break;
}
DbgPos("mutex_unlock mutex=%p: end, rval=%d\n", mutex, rval);
return rval;
}
static void robust_mutex_wake_all(
struct pthread_mutex_s *__restrict const mutex,
struct mutex_desc *m_desc)
{
struct el_waiter *waiter;
int i;
if (!mutex_has_waiters(m_desc))
return;
raw_spin_lock_irq(&m_desc->lock);
waiter = plist_first_entry(&m_desc->wait_list,
struct el_waiter, list_entry);
continue_wake:
if (check_desc(m_desc, MUTEX, mutex)
|| m_desc->robust != NOT_RECOVERABLE) {
raw_spin_unlock_irq(&m_desc->lock);
return;
}
i = 0;
list_for_each_entry_from(waiter, &m_desc->wait_list.node_list,
list_entry.node_list) {
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
if (++i >= WAKE_AT_MOST)
break;
}
raw_spin_unlock_irq(&m_desc->lock);
if (mutex_has_waiters(m_desc)) {
cpu_relax();
raw_spin_lock_irq(&m_desc->lock);
goto continue_wake;
}
}
static int do_mutex_unlock(struct pthread_mutex_s *__restrict const mutex,
const int __m_kind, const int __m_desc)
{
int rval;
struct task_struct *const task = current;
struct mutex_desc *const m_desc = mutex_once(task, mutex, __m_desc,
__m_kind);
if (unlikely(IS_ERR(m_desc)))
return PTR_ERR(m_desc);
restart:
el_pagefault_disable();
raw_spin_lock_irq(&m_desc->lock);
if (!check_desc(m_desc, MUTEX, mutex))
rval = __do_mutex_unlock(mutex, task, m_desc);
else
rval = -EINVAL;
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
if (unlikely(rval == -EFAULT)) {
if (!handle_fault((unsigned long) &mutex->__m_lock))
goto restart;
} else if (unlikely(rval == -ENOTRECOVERABLE)) {
robust_mutex_wake_all(mutex, m_desc);
rval = 0;
}
return rval;
}
int do_cancel(pid_t tgid, pid_t *p, int signal)
{
pid_t pid;
if (unlikely(get_user(pid, p) || pid <= 0))
return -ESRCH;
DbgPos("do_cancel: cancelling thread %d\n", pid);
if (unlikely(sys_tgkill(tgid, pid, signal)))
return -ESRCH;
return 0;
}
/*
* do_cond_lock() and do_cond_unlock() are used by
* do_cond_timedwait() to do the user-side of mutex locking and
* unlocking here in kernel so that we do not need to switch to userspace
* just to call pthread_mutex_lock() or pthread_mutex_unlock() from there
*/
//TODO 3.14 support SCHED_DEADLINE
/* Returns zero on success */
static int cond_fast_lock(struct pthread_mutex_s *const mutex,
const char protocol, const int __m_lock)
{
int rval, oldval;
switch (protocol) {
case PTHREAD_PRIO_NONE:
if (unlikely(__m_lock)) {
rval = 1;
} else {
#if defined ARCH_HAS_ATOMIC_CMPXCHG
rval = el_atomic_cmpxchg_acq(oldval,
&mutex->__m_lock, 0, 1);
#else
rval = el_atomic_xchg_acq(oldval,
&mutex->__m_lock, 1);
#endif
if (likely(!rval))
rval = (oldval != 0);
}
break;
#if defined ARCH_HAS_ATOMIC_CMPXCHG
case PTHREAD_PRIO_INHERIT:
case PTHREAD_PRIO_PROTECT:
if (unlikely(__m_lock)) {
rval = 1;
} else {
rval = el_atomic_cmpxchg_acq(oldval,
&mutex->__m_lock, 0, current->pid);
if (likely(!rval))
rval = (oldval != 0);
}
break;
#endif
default:
rval = 1;
break;
}
return rval;
}
#ifdef CONFIG_SMP
/* Returns zero if @mutex is locked. */
static int mutex_is_free(struct pthread_mutex_s *const mutex,
const char protocol)
{
int rval, __m_lock;
switch (protocol) {
case PTHREAD_PRIO_NONE:
#if defined ARCH_HAS_ATOMIC_CMPXCHG
case PTHREAD_PRIO_INHERIT:
case PTHREAD_PRIO_PROTECT:
#endif
if (unlikely(__get_user(__m_lock,
(volatile int *) &mutex->__m_lock)))
rval = 1;
else
rval = unlikely(__m_lock == 0);
break;
default:
rval = 0;
break;
}
return rval;
}
#endif
/* Context switch cost for threads in processor cycles (must account for
* context switch time, system call time and cache invalidation). */
#ifdef CONFIG_SMP
/* Assumed thread context switch cost (in cycles). */
# define ASSUMED_SWITCH_COST 5000
static unsigned int __read_mostly context_switch_cost = ASSUMED_SWITCH_COST;
#else
static unsigned int __read_mostly context_switch_cost;
#endif
/* Maximum delay between reads when spinning in processor cycles. */
#define MAXIMUM_SPIN_DELAY 50
static int do_cond_lock(
struct pthread_mutex_s *__restrict const mutex,
struct mutex_desc *const m_desc, const int ptr_64)
{
int rval, __m_lock, __m_owner;
#ifdef CONFIG_SMP
struct task_struct *task;
int pid, cycles, __m_spins = 0;
# ifdef ARCH_HAS_GET_CYCLES
int contended = 0;
cycles_t start, waited = 0;
# else
unsigned int waited = 0;
# endif
#endif
if (unlikely(__get_user(__m_lock, &mutex->__m_lock)))
return -EFAULT;
DbgPos("cond_lock: mutex=%p started lock=%d\n", mutex, __m_lock);
switch (builtin_expect_wrapper(m_desc->type, PTHREAD_MUTEX_TIMED_NP)) {
case PTHREAD_MUTEX_ADAPTIVE_NP:
#ifdef CONFIG_SMP
rval = cond_fast_lock(mutex, m_desc->protocol, __m_lock);
if (likely(!rval)) {
return __put_user((unsigned int) get_cycles(),
&mutex->__m_count)
| __put_user(current->pid,
&mutex->__m_owner);
}
if (unlikely(rval == -EFAULT))
break;
waited = 0;
if (__m_lock == -1) {
/* The mutex has waiters and we definitely will not
* get it soon. */
# ifdef ARCH_HAS_GET_CYCLES
/* We do not want to adjust spinning time for this
* lock - the mutex is contended, so in any case
* spinning here does not make any sense (without
* pipelining, that is).
* Pipelining is when a new waiter can steal the mutex
* from all the waiters that are already in the queue -
* a complete unfairness. This library is intended for
* real-time applications, so pipelining is avoided at
* all costs (only on architectures without cmpxchg it
* is used for PTHREAD_PRIO_NONE mutexes). */
contended = 1;
# endif
break;
} else {
# ifdef ARCH_HAS_GET_CYCLES
contended = 0;
# endif
}
if (unlikely(__get_user(__m_spins,
&mutex->__m_spins - ptr_64))) {
rval = -EFAULT;
break;
}
# ifdef ARCH_HAS_GET_CYCLES
/* Note: for the '!mutex->__m_spins' check to work,
* context_switch_cost must not be adjusted at run-time
* as __m_spins is changed only by multiples of
* (context_switch_cost >> 4) (otherwise after a change
* it might never be 0 again). */
# endif
if (unlikely(!__m_spins))
break;
/* __m_spins is passed from user so we cannot trust it */
if (unlikely(__m_spins > context_switch_cost))
__m_spins = context_switch_cost;
# ifdef ARCH_HAS_GET_CYCLES
start = get_cycles();
# endif
/* Check that the owner is running now. */
if (unlikely(__get_user(pid, &mutex->__m_owner))) {
rval = -EFAULT;
break;
}
if (unlikely(!pid))
break;
rcu_read_lock();
task = __find_task_by_pid_check(pid);
if (unlikely(!task || !task_curr(task))) {
# if DEBUG_POSIX
if (!task)
DbgPos("el_posix: owner of adaptive mutex %p"
" not found.\n", mutex);
# endif
rcu_read_unlock();
break;
}
get_task_struct(task);
rcu_read_unlock();
cycles = 12; /* Just some small value. */
# ifdef ARCH_HAS_GET_CYCLES
/* Wait until the mutex is freed. */
do {
__delay(cycles);
if (cycles < MAXIMUM_SPIN_DELAY)
cycles *= 2;
waited = get_cycles() - start;
/* Try to take the mutex.
*
* We do not check for -EFAULT now because we
* already checked, and anyway if there is some
* problem then the user is seriously screwed
* and a little spinning won't do any harm. */
if (mutex_is_free(mutex, m_desc->protocol)
&& !__get_user(__m_lock,
&mutex->__m_lock)
&& !cond_fast_lock(mutex,
m_desc->protocol, __m_lock)) {
/* We successfully acquired the mutex. */
cycles_t delta;
put_task_struct(task);
adjust_adaptive_spin_strategy:
delta = context_switch_cost >> 4;
if (waited > context_switch_cost) {
/* We were waiting for a long time,
* looks like blocking is better than
* spinning. */
if (__m_spins >= delta) {
__m_spins -= delta;
__put_user(__m_spins,
&mutex->__m_spins -
ptr_64);
}
} else {
/* We were waiting for a short time,
* looks like spinning is better than
* blocking. */
if (__m_spins < context_switch_cost) {
__m_spins += delta;
__put_user(__m_spins,
&mutex->__m_spins -
ptr_64);
}
}
skip_adjust_adaptive_spin_strategy:
return __put_user((unsigned int) get_cycles(),
&mutex->__m_count)
| __put_user(current->pid,
&mutex->__m_owner);
}
} while (task_curr(task) && ((get_cycles() - start + cycles) <
(cycles_t) __m_spins));
# else
/* Wait until the mutex is freed for __m_spins
* processor cycles. */
waited = 0;
do {
__delay(cycles);
waited += cycles + 100;
if (cycles < MAXIMUM_SPIN_DELAY)
cycles *= 2;
/* Try to take the mutex. */
if (mutex_is_free(mutex, m_desc->protocol)
&& !__get_user(__m_lock,
&mutex->__m_lock)
&& !cond_fast_lock(mutex,
m_desc->protocol, __m_lock)) {
/* We successfully acquired the mutex. */
put_task_struct(task);
return __put_user(current->pid,
&mutex->__m_owner);
}
} while (waited + cycles < __m_spins);
# endif
put_task_struct(task);
break;
#endif
case PTHREAD_MUTEX_TIMED_NP:
rval = cond_fast_lock(mutex, m_desc->protocol, __m_lock);
if (likely(rval == 0))
return 0;
break;
case PTHREAD_MUTEX_RECURSIVE_NP:
if (unlikely(__get_user(__m_owner, &mutex->__m_owner)))
return -EFAULT;
if (__m_owner == current->pid) {
int __m_count;
if (unlikely(__get_user(__m_count, &mutex->__m_count)))
return -EFAULT;
if (unlikely(__m_count + 1 == 0))
/* Overflow of the counter */
return -EAGAIN;
return __put_user(__m_count + 1, &mutex->__m_count);
}
/* FALLTHROUGH */
case PTHREAD_MUTEX_ERRORCHECK_NP:
rval = cond_fast_lock(mutex, m_desc->protocol, __m_lock);
if (likely(rval == 0))
return __put_user(current->pid, &mutex->__m_owner);
break;
default:
return -EINVAL;
}
if (likely(rval != -EFAULT)) {
rval = __do_mutex_timedlock(mutex, NULL, current, m_desc);
if (likely(rval == 0)) {
#if defined CONFIG_SMP && defined ARCH_HAS_GET_CYCLES
if (unlikely(m_desc->type ==
PTHREAD_MUTEX_ADAPTIVE_NP)) {
/* We successfully acquired the mutex. */
cycles_t crit_section_length;
unsigned int __m_count;
# ifdef ARCH_HAS_ATOMIC_CMPXCHG
if (unlikely(contended || __get_user(__m_count,
&mutex->__m_count)))
# else
if (unlikely(contended
|| m_desc->protocol !=
PTHREAD_PRIO_NONE
|| __get_user(__m_count,
&mutex->__m_count)))
# endif
goto skip_adjust_adaptive_spin_strategy;
crit_section_length = (cycles_t) __m_count;
/* Estimate the time between the last 'lock'
* and 'unlock' operations. 'waited' stands
* for how much we were busy waiting, and
* '(crit_section_length - waited) / 2' stands
* for how much more we probably should have
* busy waited before the owner would unlock
* the mutex. */
if (crit_section_length > waited)
waited += (crit_section_length - waited) / 2;
goto adjust_adaptive_spin_strategy;
}
#endif
rval = __put_user(current->pid, &mutex->__m_owner);
}
}
#if DEBUG_POSIX
__get_user(__m_lock, &mutex->__m_lock);
#endif
DbgPos("cond_lock mutex=%p: ended __m_lock=%d rval=%d\n",
mutex, __m_lock, rval);
return rval;
}
/* Returns zero on success, -EFAULT if the page is not available
* and any other number on failure */
static int cond_fast_unlock(struct pthread_mutex_s *const mutex,
const char protocol, const int __m_lock)
{
int rval, oldval;
#if defined ARCH_HAS_ATOMIC_CMPXCHG
int pid;
switch (protocol) {
case PTHREAD_PRIO_NONE:
if (unlikely(__m_lock != 1)) {
rval = 1;
} else {
rval = el_atomic_cmpxchg_rel(oldval,
&mutex->__m_lock, 1, 0);
if (likely(!rval))
rval = (oldval != 1);
}
break;
case PTHREAD_PRIO_INHERIT:
case PTHREAD_PRIO_PROTECT:
pid = current->pid;
if (unlikely(__m_lock != pid)) {
rval = 1;
} else {
rval = el_atomic_cmpxchg_rel(oldval,
&mutex->__m_lock, pid, 0);
if (likely(!rval))
rval = (oldval != pid);
}
break;
default:
rval = 1;
break;
}
#else
if (unlikely(protocol == PTHREAD_PRIO_NONE)) {
rval = el_atomic_xchg_rel(oldval, &mutex->__m_lock, 0);
if (likely(!rval))
rval = (oldval != 1);
} else {
rval = 1;
}
#endif
return rval;
}
static int do_cond_unlock(
struct task_struct *const task,
struct pthread_mutex_s *__restrict const mutex,
struct mutex_desc *const m_desc,
unsigned long *fault_address, const int ptr_64)
{
int rval, __m_owner, __m_lock;
#ifdef CONFIG_SMP
# ifdef ARCH_HAS_GET_CYCLES
unsigned int __m_count;
# endif
#endif
DbgPos("do_cond_unlock mutex=%p: start\n", mutex);
if (unlikely(__get_user(__m_lock, &mutex->__m_lock))) {
*fault_address = (unsigned long) &mutex->__m_lock;
return -EFAULT;
}
switch (builtin_expect_wrapper(m_desc->type, PTHREAD_MUTEX_TIMED_NP)) {
case PTHREAD_MUTEX_ADAPTIVE_NP:
#ifdef CONFIG_SMP
# ifdef ARCH_HAS_GET_CYCLES
if (unlikely(__get_user(__m_count, &mutex->__m_count))) {
*fault_address = (unsigned long) &mutex->__m_count;
return -EFAULT;
}
__m_count = ((unsigned int) get_cycles()) - __m_count;
if (unlikely(__put_user(__m_count, &mutex->__m_count))) {
*fault_address = (unsigned long) &mutex->__m_count;
return -EFAULT;
}
# endif
if (unlikely(__put_user(0, &mutex->__m_owner))) {
*fault_address = (unsigned long) &mutex->__m_owner;
return -EFAULT;
}
#endif
/* FALLTHROUGH */
case PTHREAD_MUTEX_TIMED_NP:
simple:
rval = cond_fast_unlock(mutex, m_desc->protocol, __m_lock);
break;
case PTHREAD_MUTEX_ERRORCHECK_NP:
if (unlikely(__get_user(__m_owner, &mutex->__m_owner))) {
*fault_address = (unsigned long) &mutex->__m_owner;
return -EFAULT;
}
if (likely(__m_owner == current->pid)) {
if (unlikely(__put_user(0, &mutex->__m_owner))) {
*fault_address = (unsigned long)
&mutex->__m_owner;
return -EFAULT;
}
goto simple;
} else {
return -EPERM;
}
break;
case PTHREAD_MUTEX_RECURSIVE_NP:
if (unlikely(__get_user(__m_owner, &mutex->__m_owner))) {
*fault_address = (unsigned long) &mutex->__m_owner;
return -EFAULT;
}
if (likely(__m_owner == current->pid)) {
int __m_count;
if (unlikely(__get_user(__m_count, &mutex->__m_count))){
*fault_address = (unsigned long)
&mutex->__m_count;
return -EFAULT;
}
if (__m_count) {
/* Just decrease the counter */
if (unlikely(__put_user(__m_count - 1,
&mutex->__m_count))) {
*fault_address = (unsigned long)
&mutex->__m_count;
return -EFAULT;
}
return 0;
} else {
if (unlikely(__put_user(0, &mutex->__m_owner))){
*fault_address = (unsigned long)
&mutex->__m_owner;
return -EFAULT;
}
goto simple;
}
} else {
return -EPERM;
}
break;
default:
return -EINVAL;
}
switch (builtin_expect_wrapper(rval, 0)) {
case 0:
break;
case -EFAULT:
*fault_address = (unsigned long) &mutex->__m_lock;
break;
default:
el_pagefault_disable();
raw_spin_lock(&m_desc->lock);
if (!check_desc(m_desc, MUTEX, mutex))
rval = __do_mutex_unlock(mutex, task, m_desc);
else
rval = -EINVAL;
raw_spin_unlock(&m_desc->lock);
el_pagefault_enable();
if (unlikely(rval == -EFAULT))
*fault_address = (unsigned long) &mutex->__m_lock;
break;
}
DbgPos("do_cond_unlock mutex=%p: rval=%d\n", mutex, rval);
return rval;
}
static __always_inline int queue_on_condition(
struct task_struct *const task,
struct el_waiter *const waiter,
struct cond_desc *const c_desc,
struct pthread_cond_s *const cond,
struct mutex_desc *const m_desc,
struct pthread_mutex_s *const mutex,
const int ptr_64)
{
struct mutex_desc *prev_m_desc;
struct pthread_mutex_s *prev_mutex;
int prio, rval = 0;
unsigned long fault_address = 0;
char m_kind;
restart:
el_pagefault_disable();
raw_spin_lock_irq(&c_desc->lock);
prev_m_desc = c_desc->m_desc;
if (unlikely(__get_user(prev_mutex, &cond->__c_mutex))) {
fault_address = (unsigned long) &cond->__c_mutex;
goto out_error_unlock;
}
if (check_desc(c_desc, CONDITION, cond)) {
rval = -EINVAL;
goto out_error_unlock;
}
/* After fork() cond->__c_mutex may be left in bad state.
* That's why we use c_desc->m_desc here instead. */
if (prev_m_desc) {
if (unlikely((void *) ((unsigned long) prev_m_desc & ~1UL)
!= m_desc)) {
DbgPos("queue_on_condition: different mutex descriptors"
" (%p != %p)\n", c_desc->m_desc, m_desc);
rval = -EINVAL;
goto out_error_unlock;
}
} else {
/* For shared mutexes the value stored in __c_mutex cannot
* be used because it must be process-local, but since we
* have mutex type (private or process shared) stored in its
* descriptor, we just will not use __c_mutex field in that
* case. */
if (unlikely(__put_user(mutex, &cond->__c_mutex))) {
fault_address = (unsigned long) &cond->__c_mutex;
goto out_error_unlock;
}
c_desc->m_desc = m_desc;
}
/* Unlock the mutex */
m_kind = m_desc->type;
rval = do_cond_unlock(task, mutex, m_desc, &fault_address, ptr_64);
if (unlikely(rval) && rval != -ENOTRECOVERABLE) {
__put_user(prev_mutex, &cond->__c_mutex);
c_desc->m_desc = prev_m_desc;
goto out_error_unlock;
}
/* Check for multiple times locked recursive mutex */
if (unlikely(m_kind == PTHREAD_MUTEX_RECURSIVE_NP)) {
int tmp;
if (unlikely(__get_user(tmp, &mutex->__m_owner))) {
fault_address = (unsigned long) &mutex->__m_owner;
goto out_error_unlock;
}
if (unlikely(tmp == task->pid)) {
/* We add the mutex owner (i.e. this thread) to
* the top of condition variable's waitqueue. */
c_desc->m_desc = (void *) ((unsigned long)
c_desc->m_desc | 1UL);
prio = -1;
goto prio_is_set;
}
}
/*
* This thread is added to wait queue either with
* its own priority if it is a real-time thread or
* with MAX_RT_PRIO if it is non-RT thread. This
* way RT threads get woken up in priority order
* and non-RT threads get woken up in FIFO order.
*/
prio = min(task->prio, MAX_RT_PRIO);
prio_is_set:
/* Initialize the waitqueue */
plist_node_init(&waiter->list_entry, prio);
plist_add(&waiter->list_entry, &c_desc->wait_list);
waiter->task = task;
waiter->timedout = 0;
waiter->state = WAITING_ON_CONDITION;
/* Since we are going to call schedule() right away,
* there is no need to check preemption. */
raw_spin_unlock_irq_no_resched(&c_desc->lock);
el_pagefault_enable();
return rval;
out_error_unlock:
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
if (unlikely(fault_address)) {
if (!handle_fault(fault_address)) {
fault_address = 0;
goto restart;
}
rval = -EFAULT;
}
return rval;
}
static __always_inline int unqueue_from_condition(
struct el_waiter *const waiter,
struct pthread_cond_s *const cond,
struct cond_desc *const c_desc)
{
restart:
raw_spin_lock_irq(&c_desc->lock);
if (unlikely(waiter->state != WAITING_ON_CONDITION)) {
raw_spin_unlock_irq(&c_desc->lock);
return 1;
}
el_pagefault_disable();
/* This check works even if this waiter was moved
* to a temporary list by signal or broadcast */
if (plist_head_empty(&c_desc->wait_list) ||
plist_first(&c_desc->wait_list) ==
plist_last(&c_desc->wait_list) &&
plist_first_entry(&c_desc->wait_list, struct el_waiter,
list_entry) == waiter) {
/* Since there are no more threads waiting on this
* condition, disassociate the mutex from it */
if (unlikely(__put_user(NULL, &cond->__c_mutex))) {
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
if (!handle_fault((unsigned long) &cond->__c_mutex))
goto restart;
el_pagefault_disable();
raw_spin_lock_irq(&c_desc->lock);
}
c_desc->m_desc = NULL;
}
plist_del(&waiter->list_entry, &c_desc->wait_list);
waiter->state = NOT_WAITING;
raw_spin_unlock_irq(&c_desc->lock);
el_pagefault_enable();
return 0;
}
static int do_cond_timedwait(
struct pthread_cond_s *const cond,
struct pthread_mutex_s *const mutex,
const struct timespec_64 *const abstime, const int ptr_64)
{
int rval, __c_value = 0;
struct task_struct *const task = current;
const int __m_kind = ({
int tmp;
if (unlikely(((unsigned int) ptr_64) > 1))
return -EINVAL;
if (unlikely(__get_user(tmp, &mutex->__m_kind + ptr_64)))
return -EFAULT;
tmp;
});
const int __c_desc = ({
int tmp;
if (unlikely(__get_user(tmp, &cond->__c_desc)))
return -EFAULT;
tmp;
});
const int __m_desc = ({
int tmp;
if (unlikely(__get_user(tmp, &mutex->__m_desc)))
return -EFAULT;
tmp;
});
struct cond_desc *const c_desc = cond_once(task, cond, __c_desc);
struct mutex_desc *const m_desc = mutex_once(task, mutex, __m_desc,
__m_kind);
struct el_waiter waiter;
struct timespec_64 iabstime;
DbgPos("cond_timedwait cond=%p mutex=%p: start\n", cond, mutex);
if (unlikely(IS_ERR(c_desc)))
return PTR_ERR(c_desc);
if (unlikely(IS_ERR(m_desc)))
return PTR_ERR(m_desc);
if (unlikely(abstime)) {
if (unlikely(copy_from_user(&iabstime, abstime,
sizeof(iabstime))))
return -EFAULT;
if (unlikely(iabstime.tv_nsec < 0
|| iabstime.tv_nsec >= 1000000000)) {
DbgPos("%d cond_timedwait cond=%p mutex=%p: bad nsec "
"timeout (%lld)\n", task->pid, cond,
mutex, iabstime.tv_nsec);
return -EINVAL;
}
if (unlikely(__get_user(__c_value, &cond->__c_value)))
return -EFAULT;
}
rval = queue_on_condition(task, &waiter, c_desc,
cond, m_desc, mutex, ptr_64);
if (unlikely(rval)) {
if (rval != -ENOTRECOVERABLE)
return rval;
robust_mutex_wake_all(mutex, m_desc);
}
DbgPos("cond_timedwait cond=%p mutex=%p: before schedule()\n",
cond, mutex);
set_task_state(task, TASK_INTERRUPTIBLE);
if (likely(waiter.state == WAITING_ON_CONDITION)) {
if (likely(!abstime)) {
schedule();
} else {
clockid_t clock_id;
clock_id = (__c_value & PTHREAD_CONDATTR_CLOCK_ID_MASK)
>> PTHREAD_CONDATTR_CLOCK_ID_SHIFT;
if (clock_id != CLOCK_REALTIME
&& clock_id != CLOCK_MONOTONIC)
clock_id = CLOCK_REALTIME;
waiter.timedout = schedule_with_timeout(task,
clock_id, &iabstime);
}
} else {
preempt_check_resched();
}
__set_task_state(task, TASK_RUNNING);
DbgPos("cond_timedwait cond=%p mutex=%p, state=%d: after schedule()\n",
cond, mutex, waiter.state);
retry:
switch (builtin_expect_wrapper(waiter.state, WAITING_ON_MUTEX)) {
case WAITING_ON_MUTEX:
/* We are standing in mutex's waitqueue.
* Try to acquire it. */
try_taking_mutex_again:
el_pagefault_disable();
raw_spin_lock_irq(&m_desc->lock);
if (unlikely(m_desc->protocol == PTHREAD_PRIO_PROTECT
&& ((int) m_desc->prioceiling)
> normal_prio(task))) {
DbgPos("cond_timedwait cond=%p mutex=%p: thread "
"priority is bigger than mutex "
"prioceiling.\n", cond, mutex);
rval = give_up_on_mutex(mutex, m_desc, &waiter);
if (likely(!rval))
rval = -EINVAL;
break;
}
if (try_to_take_mutex(task, mutex, m_desc, &waiter) == 0) {
/* Success. */
raw_spin_unlock_irq(&m_desc->lock);
el_pagefault_enable();
rval = __put_user(task->pid, &mutex->__m_owner);
if (unlikely(rval))
break;
/* Check robust attribute. */
if (unlikely(m_desc->robust > ROBUST)) {
/* Since we were able to lock the mutex, it
* should not be in NOT_RECOVERABLE state. */
WARN_ON(m_desc->robust == NOT_RECOVERABLE);
rval = -EOWNERDEAD;
} else if (unlikely(waiter.timedout) && !rval) {
rval = -ETIMEDOUT;
}
} else {
/* We were signaled or the mutex was stealed. */
if (!signal_pending(task) && likely(
m_desc->robust != NOT_RECOVERABLE)) {
raw_spin_unlock_irq_no_resched(&m_desc->lock);
el_pagefault_enable();
set_task_state(task, TASK_INTERRUPTIBLE);
if (likely(m_desc->pending_owner != task
&& m_desc->robust
!= NOT_RECOVERABLE))
schedule();
else
preempt_check_resched();
__set_task_state(task, TASK_RUNNING);
goto try_taking_mutex_again;
}
DbgPos("%d cond_timedwait cond=%p mutex=%p: "
"signal_pending() = %d, "
"pending.signal = %lx : %lx\n",
task->pid, cond, mutex,
signal_pending(task),
task->pending.signal.sig[0],
task->pending.signal.sig[1]);
rval = give_up_on_mutex(mutex, m_desc, &waiter);
if (unlikely(m_desc->robust == NOT_RECOVERABLE))
rval = -ENOTRECOVERABLE;
else if (likely(!rval))
rval = -EINTR;
}
break;
case WAITING_ON_CONDITION:
/* We were not woken by cond_signal or cond_broadcast,
* i.e. we timed out or caught a signal. */
if (unlikely(unqueue_from_condition(&waiter, cond, c_desc)))
/* Condition was signaled in the small window
* after wakeup. */
goto retry;
if (unlikely(!signal_pending(task)))
/* We timed out or the signal was handled
* by another thread */
goto retry;
rval = -EINTR;
break;
case NOT_WAITING:
/* We were not moved to the mutex waitqueue.
* Acquire the mutex by ourselves, but first wait to make
* sure that there will be no wake up signals sent after
* we leave kernel. */
raw_spin_unlock_wait(&c_desc->lock);
rval = do_cond_lock(mutex, m_desc, ptr_64);
if (rval == 0 && unlikely(waiter.timedout))
rval = -ETIMEDOUT;
break;
default:
WARN_ON(1);
rval = -EINVAL;
break;
}
DbgPos("cond_timedwait cond=%p mutex=%p: end rval=%d\n",
cond, mutex, rval);
return rval;
}
static __always_inline void queue_on_barrier(struct task_struct *const task,
struct el_barrier_waiter *const waiter,
struct barr_desc *const b_desc)
{
++b_desc->present;
/*
* This thread is added to wait queue either with
* its own priority if it is a real-time thread or
* with MAX_RT_PRIO if it is non-RT thread. This
* way RT threads get woken up in priority order
* and non-RT threads get woken up in FIFO order.
*/
plist_node_init(&waiter->list_entry, min(task->prio, MAX_RT_PRIO));
plist_add(&waiter->list_entry, &b_desc->wait_list);
waiter->task = task;
waiter->b_desc = b_desc;
waiter->state = WAITING_ON_BARRIER;
}
/* Wake all from queue in the descriptor @b_desc. Maximum possible number
* of waiters is @list_size while actual number can be less.
* Must be called with the spinlock held. */
static void wake_barrier_waiters(struct task_struct *const task,
const unsigned int list_size, struct barr_desc *const b_desc)
{
struct el_waiter *waiter;
struct plist_node pi_list_entry = PLIST_NODE_INIT(pi_list_entry,
MAX_PRIO-1);
if (list_size <= WAKE_AT_MOST) {
DbgPos("waking waiters: list not detached, waking %d threads\n",
list_size);
plist_for_each_entry(waiter, &b_desc->wait_list, list_entry) {
waiter->state = NOT_WAITING;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
}
plist_head_init(&b_desc->wait_list);
} else {
int i;
struct plist_head to_move_list;
struct el_waiter *next;
/* Do not disable interrupts for a long periods of time.
* Detach the list of waiting threads from the barrier
* so that we can drop the spinlock from time to time.*/
if (unlikely(plist_head_empty(&b_desc->wait_list))) {
raw_spin_unlock_irq(&b_desc->lock);
return;
}
to_move_list = b_desc->wait_list;
to_move_list.node_list.next->prev = &to_move_list.node_list;
to_move_list.node_list.prev->next = &to_move_list.node_list;
plist_head_init(&b_desc->wait_list);
i = WAKE_AT_MOST >> 1;
continue_wake:
plist_for_each_entry_safe(waiter, next, &to_move_list,
list_entry) {
plist_del(&waiter->list_entry, &to_move_list);
waiter->state = NOT_WAITING;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
if (unlikely(++i >= WAKE_AT_MOST
&& !plist_head_empty(&to_move_list))) {
boost_priority(next->list_entry.prio,
&pi_list_entry);
raw_spin_unlock_irq(&b_desc->lock);
cpu_relax();
raw_spin_lock_irq(&b_desc->lock);
i = 0;
goto continue_wake;
}
}
}
raw_spin_unlock_irq(&b_desc->lock);
/* Restore normal priority if we detached the waiters list. */
if (list_size > WAKE_AT_MOST)
restore_priority(&pi_list_entry);
}
static int do_barrier_wait(struct pthread_barrier_s *const barr,
const unsigned int required, const int restarted,
const int __b_desc)
{
struct task_struct *const task = current;
struct el_barrier_waiter *waiter;
struct barr_desc *const b_desc = barr_once(task, barr, __b_desc);
DbgPos("pbarrier_wait: started for barr %p, descr=%p\n", barr, b_desc);
if (unlikely(IS_ERR(b_desc)))
return PTR_ERR(b_desc);
if (unlikely(restarted)) {
waiter = &task->el_posix.barr_waiter;
goto again;
}
raw_spin_lock_irq(&b_desc->lock);
if (check_desc(b_desc, BARRIER, barr)) {
raw_spin_unlock_irq(&b_desc->lock);
return -EINVAL;
}
if (unlikely((b_desc->present + 1) == required)
&& likely(!restarted)) {
/* Wake everyone */
b_desc->present = 0;
DbgPos("pbarrier_wait: barr %p, I am the waker\n", barr);
wake_barrier_waiters(task, required, b_desc);
/* One thread must return PTHREAD_BARRIER_SERIAL_THREAD */
return 1;
} else {
/* Queue ourselves */
waiter = &task->el_posix.barr_waiter;
queue_on_barrier(task, waiter, b_desc);
/* Since we are going to call schedule() right away,
* there is no need to check preemption. */
raw_spin_unlock_irq_no_resched(&b_desc->lock);
again:
DbgPos("pbarrier_wait: barr %p, before schedule\n", barr);
set_task_state(task, TASK_INTERRUPTIBLE);
if (likely(waiter->state == WAITING_ON_BARRIER))
schedule();
else
preempt_check_resched();
__set_task_state(task, TASK_RUNNING);
DbgPos("pbarrier_wait: barr %p, after schedule\n", barr);
if (unlikely(waiter->state == WAITING_ON_BARRIER)) {
/* Signal caught */
if (unlikely(!signal_pending(task)))
goto again;
if (!restarted)
return -EINTR;
else
return -ERESTARTNOINTR;
}
return 0;
}
}
struct el_sem_waiter {
int state;
struct plist_node list_entry;
struct task_struct *task;
};
#if !defined ARCH_HAS_ATOMIC_CMPXCHG
static int do_sem_getvalue(struct posix_sem_s *__restrict const sem,
const int __s_desc)
{
struct sem_desc *s_desc = sem_once(current, sem, __s_desc);
int rval;
if (unlikely(IS_ERR(s_desc)))
return PTR_ERR(s_desc);
raw_spin_lock_irq(&s_desc->lock);
if (check_desc(s_desc, SEMAPHORE, sem))
rval = -EINVAL;
else
rval = s_desc->value;
raw_spin_unlock_irq(&s_desc->lock);
return rval;
}
static int do_sem_post(struct posix_sem_s *__restrict const sem,
const int __s_desc)
{
struct sem_desc *s_desc = sem_once(current, sem, __s_desc);
DbgPos("sem_post: sem %p - started\n", sem);
if (unlikely(IS_ERR(s_desc)))
return PTR_ERR(s_desc);
raw_spin_lock_irq(&s_desc->lock);
if (check_desc(s_desc, SEMAPHORE, sem)) {
raw_spin_unlock_irq(&s_desc->lock);
return -EINVAL;
}
if (unlikely(s_desc->value == INT_MAX)) {
raw_spin_unlock_irq(&s_desc->lock);
return -EOVERFLOW;
}
++s_desc->value;
if (!plist_head_empty(&s_desc->wait_list)) {
struct el_sem_waiter *this;
int i;
/* Wake the waiters. */
i = 0;
plist_for_each_entry(this, &s_desc->wait_list, list_entry) {
this->state = NOT_WAITING;
wake_up_state(this->task, TASK_INTERRUPTIBLE);
++i;
if (i >= WAKE_AT_MOST || i == s_desc->value)
break;
}
}
raw_spin_unlock_irq(&s_desc->lock);
return 0;
}
static __always_inline void queue_on_semaphore(
struct task_struct *const task,
struct el_sem_waiter *const waiter,
struct posix_sem_s *const sem,
struct sem_desc *const s_desc)
{
/*
* This thread is added to wait queue either with
* its own priority if it is a real-time thread or
* with MAX_RT_PRIO if it is non-RT thread. This
* way RT threads get woken up in priority order
* and non-RT threads get woken up in FIFO order.
*/
waiter->task = task;
waiter->state = WAITING_ON_SEMAPHORE;
plist_node_init(&waiter->list_entry, min(task->prio, MAX_RT_PRIO));
plist_add(&waiter->list_entry, &s_desc->wait_list);
}
static int do_sem_timedwait(struct posix_sem_s *__restrict const sem,
struct timespec_64 *__restrict const abstime,
const int __s_desc, const int try)
{
struct timespec_64 iabstime;
struct task_struct *const task = current;
struct sem_desc *s_desc = sem_once(task, sem, __s_desc);
struct el_sem_waiter waiter;
int rval, timedout;
DbgPos("sem_wait: sem %p - started\n", sem);
if (unlikely(IS_ERR(s_desc)))
return PTR_ERR(s_desc);
if (unlikely(abstime)) {
if (unlikely(copy_from_user(&iabstime, abstime,
sizeof(iabstime))))
return -EFAULT;
if (unlikely(iabstime.tv_nsec < 0
|| iabstime.tv_nsec >= 1000000000))
return -EINVAL;
}
raw_spin_lock_irq(&s_desc->lock);
if (check_desc(s_desc, SEMAPHORE, sem)) {
rval = -EINVAL;
goto out_unlock;
}
if (s_desc->value) {
--s_desc->value;
goto success_unlock;
}
if (unlikely(try)) {
rval = -EAGAIN;
goto out_unlock;
}
queue_on_semaphore(task, &waiter, sem, s_desc);
raw_spin_unlock_irq_no_resched(&s_desc->lock);
sleep:
DbgPos("sem_wait sem=%p: before schedule()\n", sem);
set_current_state(TASK_INTERRUPTIBLE);
timedout = 0;
if (likely(waiter.state == WAITING_ON_SEMAPHORE)) {
if (likely(!abstime))
schedule();
else
timedout = schedule_with_timeout(task,
CLOCK_REALTIME, &iabstime);
} else {
preempt_check_resched();
}
__set_task_state(task, TASK_RUNNING);
DbgPos("sem_wait semaphore %p: after schedule\n", sem);
raw_spin_lock_irq(&s_desc->lock);
if (unlikely(waiter.state == WAITING_ON_SEMAPHORE)) {
/* Task caught signal or timed out */
if (unlikely(!signal_pending(task) && !timedout)) {
raw_spin_unlock_irq_no_resched(&s_desc->lock);
goto sleep;
}
if (timedout)
rval = -ETIMEDOUT;
else
rval = -EINTR;
plist_del(&waiter.list_entry, &s_desc->wait_list);
goto out_unlock;
}
if (unlikely(!s_desc->value)) {
/* Someone was faster. */
waiter.state = WAITING_ON_SEMAPHORE;
raw_spin_unlock_irq_no_resched(&s_desc->lock);
goto sleep;
}
plist_del(&waiter.list_entry, &s_desc->wait_list);
--s_desc->value;
success_unlock:
raw_spin_unlock_irq(&s_desc->lock);
return 0;
out_unlock:
raw_spin_unlock_irq(&s_desc->lock);
return rval;
}
#else
static int do_sem_post(struct posix_sem_s *__restrict const sem,
const int __s_desc)
{
struct task_struct *const task = current;
struct sem_desc *s_desc = sem_once(task, sem, __s_desc);
int __s_value;
DbgPos("sem_post: sem %p - started\n", sem);
if (unlikely(IS_ERR(s_desc)))
return PTR_ERR(s_desc);
if (unlikely(__get_user(__s_value, &sem->__s_value)))
return -EINVAL;
/* We can safely read s_desc->waiters_nr here even if the descriptor
* is bad since the only effect would be immediate return. */
if (unlikely(!__s_value || !s_desc->waiters_nr))
/* Somebody else has done sem_wait() while we were
* entering the kernel or there are no waiters. */
return 0;
restart:
raw_spin_lock_irq(&s_desc->lock);
if (check_desc(s_desc, SEMAPHORE, sem)) {
raw_spin_unlock_irq(&s_desc->lock);
return -EINVAL;
}
el_pagefault_disable();
if (s_desc->waiters_nr) {
struct el_sem_waiter *this;
int i;
/* Find how many waiters we will wake. */
i = 0;
plist_for_each_entry(this, &s_desc->wait_list, list_entry) {
++i;
if (i >= WAKE_AT_MOST)
break;
}
if (i > __s_value)
i = __s_value;
/* Store the new waiters number in sem->__s_waiters. */
if (unlikely(__put_user(s_desc->waiters_nr - i,
&sem->__s_waiters))) {
raw_spin_unlock_irq(&s_desc->lock);
el_pagefault_enable();
if (handle_fault((unsigned long) &sem->__s_waiters))
return -EFAULT;
goto restart;
}
/* Store the new waiters number in s_desc->waiters_nr. */
s_desc->waiters_nr -= i;
/* Wake the waiters. */
for (; i > 0; i--) {
this = plist_first_entry(&s_desc->wait_list,
struct el_sem_waiter, list_entry);
plist_del(&this->list_entry, &s_desc->wait_list);
this->state = NOT_WAITING;
wake_up_state(this->task, TASK_INTERRUPTIBLE);
}
}
raw_spin_unlock_irq(&s_desc->lock);
el_pagefault_enable();
return 0;
}
static __always_inline int queue_on_semaphore(
struct task_struct *const task,
struct el_sem_waiter *const waiter,
struct posix_sem_s *const sem,
struct sem_desc *const s_desc)
{
restart:
raw_spin_lock_irq(&s_desc->lock);
if (check_desc(s_desc, SEMAPHORE, sem)) {
raw_spin_unlock_irq(&s_desc->lock);
return -EINVAL;
}
el_pagefault_disable();
if (unlikely(__put_user(s_desc->waiters_nr + 1, &sem->__s_waiters))) {
raw_spin_unlock_irq(&s_desc->lock);
el_pagefault_enable();
if (handle_fault((unsigned long) &sem->__s_waiters))
return -EFAULT;
goto restart;
}
++s_desc->waiters_nr;
/*
* This thread is added to wait queue either with
* its own priority if it is a real-time thread or
* with MAX_RT_PRIO if it is non-RT thread. This
* way RT threads get woken up in priority order
* and non-RT threads get woken up in FIFO order.
*/
waiter->task = task;
waiter->state = WAITING_ON_SEMAPHORE;
plist_node_init(&waiter->list_entry, min(task->prio, MAX_RT_PRIO));
plist_add(&waiter->list_entry, &s_desc->wait_list);
raw_spin_unlock_irq_no_resched(&s_desc->lock);
el_pagefault_enable();
return 0;
}
static int unqueue_from_semaphore(
struct el_sem_waiter *const waiter,
struct posix_sem_s *const sem,
struct sem_desc *const s_desc)
{
restart:
raw_spin_lock_irq(&s_desc->lock);
if (unlikely(waiter->state != WAITING_ON_SEMAPHORE)) {
raw_spin_unlock_irq(&s_desc->lock);
return 1;
}
el_pagefault_disable();
if (unlikely(__put_user(s_desc->waiters_nr - 1, &sem->__s_waiters))) {
raw_spin_unlock_irq(&s_desc->lock);
el_pagefault_enable();
if (handle_fault((unsigned long) &sem->__s_waiters))
return -EFAULT;
goto restart;
}
--s_desc->waiters_nr;
plist_del(&waiter->list_entry, &s_desc->wait_list);
waiter->state = NOT_WAITING;
raw_spin_unlock_irq(&s_desc->lock);
el_pagefault_enable();
return 0;
}
static int do_sem_timedwait(struct posix_sem_s *__restrict const sem,
struct timespec_64 *__restrict const abstime,
const int __s_desc)
{
struct timespec_64 iabstime;
struct task_struct *const task = current;
struct sem_desc *s_desc = sem_once(task, sem, __s_desc);
struct el_sem_waiter waiter;
int rval, __s_value, timedout;
DbgPos("sem_post: sem %p - started\n", sem);
if (unlikely(IS_ERR(s_desc)))
return PTR_ERR(s_desc);
if (unlikely(abstime)) {
if (unlikely(copy_from_user(&iabstime, abstime,
sizeof(iabstime))))
return -EFAULT;
if (unlikely(iabstime.tv_nsec < 0
|| iabstime.tv_nsec >= 1000000000))
return -EINVAL;
}
rval = queue_on_semaphore(task, &waiter, sem, s_desc);
if (unlikely(rval))
return rval;
/* Order is important here: first we set __s_waiters field in
* queue_on_semaphore(), and only after that we read __s_value field. */
smp_mb();
if (unlikely(__get_user(__s_value, &sem->__s_value))) {
if (unlikely(__get_user(__s_value, &sem->__s_value)))
return -EFAULT;
}
sleep:
DbgPos("sem_wait sem=%p: before schedule()\n", sem);
set_current_state(TASK_INTERRUPTIBLE);
timedout = 0;
if (likely(!__s_value && waiter.state == WAITING_ON_SEMAPHORE)) {
if (likely(!abstime))
schedule();
else
timedout = schedule_with_timeout(task,
CLOCK_REALTIME, &iabstime);
} else {
preempt_check_resched();
}
__set_task_state(task, TASK_RUNNING);
DbgPos("sem_wait semaphore %p: after schedule\n", sem);
if (unlikely(waiter.state == WAITING_ON_SEMAPHORE)) {
/* __s_value was not 0, task caught signal or timed out */
if (unlikely(!__s_value && !signal_pending(task)
&& !timedout)) {
if (likely(!__get_user(__s_value, &sem->__s_value)))
goto sleep;
if (unqueue_from_semaphore(&waiter, sem, s_desc) == 1)
goto success;
rval = -EFAULT;
} else {
rval = unqueue_from_semaphore(&waiter, sem, s_desc);
if (unlikely(__s_value > 0 || rval == 1))
goto success;
if (likely(!rval)) {
if (timedout)
rval = -ETIMEDOUT;
else
rval = -EINTR;
}
}
return rval;
}
success:
return 0;
}
#endif
#ifdef CONFIG_SMP
struct thread_data {
struct completion done;
unsigned long long start_time;
unsigned long long end_time;
long khz;
unsigned int iterations;
};
/* Placing semaphore on stack defeats lockdep mechanism so make it global. */
struct semaphore cs_sem __initdata = __SEMAPHORE_INITIALIZER(cs_sem, 0);
static int __init cs_thread(void *data)
{
struct thread_data *td = (struct thread_data *) data;
int i;
down(&cs_sem);
td->start_time = sched_clock();
up(&cs_sem);
for (i = 0; (kthread_should_stop() == 0)
&& ((i < (td->khz >> 7)) || i < 10); i++)
yield();
td->end_time = sched_clock();
td->iterations = i;
complete(&td->done);
/* Wait for termination */
set_current_state(TASK_INTERRUPTIBLE);
while (kthread_should_stop() == 0) {
schedule();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
static int __init
elpthread_init(void)
{
struct sched_param param;
struct task_struct *th1, *th2;
struct thread_data thread1_data, thread2_data;
unsigned long long start_time, end_time;
unsigned int switch_cost, iterations;
long khz;
int rval;
init_rwsem(&shared.lock);
khz = sys_el_posix(EL_GET_CPU_KHZ, 0, 0, 0, 0);
if (khz <= 0)
return 0;
thread1_data.iterations = 0;
thread1_data.khz = khz;
thread1_data.done = COMPLETION_INITIALIZER_ONSTACK(thread1_data.done);
thread2_data.iterations = 0;
thread2_data.khz = khz;
thread2_data.done = COMPLETION_INITIALIZER_ONSTACK(thread2_data.done);
th1 = kthread_create(&cs_thread, &thread1_data, "cs_thread1");
if (IS_ERR(th1))
goto failed;
th2 = kthread_create(&cs_thread, &thread2_data, "cs_thread2");
if (IS_ERR(th2))
goto failed_cleanup_thread;
kthread_bind(th1, 0);
kthread_bind(th2, 0);
param.sched_priority = MAX_RT_PRIO - 1;
if ((rval = sched_setscheduler_nocheck(th1, SCHED_FIFO, &param))
|| (rval = sched_setscheduler_nocheck(th2, SCHED_FIFO,
&param)))
goto failed_cleanup_threads;
printk(KERN_INFO "Measuring thread context switch cost...\n");
wake_up_process(th1);
wake_up_process(th2);
up(&cs_sem);
wait_for_completion(&thread1_data.done);
kthread_stop(th1);
wait_for_completion(&thread2_data.done);
kthread_stop(th2);
iterations = thread1_data.iterations + thread2_data.iterations;
if (iterations == 0)
goto failed;
start_time = (thread1_data.start_time > thread2_data.start_time)
? thread1_data.start_time : thread2_data.start_time;
end_time = (thread1_data.end_time < thread2_data.end_time)
? thread1_data.end_time : thread2_data.end_time;
/* Use 3x multiplier to account for cache invalidation,
* system call cost and time spent in the library. */
switch_cost = 3 * (((unsigned int) (end_time - start_time)) /
iterations);
printk(KERN_INFO "%d iterations in %lld nanoseconds\n",
iterations, end_time - start_time);
/* Convert to processor cycles. */
switch_cost = (switch_cost * ((unsigned int) (khz >> 10))) >> 10;
if (switch_cost > (4 * ASSUMED_SWITCH_COST))
/* Looks like there was some other high-priority thread
* interfering with this computation. */
goto failed;
printk(KERN_INFO "Thread context switch cost is %d cycles "
"(CPU at %ld kHz).\n", switch_cost, khz);
context_switch_cost = switch_cost;
return 0;
failed_cleanup_threads:
kthread_stop(th2);
up(&cs_sem);
wait_for_completion(&thread2_data.done);
failed_cleanup_thread:
kthread_stop(th1);
up(&cs_sem);
wait_for_completion(&thread1_data.done);
failed:
printk(KERN_INFO "Measuring thread context switch cost failed\n");
return 0;
}
late_initcall(elpthread_init);
#endif
/**
* do_main_init() - the first and only initialization function
* a process must call before anything else.
* @cs_cost: context switch cost measured by kernel is written at this
* address and isused to implement adaptive spinning.
* @kernel_flags: a set of flags indicating some ABI peculiarities of
* the kernel el_posix implementation is passed to the library here.
*/
static int do_main_init(unsigned int *cs_cost, unsigned int *kernel_flags)
{
int new_flags;
DbgPos("pthread_main_init\n");
if (cs_cost && __put_user(context_switch_cost, cs_cost))
return -EFAULT;
if (!kernel_flags) {
if (printk_ratelimit())
pr_info("elpthread library is too old, "
"please update\n");
return -ENOSYS;
}
/*
* Bits in 'kernel_flags':
* 0x1 - 'kernel_flags' parameter is supported (always set to 1)
* 0x2 - kernel uses atomic cmpxchg instruction
* 0x4 - updated PTHREAD_PRIO_PROTECT implementation which supports
* fast unlocking.
* 0x8 - use -1 instead of 2 in mutex->__m_lock for PTHREAD_PRIO_NONE
* mutexes to mark fast unlocking impossible (only has meaning for
* !ARCH_HAS_ATOMIC_CMPXCHG architectures because only in them
* library actually uses this value).
*/
new_flags = 0x1 | 0x4;
#if defined ARCH_HAS_ATOMIC_CMPXCHG
new_flags |= 0x2;
#else
new_flags |= 0x8;
#endif
if (__put_user(new_flags, kernel_flags))
return -EFAULT;
return 0;
}
void el_posix_lock(unsigned long clone_flags)
{
if (!(clone_flags & (CLONE_VM | CLONE_VFORK))
&& current->mm
&& current->mm->el_posix.others)
/* This task uses el_posix and is forking. */
down_read(&current->mm->el_posix.lock);
}
void el_posix_unlock(unsigned long clone_flags)
{
if (!(clone_flags & (CLONE_VM | CLONE_VFORK))
&& current->mm
&& current->mm->el_posix.others)
/* This task uses el_posix and is forking. */
up_read(&current->mm->el_posix.lock);
}
void el_posix_init(struct task_struct *task)
{
INIT_LIST_HEAD(&task->el_posix.pi_mutex_list);
plist_head_init(&task->el_posix.pp_mutex_list);
plist_node_init(&task->el_posix.pi_list_entry, MAX_PRIO-1);
plist_head_init(&task->el_posix.pi_waiters);
task->el_posix.barr_waiter.state = NOT_WAITING;
task->el_posix.pi_blocked_on = NULL;
}
static void copy_block(s8 *from, s8 *to, int sz, enum types type)
{
int i;
*((struct zero_cell *) to) = *((struct zero_cell *) from);
for (i = 1; i < DESCS_NUMBER; i++) {
struct common_desc *from_desc =
(struct common_desc *) (from + i * sz);
struct common_desc *to_desc =
(struct common_desc *) (to + i * sz);
to_desc->next_free = from_desc->next_free;
if (type != MUTEX) {
to_desc->desc_type = from_desc->desc_type;
} else {
((struct mutex_desc *) to_desc)->protocol =
((struct mutex_desc *) from_desc)->protocol;
((struct mutex_desc *) to_desc)->type =
((struct mutex_desc *) from_desc)->type;
}
raw_spin_lock_init(&to_desc->lock);
plist_head_init(&to_desc->wait_list);
}
}
/* Copies private descriptors when forking */
static int copy_blocks(struct allocated_descs_common *old,
struct allocated_descs_common **to, const enum types type)
{
int i, j, sz;
struct allocated_descs_common *new;
if (!old)
return 0;
switch (type) {
case MUTEX:
new = kzalloc(sizeof(struct allocated_private_mutex_descs),
GFP_USER);
if (!new)
goto bad;
new->blocks[1] = &((struct allocated_private_mutex_descs *)
new)->first_block;
break;
case OTHER:
new = kzalloc(sizeof(struct allocated_private_other_descs),
GFP_USER);
if (!new)
goto bad;
new->blocks[1] = &((struct allocated_private_other_descs *)
new)->first_block;
break;
default:
goto bad;
}
new = kzalloc(sizeof(struct allocated_private_other_descs), GFP_USER);
if (!new)
goto bad;
new->blocks[1] = &((struct allocated_private_other_descs *)
new)->first_block;
sz = get_sz(1, type);
new->free_block = old->free_block;
new->used_blocks = old->used_blocks;
copy_block(old->blocks[1], new->blocks[1], sz, type);
for (i = 2; i < BLOCKS_NUMBER && old->blocks[i]; i++) {
new->blocks[i] = kzalloc(DESCS_NUMBER * sz, GFP_USER);
if (!new->blocks[i])
goto bad_cleanup;
copy_block(old->blocks[i], new->blocks[i], sz, type);
}
*to = new;
return 0;
bad_cleanup:
for (j = 2; j < i; j++)
kfree(new->blocks[j]);
kfree(new);
bad:
return -ENOMEM;
}
static void free_blocks(struct allocated_descs_common *all_blocks)
{
int i;
if (!all_blocks)
return;
for (i = 2; i < BLOCKS_NUMBER && all_blocks->blocks[i]; i++)
kfree(all_blocks->blocks[i]);
kfree(all_blocks);
}
/* Called when forking */
int dup_mm_el_posix(struct mm_struct *oldmm, struct mm_struct *mm,
unsigned long clone_flags)
{
int rval, copied_descs_num;
if (clone_flags & CLONE_VFORK) {
/* The new task will not use these. */
mm->el_posix.mutexes = NULL;
mm->el_posix.others = NULL;
return 0;
}
if (oldmm->el_posix.mutexes) {
struct allocated_private_mutex_descs *mutexes;
mutexes = kzalloc(sizeof(*mutexes), GFP_USER);
if (!mutexes) {
rval = -ENOMEM;
goto bad;
}
mutexes->free_block = 1;
mutexes->used_blocks = 1;
mutexes->blocks[1] = &mutexes->first_block;
block_init((s8 *) &mutexes->first_block, get_sz(1, MUTEX), 1);
mm->el_posix.mutexes = mutexes;
}
rval = copy_blocks((struct allocated_descs_common *)
oldmm->el_posix.others,
(struct allocated_descs_common **)
&mm->el_posix.others,
OTHER);
if (rval)
goto bad_cleanup_mutexes;
copied_descs_num = 0;
if (mm->el_posix.mutexes)
copied_descs_num += mm->el_posix.mutexes->used_blocks
* DESCS_NUMBER;
if (mm->el_posix.others)
copied_descs_num += mm->el_posix.others->used_blocks
* DESCS_NUMBER;
if (oldmm->el_posix.user) {
rval = add_descriptors_count(1, copied_descs_num,
oldmm->el_posix.user);
if (rval)
goto bad_cleanup_others;
mm->el_posix.user = get_uid(oldmm->el_posix.user);
}
return 0;
bad_cleanup_others:
free_blocks((struct allocated_descs_common *) mm->el_posix.others);
mm->el_posix.others = NULL;
bad_cleanup_mutexes:
free_blocks((struct allocated_descs_common *) mm->el_posix.mutexes);
mm->el_posix.mutexes = NULL;
bad:
return rval;
}
/* Called from __mmdrop(). */
void el_posix_mm_destroy(struct mm_struct *mm)
{
int freed_descs_number;
if (!list_empty(&mm->el_posix.shared_objects)) {
unsigned long flags;
raw_spin_lock_irqsave(&freed_shared_descs_lock, flags);
list_splice_init(&mm->el_posix.shared_objects,
&freed_shared_descs);
raw_spin_unlock_irqrestore(&freed_shared_descs_lock, flags);
}
freed_descs_number = 0;
if (mm->el_posix.mutexes) {
freed_descs_number += mm->el_posix.mutexes->used_blocks
* DESCS_NUMBER;
free_blocks((struct allocated_descs_common *)
mm->el_posix.mutexes);
mm->el_posix.mutexes = NULL;
}
if (mm->el_posix.others) {
freed_descs_number += mm->el_posix.others->used_blocks
* DESCS_NUMBER;
free_blocks((struct allocated_descs_common *)
mm->el_posix.others);
mm->el_posix.others = NULL;
}
if (mm->el_posix.user) {
sub_descriptors_count(1, freed_descs_number, mm->el_posix.user);
free_uid(mm->el_posix.user);
mm->el_posix.user = NULL;
}
}
/* Called when task is dying and unlocking all the mutexes it owns. */
static void remove_mutex_desc(struct mutex_desc *const m_desc,
struct task_struct *const task, const int protocol)
{
unsigned long flags;
#if defined ARCH_HAS_ATOMIC_CMPXCHG
struct pthread_mutex_s *mutex;
#endif
/* We can do this check without any locks because
* we own the mutex so it cannot be freed. */
WARN_ON((int) m_desc->protocol != protocol);
#if defined ARCH_HAS_ATOMIC_CMPXCHG
again:
#endif
el_pagefault_disable();
raw_spin_lock_irqsave(&m_desc->lock, flags);
if (desc_in_use(m_desc) == 0
|| desc_check_type((void *) m_desc, MUTEX)) {
/* This should not happen because descriptors
* queued in mutex_list cannot be freed. */
WARN_ON_ONCE(1);
if (m_desc->protocol == PTHREAD_PRIO_INHERIT) {
list_del(&m_desc->mutex_list_entry.pi);
} else if (m_desc->protocol == PTHREAD_PRIO_PROTECT) {
plist_del(&m_desc->mutex_list_entry.pp,
&current->el_posix.pp_mutex_list);
if (plist_head_empty(&current->el_posix.pp_mutex_list)
&& !plist_node_empty(
&current->el_posix.pi_list_entry))
plist_del(&current->el_posix.pi_list_entry,
&current->el_posix.pi_waiters);
}
goto out_unlock;
}
DbgPos("exit_el_posix: found mutex with descriptor at %p, robust=%d\n",
m_desc, (int) m_desc->robust);
/*
* If mutex is robust and does not have any waiters then
* we write an invalid pid into mutex->__m_lock.
*
* If mutex is robust and does have waiters we just make
* one of them the next owner.
*/
#if defined ARCH_HAS_ATOMIC_CMPXCHG
if ((m_desc->robust == ROBUST || m_desc->robust == OWNER_DEAD)
&& !mutex_has_waiters(m_desc)) {
int __m_lock;
/* We have to write invalid pid into mutex->__m_lock field.
* For details see the explanation before enum robust_state. */
/* Make sure that OWNER_DEAD mutexes
* always have an owner. */
if (m_desc->robust == OWNER_DEAD)
m_desc->robust = ROBUST;
/* Write invalid pid into mutex->__m_lock */
if (m_desc->private) {
/* Private mutex */
mutex = (struct pthread_mutex_s *)
*desc_to_object(m_desc, MUTEX);
private_mapping:
if (__get_user(__m_lock, &mutex->__m_lock))
goto handle_fault;
DbgPos("exit_el_posix: replacing %d with %ld in "
"mutex->__m_lock for mutex %p\n",
__m_lock, PID_MAX_LIMIT, mutex);
if (__m_lock != -1) {
DbgPos("exit_el_posix: bad __m_lock "
"(mutex %p)\n", mutex);
goto skip_fixing_robust_mutex;
}
if (__put_user(PID_MAX_LIMIT, &mutex->__m_lock))
goto handle_fault;
} else {
/* Shared mutex */
union key_shared *key = desc_to_key(m_desc, MUTEX);
struct page *page;
void *kaddr;
# ifdef CONFIG_HIGHMEM
int page_mapped = 0;
# endif
DbgPos("exit_el_posix: robust mutex (m_desc %p), "
"shared key %p\n", m_desc, key);
if (key->both.offset & 1) {
/* Private mapping */
mutex = (struct pthread_mutex_s *)
key->private.address;
goto private_mapping;
}
/* Shared mapping */
page = find_get_page(key->shared.inode->i_mapping,
key->shared.pgoff);
if (!page) {
DbgPos("exit_el_posix: zero page!\n");
goto skip_fixing_robust_mutex;
}
kaddr = page_address(page);
if (!kaddr) {
DbgPos("exit_el_posix: page_address() "
"returned 0 (high = %d)\n",
PageHighMem(page));
# ifdef CONFIG_HIGHMEM
if (!PageHighMem(page)) {
/* Something strange is happening... */
page_cache_release(page);
goto skip_fixing_robust_mutex;
}
/* So this is a page from high memory, map it */
kaddr = kmap_atomic(page);
page_mapped = 1;
# else
/* Something strange is happening... */
page_cache_release(page);
goto skip_fixing_robust_mutex;
# endif
}
mutex = (struct pthread_mutex_s *)
(kaddr + key->shared.offset);
DbgPos("exit_el_posix: replacing %d with %ld in "
"mutex->__m_lock for mutex with "
"desc %p\n", mutex->__m_lock,
PID_MAX_LIMIT, m_desc);
if (mutex->__m_lock == -1)
mutex->__m_lock = PID_MAX_LIMIT;
else
DbgPos("exit_el_posix: bad __m_lock "
"(desc %p)\n", m_desc);
# ifdef CONFIG_HIGHMEM
if (page_mapped)
kunmap_atomic(kaddr);
# endif
page_cache_release(page);
}
} else
#endif
if (m_desc->robust == ROBUST) {
/* For robust mutexes with waiters
* we change the state to OWNER_DEAD. */
m_desc->robust = OWNER_DEAD;
}
#if defined ARCH_HAS_ATOMIC_CMPXCHG
skip_fixing_robust_mutex:
#endif
raw_spin_lock(&task->pi_lock);
m_desc->owner = NULL;
switch (m_desc->protocol) {
case PTHREAD_PRIO_INHERIT:
__task_unlocked_pi_mutex(task, m_desc);
break;
case PTHREAD_PRIO_PROTECT:
__task_unlocked_pp_mutex(task, m_desc);
break;
default:
/* Only PP and PI mutexes can have robust attribute
* set and be freed when owner dies. */
WARN_ON(1);
break;
}
raw_spin_unlock(&task->pi_lock);
if (mutex_has_waiters(m_desc)) {
struct el_waiter *waiter = plist_first_entry(&m_desc->wait_list,
struct el_waiter, list_entry);
m_desc->pending_owner = waiter->task;
wake_up_state(waiter->task, TASK_INTERRUPTIBLE);
}
out_unlock:
raw_spin_unlock_irqrestore(&m_desc->lock, flags);
el_pagefault_enable();
return;
#ifdef ARCH_HAS_ATOMIC_CMPXCHG
handle_fault:
raw_spin_unlock_irqrestore(&m_desc->lock, flags);
el_pagefault_enable();
if (handle_fault((unsigned long) &mutex->__m_lock)) {
/* Since the underlying mapping has been destroyed
* there is no one to inform about roubst states
* anyway, so just skip it. */
DbgPos("exit_el_posix: mutex at %p is unaccessible!\n", mutex);
el_pagefault_disable();
raw_spin_lock_irqsave(&m_desc->lock, flags);
goto skip_fixing_robust_mutex;
} else {
goto again;
}
#endif
}
void exit_el_posix(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
if (!mm)
return;
if (mm->el_posix.others) {
struct el_barrier_waiter *waiter = &task->el_posix.barr_waiter;
if (unlikely(waiter->state == WAITING_ON_BARRIER)) {
struct barr_desc *const b_desc = waiter->b_desc;
/* b_desc is a valid address, because this function is
* called from the beginning of mm_release() when
* meomry descriptor is still valid. */
raw_spin_lock_irq(&b_desc->lock);
if (waiter->state == WAITING_ON_BARRIER) {
/* There is no way for us to know whether
* the wait_list was detached from the
* barrier's descriptor, so we can not
* do --b_desc->present, but this is okay. */
plist_del(&waiter->list_entry,
&b_desc->wait_list);
waiter->state = NOT_WAITING;
}
raw_spin_unlock_irq(&b_desc->lock);
}
}
if (mm->el_posix.mutexes) {
struct mutex_desc *m_desc, *next;
WARN_ON(task->el_posix.pi_blocked_on);
/* Set all robust mutexes owned by task to the OWNER_DEAD
* state. PTHREAD_PRIO_PROTECT and PTHREAD_PRIO_INHERIT
* mutexes are queued even if robust attribute is not set,
* this allows to handle walking priority chain with dead
* tasks and does not notably affect performance. */
list_for_each_entry_safe(m_desc, next,
&task->el_posix.pi_mutex_list,
mutex_list_entry.pi) {
remove_mutex_desc(m_desc, task, PTHREAD_PRIO_INHERIT);
}
plist_for_each_entry_safe(m_desc, next,
&task->el_posix.pp_mutex_list,
mutex_list_entry.pp) {
remove_mutex_desc(m_desc, task, PTHREAD_PRIO_PROTECT);
}
}
WARN_ON(!list_empty(&task->el_posix.pi_mutex_list));
WARN_ON(!plist_head_empty(&task->el_posix.pp_mutex_list));
WARN_ON(!plist_node_empty(&task->el_posix.pi_list_entry));
WARN_ON(!plist_head_empty(&task->el_posix.pi_waiters));
}
static int do_mutex_set_ceiling(struct pthread_mutex_s *const mutex,
const int __m_desc, const int __m_kind_new, const int ptr_64)
{
struct task_struct *const task = current;
unsigned long fault_address = 0;
struct sched_param param;
struct mutex_desc *m_desc;
int __m_kind, rval, prioceiling, rval_unlock;
if (unlikely(((unsigned int) ptr_64) > 1))
return -EINVAL;
if (unlikely(__get_user(__m_kind, &mutex->__m_kind + ptr_64)))
return -EFAULT;
DbgPos("do_mutex_set_ceiling started: mutex %p, __m_kind %x, "
"new __m_kind %x\n", mutex, __m_kind, __m_kind_new);
/*
* Check permissions
*/
m_desc = mutex_once(task, mutex, __m_desc, __m_kind);
if (unlikely(IS_ERR(m_desc)))
return PTR_ERR(m_desc);
if (unlikely(m_desc->protocol != PTHREAD_PRIO_PROTECT)) {
DbgPos("do_mutex_set_ceiling: mutex %p is not protected!\n",
mutex);
return -EINVAL;
}
prioceiling = (__m_kind_new & PTHREAD_MUTEXATTR_PRIO_CEILING_MASK)
>> PTHREAD_MUTEXATTR_PRIO_CEILING_SHIFT;
if (prioceiling < 1 || prioceiling > MAX_USER_RT_PRIO-1) {
DbgPos("do_mutex_set_ceiling: mutex %p, bad prioceiling %d\n",
mutex, prioceiling);
return -EINVAL;
}
/* Allow unprivileged RT tasks to decrease priority. */
if (!capable(CAP_SYS_NICE)) {
unsigned long flags, rlim_rtprio;
if (!lock_task_sighand(task, &flags))
return -EPERM;
rlim_rtprio = task->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
unlock_task_sighand(task, &flags);
if ((SCHED_FIFO != task->policy && !rlim_rtprio)
|| (prioceiling > task->rt_priority
&& prioceiling > rlim_rtprio))
return -EPERM;
}
if (task->policy == SCHED_IDLE)
return -EPERM;
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_task_has_rt_runtime(task))
return -EPERM;
#endif
param.sched_priority = prioceiling;
rval = security_task_setscheduler(task);
if (rval)
return rval;
/*
* Lock the mutex
*/
rval = do_cond_lock(mutex, m_desc, ptr_64);
if (rval)
return rval;
/*
* Change priority ceiling
*/
if (m_desc->protocol == PTHREAD_PRIO_PROTECT) {
/* We can safely set prioceiling without locking the spinlock
* because locked mutex cannot be freed. (Actually user can
* free it by writing 0 to mutex->__m_lock directly and then
* calling mutex_ destroy(), but he will never ever do that,
* and if he does, nothing too bad would happen). */
m_desc->prioceiling = (unsigned char)
(MAX_RT_PRIO-1 - prioceiling);
rval = __put_user(__m_kind_new, &mutex->__m_kind + ptr_64);
} else {
rval = -EINVAL;
}
/*
* Unlock the mutex
*/
restart_unlock:
rval_unlock = do_cond_unlock(task, mutex, m_desc,
&fault_address, ptr_64);
DbgPos("do_mutex_set_ceiling: mutex %p, rval_unlock %d, rval %d\n",
mutex, rval_unlock, rval);
if (unlikely(rval_unlock == -EFAULT)) {
if (!handle_fault(fault_address))
goto restart_unlock;
} else if (unlikely(rval_unlock == -ENOTRECOVERABLE)) {
robust_mutex_wake_all(mutex, m_desc);
rval_unlock = 0;
}
if (rval_unlock)
rval = rval_unlock;
return rval;
}
static int do_mutex_consistent(struct pthread_mutex_s *const mutex,
const int __m_kind, const int __m_desc)
{
struct task_struct *const task = current;
struct mutex_desc *const m_desc = mutex_once(task, mutex, __m_desc,
__m_kind);
int rval;
if (unlikely(IS_ERR(m_desc)))
return PTR_ERR(m_desc);
raw_spin_lock_irq(&m_desc->lock);
if (check_desc(m_desc, MUTEX, mutex)) {
rval = -EINVAL;
goto out_unlock;
}
if (m_desc->robust == OWNER_DEAD) {
m_desc->robust = ROBUST;
rval = 0;
} else {
rval = -EINVAL;
}
out_unlock:
raw_spin_unlock_irq(&m_desc->lock);
return rval;
}
static int do_set_unsafe_shared(pid_t pid, int *old_unsafe, int unsafe)
{
struct task_struct *task;
int rval;
#ifdef CONFIG_MCST_RT
if (!capable(CAP_SYS_RESOURCE) && !rts_mode)
#else
if (!capable(CAP_SYS_RESOURCE))
#endif
return -EPERM;
if ((unsigned int) unsafe > 1)
return -EINVAL;
if (!pid) {
task = current;
} else {
rcu_read_lock();
task = __find_task_by_pid_check(pid);
if (task && task->mm)
get_task_struct(task);
rcu_read_unlock();
if (!task || !task->mm)
return -ESRCH;
}
if (old_unsafe) {
if (put_user((int) task->mm->el_posix.unsafe_shared_objects,
old_unsafe)) {
rval = -EFAULT;
goto out_put_task;
}
smp_mb();
}
task->mm->el_posix.unsafe_shared_objects = unsafe;
rval = 0;
out_put_task:
if (pid)
put_task_struct(task);
return rval;
}
#endif /* CONFIG_HAVE_EL_POSIX_SYSCALL */
#ifdef CONFIG_MCST_RT
#ifdef EL_TIMERFD_USING
static inline int el_ctx_lock_irq(struct el_timerfd_ctx *ctx)
{
again:
raw_spin_lock_irq(&ctx->lock);
if (ctx->locked) {
raw_spin_unlock_irq(&ctx->lock);
if (signal_pending(current))
return -ERESTARTSYS;
goto again;
}
return 0;
}
static inline void el_ctx_unlock_irq(struct el_timerfd_ctx *ctx)
{
raw_spin_unlock_irq(&ctx->lock);
}
static int el_timerfd_release(struct inode *inode, struct file *file)
{
struct el_timerfd_ctx *ctx = file->private_data;
hrtimer_cancel(&ctx->tmr);
kfree(ctx);
return 0;
}
static inline int eltfd_populate_user_buf(char __user *buf, size_t count,
u64 ticks, s64 wu_time, ktime_t cb_timeout,
s64 intr_timeout, ktime_t expiried)
{
int res = 0;
s64 nsec;
/* Number of missed ticks */
if (copy_to_user(buf, &ticks, sizeof(s64)))
return -EFAULT;
res = sizeof(s64);
/* Wake up time */
if (count >= 2 * sizeof(s64)) {
buf += sizeof(s64);
if (copy_to_user(buf, &wu_time, sizeof(s64)))
return -EFAULT;
res += sizeof(s64);
}
/* Callback timeout */
if (count >= 3 * sizeof(s64)) {
buf += sizeof(s64);
nsec = ktime_to_ns(cb_timeout);
if (copy_to_user(buf, &nsec, sizeof(s64)))
return -EFAULT;
res += sizeof(s64);
}
/* Latency of hrtimer_interrupt start */
if (count >= 4 * sizeof(s64)) {
buf += sizeof(s64);
nsec = intr_timeout;
if (copy_to_user(buf, &nsec, sizeof(s64)))
return -EFAULT;
res += sizeof(s64);
}
/* Time of timer expiration */
if (count >= 5 * sizeof(s64)) {
buf += sizeof(s64);
nsec = ktime_to_ns(expiried);
if (copy_to_user(buf, &nsec, sizeof(s64)))
return -EFAULT;
res += sizeof(s64);
}
return res;
}
static ssize_t el_timerfd_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
struct el_timerfd_ctx *ctx = file->private_data;
struct el_wait_queue_head wait = { .task = current,
.wuc_time = KTIME_MAX };
s64 wu_time, intr_timeout;
u64 ticks;
ktime_t remaining;
ktime_t cb_timeout;
ktime_t expiried;
int res;
if (count < sizeof(s64))
return -EINVAL;
if (el_ctx_lock_irq(ctx))
return -ERESTARTSYS;
ticks = ctx->ticks;
/* Have we missed at least a tick? */
if (ctx->handled_ticks != ticks) {
ctx->handled_ticks = ticks;
cb_timeout = ctx->cb_timeout;
intr_timeout = ctx->tmr.intr_timeout;
expiried = ctx->expiried;
ctx->tmr.intr_timeout = 0;
el_ctx_unlock_irq(ctx);
wu_time = 0;
goto copy_to_user;
}
/* We have to wait next tick */
list_add(&wait.task_list, &ctx->wqh.task_list);
set_current_state(TASK_INTERRUPTIBLE);
el_ctx_unlock_irq(ctx);
while (1) {
ktime_t now;
schedule();
now = ktime_get();
if (el_ctx_lock_irq(ctx)) {
res = -ERESTARTSYS;
goto out;
}
/* Got we a new tick? */
if (ticks != ctx->ticks) {
ticks = ctx->ticks;
remaining = ktime_sub(now, wait.wuc_time);
wu_time = ktime_to_ns(remaining);
cb_timeout = ctx->cb_timeout;
intr_timeout = ctx->tmr.intr_timeout;
expiried = ctx->expiried;
ctx->handled_ticks = ticks;
ctx->tmr.intr_timeout = 0;
WARN_ON_ONCE(wait.wuc_time == KTIME_MAX);
res = 0;
break;
}
set_current_state(TASK_INTERRUPTIBLE);
el_ctx_unlock_irq(ctx);
}
list_del(&wait.task_list);
__set_current_state(TASK_RUNNING);
el_ctx_unlock_irq(ctx);
if (wu_time < 0)
return -EAGAIN; /* It's wrong, because the timer had to be expiried */
else if (res < 0)
return res;
copy_to_user:
res = eltfd_populate_user_buf(buf, count, ticks, wu_time, cb_timeout,
intr_timeout, expiried);
out:
return res;
}
static const struct file_operations el_timerfd_fops = {
.release = el_timerfd_release,
.read = el_timerfd_read,
};
static int el_open_timerfd(void)
{
struct el_timerfd_ctx *ctx;
int ufd;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
hrtimer_init(&ctx->tmr, CLOCK_REALTIME, HRTIMER_MODE_ABS_HARD);
raw_spin_lock_init(&ctx->lock);
INIT_LIST_HEAD(&ctx->wqh.task_list);
ctx->locked = 0;
ctx->ticks = 0;
ctx->handled_ticks = 0;
ufd = anon_inode_getfd("[el_timerfd]", &el_timerfd_fops, ctx, 0);
if (ufd < 0)
kfree(ctx);
return ufd;
}
static struct file *el_timerfd_fget(int fd)
{
struct file *file;
file = fget(fd);
if (!file)
return ERR_PTR(-EBADF);
if (file->f_op != &el_timerfd_fops) {
fput(file);
return ERR_PTR(-EINVAL);
}
return file;
}
enum hrtimer_restart el_timerfd_tmrproc(struct hrtimer *htmr)
{
struct el_timerfd_ctx *ctx = container_of(htmr, struct el_timerfd_ctx, tmr);
struct el_wait_queue_head *wait;
ktime_t now = ctx->run_time;
BUG_ON(!irqs_disabled());
raw_spin_lock(&ctx->lock);
ctx->ticks++;
ctx->expiried = hrtimer_get_expires(htmr);
ctx->cb_timeout = ktime_sub(now, ctx->expiried);
list_for_each_entry(wait, &ctx->wqh.task_list, task_list) {
if (wait->wuc_time == KTIME_MAX) {
wait->wuc_time = ktime_get();
wake_up_state(wait->task, TASK_NORMAL);
}
}
raw_spin_unlock(&ctx->lock);
hrtimer_forward_now(htmr, ctx->tintv);
return HRTIMER_RESTART;
}
static int do_el_timerfd_settime(int ufd, struct itimerspec *ktmr)
{
struct file *file;
struct el_timerfd_ctx *ctx;
file = el_timerfd_fget(ufd);
if (IS_ERR(file))
return PTR_ERR(file);
ctx = file->private_data;
BUG_ON(irqs_disabled());
for (;;) {
raw_spin_lock_irq(&ctx->lock);
if (ctx->locked) {
raw_spin_unlock_irq(&ctx->lock);
continue;
}
/* Prevent from parallel settime and read */
ctx->locked = 1;
raw_spin_unlock_irq(&ctx->lock);
if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
break;
raw_spin_lock_irq(&ctx->lock);
ctx->locked = 0;
raw_spin_unlock_irq(&ctx->lock);
cpu_relax();
}
raw_spin_lock_irq(&ctx->lock);
ctx->tmr.function = el_timerfd_tmrproc;
ctx->tintv = timespec_to_ktime(ktmr->it_interval);
hrtimer_set_expires(&ctx->tmr, ctx->tintv);
/* Return the first timer expiration time */
ktmr->it_value = ktime_to_timespec(ctx->tmr.node.expires);
ctx->tmr.intr_timeout = 0;
raw_spin_unlock_irq(&ctx->lock);
if (ctx->tintv != 0)
hrtimer_start(&ctx->tmr, ctx->tintv, HRTIMER_MODE_REL);
raw_spin_lock_irq(&ctx->lock);
ctx->locked = 0;
raw_spin_unlock_irq(&ctx->lock);
fput(file);
return 0;
}
static int el_timerfd_settime(int ufd, struct itimerspec __user *tmr)
{
struct itimerspec ktmr;
int ret;
if (copy_from_user(&ktmr, tmr, sizeof(ktmr)))
return -EFAULT;
ret = do_el_timerfd_settime(ufd, &ktmr);
if (ret < 0)
return ret;
return copy_to_user(tmr, &ktmr, sizeof(ktmr));
}
#ifdef CONFIG_COMPAT
static int compat_el_timerfd_settime(int ufd, struct compat_itimerspec __user *tmr)
{
struct compat_itimerspec c_ktmr;
struct itimerspec ktmr;
int ret;
if (copy_from_user(&c_ktmr, tmr, sizeof(c_ktmr)))
return -EFAULT;
ktmr.it_interval.tv_sec = c_ktmr.it_interval.tv_sec;
ktmr.it_interval.tv_nsec = c_ktmr.it_interval.tv_nsec;
ktmr.it_value.tv_sec = c_ktmr.it_value.tv_sec;
ktmr.it_value.tv_nsec = c_ktmr.it_value.tv_nsec;
ret = do_el_timerfd_settime(ufd, &ktmr);
if (ret < 0)
return ret;
c_ktmr.it_interval.tv_sec = ktmr.it_interval.tv_sec;
c_ktmr.it_interval.tv_nsec = ktmr.it_interval.tv_nsec;
c_ktmr.it_value.tv_sec = ktmr.it_value.tv_sec;
c_ktmr.it_value.tv_nsec = ktmr.it_value.tv_nsec;
return copy_to_user(tmr, &c_ktmr, sizeof(c_ktmr));
}
#endif /* CONFIG_COMPAT */
#endif /*EL_TIMERFD_USING */
#endif