src - OpenBSD base system

diff options


context:
space:
mode:

author	Paul Irofti <pirofti@cvs.openbsd.org>	2011-09-18 02:23:19 +0000
committer	Paul Irofti <pirofti@cvs.openbsd.org>	2011-09-18 02:23:19 +0000
commit	72ee693a0a18b82f6a4f31598021be1c6ad1c7ef (patch)
tree	b9512bac6044d92e0dd777720a304e5f698c1f2c /sys/compat/linux/linux_futex.c
parent	ff7f5dbd755848cb234d75c66c4b49b3d8fa8eb5 (diff)

Add futex support for compat/linux.

Based on Emmanuel Dreyfus work with additions, tricks and adaptations by me. Lots of help and reviewing by guenther@ and oga@. Okay guenther@.

Diffstat (limited to 'sys/compat/linux/linux_futex.c')

-rw-r--r--

sys/compat/linux/linux_futex.c

690

1 files changed, 690 insertions, 0 deletions

diff --git a/sys/compat/linux/linux_futex.c b/sys/compat/linux/linux_futex.c
new file mode 100644
index 00000000000..2db9e5cdfdd
--- /dev/null
+++ b/sys/compat/linux/linux_futex.c

@@ -0,0 +1,690 @@

+/* $OpenBSD: linux_futex.c,v 1.1 2011/09/18 02:23:18 pirofti Exp $ */

+/* $NetBSD: linux_futex.c,v 1.26 2010/07/07 01:30:35 chs Exp $ */

+/*-

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions

+ * are met:

+ * 1. Redistributions of source code must retain the above copyright

+ * notice, this list of conditions and the following disclaimer.

+ * 2. Redistributions in binary form must reproduce the above copyright

+ * notice, this list of conditions and the following disclaimer in the

+ * documentation and/or other materials provided with the distribution.

+ * 3. All advertising materials mentioning features or use of this software

+ * must display the following acknowledgement:

+ * This product includes software developed by Emmanuel Dreyfus

+ * 4. The name of the author may not be used to endorse or promote

+ * products derived from this software without specific prior written

+ * permission.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''

+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,

+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS

+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ * POSSIBILITY OF SUCH DAMAGE.

+ */

+#include <sys/param.h>

+#include <sys/ucred.h>

+#include <sys/malloc.h>

+#include <sys/mount.h>

+#include <sys/signal.h>

+#include <sys/stdint.h>

+#include <sys/time.h>

+#include <sys/systm.h>

+#include <sys/proc.h>

+#include <sys/pool.h>

+#include <sys/kernel.h>

+#include <sys/syscallargs.h>

+#include <compat/linux/linux_types.h>

+#include <compat/linux/linux_fcntl.h>

+#include <compat/linux/linux_misc.h>

+#include <compat/linux/linux_mmap.h>

+#include <compat/linux/linux_sched.h>

+#include <compat/linux/linux_signal.h>

+#include <compat/linux/linux_syscallargs.h>

+#include <compat/linux/linux_util.h>

+#include <compat/linux/linux_dirent.h>

+#include <compat/linux/linux_emuldata.h>

+#include <compat/linux/linux_time.h>

+#include <compat/linux/linux_futex.h>

+#ifdef COMPATFUTEX_DEBUG

+#define DPRINTF(x) printf x

+#else

+#define DPRINTF(x)

+#endif

+struct pool futex_pool;

+struct pool futex_wp_pool;

+struct futex;

+struct waiting_proc {

+ struct proc *p;

+ struct futex *wp_new_futex;

+ TAILQ_ENTRY(waiting_proc) wp_list;

+ TAILQ_ENTRY(waiting_proc) wp_rqlist;

+};

+struct futex {

+ void *f_uaddr;

+ int f_refcount;

+ LIST_ENTRY(futex) f_list;

+ TAILQ_HEAD(, waiting_proc) f_waiting_proc;

+ TAILQ_HEAD(, waiting_proc) f_requeue_proc;

+};

+static LIST_HEAD(futex_list, futex) futex_list;

+struct mutex futex_lock;

+void futex_pool_init(void);

+int linux_do_futex(struct proc *, const struct linux_sys_futex_args *,

+ register_t *, struct timespec *);

+struct futex * futex_get(void *);

+void futex_ref(struct futex *);

+void futex_put(struct futex *);

+int futex_sleep(struct futex **, struct proc *, int, struct waiting_proc *);

+int futex_wake(struct futex *, int, struct futex *, int);

+int futex_atomic_op(struct proc *, int, void *);

+int futex_itimespecfix(struct timespec *ts);

+int

+linux_sys_futex(struct proc *p, void *v, register_t *retval)

+ struct linux_sys_futex_args /* {

+ syscallarg(int *) uaddr;

+ syscallarg(int) op;

+ syscallarg(int) val;

+ syscallarg(const struct linux_timespec *) timeout;

+ syscallarg(int *) uaddr2;

+ syscallarg(int) val3;

+ } */ *uap = v;

+ struct l_timespec lts;

+ struct timespec ts = {0, 0};

+ int error;

+ if ((SCARG(uap, op) & ~LINUX_FUTEX_PRIVATE_FLAG) == LINUX_FUTEX_WAIT &&

+ SCARG(uap, timeout) != NULL) {

+ if ((error = copyin(SCARG(uap, timeout),

+ &lts, sizeof(lts))) != 0) {

+ return error;

+ }

+ linux_to_native_timespec(&ts, &lts);

+ }

+ return linux_do_futex(p, uap, retval, &ts);

+int

+linux_do_futex(struct proc *p, const struct linux_sys_futex_args *uap,

+ register_t *retval, struct timespec *ts)

+ /* {

+ syscallarg(int *) uaddr;

+ syscallarg(int) op;

+ syscallarg(int) val;

+ syscallarg(const struct linux_timespec *) timeout;

+ syscallarg(int *) uaddr2;

+ syscallarg(int) val3;

+ } */

+ int val;

+ int ret;

+ int error = 0;

+ struct futex *f;

+ struct futex *newf;

+ int timeout_hz;

+ struct timeval tv;

+ struct futex *f2;

+ struct waiting_proc *wp;

+ int op_ret;

+ int args_val = SCARG(uap, val);

+ /*

+ * Our implementation provides only private futexes. Most of the apps

+ * should use private futexes but don't claim so. Therefore we treat

+ * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works

+ * in most cases (ie. when futexes are not shared on file descriptor

+ * or between different processes).

+ */

+ switch (SCARG(uap, op) & ~LINUX_FUTEX_PRIVATE_FLAG) {

+ case LINUX_FUTEX_WAIT:

+ mtx_enter(&futex_lock);

+ if ((error = copyin(SCARG(uap, uaddr),

+ &val, sizeof(val))) != 0) {

+ mtx_leave(&futex_lock);

+ return error;

+ }

+ if (val != args_val) {

+ mtx_leave(&futex_lock);

+ return EWOULDBLOCK;

+ }

+ DPRINTF(("FUTEX_WAIT %d: val = %d, uaddr = %p, "

+ "*uaddr = %d, timeout = %lld.%09ld\n",

+ p->p_pid, args_val, SCARG(uap, uaddr), val,

+ (long long)ts->tv_sec, ts->tv_nsec));

+ if ((error = futex_itimespecfix(ts)) != 0) {

+ mtx_leave(&futex_lock);

+ return error;

+ }

+ TIMESPEC_TO_TIMEVAL(&tv, ts);

+ timeout_hz = tvtohz(&tv);

+ /*

+ * If the user process requests a non null timeout,

+ * make sure we do not turn it into an infinite

+ * timeout because timeout_hz is 0.

+ *

+ * We use a minimal timeout of 1/hz. Maybe it would make

+ * sense to just return ETIMEDOUT without sleeping.

+ */

+ if (SCARG(uap, timeout) != NULL && timeout_hz == 0)

+ timeout_hz = 1;

+ wp = pool_get(&futex_wp_pool, PR_WAITOK);

+ f = futex_get(SCARG(uap, uaddr));

+ ret = futex_sleep(&f, p, timeout_hz, wp);

+ futex_put(f);

+ mtx_leave(&futex_lock);

+ pool_put(&futex_wp_pool, wp);

+ DPRINTF(("FUTEX_WAIT %d: uaddr = %p, "

+ "ret = %d\n", p->p_pid,

+ SCARG(uap, uaddr), ret));

+ switch (ret) {

+ case EWOULDBLOCK: /* timeout */

+ return ETIMEDOUT;

+ break;

+ case EINTR: /* signal */

+ return EINTR;

+ break;

+ case 0: /* FUTEX_WAKE received */

+ DPRINTF(("FUTEX_WAIT %d: uaddr = %p, got it\n",

+ p->p_pid, SCARG(uap, uaddr)));

+ return 0;

+ break;

+ default:

+ DPRINTF(("FUTEX_WAIT: unexpected ret = %d\n", ret));

+ break;

+ }

+ /* NOTREACHED */

+ break;

+ case LINUX_FUTEX_WAKE:

+ /*

+ * XXX: Linux is able cope with different addresses

+ * corresponding to the same mapped memory in the sleeping

+ * and the waker process(es).

+ */

+ DPRINTF(("FUTEX_WAKE %d: uaddr = %p, val = %d\n",

+ p->p_pid, SCARG(uap, uaddr), args_val));

+ if (args_val < 0)

+ return EINVAL;

+ mtx_enter(&futex_lock);

+ f = futex_get(SCARG(uap, uaddr));

+ *retval = futex_wake(f, args_val, NULL, 0);

+ futex_put(f);

+ mtx_leave(&futex_lock);

+ break;

+ case LINUX_FUTEX_CMP_REQUEUE:

+ if (args_val < 0)

+ return EINVAL;

+ mtx_enter(&futex_lock);

+ if ((error = copyin(SCARG(uap, uaddr),

+ &val, sizeof(val))) != 0) {

+ mtx_leave(&futex_lock);

+ return error;

+ }

+ if (val != SCARG(uap, val3)) {

+ mtx_leave(&futex_lock);

+ return EAGAIN;

+ }

+ DPRINTF(("FUTEX_CMP_REQUEUE %d: uaddr = %p, val = %d, "

+ "uaddr2 = %p, val2 = %d\n",

+ p->p_pid, SCARG(uap, uaddr), args_val, SCARG(uap, uaddr2),

+ (int)(unsigned long)SCARG(uap, timeout)));

+ f = futex_get(SCARG(uap, uaddr));

+ newf = futex_get(SCARG(uap, uaddr2));

+ *retval = futex_wake(f, args_val, newf,

+ (int)(unsigned long)SCARG(uap, timeout));

+ futex_put(f);

+ futex_put(newf);

+ mtx_leave(&futex_lock);

+ break;

+ case LINUX_FUTEX_REQUEUE:

+ DPRINTF(("FUTEX_REQUEUE %d: uaddr = %p, val = %d, "

+ "uaddr2 = %p, val2 = %d\n",

+ p->p_pid, SCARG(uap, uaddr), args_val, SCARG(uap, uaddr2),

+ (int)(unsigned long)SCARG(uap, timeout)));

+ if (args_val < 0)

+ return EINVAL;

+ mtx_enter(&futex_lock);

+ f = futex_get(SCARG(uap, uaddr));

+ newf = futex_get(SCARG(uap, uaddr2));

+ *retval = futex_wake(f, args_val, newf,

+ (int)(unsigned long)SCARG(uap, timeout));

+ futex_put(f);

+ futex_put(newf);

+ mtx_leave(&futex_lock);

+ break;

+ case LINUX_FUTEX_FD:

+ DPRINTF(("linux_sys_futex: unimplemented op %d\n",

+ SCARG(uap, op)));

+ return ENOSYS;

+ case LINUX_FUTEX_WAKE_OP:

+ DPRINTF(("FUTEX_WAKE_OP %d: uaddr = %p, op = %d, "

+ "val = %d, uaddr2 = %p, val2 = %d\n",

+ p->p_pid, SCARG(uap, uaddr), SCARG(uap, op), args_val,

+ SCARG(uap, uaddr2),

+ (int)(unsigned long)SCARG(uap, timeout)));

+ if (args_val < 0)

+ return EINVAL;

+ mtx_enter(&futex_lock);

+ f = futex_get(SCARG(uap, uaddr));

+ f2 = futex_get(SCARG(uap, uaddr2));

+ mtx_leave(&futex_lock);

+ /*

+ * This function returns a positive number as results and

+ * negative as errors

+ */

+ op_ret = futex_atomic_op(p, SCARG(uap, val3),

+ SCARG(uap, uaddr2));

+ if (op_ret < 0) {

+ futex_put(f);

+ futex_put(f2);

+ return -op_ret;

+ }

+ mtx_enter(&futex_lock);

+ ret = futex_wake(f, args_val, NULL, 0);

+ futex_put(f);

+ if (op_ret > 0) {

+ op_ret = 0;

+ /*

+ * Linux abuses the address of the timespec parameter

+ * as the number of retries

+ */

+ op_ret += futex_wake(f2,

+ (int)(unsigned long)SCARG(uap, timeout), NULL, 0);

+ ret += op_ret;

+ }

+ futex_put(f2);

+ mtx_leave(&futex_lock);

+ *retval = ret;

+ break;

+ default:

+ DPRINTF(("linux_sys_futex: unknown op %d\n",

+ SCARG(uap, op)));

+ return ENOSYS;

+ }

+ return 0;

+void

+futex_pool_init(void)

+ DPRINTF(("Inside futex_pool_init()\n"));

+ pool_init(&futex_pool, sizeof(struct futex), 0, 0, PR_DEBUGCHK,

+ "futexpl", &pool_allocator_nointr);

+ pool_init(&futex_wp_pool, sizeof(struct waiting_proc), 0, 0,

+ PR_DEBUGCHK, "futexwppl", &pool_allocator_nointr);

+/*

+ * Get a futex.

+ * If we have an existing one, we will return that with the refcount bumped.

+ * Otherwise we will allocate and hook up a new one.

+ * Must be called with futex_lock held, but we may unlock it in order to

+ * sleep for allocation.

+ */

+struct futex *

+futex_get(void *uaddr)

+ struct futex *f, *newf;

+ MUTEX_ASSERT_LOCKED(&futex_lock);

+ LIST_FOREACH(f, &futex_list, f_list) {

+ if (f->f_uaddr == uaddr) {

+ f->f_refcount++;

+ return f;

+ }

+ mtx_leave(&futex_lock);

+ /* Not found, create it */

+ newf = pool_get(&futex_pool, PR_WAITOK|PR_ZERO);

+ mtx_enter(&futex_lock);

+ /* Did someone else create it in the meantime? */

+ LIST_FOREACH(f, &futex_list, f_list) {

+ if (f->f_uaddr == uaddr) {

+ f->f_refcount++;

+ pool_put(&futex_pool, newf);

+ return f;

+ }

+ newf->f_uaddr = uaddr;

+ newf->f_refcount = 1;

+ TAILQ_INIT(&newf->f_waiting_proc);

+ TAILQ_INIT(&newf->f_requeue_proc);

+ LIST_INSERT_HEAD(&futex_list, newf, f_list);

+ return newf;

+/*

+ * Grab a reference on a futex.

+ * The futex lock must be locked.

+ */

+void

+futex_ref(struct futex *f)

+ MUTEX_ASSERT_LOCKED(&futex_lock);

+ f->f_refcount++;

+/*

+ * Release our reference on the futex.

+ * must be called with the futex_lock held.

+ */

+void

+futex_put(struct futex *f)

+ MUTEX_ASSERT_LOCKED(&futex_lock);

+ f->f_refcount--;

+ if (f->f_refcount == 0) {

+ KASSERT(TAILQ_EMPTY(&f->f_waiting_proc));

+ KASSERT(TAILQ_EMPTY(&f->f_requeue_proc));

+ LIST_REMOVE(f, f_list);

+ pool_put(&futex_pool, f);

+ }

+int

+futex_sleep(struct futex **fp, struct proc *p, int timeout,

+ struct waiting_proc *wp)

+ struct futex *f, *newf;

+ int ret;

+ MUTEX_ASSERT_LOCKED(&futex_lock);

+ f = *fp;

+ wp->p = p;

+ wp->wp_new_futex = NULL;

+requeue:

+ TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list);

+ ret = msleep(&f, &futex_lock, PUSER | PCATCH, "futex_sleep", timeout);

+ TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);

+ /* if futex_wake() tells us to requeue ... */

+ newf = wp->wp_new_futex;

+ if (ret == 0 && newf != NULL) {

+ /* ... requeue ourselves on the new futex */

+ futex_put(f);

+ wp->wp_new_futex = NULL;

+ TAILQ_REMOVE(&newf->f_requeue_proc, wp, wp_rqlist);

+ *fp = f = newf;

+ goto requeue;

+ }

+ return ret;

+int

+futex_wake(struct futex *f, int n, struct futex *newf, int n2)

+ struct waiting_proc *wp;

+ int count;

+ KASSERT(newf != f);

+ MUTEX_ASSERT_LOCKED(&futex_lock);

+ count = newf ? 0 : 1;

+ /*

+ * first, wake up any threads sleeping on this futex.

+ * note that sleeping threads are not in the process of requeueing.

+ */

+ if (!TAILQ_EMPTY(&f->f_waiting_proc))

+ wakeup(&f); /* only call wakeup once */

+ TAILQ_FOREACH(wp, &f->f_waiting_proc, wp_list) {

+ KASSERT(wp->wp_new_futex == NULL);

+ DPRINTF(("futex_wake: signal f %p ref %d\n",

+ f, f->f_refcount));

+ if (count <= n) {

+ count++;

+ } else {

+ if (newf == NULL)

+ break;

+ /* matching futex_put() is called by the other thread. */

+ futex_ref(newf);

+ wp->wp_new_futex = newf;

+ TAILQ_INSERT_TAIL(&newf->f_requeue_proc, wp, wp_rqlist);

+ DPRINTF(("futex_wake: requeue newf %p ref %d\n",

+ newf, newf->f_refcount));

+ if (count - n >= n2)

+ goto out;

+ }

+ /*

+ * next, deal with threads that are requeuing to this futex.

+ * we don't need to signal these threads, any thread on the

+ * requeue list has already been signaled but hasn't had a chance

+ * to run and requeue itself yet. if we would normally wake

+ * a thread, just remove the requeue info. if we would normally

+ * requeue a thread, change the requeue target.

+ */

+ while ((wp = TAILQ_FIRST(&f->f_requeue_proc)) != NULL) {

+ /* XXX: talk to oga, should mtx_enter again, recursive */

+ KASSERT(wp->wp_new_futex == f);

+ DPRINTF(("futex_wake: unrequeue f %p ref %d\n",

+ f, f->f_refcount));

+ wp->wp_new_futex = NULL;

+ TAILQ_REMOVE(&f->f_requeue_proc, wp, wp_rqlist);

+ futex_put(f);

+ if (count <= n) {

+ count++;

+ } else {

+ if (newf == NULL) {

+ break;

+ }

+ /*matching futex_put() is called by the other thread.*/

+ futex_ref(newf);

+ wp->wp_new_futex = newf;

+ TAILQ_INSERT_TAIL(&newf->f_requeue_proc, wp, wp_rqlist);

+ DPRINTF(("futex_wake: rerequeue newf %p ref %d\n",

+ newf, newf->f_refcount));

+ if (count - n >= n2)

+ break;

+ }

+out:

+ return count;

+int

+futex_atomic_op(struct proc *p, int encoded_op, void *uaddr)

+ const int op = (encoded_op >> 28) & 7;

+ const int cmp = (encoded_op >> 24) & 15;

+ const int cmparg = (encoded_op << 20) >> 20;

+ int oparg = (encoded_op << 8) >> 20;

+ int error, oldval, cval;

+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))

+ oparg = 1 << oparg;

+ /* XXX: linux verifies access here and returns EFAULT */

+ if (copyin(uaddr, &cval, sizeof(int)) != 0)

+ return -EFAULT;

+ for (;;) {

+ int nval;

+ switch (op) {

+ case FUTEX_OP_SET:

+ nval = oparg;

+ break;

+ case FUTEX_OP_ADD:

+ nval = cval + oparg;

+ break;

+ case FUTEX_OP_OR:

+ nval = cval | oparg;

+ break;

+ case FUTEX_OP_ANDN:

+ nval = cval & ~oparg;

+ break;

+ case FUTEX_OP_XOR:

+ nval = cval ^ oparg;

+ break;

+ default:

+ return -ENOSYS;

+ }

+ oldval = nval;

+ error = atomic_ucas_32(uaddr, cval, nval);

+ if (oldval == cval || error) {

+ break;

+ }

+ cval = oldval;

+ }

+ if (error)

+ return -EFAULT;

+ switch (cmp) {

+ case FUTEX_OP_CMP_EQ:

+ return (oldval == cmparg);

+ case FUTEX_OP_CMP_NE:

+ return (oldval != cmparg);

+ case FUTEX_OP_CMP_LT:

+ return (oldval < cmparg);

+ case FUTEX_OP_CMP_GE:

+ return (oldval >= cmparg);

+ case FUTEX_OP_CMP_LE:

+ return (oldval <= cmparg);

+ case FUTEX_OP_CMP_GT:

+ return (oldval > cmparg);

+ default:

+ return -ENOSYS;

+ }

+int

+linux_sys_set_robust_list(struct proc *p, void *v, register_t *retval)

+ struct linux_sys_set_robust_list_args /* {

+ syscallarg(struct linux_robust_list_head *) head;

+ syscallarg(size_t) len;

+ } */ *uap = v;

+ struct linux_emuldata *led;

+ if (SCARG(uap, len) != sizeof(struct linux_robust_list_head))

+ return EINVAL;

+ led = p->p_emuldata;

+ led->led_robust_head = SCARG(uap, head);

+ *retval = 0;

+ return 0;

+int

+linux_sys_get_robust_list(struct proc *p, void *v, register_t *retval)

+ struct linux_sys_get_robust_list_args /* {

+ syscallarg(int) pid;

+ syscallarg(struct linux_robust_list_head **) head;

+ syscallarg(size_t *) len;

+ } */ *uap = v;

+ struct proc *q;

+ struct linux_emuldata *led;

+ struct linux_robust_list_head *head;

+ size_t len;

+ int error = 0;

+ if (!SCARG(uap, pid)) {

+ led = p->p_emuldata;

+ head = led->led_robust_head;

+ } else {

+ if (!SCARG(uap, pid))

+ q = p;

+ else if ((q = pfind(SCARG(uap, pid))) == NULL)

+ return ESRCH;

+ else if (p->p_p != q->p_p)

+ return EPERM;

+ led = q->p_emuldata;

+ head = led->led_robust_head;

+ }

+ len = sizeof(*head);

+ error = copyout(&len, SCARG(uap, len), sizeof(len));

+ if (error)

+ return error;

+ return copyout(&head, SCARG(uap, head), sizeof(head));

+int

+futex_itimespecfix(struct timespec *ts)

+ if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)

+ return EINVAL;

+ if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)

+ ts->tv_nsec = tick * 1000;

+ return 0;