summaryrefslogtreecommitdiff
path: root/sys/kern
diff options
context:
space:
mode:
authorMartin Pieuchot <mpi@cvs.openbsd.org>2017-04-28 13:50:56 +0000
committerMartin Pieuchot <mpi@cvs.openbsd.org>2017-04-28 13:50:56 +0000
commite8355c7596eba1f524f096ea8857a4fc65321cfa (patch)
treee67d6e87c5f1ea43102f6628c554222913d06ed6 /sys/kern
parent38e6404586671a5cee5b13b7c54b8be33e33daed (diff)
Add futex(2) syscall based on a sane subset of its Linux equivalent.
The syscall is marked NOLOCK and only FUTEX_WAIT grabs the KERNEL_LOCK() because of PCATCH and the signal nightmare. Serialization of threads is currently done with a global & exclusive rwlock. Note that the current implementation still use copyin(9) which is not guaranteed to be atomic. Committing now such that remaining issues can be addressed in-tree. With inputs from guenther@, kettenis@ and visa@. ok deraadt@, visa@
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/init_main.c8
-rw-r--r--sys/kern/kern_pledge.c3
-rw-r--r--sys/kern/sys_futex.c287
-rw-r--r--sys/kern/syscalls.master5
4 files changed, 299 insertions, 4 deletions
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 9b6291d4352..83550d8c55c 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: init_main.c,v 1.268 2017/04/20 12:59:36 visa Exp $ */
+/* $OpenBSD: init_main.c,v 1.269 2017/04/28 13:50:55 mpi Exp $ */
/* $NetBSD: init_main.c,v 1.84.4.1 1996/06/02 09:08:06 mrg Exp $ */
/*
@@ -144,6 +144,7 @@ void db_ctf_init(void);
void prof_init(void);
void init_exec(void);
void kqueue_init(void);
+void futex_init(void);
void taskq_init(void);
void timeout_proc_init(void);
void pool_gc_pages(void *);
@@ -264,6 +265,11 @@ main(void *framep)
*/
kqueue_init();
+ /*
+ * Initialize futexes.
+ */
+ futex_init();
+
/* Create credentials. */
p->p_ucred = crget();
p->p_ucred->cr_ngroups = 1; /* group 0 */
diff --git a/sys/kern/kern_pledge.c b/sys/kern/kern_pledge.c
index d99a60d2a1e..4bea704b4cf 100644
--- a/sys/kern/kern_pledge.c
+++ b/sys/kern/kern_pledge.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_pledge.c,v 1.205 2017/04/20 15:21:53 deraadt Exp $ */
+/* $OpenBSD: kern_pledge.c,v 1.206 2017/04/28 13:50:55 mpi Exp $ */
/*
* Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org>
@@ -259,6 +259,7 @@ const uint64_t pledge_syscalls[SYS_MAXSYSCALL] = {
[SYS___tfork] = PLEDGE_STDIO,
[SYS_sched_yield] = PLEDGE_STDIO,
[SYS___thrsleep] = PLEDGE_STDIO,
+ [SYS_futex] = PLEDGE_ALWAYS,
[SYS___thrwakeup] = PLEDGE_STDIO,
[SYS___threxit] = PLEDGE_STDIO,
[SYS___thrsigdivert] = PLEDGE_STDIO,
diff --git a/sys/kern/sys_futex.c b/sys/kern/sys_futex.c
new file mode 100644
index 00000000000..0db6a10c7f3
--- /dev/null
+++ b/sys/kern/sys_futex.c
@@ -0,0 +1,287 @@
+/* $OpenBSD: sys_futex.c,v 1.1 2017/04/28 13:50:55 mpi Exp $ */
+
+/*
+ * Copyright (c) 2016-2017 Martin Pieuchot
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/syscallargs.h>
+#include <sys/pool.h>
+#include <sys/time.h>
+#include <sys/rwlock.h>
+#include <sys/futex.h>
+
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+/*
+ * Kernel representation of a futex.
+ */
+struct futex {
+ LIST_ENTRY(futex) ft_list; /* list of all futexes */
+ TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */
+ uint32_t *ft_uaddr; /* userspace address */
+ pid_t ft_pid; /* process identifier */
+ unsigned int ft_refcnt; /* # of references */
+};
+
+/* Syscall helpers. */
+int futex_wait(uint32_t *, uint32_t, const struct timespec *);
+int futex_wake(uint32_t *, uint32_t);
+int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t);
+
+/* Flags for futex_get(). */
+#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */
+
+struct futex *futex_get(uint32_t *, int);
+void futex_put(struct futex *);
+
+/*
+ * The global futex lock serialize futex(2) calls such that no wakeup
+ * event are lost, protect the global list of all futexes and their
+ * states.
+ */
+struct rwlock ftlock = RWLOCK_INITIALIZER("futex");
+static LIST_HEAD(, futex) ftlist;
+struct pool ftpool;
+
+
+void
+futex_init(void)
+{
+ pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE, 0, "futexpl",
+ NULL);
+}
+
+int
+sys_futex(struct proc *p, void *v, register_t *retval)
+{
+ struct sys_futex_args /* {
+ syscallarg(uint32_t *) f;
+ syscallarg(int) op;
+ syscallarg(inr) val;
+ syscallarg(const struct timespec *) timeout;
+ syscallarg(uint32_t *) g;
+ } */ *uap = v;
+ uint32_t *uaddr = SCARG(uap, f);
+ int op = SCARG(uap, op);
+ uint32_t val = SCARG(uap, val);
+ const struct timespec *timeout = SCARG(uap, timeout);
+ void *g = SCARG(uap, g);
+ int error = 0;
+
+ switch (op) {
+ case FUTEX_WAIT:
+ KERNEL_LOCK();
+ rw_enter_write(&ftlock);
+ *retval = futex_wait(uaddr, val, timeout);
+ rw_exit_write(&ftlock);
+ KERNEL_UNLOCK();
+ break;
+ case FUTEX_WAKE:
+ rw_enter_write(&ftlock);
+ *retval = futex_wake(uaddr, val);
+ rw_exit_write(&ftlock);
+ break;
+ case FUTEX_REQUEUE:
+ rw_enter_write(&ftlock);
+ *retval = futex_requeue(uaddr, val, g, (unsigned long)timeout);
+ rw_exit_write(&ftlock);
+ break;
+ default:
+ error = ENOSYS;
+ break;
+ }
+
+ return (error ? -1 : 0);
+}
+
+/*
+ * Return an existing futex matching userspace address ``uaddr''.
+ *
+ * If such futex does not exist and FT_CREATE is given, create it.
+ */
+struct futex *
+futex_get(uint32_t *uaddr, int flag)
+{
+ struct futex *f;
+
+ rw_assert_wrlock(&ftlock);
+
+ LIST_FOREACH(f, &ftlist, ft_list) {
+ if (f->ft_uaddr == uaddr && f->ft_pid == curproc->p_p->ps_pid) {
+ f->ft_refcnt++;
+ break;
+ }
+ }
+
+ if ((f == NULL) && (flag & FT_CREATE)) {
+ /*
+ * We rely on the rwlock to ensure that no other thread
+ * create the same futex.
+ */
+ f = pool_get(&ftpool, PR_WAITOK);
+ TAILQ_INIT(&f->ft_threads);
+ f->ft_uaddr = uaddr;
+ f->ft_pid = curproc->p_p->ps_pid;
+ f->ft_refcnt = 1;
+ LIST_INSERT_HEAD(&ftlist, f, ft_list);
+ }
+
+ return f;
+}
+
+/*
+ * Release a given futex.
+ */
+void
+futex_put(struct futex *f)
+{
+ rw_assert_wrlock(&ftlock);
+
+ KASSERT(f->ft_refcnt > 0);
+
+ --f->ft_refcnt;
+ if (f->ft_refcnt == 0) {
+ KASSERT(TAILQ_EMPTY(&f->ft_threads));
+ LIST_REMOVE(f, ft_list);
+ pool_put(&ftpool, f);
+ }
+}
+
+/*
+ * Put the current thread on the sleep queue of the futex at address
+ * ``uaddr''. Let it sleep for the specified ``timeout'' time, or
+ * indefinitly if the argument is NULL.
+ */
+int
+futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout)
+{
+ struct proc *p = curproc;
+ struct futex *f;
+ uint64_t to_ticks = 0;
+ uint32_t cval;
+ int error;
+
+ /*
+ * After reading the value a race is still possible but
+ * we deal with it by serializing all futex syscalls.
+ */
+ rw_assert_wrlock(&ftlock);
+
+ /*
+ * Read user space futex value
+ *
+ * XXX copyin(9) is not guaranteed to be atomic.
+ */
+ if ((error = copyin(uaddr, &cval, sizeof(cval))))
+ return error;
+
+ /* If the value changed, stop here. */
+ if (cval != val)
+ return EAGAIN;
+
+ if (timeout != NULL) {
+ struct timespec ts;
+
+ if ((error = copyin(timeout, &ts, sizeof(ts))))
+ return error;
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrabstimespec(p, timeout);
+#endif
+ to_ticks = (uint64_t)hz * ts.tv_sec +
+ (ts.tv_nsec + tick * 1000 - 1) / (tick * 1000) + 1;
+ if (to_ticks > INT_MAX)
+ to_ticks = INT_MAX;
+ }
+
+ f = futex_get(uaddr, FT_CREATE);
+ TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link);
+ p->p_futex = f;
+
+ error = rwsleep(p, &ftlock, PUSER|PCATCH, "fsleep", (int)to_ticks);
+ if (error == ERESTART)
+ error = EINTR;
+ else if (error == EWOULDBLOCK) {
+ /* A race occured between a wakeup and a timeout. */
+ if (p->p_futex == NULL)
+ error = 0;
+ else
+ error = ETIMEDOUT;
+ }
+
+ /* Remove ourself if we haven't been awaken. */
+ if ((f = p->p_futex) != NULL) {
+ p->p_futex = NULL;
+ TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
+ futex_put(f);
+ }
+
+ return error;
+}
+
+/*
+ * Wakeup at most ``n'' sibling threads sleeping on a futex at address
+ * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at
+ * address ``uaddr2''.
+ */
+int
+futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m)
+{
+ struct futex *f, *g;
+ struct proc *p;
+ uint32_t count = 0;
+
+ rw_assert_wrlock(&ftlock);
+
+ f = futex_get(uaddr, 0);
+ if (f == NULL)
+ return 0;
+
+ while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) {
+ p->p_futex = NULL;
+ TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
+ futex_put(f);
+
+ if (count < n) {
+ wakeup_one(p);
+ } else if (uaddr2 != NULL) {
+ g = futex_get(uaddr2, FT_CREATE);
+ TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link);
+ p->p_futex = g;
+ }
+ count++;
+ }
+
+ futex_put(f);
+
+ return count;
+}
+
+/*
+ * Wakeup at most ``n'' sibling threads sleeping on a futex at address
+ * ``uaddr''.
+ */
+int
+futex_wake(uint32_t *uaddr, uint32_t n)
+{
+ return futex_requeue(uaddr, n, NULL, 0);
+}
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 0ce9ba5238a..15cf55de433 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -1,4 +1,4 @@
-; $OpenBSD: syscalls.master,v 1.175 2017/04/13 04:06:46 guenther Exp $
+; $OpenBSD: syscalls.master,v 1.176 2017/04/28 13:50:55 mpi Exp $
; $NetBSD: syscalls.master,v 1.32 1996/04/23 10:24:21 mycroft Exp $
; @(#)syscalls.master 8.2 (Berkeley) 1/13/94
@@ -187,7 +187,8 @@
const gid_t *gidset); }
81 STD { int sys_getpgrp(void); }
82 STD { int sys_setpgid(pid_t pid, pid_t pgid); }
-83 OBSOL osendsyslog
+83 STD NOLOCK { int sys_futex(uint32_t *f, int op, int val, \
+ const struct timespec *timeout, uint32_t *g); }
84 STD { int sys_utimensat(int fd, const char *path, \
const struct timespec *times, int flag); }
85 STD { int sys_futimens(int fd, \