diff options
author | Martin Pieuchot <mpi@cvs.openbsd.org> | 2017-04-28 13:50:56 +0000 |
---|---|---|
committer | Martin Pieuchot <mpi@cvs.openbsd.org> | 2017-04-28 13:50:56 +0000 |
commit | e8355c7596eba1f524f096ea8857a4fc65321cfa (patch) | |
tree | e67d6e87c5f1ea43102f6628c554222913d06ed6 /sys/kern | |
parent | 38e6404586671a5cee5b13b7c54b8be33e33daed (diff) |
Add futex(2) syscall based on a sane subset of its Linux equivalent.
The syscall is marked NOLOCK and only FUTEX_WAIT grabs the KERNEL_LOCK()
because of PCATCH and the signal nightmare.
Serialization of threads is currently done with a global & exclusive
rwlock.
Note that the current implementation still use copyin(9) which is not
guaranteed to be atomic. Committing now such that remaining issues can
be addressed in-tree.
With inputs from guenther@, kettenis@ and visa@.
ok deraadt@, visa@
Diffstat (limited to 'sys/kern')
-rw-r--r-- | sys/kern/init_main.c | 8 | ||||
-rw-r--r-- | sys/kern/kern_pledge.c | 3 | ||||
-rw-r--r-- | sys/kern/sys_futex.c | 287 | ||||
-rw-r--r-- | sys/kern/syscalls.master | 5 |
4 files changed, 299 insertions, 4 deletions
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 9b6291d4352..83550d8c55c 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,4 +1,4 @@ -/* $OpenBSD: init_main.c,v 1.268 2017/04/20 12:59:36 visa Exp $ */ +/* $OpenBSD: init_main.c,v 1.269 2017/04/28 13:50:55 mpi Exp $ */ /* $NetBSD: init_main.c,v 1.84.4.1 1996/06/02 09:08:06 mrg Exp $ */ /* @@ -144,6 +144,7 @@ void db_ctf_init(void); void prof_init(void); void init_exec(void); void kqueue_init(void); +void futex_init(void); void taskq_init(void); void timeout_proc_init(void); void pool_gc_pages(void *); @@ -264,6 +265,11 @@ main(void *framep) */ kqueue_init(); + /* + * Initialize futexes. + */ + futex_init(); + /* Create credentials. */ p->p_ucred = crget(); p->p_ucred->cr_ngroups = 1; /* group 0 */ diff --git a/sys/kern/kern_pledge.c b/sys/kern/kern_pledge.c index d99a60d2a1e..4bea704b4cf 100644 --- a/sys/kern/kern_pledge.c +++ b/sys/kern/kern_pledge.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_pledge.c,v 1.205 2017/04/20 15:21:53 deraadt Exp $ */ +/* $OpenBSD: kern_pledge.c,v 1.206 2017/04/28 13:50:55 mpi Exp $ */ /* * Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org> @@ -259,6 +259,7 @@ const uint64_t pledge_syscalls[SYS_MAXSYSCALL] = { [SYS___tfork] = PLEDGE_STDIO, [SYS_sched_yield] = PLEDGE_STDIO, [SYS___thrsleep] = PLEDGE_STDIO, + [SYS_futex] = PLEDGE_ALWAYS, [SYS___thrwakeup] = PLEDGE_STDIO, [SYS___threxit] = PLEDGE_STDIO, [SYS___thrsigdivert] = PLEDGE_STDIO, diff --git a/sys/kern/sys_futex.c b/sys/kern/sys_futex.c new file mode 100644 index 00000000000..0db6a10c7f3 --- /dev/null +++ b/sys/kern/sys_futex.c @@ -0,0 +1,287 @@ +/* $OpenBSD: sys_futex.c,v 1.1 2017/04/28 13:50:55 mpi Exp $ */ + +/* + * Copyright (c) 2016-2017 Martin Pieuchot + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/syscallargs.h> +#include <sys/pool.h> +#include <sys/time.h> +#include <sys/rwlock.h> +#include <sys/futex.h> + +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +/* + * Kernel representation of a futex. + */ +struct futex { + LIST_ENTRY(futex) ft_list; /* list of all futexes */ + TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */ + uint32_t *ft_uaddr; /* userspace address */ + pid_t ft_pid; /* process identifier */ + unsigned int ft_refcnt; /* # of references */ +}; + +/* Syscall helpers. */ +int futex_wait(uint32_t *, uint32_t, const struct timespec *); +int futex_wake(uint32_t *, uint32_t); +int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t); + +/* Flags for futex_get(). */ +#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */ + +struct futex *futex_get(uint32_t *, int); +void futex_put(struct futex *); + +/* + * The global futex lock serialize futex(2) calls such that no wakeup + * event are lost, protect the global list of all futexes and their + * states. + */ +struct rwlock ftlock = RWLOCK_INITIALIZER("futex"); +static LIST_HEAD(, futex) ftlist; +struct pool ftpool; + + +void +futex_init(void) +{ + pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE, 0, "futexpl", + NULL); +} + +int +sys_futex(struct proc *p, void *v, register_t *retval) +{ + struct sys_futex_args /* { + syscallarg(uint32_t *) f; + syscallarg(int) op; + syscallarg(inr) val; + syscallarg(const struct timespec *) timeout; + syscallarg(uint32_t *) g; + } */ *uap = v; + uint32_t *uaddr = SCARG(uap, f); + int op = SCARG(uap, op); + uint32_t val = SCARG(uap, val); + const struct timespec *timeout = SCARG(uap, timeout); + void *g = SCARG(uap, g); + int error = 0; + + switch (op) { + case FUTEX_WAIT: + KERNEL_LOCK(); + rw_enter_write(&ftlock); + *retval = futex_wait(uaddr, val, timeout); + rw_exit_write(&ftlock); + KERNEL_UNLOCK(); + break; + case FUTEX_WAKE: + rw_enter_write(&ftlock); + *retval = futex_wake(uaddr, val); + rw_exit_write(&ftlock); + break; + case FUTEX_REQUEUE: + rw_enter_write(&ftlock); + *retval = futex_requeue(uaddr, val, g, (unsigned long)timeout); + rw_exit_write(&ftlock); + break; + default: + error = ENOSYS; + break; + } + + return (error ? -1 : 0); +} + +/* + * Return an existing futex matching userspace address ``uaddr''. + * + * If such futex does not exist and FT_CREATE is given, create it. + */ +struct futex * +futex_get(uint32_t *uaddr, int flag) +{ + struct futex *f; + + rw_assert_wrlock(&ftlock); + + LIST_FOREACH(f, &ftlist, ft_list) { + if (f->ft_uaddr == uaddr && f->ft_pid == curproc->p_p->ps_pid) { + f->ft_refcnt++; + break; + } + } + + if ((f == NULL) && (flag & FT_CREATE)) { + /* + * We rely on the rwlock to ensure that no other thread + * create the same futex. + */ + f = pool_get(&ftpool, PR_WAITOK); + TAILQ_INIT(&f->ft_threads); + f->ft_uaddr = uaddr; + f->ft_pid = curproc->p_p->ps_pid; + f->ft_refcnt = 1; + LIST_INSERT_HEAD(&ftlist, f, ft_list); + } + + return f; +} + +/* + * Release a given futex. + */ +void +futex_put(struct futex *f) +{ + rw_assert_wrlock(&ftlock); + + KASSERT(f->ft_refcnt > 0); + + --f->ft_refcnt; + if (f->ft_refcnt == 0) { + KASSERT(TAILQ_EMPTY(&f->ft_threads)); + LIST_REMOVE(f, ft_list); + pool_put(&ftpool, f); + } +} + +/* + * Put the current thread on the sleep queue of the futex at address + * ``uaddr''. Let it sleep for the specified ``timeout'' time, or + * indefinitly if the argument is NULL. + */ +int +futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout) +{ + struct proc *p = curproc; + struct futex *f; + uint64_t to_ticks = 0; + uint32_t cval; + int error; + + /* + * After reading the value a race is still possible but + * we deal with it by serializing all futex syscalls. + */ + rw_assert_wrlock(&ftlock); + + /* + * Read user space futex value + * + * XXX copyin(9) is not guaranteed to be atomic. + */ + if ((error = copyin(uaddr, &cval, sizeof(cval)))) + return error; + + /* If the value changed, stop here. */ + if (cval != val) + return EAGAIN; + + if (timeout != NULL) { + struct timespec ts; + + if ((error = copyin(timeout, &ts, sizeof(ts)))) + return error; +#ifdef KTRACE + if (KTRPOINT(p, KTR_STRUCT)) + ktrabstimespec(p, timeout); +#endif + to_ticks = (uint64_t)hz * ts.tv_sec + + (ts.tv_nsec + tick * 1000 - 1) / (tick * 1000) + 1; + if (to_ticks > INT_MAX) + to_ticks = INT_MAX; + } + + f = futex_get(uaddr, FT_CREATE); + TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link); + p->p_futex = f; + + error = rwsleep(p, &ftlock, PUSER|PCATCH, "fsleep", (int)to_ticks); + if (error == ERESTART) + error = EINTR; + else if (error == EWOULDBLOCK) { + /* A race occured between a wakeup and a timeout. */ + if (p->p_futex == NULL) + error = 0; + else + error = ETIMEDOUT; + } + + /* Remove ourself if we haven't been awaken. */ + if ((f = p->p_futex) != NULL) { + p->p_futex = NULL; + TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); + futex_put(f); + } + + return error; +} + +/* + * Wakeup at most ``n'' sibling threads sleeping on a futex at address + * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at + * address ``uaddr2''. + */ +int +futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m) +{ + struct futex *f, *g; + struct proc *p; + uint32_t count = 0; + + rw_assert_wrlock(&ftlock); + + f = futex_get(uaddr, 0); + if (f == NULL) + return 0; + + while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) { + p->p_futex = NULL; + TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); + futex_put(f); + + if (count < n) { + wakeup_one(p); + } else if (uaddr2 != NULL) { + g = futex_get(uaddr2, FT_CREATE); + TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link); + p->p_futex = g; + } + count++; + } + + futex_put(f); + + return count; +} + +/* + * Wakeup at most ``n'' sibling threads sleeping on a futex at address + * ``uaddr''. + */ +int +futex_wake(uint32_t *uaddr, uint32_t n) +{ + return futex_requeue(uaddr, n, NULL, 0); +} diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 0ce9ba5238a..15cf55de433 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -1,4 +1,4 @@ -; $OpenBSD: syscalls.master,v 1.175 2017/04/13 04:06:46 guenther Exp $ +; $OpenBSD: syscalls.master,v 1.176 2017/04/28 13:50:55 mpi Exp $ ; $NetBSD: syscalls.master,v 1.32 1996/04/23 10:24:21 mycroft Exp $ ; @(#)syscalls.master 8.2 (Berkeley) 1/13/94 @@ -187,7 +187,8 @@ const gid_t *gidset); } 81 STD { int sys_getpgrp(void); } 82 STD { int sys_setpgid(pid_t pid, pid_t pgid); } -83 OBSOL osendsyslog +83 STD NOLOCK { int sys_futex(uint32_t *f, int op, int val, \ + const struct timespec *timeout, uint32_t *g); } 84 STD { int sys_utimensat(int fd, const char *path, \ const struct timespec *times, int flag); } 85 STD { int sys_futimens(int fd, \ |