/* $OpenBSD: linux_futex.c,v 1.10 2012/11/19 15:03:55 pirofti Exp $ */ /* $NetBSD: linux_futex.c,v 1.26 2010/07/07 01:30:35 chs Exp $ */ /*- * Copyright (c) 2011 Paul Irofti * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPATFUTEX_DEBUG #define DPRINTF(x) printf x #else #define DPRINTF(x) #endif struct pool futex_pool; struct pool futex_wp_pool; struct futex; struct waiting_proc { struct proc *p; struct futex *wp_new_futex; TAILQ_ENTRY(waiting_proc) wp_list; TAILQ_ENTRY(waiting_proc) wp_rqlist; }; struct futex { void *f_uaddr; int f_refcount; LIST_ENTRY(futex) f_list; TAILQ_HEAD(, waiting_proc) f_waiting_proc; TAILQ_HEAD(, waiting_proc) f_requeue_proc; }; static LIST_HEAD(futex_list, futex) futex_list; struct mutex futex_lock; void futex_pool_init(void); int linux_do_futex(struct proc *, const struct linux_sys_futex_args *, register_t *, struct timespec *); struct futex * futex_get(void *); void futex_ref(struct futex *); void futex_put(struct futex *); int futex_sleep(struct futex **, struct proc *, int, struct waiting_proc *); int futex_wake(struct futex *, int, struct futex *, int); int futex_atomic_op(struct proc *, int, void *); int futex_itimespecfix(struct timespec *ts); int linux_sys_futex(struct proc *p, void *v, register_t *retval) { struct linux_sys_futex_args /* { syscallarg(int *) uaddr; syscallarg(int) op; syscallarg(int) val; syscallarg(const struct linux_timespec *) timeout; syscallarg(int *) uaddr2; syscallarg(int) val3; } */ *uap = v; struct l_timespec lts; struct timespec ts = {0, 0}; int error; if ((SCARG(uap, op) & ~LINUX_FUTEX_PRIVATE_FLAG) == LINUX_FUTEX_WAIT && SCARG(uap, timeout) != NULL) { if ((error = copyin(SCARG(uap, timeout), <s, sizeof(lts))) != 0) { return error; } linux_to_native_timespec(&ts, <s); } return linux_do_futex(p, uap, retval, &ts); } int linux_do_futex(struct proc *p, const struct linux_sys_futex_args *uap, register_t *retval, struct timespec *ts) { /* { syscallarg(int *) uaddr; syscallarg(int) op; syscallarg(int) val; syscallarg(const struct linux_timespec *) timeout; syscallarg(int *) uaddr2; syscallarg(int) val3; } */ int ret; int error = 0; struct futex *f; struct futex *newf; int timeout_hz; struct timeval tv; struct futex *f2; struct waiting_proc *wp; int op_ret; int uaddr_val; int args_val = SCARG(uap, val); #ifdef COMPATFUTEX_DEBUG int tid = p->p_pid + THREAD_PID_OFFSET; #endif /* * Our implementation provides only private futexes. Most of the apps * should use private futexes but don't claim so. Therefore we treat * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works * in most cases (ie. when futexes are not shared on file descriptor * or between different processes). */ switch (SCARG(uap, op) & ~LINUX_FUTEX_PRIVATE_FLAG) { case LINUX_FUTEX_WAIT: mtx_enter(&futex_lock); if ((error = copyin(SCARG(uap, uaddr), &uaddr_val, sizeof(uaddr_val))) != 0) { mtx_leave(&futex_lock); return error; } DPRINTF(("FUTEX_WAIT %d: Sleep if [%8.8X] %d == %d\n", tid, SCARG(uap, uaddr), uaddr_val, args_val)); /* Sleep only if [uaddr] == val */ if (uaddr_val != args_val) { mtx_leave(&futex_lock); return EWOULDBLOCK; } /* Check for infinity */ if (ts->tv_sec == 0 && ts->tv_nsec == 0) { timeout_hz = 0; } else { if ((error = futex_itimespecfix(ts)) != 0) { mtx_leave(&futex_lock); return error; } TIMESPEC_TO_TIMEVAL(&tv, ts); timeout_hz = tvtohz(&tv); } /* * If the user process requests a non null timeout, * make sure we do not turn it into an infinite * timeout because timeout_hz is 0. * * We use a minimal timeout of 1/hz. Maybe it would make * sense to just return ETIMEDOUT without sleeping. */ if (SCARG(uap, timeout) != NULL && timeout_hz == 0) timeout_hz = 1; DPRINTF(("FUTEX_WAIT %d: Sleep for %lld.%09lds (%dhz)\n", tid, (long long)ts->tv_sec, ts->tv_nsec, timeout_hz)); wp = pool_get(&futex_wp_pool, PR_NOWAIT); if (wp == NULL) { DPRINTF(("FUTEX_WAIT %d: Couldn't fetch a new wp from" "futex_wp_pool.\n", tid)); mtx_leave(&futex_lock); return EAGAIN; } f = futex_get(SCARG(uap, uaddr)); ret = futex_sleep(&f, p, timeout_hz, wp); futex_put(f); mtx_leave(&futex_lock); pool_put(&futex_wp_pool, wp); DPRINTF(("FUTEX_WAIT %d: Woke up from uaddr %8.8X with " "ret = %d\n", tid, SCARG(uap, uaddr), ret)); *retval = ret ? -1 : 0; switch (ret) { case EWOULDBLOCK: /* timeout */ return ETIMEDOUT; break; case EINTR: /* signal */ return EINTR; break; case 0: /* FUTEX_WAKE received */ DPRINTF(("FUTEX_WAIT %d: uaddr = %p, got it\n", tid, SCARG(uap, uaddr))); return 0; break; default: DPRINTF(("FUTEX_WAIT: unexpected ret = %d\n", ret)); break; } /* NOTREACHED */ break; case LINUX_FUTEX_WAKE: /* * XXX: Linux is able cope with different addresses * corresponding to the same mapped memory in the sleeping * and the waker process(es). */ DPRINTF(("FUTEX_WAKE %d: uaddr = %p, val = %d\n", tid, SCARG(uap, uaddr), args_val)); if (args_val < 0) return EINVAL; mtx_enter(&futex_lock); f = futex_get(SCARG(uap, uaddr)); *retval = futex_wake(f, args_val, NULL, 0); futex_put(f); mtx_leave(&futex_lock); break; case LINUX_FUTEX_CMP_REQUEUE: if (args_val < 0) return EINVAL; /* * Don't allow using the same address for requeueing. * * glibc seems to cope with this. */ if (SCARG(uap, uaddr) == SCARG(uap, uaddr2)) { return EINVAL; } mtx_enter(&futex_lock); if ((error = copyin(SCARG(uap, uaddr), &uaddr_val, sizeof(uaddr_val))) != 0) { mtx_leave(&futex_lock); return error; } DPRINTF(("FUTEX_CMP_REQUEUE %d: Check for possible race " "condition [%8.8X] %d != %d", tid, SCARG(uap, uaddr), uaddr_val, SCARG(uap, val3))); if (uaddr_val != SCARG(uap, val3)) { mtx_leave(&futex_lock); return EAGAIN; } /* Following code is the same as REQUEUE */ DPRINTF(("FUTEX_CMP_REQUEUE %d: Wakeup %d processes and requeue" " waiters at %8.8X\n", tid, SCARG(uap, val), SCARG(uap, uaddr2))); f = futex_get(SCARG(uap, uaddr)); newf = futex_get(SCARG(uap, uaddr2)); /* * Check if uaddr2 is in use. * If true, return EINVAL to avoid deadlock. * * glibc seems to cope with this. */ if (newf->f_refcount != 1) { futex_put(f); futex_put(newf); return EINVAL; } *retval = futex_wake(f, args_val, newf, (int)(unsigned long)SCARG(uap, timeout)); futex_put(f); futex_put(newf); mtx_leave(&futex_lock); break; case LINUX_FUTEX_REQUEUE: DPRINTF(("FUTEX_REQUEUE %d: Wakeup %d processes and requeue " "waiters at %8.8X\n", tid, SCARG(uap, val), SCARG(uap, uaddr2))); if (args_val < 0) return EINVAL; mtx_enter(&futex_lock); f = futex_get(SCARG(uap, uaddr)); newf = futex_get(SCARG(uap, uaddr2)); *retval = futex_wake(f, args_val, newf, (int)(unsigned long)SCARG(uap, timeout)); futex_put(f); futex_put(newf); mtx_leave(&futex_lock); break; case LINUX_FUTEX_FD: DPRINTF(("linux_sys_futex: unimplemented op %d\n", SCARG(uap, op))); return ENOSYS; case LINUX_FUTEX_WAKE_OP: DPRINTF(("FUTEX_WAKE_OP %d: uaddr = %p, op = %d, " "val = %d, uaddr2 = %p, val3 = %d\n", tid, SCARG(uap, uaddr), SCARG(uap, op), args_val, SCARG(uap, uaddr2), SCARG(uap, val3))); if (args_val < 0) return EINVAL; mtx_enter(&futex_lock); f = futex_get(SCARG(uap, uaddr)); f2 = futex_get(SCARG(uap, uaddr2)); mtx_leave(&futex_lock); /* * This function returns a positive number as results and * negative as errors */ op_ret = futex_atomic_op(p, SCARG(uap, val3), SCARG(uap, uaddr2)); if (op_ret < 0) { futex_put(f); futex_put(f2); return -op_ret; } mtx_enter(&futex_lock); ret = futex_wake(f, args_val, NULL, 0); futex_put(f); if (op_ret > 0) { op_ret = 0; /* * Linux abuses the address of the timespec parameter * as the number of retries */ op_ret += futex_wake(f2, (int)(unsigned long)SCARG(uap, timeout), NULL, 0); ret += op_ret; } futex_put(f2); mtx_leave(&futex_lock); *retval = ret; break; default: DPRINTF(("linux_sys_futex: unknown op %d\n", SCARG(uap, op))); return ENOSYS; } return 0; } void futex_pool_init(void) { DPRINTF(("Inside futex_pool_init()\n")); pool_init(&futex_pool, sizeof(struct futex), 0, 0, PR_DEBUGCHK, "futexpl", &pool_allocator_nointr); pool_init(&futex_wp_pool, sizeof(struct waiting_proc), 0, 0, PR_DEBUGCHK, "futexwppl", &pool_allocator_nointr); } /* * Get a futex. * If we have an existing one, we will return that with the refcount bumped. * Otherwise we will allocate and hook up a new one. * Must be called with futex_lock held, but we may unlock it in order to * sleep for allocation. */ struct futex * futex_get(void *uaddr) { struct futex *f, *newf; MUTEX_ASSERT_LOCKED(&futex_lock); LIST_FOREACH(f, &futex_list, f_list) { if (f->f_uaddr == uaddr) { f->f_refcount++; return f; } } mtx_leave(&futex_lock); /* Not found, create it */ newf = pool_get(&futex_pool, PR_WAITOK|PR_ZERO); mtx_enter(&futex_lock); /* Did someone else create it in the meantime? */ LIST_FOREACH(f, &futex_list, f_list) { if (f->f_uaddr == uaddr) { f->f_refcount++; pool_put(&futex_pool, newf); return f; } } newf->f_uaddr = uaddr; newf->f_refcount = 1; TAILQ_INIT(&newf->f_waiting_proc); TAILQ_INIT(&newf->f_requeue_proc); LIST_INSERT_HEAD(&futex_list, newf, f_list); return newf; } /* * Grab a reference on a futex. * The futex lock must be locked. */ void futex_ref(struct futex *f) { MUTEX_ASSERT_LOCKED(&futex_lock); f->f_refcount++; } /* * Release our reference on the futex. * must be called with the futex_lock held. */ void futex_put(struct futex *f) { MUTEX_ASSERT_LOCKED(&futex_lock); f->f_refcount--; if (f->f_refcount == 0) { KASSERT(TAILQ_EMPTY(&f->f_waiting_proc)); KASSERT(TAILQ_EMPTY(&f->f_requeue_proc)); LIST_REMOVE(f, f_list); pool_put(&futex_pool, f); } } int futex_sleep(struct futex **fp, struct proc *p, int timeout, struct waiting_proc *wp) { struct futex *f, *newf; int ret; MUTEX_ASSERT_LOCKED(&futex_lock); f = *fp; wp->p = p; wp->wp_new_futex = NULL; DPRINTF(("futex_sleep: sleep on futex %p\n", f)); requeue: TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list); ret = msleep(f, &futex_lock, PUSER | PCATCH, "futex_sleep", timeout); TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); /* if futex_wake() tells us to requeue ... */ newf = wp->wp_new_futex; if (ret == 0 && newf != NULL) { /* ... requeue ourselves on the new futex */ DPRINTF(("futex_sleep: requeue futex %p\n", newf)); futex_put(f); wp->wp_new_futex = NULL; TAILQ_REMOVE(&newf->f_requeue_proc, wp, wp_rqlist); *fp = f = newf; goto requeue; } return ret; } int futex_wake(struct futex *f, int n, struct futex *newf, int n2) { struct waiting_proc *wp; int count = 0; KASSERT(newf != f); MUTEX_ASSERT_LOCKED(&futex_lock); /* * first, wake up any threads sleeping on this futex. * note that sleeping threads are not in the process of requeueing. */ if (!TAILQ_EMPTY(&f->f_waiting_proc)) wakeup(f); /* only call wakeup once */ TAILQ_FOREACH(wp, &f->f_waiting_proc, wp_list) { KASSERT(wp->wp_new_futex == NULL); DPRINTF(("futex_wake: signal futex %p ref %d\n", f, f->f_refcount)); if (count <= n) { count++; } else { if (newf == NULL) break; /* matching futex_put() is called by the other thread */ futex_ref(newf); wp->wp_new_futex = newf; TAILQ_INSERT_TAIL(&newf->f_requeue_proc, wp, wp_rqlist); DPRINTF(("futex_wake: requeue newf %p ref %d\n", newf, newf->f_refcount)); if (count - n >= n2) goto out; } } /* * next, deal with threads that are requeuing to this futex. * we don't need to signal these threads, any thread on the * requeue list has already been signaled but hasn't had a chance * to run and requeue itself yet. if we would normally wake * a thread, just remove the requeue info. if we would normally * requeue a thread, change the requeue target. */ while ((wp = TAILQ_FIRST(&f->f_requeue_proc)) != NULL) { /* XXX: talk to oga, should mtx_enter again, recursive */ KASSERT(wp->wp_new_futex == f); DPRINTF(("futex_wake: unrequeue f %p ref %d\n", f, f->f_refcount)); wp->wp_new_futex = NULL; TAILQ_REMOVE(&f->f_requeue_proc, wp, wp_rqlist); futex_put(f); if (count <= n) { count++; } else { if (newf == NULL) { break; } /*matching futex_put() is called by the other thread.*/ futex_ref(newf); wp->wp_new_futex = newf; TAILQ_INSERT_TAIL(&newf->f_requeue_proc, wp, wp_rqlist); DPRINTF(("futex_wake: rerequeue newf %p ref %d\n", newf, newf->f_refcount)); if (count - n >= n2) break; } } out: return count; } int futex_atomic_op(struct proc *p, int encoded_op, void *uaddr) { const int op = (encoded_op >> 28) & 7; const int cmp = (encoded_op >> 24) & 15; const int cmparg = (encoded_op << 20) >> 20; int oparg = (encoded_op << 8) >> 20; int error, oldval, cval; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; if (copyin(uaddr, &cval, sizeof(int)) != 0) return -EFAULT; for (;;) { int nval; switch (op) { case FUTEX_OP_SET: nval = oparg; break; case FUTEX_OP_ADD: nval = cval + oparg; break; case FUTEX_OP_OR: nval = cval | oparg; break; case FUTEX_OP_ANDN: nval = cval & ~oparg; break; case FUTEX_OP_XOR: nval = cval ^ oparg; break; default: return -ENOSYS; } oldval = nval; error = futex_atomic_ucas_int32(uaddr, cval, nval); if (oldval == cval || error) { break; } cval = oldval; } if (error) return -EFAULT; switch (cmp) { case FUTEX_OP_CMP_EQ: return (oldval == cmparg); case FUTEX_OP_CMP_NE: return (oldval != cmparg); case FUTEX_OP_CMP_LT: return (oldval < cmparg); case FUTEX_OP_CMP_GE: return (oldval >= cmparg); case FUTEX_OP_CMP_LE: return (oldval <= cmparg); case FUTEX_OP_CMP_GT: return (oldval > cmparg); default: return -ENOSYS; } } int linux_sys_set_robust_list(struct proc *p, void *v, register_t *retval) { struct linux_sys_set_robust_list_args /* { syscallarg(struct linux_robust_list_head *) head; syscallarg(size_t) len; } */ *uap = v; struct linux_emuldata *led; if (SCARG(uap, len) != sizeof(struct linux_robust_list_head)) return EINVAL; led = p->p_emuldata; led->led_robust_head = SCARG(uap, head); *retval = 0; return 0; } int linux_sys_get_robust_list(struct proc *p, void *v, register_t *retval) { struct linux_sys_get_robust_list_args /* { syscallarg(int) pid; syscallarg(struct linux_robust_list_head **) head; syscallarg(size_t *) len; } */ *uap = v; struct proc *q; struct linux_emuldata *led; struct linux_robust_list_head *head; size_t len; int error = 0; if (!SCARG(uap, pid)) { led = p->p_emuldata; head = led->led_robust_head; } else { if (!SCARG(uap, pid)) q = p; else if ((q = pfind(SCARG(uap, pid))) == NULL) return ESRCH; else if (p->p_p != q->p_p) return EPERM; led = q->p_emuldata; head = led->led_robust_head; } len = sizeof(*head); error = copyout(&len, SCARG(uap, len), sizeof(len)); if (error) return error; return copyout(&head, SCARG(uap, head), sizeof(head)); } int futex_itimespecfix(struct timespec *ts) { if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return EINVAL; if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000) ts->tv_nsec = tick * 1000; return 0; }