diff options
author | David Gwynne <dlg@cvs.openbsd.org> | 2020-07-06 03:56:52 +0000 |
---|---|---|
committer | David Gwynne <dlg@cvs.openbsd.org> | 2020-07-06 03:56:52 +0000 |
commit | 3e050f6d7149fc0653707ceba8ea50fd0eeef516 (patch) | |
tree | 2f4e656a98b5c35c37377a2c80d54cc0a1649a90 /sys/dev/kstat.c | |
parent | c3cc4803f315c77e9c28de5cd68b11a918d0745e (diff) |
add kstat(4), a subsystem to let the kernel expose statistics to userland.
a kstat is an arbitrary chunk of data that a part of the kernel
wants to expose to userland. data could mean just a chunk of raw
bytes, but generally a kernel subsystem will provide a series of
kstat key/value chunks.
this code is loosely modelled on kstat in solaris, but with a bunch
of simplifications (we don't want to provide write support for
example). the named or key/value structure is significantly richer
in this version too. eg, ssolaris kstat named data supports integer
types, but this version offers differentiation between counters
(like the number of packets transmitted on an interface) and gauges
(like how long the transmit queue is) and lets kernel providers say
what the units are (eg, packets vs bytes vs cycles).
the main motivation for this is to improve the visibility of what
the kernel is doing while it's running. i wrote this as part of the
recent work we've been doing on multiqueue and rss/toeplitz so i
could verify that network load is actually spread across multiple
rings on a single nic. without this we would be wasting memory and
interrupt vectors on multiple rings and still just using the 1st
one, and noone would know cos there's no way to see what rings are
being used.
another thing that can become visible is the different counters
that various network cards provide. i'm particularly interested in
seeing if packets get dropped because the rings aren't filled fully,
which is an effect we've never really observed directly.
a small part of wanting this is cos i spend an annoying amount of
time instrumenting the kernel when hacking code in it. if most of
the scaffolding for the instrumentation is already there, i can
avoid repeatedly writing that code and save time.
iterated a few times with claudio@ and deraadt@
Diffstat (limited to 'sys/dev/kstat.c')
-rw-r--r-- | sys/dev/kstat.c | 689 |
1 files changed, 689 insertions, 0 deletions
diff --git a/sys/dev/kstat.c b/sys/dev/kstat.c new file mode 100644 index 00000000000..11e3ed8dd0d --- /dev/null +++ b/sys/dev/kstat.c @@ -0,0 +1,689 @@ +/* $OpenBSD: kstat.c,v 1.1 2020/07/06 03:56:51 dlg Exp $ */ + +/* + * Copyright (c) 2020 David Gwynne <dlg@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/pool.h> +#include <sys/time.h> + +/* for kstat_set_cpu */ +#include <sys/proc.h> +#include <sys/sched.h> + +#include <sys/kstat.h> + +RBT_HEAD(kstat_id_tree, kstat); + +static inline int +kstat_id_cmp(const struct kstat *a, const struct kstat *b) +{ + if (a->ks_id > b->ks_id) + return (1); + if (a->ks_id < b->ks_id) + return (-1); + + return (0); +} + +RBT_PROTOTYPE(kstat_id_tree, kstat, ks_id_entry, kstat_id_cmp); + +RBT_HEAD(kstat_pv_tree, kstat); + +static inline int +kstat_pv_cmp(const struct kstat *a, const struct kstat *b) +{ + int rv; + + rv = strcmp(a->ks_provider, b->ks_provider); + if (rv != 0) + return (rv); + + if (a->ks_instance > b->ks_instance) + return (1); + if (a->ks_instance < b->ks_instance) + return (-1); + + rv = strcmp(a->ks_name, b->ks_name); + if (rv != 0) + return (rv); + + if (a->ks_unit > b->ks_unit) + return (1); + if (a->ks_unit < b->ks_unit) + return (-1); + + return (0); +} + +RBT_PROTOTYPE(kstat_pv_tree, kstat, ks_pv_entry, kstat_pv_cmp); + +RBT_HEAD(kstat_nm_tree, kstat); + +static inline int +kstat_nm_cmp(const struct kstat *a, const struct kstat *b) +{ + int rv; + + rv = strcmp(a->ks_name, b->ks_name); + if (rv != 0) + return (rv); + + if (a->ks_unit > b->ks_unit) + return (1); + if (a->ks_unit < b->ks_unit) + return (-1); + + rv = strcmp(a->ks_provider, b->ks_provider); + if (rv != 0) + return (rv); + + if (a->ks_instance > b->ks_instance) + return (1); + if (a->ks_instance < b->ks_instance) + return (-1); + + return (0); +} + +RBT_PROTOTYPE(kstat_nm_tree, kstat, ks_nm_entry, kstat_nm_cmp); + +struct kstat_lock_ops { + void (*enter)(void *); + void (*leave)(void *); +}; + +#define kstat_enter(_ks) (_ks)->ks_lock_ops->enter((_ks)->ks_lock) +#define kstat_leave(_ks) (_ks)->ks_lock_ops->leave((_ks)->ks_lock) + +const struct kstat_lock_ops kstat_rlock_ops = { + (void (*)(void *))rw_enter_read, + (void (*)(void *))rw_exit_read, +}; + +const struct kstat_lock_ops kstat_wlock_ops = { + (void (*)(void *))rw_enter_write, + (void (*)(void *))rw_exit_write, +}; + +const struct kstat_lock_ops kstat_mutex_ops = { + (void (*)(void *))mtx_enter, + (void (*)(void *))mtx_leave, +}; + +void kstat_cpu_enter(void *); +void kstat_cpu_leave(void *); + +const struct kstat_lock_ops kstat_cpu_ops = { + kstat_cpu_enter, + kstat_cpu_leave, +}; + +struct rwlock kstat_lock = RWLOCK_INITIALIZER("kstat"); + +/* + * The global state is versioned so changes to the set of kstats + * can be detected. This is an int so it can be read atomically on + * any arch, which is a ridiculous optimisation, really. + */ +unsigned int kstat_version = 0; + +/* + * kstat structures have a unique identifier so they can be found + * quickly. Identifiers are 64bit in the hope that it won't wrap + * during the runtime of a system. The identifiers start at 1 so that + * 0 can be used as the first value for userland to iterate with. + */ +uint64_t kstat_next_id = 1; + +struct kstat_id_tree kstat_id_tree = RBT_INITIALIZER(); +struct kstat_pv_tree kstat_pv_tree = RBT_INITIALIZER(); +struct kstat_nm_tree kstat_nm_tree = RBT_INITIALIZER(); +struct pool kstat_pool; + +struct rwlock kstat_default_lock = RWLOCK_INITIALIZER("kstatlk"); + +int kstat_read(struct kstat *); +int kstat_copy(struct kstat *, void *); + +int +kstatattach(int num) +{ + /* XXX install system stats here */ + return (0); +} + +int +kstatopen(dev_t dev, int flag, int mode, struct proc *p) +{ + return (0); +} + +int +kstatclose(dev_t dev, int flag, int mode, struct proc *p) +{ + return (0); +} + +int +kstatioc_enter(struct kstat_req *ksreq) +{ + int error; + + error = rw_enter(&kstat_lock, RW_READ | RW_INTR); + if (error != 0) + return (error); + + if (!ISSET(ksreq->ks_rflags, KSTATIOC_F_IGNVER) && + ksreq->ks_version != kstat_version) { + error = EINVAL; + goto error; + } + + return (0); + +error: + rw_exit(&kstat_lock); + return (error); +} + +int +kstatioc_leave(struct kstat_req *ksreq, struct kstat *ks) +{ + void *buf = NULL; + size_t klen = 0, ulen = 0; + struct timespec updated; + int error = 0; + + if (ks == NULL) { + error = ENOENT; + goto error; + } + + switch (ks->ks_state) { + case KSTAT_S_CREATED: + ksreq->ks_updated = ks->ks_created; + ksreq->ks_interval.tv_sec = 0; + ksreq->ks_interval.tv_nsec = 0; + ksreq->ks_datalen = 0; + ksreq->ks_dataver = 0; + break; + + case KSTAT_S_INSTALLED: + ksreq->ks_dataver = ks->ks_dataver; + ksreq->ks_interval = ks->ks_interval; + + if (ksreq->ks_data == NULL) { + /* userland doesn't want actual data, so shortcut */ + kstat_enter(ks); + ksreq->ks_datalen = ks->ks_datalen; + ksreq->ks_updated = ks->ks_updated; + kstat_leave(ks); + break; + } + + klen = ks->ks_datalen; /* KSTAT_F_REALLOC */ + buf = malloc(klen, M_TEMP, M_WAITOK|M_CANFAIL); + if (buf == NULL) { + error = ENOMEM; + goto error; + } + + kstat_enter(ks); + error = (*ks->ks_read)(ks); + if (error == 0) { + updated = ks->ks_updated; + + /* KSTAT_F_REALLOC */ + KASSERTMSG(ks->ks_datalen == klen, + "kstat doesnt support resized data yet"); + + error = (*ks->ks_copy)(ks, buf); + } + kstat_leave(ks); + + if (error != 0) + goto error; + + ulen = ksreq->ks_datalen; + ksreq->ks_datalen = klen; /* KSTAT_F_REALLOC */ + ksreq->ks_updated = updated; + break; + default: + panic("ks %p unexpected state %u", ks, ks->ks_state); + } + + ksreq->ks_version = kstat_version; + ksreq->ks_id = ks->ks_id; + + if (strlcpy(ksreq->ks_provider, ks->ks_provider, + sizeof(ksreq->ks_provider)) >= sizeof(ksreq->ks_provider)) + panic("kstat %p provider string has grown", ks); + ksreq->ks_instance = ks->ks_instance; + if (strlcpy(ksreq->ks_name, ks->ks_name, + sizeof(ksreq->ks_name)) >= sizeof(ksreq->ks_name)) + panic("kstat %p name string has grown", ks); + ksreq->ks_unit = ks->ks_unit; + + ksreq->ks_created = ks->ks_created; + ksreq->ks_type = ks->ks_type; + ksreq->ks_state = ks->ks_state; + +error: + rw_exit(&kstat_lock); + + if (buf != NULL) { + if (error == 0) + error = copyout(buf, ksreq->ks_data, min(klen, ulen)); + + free(buf, M_TEMP, klen); + } + + return (error); +} + +int +kstatioc_find_id(struct kstat_req *ksreq) +{ + struct kstat *ks, key; + int error; + + error = kstatioc_enter(ksreq); + if (error != 0) + return (error); + + key.ks_id = ksreq->ks_id; + + ks = RBT_FIND(kstat_id_tree, &kstat_id_tree, &key); + + return (kstatioc_leave(ksreq, ks)); +} + +int +kstatioc_nfind_id(struct kstat_req *ksreq) +{ + struct kstat *ks, key; + int error; + + error = kstatioc_enter(ksreq); + if (error != 0) + return (error); + + key.ks_id = ksreq->ks_id; + + ks = RBT_NFIND(kstat_id_tree, &kstat_id_tree, &key); + + return (kstatioc_leave(ksreq, ks)); +} + +int +kstatioc_find_pv(struct kstat_req *ksreq) +{ + struct kstat *ks, key; + int error; + + error = kstatioc_enter(ksreq); + if (error != 0) + return (error); + + key.ks_provider = ksreq->ks_provider; + key.ks_instance = ksreq->ks_instance; + key.ks_name = ksreq->ks_name; + key.ks_unit = ksreq->ks_unit; + + ks = RBT_FIND(kstat_pv_tree, &kstat_pv_tree, &key); + + return (kstatioc_leave(ksreq, ks)); +} + +int +kstatioc_nfind_pv(struct kstat_req *ksreq) +{ + struct kstat *ks, key; + int error; + + error = kstatioc_enter(ksreq); + if (error != 0) + return (error); + + key.ks_provider = ksreq->ks_provider; + key.ks_instance = ksreq->ks_instance; + key.ks_name = ksreq->ks_name; + key.ks_unit = ksreq->ks_unit; + + ks = RBT_NFIND(kstat_pv_tree, &kstat_pv_tree, &key); + + return (kstatioc_leave(ksreq, ks)); +} + +int +kstatioc_find_nm(struct kstat_req *ksreq) +{ + struct kstat *ks, key; + int error; + + error = kstatioc_enter(ksreq); + if (error != 0) + return (error); + + key.ks_name = ksreq->ks_name; + key.ks_unit = ksreq->ks_unit; + key.ks_provider = ksreq->ks_provider; + key.ks_instance = ksreq->ks_instance; + + ks = RBT_FIND(kstat_nm_tree, &kstat_nm_tree, &key); + + return (kstatioc_leave(ksreq, ks)); +} + +int +kstatioc_nfind_nm(struct kstat_req *ksreq) +{ + struct kstat *ks, key; + int error; + + error = kstatioc_enter(ksreq); + if (error != 0) + return (error); + + key.ks_name = ksreq->ks_name; + key.ks_unit = ksreq->ks_unit; + key.ks_provider = ksreq->ks_provider; + key.ks_instance = ksreq->ks_instance; + + ks = RBT_NFIND(kstat_nm_tree, &kstat_nm_tree, &key); + + return (kstatioc_leave(ksreq, ks)); +} + +int +kstatioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + struct kstat_req *ksreq = (struct kstat_req *)data; + int error = 0; + + KERNEL_UNLOCK(); + + switch (cmd) { + case KSTATIOC_VERSION: + *(unsigned int *)data = kstat_version; + break; + + case KSTATIOC_FIND_ID: + error = kstatioc_find_id(ksreq); + break; + case KSTATIOC_NFIND_ID: + error = kstatioc_nfind_id(ksreq); + break; + case KSTATIOC_FIND_PROVIDER: + error = kstatioc_find_pv(ksreq); + break; + case KSTATIOC_NFIND_PROVIDER: + error = kstatioc_nfind_pv(ksreq); + break; + case KSTATIOC_FIND_NAME: + error = kstatioc_find_nm(ksreq); + break; + case KSTATIOC_NFIND_NAME: + error = kstatioc_nfind_nm(ksreq); + break; + + default: + error = ENOTTY; + break; + } + + KERNEL_LOCK(); + + return (error); +} + +void +kstat_init(void) +{ + static int initialized = 0; + + if (initialized) + return; + + pool_init(&kstat_pool, sizeof(struct kstat), 0, IPL_NONE, + PR_WAITOK | PR_RWLOCK, "kstatmem", NULL); + + initialized = 1; +} + +int +kstat_strcheck(const char *str) +{ + size_t i, l; + + l = strlen(str); + if (l == 0 || l >= KSTAT_STRLEN) + return (-1); + for (i = 0; i < l; i++) { + int ch = str[i]; + if (ch >= 'a' && ch <= 'z') + continue; + if (ch >= 'A' && ch <= 'Z') + continue; + if (ch >= '0' && ch <= '9') + continue; + switch (ch) { + case '-': + case '_': + case '.': + break; + default: + return (-1); + } + } + + return (0); +} + +struct kstat * +kstat_create(const char *provider, unsigned int instance, + const char *name, unsigned int unit, + unsigned int type, unsigned int flags) +{ + struct kstat *ks, *oks; + + if (kstat_strcheck(provider) == -1) + panic("invalid provider string"); + if (kstat_strcheck(name) == -1) + panic("invalid name string"); + + kstat_init(); + + ks = pool_get(&kstat_pool, PR_WAITOK|PR_ZERO); + + ks->ks_provider = provider; + ks->ks_instance = instance; + ks->ks_name = name; + ks->ks_unit = unit; + ks->ks_flags = flags; + ks->ks_type = type; + ks->ks_state = KSTAT_S_CREATED; + + getnanouptime(&ks->ks_created); + ks->ks_updated = ks->ks_created; + + ks->ks_lock = &kstat_default_lock; + ks->ks_lock_ops = &kstat_wlock_ops; + ks->ks_read = kstat_read; + ks->ks_copy = kstat_copy; + + rw_enter_write(&kstat_lock); + ks->ks_id = kstat_next_id; + + oks = RBT_INSERT(kstat_pv_tree, &kstat_pv_tree, ks); + if (oks == NULL) { + /* commit */ + kstat_next_id++; + kstat_version++; + + oks = RBT_INSERT(kstat_nm_tree, &kstat_nm_tree, ks); + if (oks != NULL) + panic("kstat name collision! (%llu)", ks->ks_id); + + oks = RBT_INSERT(kstat_id_tree, &kstat_id_tree, ks); + if (oks != NULL) + panic("kstat id collision! (%llu)", ks->ks_id); + } + rw_exit_write(&kstat_lock); + + if (oks != NULL) { + pool_put(&kstat_pool, ks); + return (NULL); + } + + return (ks); +} + +void +kstat_set_rlock(struct kstat *ks, struct rwlock *rwl) +{ + KASSERT(ks->ks_state == KSTAT_S_CREATED); + + ks->ks_lock = rwl; + ks->ks_lock_ops = &kstat_rlock_ops; +} + +void +kstat_set_wlock(struct kstat *ks, struct rwlock *rwl) +{ + KASSERT(ks->ks_state == KSTAT_S_CREATED); + + ks->ks_lock = rwl; + ks->ks_lock_ops = &kstat_wlock_ops; +} + +void +kstat_set_mutex(struct kstat *ks, struct mutex *mtx) +{ + KASSERT(ks->ks_state == KSTAT_S_CREATED); + + ks->ks_lock = mtx; + ks->ks_lock_ops = &kstat_mutex_ops; +} + +void +kstat_cpu_enter(void *p) +{ + struct cpu_info *ci = p; + sched_peg_curproc(ci); +} + +void +kstat_cpu_leave(void *p) +{ + atomic_clearbits_int(&curproc->p_flag, P_CPUPEG); +} + +void +kstat_set_cpu(struct kstat *ks, struct cpu_info *ci) +{ + KASSERT(ks->ks_state == KSTAT_S_CREATED); + + ks->ks_lock = ci; + ks->ks_lock_ops = &kstat_cpu_ops; +} + +int +kstat_read_nop(struct kstat *ks) +{ + return (0); +} + +void +kstat_install(struct kstat *ks) +{ + if (!ISSET(ks->ks_flags, KSTAT_F_REALLOC)) { + KASSERTMSG(ks->ks_copy != NULL || ks->ks_data != NULL, + "kstat %p %s:%u:%s:%u must provide ks_copy or ks_data", ks, + ks->ks_provider, ks->ks_instance, ks->ks_name, ks->ks_unit); + KASSERT(ks->ks_datalen > 0); + } + + rw_enter_write(&kstat_lock); + ks->ks_state = KSTAT_S_INSTALLED; + rw_exit_write(&kstat_lock); +} + +void +kstat_destroy(struct kstat *ks) +{ + rw_enter_write(&kstat_lock); + RBT_REMOVE(kstat_id_tree, &kstat_id_tree, ks); + RBT_REMOVE(kstat_pv_tree, &kstat_pv_tree, ks); + RBT_REMOVE(kstat_nm_tree, &kstat_nm_tree, ks); + kstat_version++; + rw_exit_write(&kstat_lock); + + pool_put(&kstat_pool, ks); +} + +int +kstat_read(struct kstat *ks) +{ + getnanouptime(&ks->ks_updated); + return (0); +} + +int +kstat_copy(struct kstat *ks, void *buf) +{ + memcpy(buf, ks->ks_data, ks->ks_datalen); + return (0); +} + +RBT_GENERATE(kstat_id_tree, kstat, ks_id_entry, kstat_id_cmp); +RBT_GENERATE(kstat_pv_tree, kstat, ks_pv_entry, kstat_pv_cmp); +RBT_GENERATE(kstat_nm_tree, kstat, ks_nm_entry, kstat_nm_cmp); + +void +kstat_kv_init(struct kstat_kv *kv, const char *name, enum kstat_kv_type type) +{ + memset(kv, 0, sizeof(*kv)); + strlcpy(kv->kv_key, name, sizeof(kv->kv_key)); /* XXX truncated? */ + kv->kv_type = type; + kv->kv_unit = KSTAT_KV_U_NONE; +} + +void +kstat_kv_unit_init(struct kstat_kv *kv, const char *name, + enum kstat_kv_type type, enum kstat_kv_unit unit) +{ + switch (type) { + case KSTAT_KV_T_COUNTER64: + case KSTAT_KV_T_COUNTER32: + case KSTAT_KV_T_UINT64: + case KSTAT_KV_T_INT64: + case KSTAT_KV_T_UINT32: + case KSTAT_KV_T_INT32: + break; + default: + panic("kv unit init %s: unit for non-integer type", name); + } + + memset(kv, 0, sizeof(*kv)); + strlcpy(kv->kv_key, name, sizeof(kv->kv_key)); /* XXX truncated? */ + kv->kv_type = type; + kv->kv_unit = unit; +} |