/* $OpenBSD: kern_tc.c,v 1.37 2019/01/31 05:00:18 cheloha Exp $ */ /* * Copyright (c) 2000 Poul-Henning Kamp * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * If we meet some day, and you think this stuff is worth it, you * can buy me a beer in return. Poul-Henning Kamp */ #include #include #include #include #include #include #include #include #include #include #include /* * A large step happens on boot. This constant detects such steps. * It is relatively small so that ntp_update_second gets called enough * in the typical 'missed a couple of seconds' case, but doesn't loop * forever when the time step is large. */ #define LARGE_STEP 200 u_int dummy_get_timecount(struct timecounter *); void ntp_update_second(int64_t *); int sysctl_tc_hardware(void *, size_t *, void *, size_t); int sysctl_tc_choice(void *, size_t *, void *, size_t); /* * Implement a dummy timecounter which we can use until we get a real one * in the air. This allows the console and other early stuff to use * time services. */ u_int dummy_get_timecount(struct timecounter *tc) { static u_int now; return (++now); } static struct timecounter dummy_timecounter = { dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000 }; struct timehands { /* These fields must be initialized by the driver. */ struct timecounter *th_counter; int64_t th_adjustment; u_int64_t th_scale; u_int th_offset_count; struct bintime th_boottime; struct bintime th_offset; struct timeval th_microtime; struct timespec th_nanotime; /* Fields not to be copied in tc_windup start with th_generation. */ volatile u_int th_generation; struct timehands *th_next; }; static struct timehands th0; static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th0}; static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th9}; static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th8}; static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th7}; static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th6}; static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th5}; static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th4}; static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th3}; static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th2}; static struct timehands th0 = { &dummy_timecounter, 0, (uint64_t)-1 / 1000000, 0, {0, 0}, {1, 0}, {0, 0}, {0, 0}, 1, &th1 }; /* * Protects writes to anything accessed during tc_windup(). * tc_windup() must be called before leaving this mutex. */ struct mutex timecounter_mtx = MUTEX_INITIALIZER(IPL_CLOCK); static struct timehands *volatile timehands = &th0; struct timecounter *timecounter = &dummy_timecounter; static struct timecounter *timecounters = &dummy_timecounter; volatile time_t time_second = 1; volatile time_t time_uptime = 0; struct bintime naptime; static int timestepwarnings; void tc_windup(void); /* * Return the difference between the timehands' counter value now and what * was when we copied it to the timehands' offset_count. */ static __inline u_int tc_delta(struct timehands *th) { struct timecounter *tc; tc = th->th_counter; return ((tc->tc_get_timecount(tc) - th->th_offset_count) & tc->tc_counter_mask); } /* * Functions for reading the time. We have to loop until we are sure that * the timehands that we operated on was not updated under our feet. See * the comment in for a description of these functions. */ void binboottime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; membar_consumer(); *bt = th->th_boottime; membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void microboottime(struct timeval *tvp) { struct bintime bt; binboottime(&bt); bintime2timeval(&bt, tvp); } void binuptime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; membar_consumer(); *bt = th->th_offset; bintime_addx(bt, th->th_scale * tc_delta(th)); membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void nanouptime(struct timespec *tsp) { struct bintime bt; binuptime(&bt); bintime2timespec(&bt, tsp); } void microuptime(struct timeval *tvp) { struct bintime bt; binuptime(&bt); bintime2timeval(&bt, tvp); } void bintime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; membar_consumer(); *bt = th->th_offset; bintime_addx(bt, th->th_scale * tc_delta(th)); bintime_add(bt, &th->th_boottime); membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void nanotime(struct timespec *tsp) { struct bintime bt; bintime(&bt); bintime2timespec(&bt, tsp); } void microtime(struct timeval *tvp) { struct bintime bt; bintime(&bt); bintime2timeval(&bt, tvp); } void getnanouptime(struct timespec *tsp) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; membar_consumer(); bintime2timespec(&th->th_offset, tsp); membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void getmicrouptime(struct timeval *tvp) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; membar_consumer(); bintime2timeval(&th->th_offset, tvp); membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void getnanotime(struct timespec *tsp) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; membar_consumer(); *tsp = th->th_nanotime; membar_consumer(); } while (gen == 0 || gen != th->th_generation); } void getmicrotime(struct timeval *tvp) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; membar_consumer(); *tvp = th->th_microtime; membar_consumer(); } while (gen == 0 || gen != th->th_generation); } /* * Initialize a new timecounter and possibly use it. */ void tc_init(struct timecounter *tc) { u_int u; u = tc->tc_frequency / tc->tc_counter_mask; /* XXX: We need some margin here, 10% is a guess */ u *= 11; u /= 10; if (tc->tc_quality >= 0) { if (u > hz) { tc->tc_quality = -2000; printf("Timecounter \"%s\" frequency %lu Hz", tc->tc_name, (unsigned long)tc->tc_frequency); printf(" -- Insufficient hz, needs at least %u\n", u); } } tc->tc_next = timecounters; timecounters = tc; /* * Never automatically use a timecounter with negative quality. * Even though we run on the dummy counter, switching here may be * worse since this timecounter may not be monotonic. */ if (tc->tc_quality < 0) return; if (tc->tc_quality < timecounter->tc_quality) return; if (tc->tc_quality == timecounter->tc_quality && tc->tc_frequency < timecounter->tc_frequency) return; (void)tc->tc_get_timecount(tc); enqueue_randomness(tc->tc_get_timecount(tc)); timecounter = tc; } /* Report the frequency of the current timecounter. */ u_int64_t tc_getfrequency(void) { return (timehands->th_counter->tc_frequency); } /* * Step our concept of UTC, aka the realtime clock. * This is done by modifying our estimate of when we booted. */ void tc_setrealtimeclock(const struct timespec *ts) { struct timespec ts2; struct bintime bt, bt2; mtx_enter(&timecounter_mtx); binuptime(&bt2); timespec2bintime(ts, &bt); bintime_sub(&bt, &bt2); bintime_add(&bt2, &timehands->th_boottime); timehands->th_boottime = bt; /* XXX fiddle all the little crinkly bits around the fiords... */ tc_windup(); mtx_leave(&timecounter_mtx); enqueue_randomness(ts->tv_sec); if (timestepwarnings) { bintime2timespec(&bt2, &ts2); log(LOG_INFO, "Time stepped from %lld.%09ld to %lld.%09ld\n", (long long)ts2.tv_sec, ts2.tv_nsec, (long long)ts->tv_sec, ts->tv_nsec); } } /* * Step the monotonic and realtime clocks, triggering any timeouts that * should have occurred across the interval. */ void tc_setclock(const struct timespec *ts) { struct bintime bt, bt2; struct timespec earlier; static int first = 1; #ifndef SMALL_KERNEL long long adj_ticks; #endif /* * When we're called for the first time, during boot when * the root partition is mounted, we need to set boottime. */ if (first) { tc_setrealtimeclock(ts); first = 0; return; } enqueue_randomness(ts->tv_sec); mtx_enter(&timecounter_mtx); timespec2bintime(ts, &bt); bintime_sub(&bt, &timehands->th_boottime); /* * Don't rewind the offset. */ if (bt.sec < timehands->th_offset.sec || (bt.sec == timehands->th_offset.sec && bt.frac < timehands->th_offset.frac)) { mtx_leave(&timecounter_mtx); bintime2timespec(&bt, &earlier); printf("%s: cannot rewind uptime to %lld.%09ld\n", __func__, (long long)earlier.tv_sec, earlier.tv_nsec); return; } bt2 = timehands->th_offset; timehands->th_offset = bt; /* XXX fiddle all the little crinkly bits around the fiords... */ tc_windup(); mtx_leave(&timecounter_mtx); #ifndef SMALL_KERNEL /* convert the bintime to ticks */ bintime_sub(&bt, &bt2); bintime_add(&naptime, &bt); adj_ticks = (uint64_t)hz * bt.sec + (((uint64_t)1000000 * (uint32_t)(bt.frac >> 32)) >> 32) / tick; if (adj_ticks > 0) { if (adj_ticks > INT_MAX) adj_ticks = INT_MAX; timeout_adjust_ticks(adj_ticks); } #endif } /* * Initialize the next struct timehands in the ring and make * it the active timehands. Along the way we might switch to a different * timecounter and/or do seconds processing in NTP. Slightly magic. */ void tc_windup(void) { struct bintime bt; struct timehands *th, *tho; u_int64_t scale; u_int delta, ncount, ogen; int i; MUTEX_ASSERT_LOCKED(&timecounter_mtx); /* * Make the next timehands a copy of the current one, but do not * overwrite the generation or next pointer. While we update * the contents, the generation must be zero. */ tho = timehands; th = tho->th_next; ogen = th->th_generation; th->th_generation = 0; membar_producer(); memcpy(th, tho, offsetof(struct timehands, th_generation)); /* * Capture a timecounter delta on the current timecounter and if * changing timecounters, a counter value from the new timecounter. * Update the offset fields accordingly. */ delta = tc_delta(th); if (th->th_counter != timecounter) ncount = timecounter->tc_get_timecount(timecounter); else ncount = 0; th->th_offset_count += delta; th->th_offset_count &= th->th_counter->tc_counter_mask; bintime_addx(&th->th_offset, th->th_scale * delta); #ifdef notyet /* * Hardware latching timecounters may not generate interrupts on * PPS events, so instead we poll them. There is a finite risk that * the hardware might capture a count which is later than the one we * got above, and therefore possibly in the next NTP second which might * have a different rate than the current NTP second. It doesn't * matter in practice. */ if (tho->th_counter->tc_poll_pps) tho->th_counter->tc_poll_pps(tho->th_counter); #endif /* * Deal with NTP second processing. The for loop normally * iterates at most once, but in extreme situations it might * keep NTP sane if timeouts are not run for several seconds. * At boot, the time step can be large when the TOD hardware * has been read, so on really large steps, we call * ntp_update_second only twice. We need to call it twice in * case we missed a leap second. */ bt = th->th_offset; bintime_add(&bt, &th->th_boottime); i = bt.sec - tho->th_microtime.tv_sec; if (i > LARGE_STEP) i = 2; for (; i > 0; i--) ntp_update_second(&th->th_adjustment); /* Update the UTC timestamps used by the get*() functions. */ /* XXX shouldn't do this here. Should force non-`get' versions. */ bintime2timeval(&bt, &th->th_microtime); bintime2timespec(&bt, &th->th_nanotime); /* Now is a good time to change timecounters. */ if (th->th_counter != timecounter) { th->th_counter = timecounter; th->th_offset_count = ncount; } /*- * Recalculate the scaling factor. We want the number of 1/2^64 * fractions of a second per period of the hardware counter, taking * into account the th_adjustment factor which the NTP PLL/adjtime(2) * processing provides us with. * * The th_adjustment is nanoseconds per second with 32 bit binary * fraction and we want 64 bit binary fraction of second: * * x = a * 2^32 / 10^9 = a * 4.294967296 * * The range of th_adjustment is +/- 5000PPM so inside a 64bit int * we can only multiply by about 850 without overflowing, but that * leaves suitably precise fractions for multiply before divide. * * Divide before multiply with a fraction of 2199/512 results in a * systematic undercompensation of 10PPM of th_adjustment. On a * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. * * We happily sacrifice the lowest of the 64 bits of our result * to the goddess of code clarity. * */ scale = (u_int64_t)1 << 63; scale += (th->th_adjustment / 1024) * 2199; scale /= th->th_counter->tc_frequency; th->th_scale = scale * 2; /* * Now that the struct timehands is again consistent, set the new * generation number, making sure to not make it zero. */ if (++ogen == 0) ogen = 1; membar_producer(); th->th_generation = ogen; /* Go live with the new struct timehands. */ time_second = th->th_microtime.tv_sec; time_uptime = th->th_offset.sec; membar_producer(); timehands = th; } /* Report or change the active timecounter hardware. */ int sysctl_tc_hardware(void *oldp, size_t *oldlenp, void *newp, size_t newlen) { char newname[32]; struct timecounter *newtc, *tc; int error; tc = timecounter; strlcpy(newname, tc->tc_name, sizeof(newname)); error = sysctl_string(oldp, oldlenp, newp, newlen, newname, sizeof(newname)); if (error != 0 || strcmp(newname, tc->tc_name) == 0) return (error); for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { if (strcmp(newname, newtc->tc_name) != 0) continue; /* Warm up new timecounter. */ (void)newtc->tc_get_timecount(newtc); (void)newtc->tc_get_timecount(newtc); timecounter = newtc; return (0); } return (EINVAL); } /* Report or change the active timecounter hardware. */ int sysctl_tc_choice(void *oldp, size_t *oldlenp, void *newp, size_t newlen) { char buf[32], *spc, *choices; struct timecounter *tc; int error, maxlen; spc = ""; maxlen = 0; for (tc = timecounters; tc != NULL; tc = tc->tc_next) maxlen += sizeof(buf); choices = malloc(maxlen, M_TEMP, M_WAITOK); *choices = '\0'; for (tc = timecounters; tc != NULL; tc = tc->tc_next) { snprintf(buf, sizeof(buf), "%s%s(%d)", spc, tc->tc_name, tc->tc_quality); spc = " "; strlcat(choices, buf, maxlen); } error = sysctl_rdstring(oldp, oldlenp, newp, choices); free(choices, M_TEMP, maxlen); return (error); } /* * Timecounters need to be updated every so often to prevent the hardware * counter from overflowing. Updating also recalculates the cached values * used by the get*() family of functions, so their precision depends on * the update frequency. */ static int tc_tick; void tc_ticktock(void) { static int count; if (++count < tc_tick) return; if (!mtx_enter_try(&timecounter_mtx)) return; count = 0; tc_windup(); mtx_leave(&timecounter_mtx); } void inittimecounter(void) { #ifdef DEBUG u_int p; #endif /* * Set the initial timeout to * max(1, ). * People should probably not use the sysctl to set the timeout * to smaller than its initial value, since that value is the * smallest reasonable one. If they want better timestamps they * should use the non-"get"* functions. */ if (hz > 1000) tc_tick = (hz + 500) / 1000; else tc_tick = 1; #ifdef DEBUG p = (tc_tick * 1000000) / hz; printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); #endif /* warm up new timecounter (again) and get rolling. */ (void)timecounter->tc_get_timecount(timecounter); (void)timecounter->tc_get_timecount(timecounter); } /* * Return timecounter-related information. */ int sysctl_tc(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { if (namelen != 1) return (ENOTDIR); switch (name[0]) { case KERN_TIMECOUNTER_TICK: return (sysctl_rdint(oldp, oldlenp, newp, tc_tick)); case KERN_TIMECOUNTER_TIMESTEPWARNINGS: return (sysctl_int(oldp, oldlenp, newp, newlen, ×tepwarnings)); case KERN_TIMECOUNTER_HARDWARE: return (sysctl_tc_hardware(oldp, oldlenp, newp, newlen)); case KERN_TIMECOUNTER_CHOICE: return (sysctl_tc_choice(oldp, oldlenp, newp, newlen)); default: return (EOPNOTSUPP); } /* NOTREACHED */ } void ntp_update_second(int64_t *adjust) { int64_t adj; /* Skew time according to any adjtime(2) adjustments. */ if (adjtimedelta > 0) adj = MIN(5000, adjtimedelta); else adj = MAX(-5000, adjtimedelta); adjtimedelta -= adj; *adjust = (adj * 1000) << 32; *adjust += timecounter->tc_freq_adj; } int tc_adjfreq(int64_t *old, int64_t *new) { if (old != NULL) { *old = timecounter->tc_freq_adj; } if (new != NULL) { timecounter->tc_freq_adj = *new; } return 0; }