/* $OpenBSD: if_pfsync.c,v 1.322 2023/10/03 10:22:10 sthen Exp $ */ /* * Copyright (c) 2002 Michael Shalayeff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2009, 2022, 2023 David Gwynne * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "bpfilter.h" #include "pfsync.h" #include "kstat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include "carp.h" #if NCARP > 0 #include #endif #include #include #include #define PFSYNC_MINPKT ( \ sizeof(struct ip) + \ sizeof(struct pfsync_header)) struct pfsync_softc; struct pfsync_deferral { TAILQ_ENTRY(pfsync_deferral) pd_entry; struct pf_state *pd_st; struct mbuf *pd_m; uint64_t pd_deadline; }; TAILQ_HEAD(pfsync_deferrals, pfsync_deferral); #define PFSYNC_DEFER_NSEC 20000000ULL #define PFSYNC_DEFER_LIMIT 128 #define PFSYNC_BULK_SND_IVAL_MS 20 static struct pool pfsync_deferrals_pool; enum pfsync_bulk_req_state { PFSYNC_BREQ_S_NONE, PFSYNC_BREQ_S_START, PFSYNC_BREQ_S_SENT, PFSYNC_BREQ_S_BULK, PFSYNC_BREQ_S_DONE, }; static const char *pfsync_bulk_req_state_names[] = { [PFSYNC_BREQ_S_NONE] = "none", [PFSYNC_BREQ_S_START] = "start", [PFSYNC_BREQ_S_SENT] = "sent", [PFSYNC_BREQ_S_BULK] = "bulk", [PFSYNC_BREQ_S_DONE] = "done", }; enum pfsync_bulk_req_event { PFSYNC_BREQ_EVT_UP, PFSYNC_BREQ_EVT_DOWN, PFSYNC_BREQ_EVT_TMO, PFSYNC_BREQ_EVT_LINK, PFSYNC_BREQ_EVT_BUS_START, PFSYNC_BREQ_EVT_BUS_END, }; static const char *pfsync_bulk_req_event_names[] = { [PFSYNC_BREQ_EVT_UP] = "up", [PFSYNC_BREQ_EVT_DOWN] = "down", [PFSYNC_BREQ_EVT_TMO] = "timeout", [PFSYNC_BREQ_EVT_LINK] = "link", [PFSYNC_BREQ_EVT_BUS_START] = "bus-start", [PFSYNC_BREQ_EVT_BUS_END] = "bus-end", }; struct pfsync_slice { struct pfsync_softc *s_pfsync; struct mutex s_mtx; struct pf_state_queue s_qs[PFSYNC_S_COUNT]; TAILQ_HEAD(, tdb) s_tdb_q; size_t s_len; struct mbuf_list s_ml; struct taskq *s_softnet; struct task s_task; struct timeout s_tmo; struct mbuf_queue s_sendq; struct task s_send; struct pfsync_deferrals s_deferrals; unsigned int s_deferred; struct task s_deferrals_task; struct timeout s_deferrals_tmo; uint64_t s_stat_locks; uint64_t s_stat_contended; uint64_t s_stat_write_nop; uint64_t s_stat_task_add; uint64_t s_stat_task_run; uint64_t s_stat_enqueue; uint64_t s_stat_dequeue; uint64_t s_stat_defer_add; uint64_t s_stat_defer_ack; uint64_t s_stat_defer_run; uint64_t s_stat_defer_overlimit; struct kstat *s_kstat; } __aligned(CACHELINESIZE); #define PFSYNC_SLICE_BITS 1 #define PFSYNC_NSLICES (1 << PFSYNC_SLICE_BITS) struct pfsync_softc { struct ifnet sc_if; unsigned int sc_dead; unsigned int sc_up; struct refcnt sc_refs; /* config */ struct in_addr sc_syncpeer; unsigned int sc_maxupdates; unsigned int sc_defer; /* operation */ unsigned int sc_sync_ifidx; unsigned int sc_sync_if_down; void *sc_inm; struct task sc_ltask; struct task sc_dtask; struct ip sc_template; struct pfsync_slice sc_slices[PFSYNC_NSLICES]; struct { struct rwlock req_lock; struct timeout req_tmo; enum pfsync_bulk_req_state req_state; unsigned int req_tries; unsigned int req_demoted; } sc_bulk_req; struct { struct rwlock snd_lock; struct timeout snd_tmo; time_t snd_requested; struct pf_state *snd_next; struct pf_state *snd_tail; unsigned int snd_again; } sc_bulk_snd; }; static struct pfsync_softc *pfsyncif = NULL; static struct cpumem *pfsynccounters; static inline void pfsyncstat_inc(enum pfsync_counters c) { counters_inc(pfsynccounters, c); } static int pfsync_clone_create(struct if_clone *, int); static int pfsync_clone_destroy(struct ifnet *); static int pfsync_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); static void pfsync_start(struct ifqueue *); static int pfsync_ioctl(struct ifnet *, u_long, caddr_t); static int pfsync_up(struct pfsync_softc *); static int pfsync_down(struct pfsync_softc *); static int pfsync_set_mtu(struct pfsync_softc *, unsigned int); static int pfsync_set_parent(struct pfsync_softc *, const struct if_parent *); static int pfsync_get_parent(struct pfsync_softc *, struct if_parent *); static int pfsync_del_parent(struct pfsync_softc *); static int pfsync_get_ioc(struct pfsync_softc *, struct ifreq *); static int pfsync_set_ioc(struct pfsync_softc *, struct ifreq *); static void pfsync_syncif_link(void *); static void pfsync_syncif_detach(void *); static void pfsync_sendout(struct pfsync_softc *, struct mbuf *); static void pfsync_slice_drop(struct pfsync_softc *, struct pfsync_slice *); static void pfsync_slice_tmo(void *); static void pfsync_slice_task(void *); static void pfsync_slice_sendq(void *); static void pfsync_deferrals_tmo(void *); static void pfsync_deferrals_task(void *); static void pfsync_defer_output(struct pfsync_deferral *); static void pfsync_bulk_req_evt(struct pfsync_softc *, enum pfsync_bulk_req_event); static void pfsync_bulk_req_tmo(void *); static void pfsync_bulk_snd_tmo(void *); #if NKSTAT > 0 struct pfsync_kstat_data { struct kstat_kv pd_locks; struct kstat_kv pd_contended; struct kstat_kv pd_write_nop; struct kstat_kv pd_task_add; struct kstat_kv pd_task_run; struct kstat_kv pd_enqueue; struct kstat_kv pd_dequeue; struct kstat_kv pd_qdrop; struct kstat_kv pd_defer_len; struct kstat_kv pd_defer_add; struct kstat_kv pd_defer_ack; struct kstat_kv pd_defer_run; struct kstat_kv pd_defer_overlimit; }; static const struct pfsync_kstat_data pfsync_kstat_tpl = { KSTAT_KV_INITIALIZER("locks", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("contended", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("write-nops", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("send-sched", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("send-run", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("enqueues", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("dequeues", KSTAT_KV_T_COUNTER64), KSTAT_KV_UNIT_INITIALIZER("qdrops", KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS), KSTAT_KV_UNIT_INITIALIZER("defer-len", KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS), KSTAT_KV_INITIALIZER("defer-add", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("defer-ack", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("defer-run", KSTAT_KV_T_COUNTER64), KSTAT_KV_INITIALIZER("defer-over", KSTAT_KV_T_COUNTER64), }; static int pfsync_kstat_copy(struct kstat *ks, void *dst) { struct pfsync_slice *s = ks->ks_softc; struct pfsync_kstat_data *pd = dst; *pd = pfsync_kstat_tpl; kstat_kv_u64(&pd->pd_locks) = s->s_stat_locks; kstat_kv_u64(&pd->pd_contended) = s->s_stat_contended; kstat_kv_u64(&pd->pd_write_nop) = s->s_stat_write_nop; kstat_kv_u64(&pd->pd_task_add) = s->s_stat_task_add; kstat_kv_u64(&pd->pd_task_run) = s->s_stat_task_run; kstat_kv_u64(&pd->pd_enqueue) = s->s_stat_enqueue; kstat_kv_u64(&pd->pd_dequeue) = s->s_stat_dequeue; kstat_kv_u32(&pd->pd_qdrop) = mq_drops(&s->s_sendq); kstat_kv_u32(&pd->pd_defer_len) = s->s_deferred; kstat_kv_u64(&pd->pd_defer_add) = s->s_stat_defer_add; kstat_kv_u64(&pd->pd_defer_ack) = s->s_stat_defer_ack; kstat_kv_u64(&pd->pd_defer_run) = s->s_stat_defer_run; kstat_kv_u64(&pd->pd_defer_overlimit) = s->s_stat_defer_overlimit; return (0); } #endif /* NKSTAT > 0 */ #define PFSYNC_MAX_BULKTRIES 12 struct if_clone pfsync_cloner = IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy); void pfsyncattach(int npfsync) { pfsynccounters = counters_alloc(pfsyncs_ncounters); if_clone_attach(&pfsync_cloner); } static int pfsync_clone_create(struct if_clone *ifc, int unit) { struct pfsync_softc *sc; struct ifnet *ifp; size_t i, q; if (unit != 0) return (ENXIO); if (pfsync_deferrals_pool.pr_size == 0) { pool_init(&pfsync_deferrals_pool, sizeof(struct pfsync_deferral), 0, IPL_MPFLOOR, 0, "pfdefer", NULL); /* pool_cache_init(&pfsync_deferrals_pool); */ } sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL); if (sc == NULL) return (ENOMEM); /* sc_refs is "owned" by IFF_RUNNING */ sc->sc_syncpeer.s_addr = INADDR_PFSYNC_GROUP; sc->sc_maxupdates = 128; sc->sc_defer = 0; task_set(&sc->sc_ltask, pfsync_syncif_link, sc); task_set(&sc->sc_dtask, pfsync_syncif_detach, sc); rw_init(&sc->sc_bulk_req.req_lock, "pfsyncbreq"); /* need process context to take net lock to call ip_output */ timeout_set_proc(&sc->sc_bulk_req.req_tmo, pfsync_bulk_req_tmo, sc); rw_init(&sc->sc_bulk_snd.snd_lock, "pfsyncbsnd"); /* need process context to take net lock to call ip_output */ timeout_set_proc(&sc->sc_bulk_snd.snd_tmo, pfsync_bulk_snd_tmo, sc); ifp = &sc->sc_if; snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d", ifc->ifc_name, unit); ifp->if_softc = sc; ifp->if_ioctl = pfsync_ioctl; ifp->if_output = pfsync_output; ifp->if_qstart = pfsync_start; ifp->if_type = IFT_PFSYNC; ifp->if_hdrlen = sizeof(struct pfsync_header); ifp->if_mtu = ETHERMTU; ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE; for (i = 0; i < nitems(sc->sc_slices); i++) { struct pfsync_slice *s = &sc->sc_slices[i]; s->s_pfsync = sc; mtx_init_flags(&s->s_mtx, IPL_SOFTNET, "pfslice", 0); s->s_softnet = net_tq(i); timeout_set(&s->s_tmo, pfsync_slice_tmo, s); task_set(&s->s_task, pfsync_slice_task, s); mq_init(&s->s_sendq, 16, IPL_SOFTNET); task_set(&s->s_send, pfsync_slice_sendq, s); s->s_len = PFSYNC_MINPKT; ml_init(&s->s_ml); for (q = 0; q < nitems(s->s_qs); q++) TAILQ_INIT(&s->s_qs[q]); TAILQ_INIT(&s->s_tdb_q); /* stupid NET_LOCK */ timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s); task_set(&s->s_deferrals_task, pfsync_deferrals_task, s); TAILQ_INIT(&s->s_deferrals); #if NKSTAT > 0 s->s_kstat = kstat_create(ifp->if_xname, 0, "pfsync-slice", i, KSTAT_T_KV, 0); kstat_set_mutex(s->s_kstat, &s->s_mtx); s->s_kstat->ks_softc = s; s->s_kstat->ks_datalen = sizeof(pfsync_kstat_tpl); s->s_kstat->ks_copy = pfsync_kstat_copy; kstat_install(s->s_kstat); #endif } if_counters_alloc(ifp); if_attach(ifp); if_alloc_sadl(ifp); #if NCARP > 0 if_addgroup(ifp, "carp"); #endif #if NBPFILTER > 0 bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN); #endif return (0); } static int pfsync_clone_destroy(struct ifnet *ifp) { struct pfsync_softc *sc = ifp->if_softc; #if NKSTAT > 0 size_t i; #endif NET_LOCK(); sc->sc_dead = 1; if (ISSET(ifp->if_flags, IFF_RUNNING)) pfsync_down(sc); NET_UNLOCK(); if_detach(ifp); #if NKSTAT > 0 for (i = 0; i < nitems(sc->sc_slices); i++) { struct pfsync_slice *s = &sc->sc_slices[i]; kstat_destroy(s->s_kstat); } #endif free(sc, M_DEVBUF, sizeof(*sc)); return (0); } static void pfsync_dprintf(struct pfsync_softc *sc, const char *fmt, ...) { struct ifnet *ifp = &sc->sc_if; va_list ap; if (!ISSET(ifp->if_flags, IFF_DEBUG)) return; printf("%s: ", ifp->if_xname); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } static void pfsync_syncif_link(void *arg) { struct pfsync_softc *sc = arg; struct ifnet *ifp0; unsigned int sync_if_down = 1; ifp0 = if_get(sc->sc_sync_ifidx); if (ifp0 != NULL && LINK_STATE_IS_UP(ifp0->if_link_state)) { pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_LINK); sync_if_down = 0; } if_put(ifp0); #if NCARP > 0 if (sc->sc_sync_if_down != sync_if_down) { carp_group_demote_adj(&sc->sc_if, sync_if_down ? 1 : -1, "pfsync link"); } #endif sc->sc_sync_if_down = sync_if_down; } static void pfsync_syncif_detach(void *arg) { struct pfsync_softc *sc = arg; struct ifnet *ifp = &sc->sc_if; if (ISSET(ifp->if_flags, IFF_RUNNING)) { pfsync_down(sc); if_down(ifp); } sc->sc_sync_ifidx = 0; } static int pfsync_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt) { m_freem(m); /* drop packet */ return (EAFNOSUPPORT); } static int pfsync_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct pfsync_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; int error = ENOTTY; switch (cmd) { case SIOCSIFADDR: error = EOPNOTSUPP; break; case SIOCSIFFLAGS: if (ISSET(ifp->if_flags, IFF_UP)) { if (!ISSET(ifp->if_flags, IFF_RUNNING)) error = pfsync_up(sc); else error = ENETRESET; } else { if (ISSET(ifp->if_flags, IFF_RUNNING)) error = pfsync_down(sc); } break; case SIOCSIFMTU: error = pfsync_set_mtu(sc, ifr->ifr_mtu); break; case SIOCSIFPARENT: error = pfsync_set_parent(sc, (struct if_parent *)data); break; case SIOCGIFPARENT: error = pfsync_get_parent(sc, (struct if_parent *)data); break; case SIOCDIFPARENT: error = pfsync_del_parent(sc); break; case SIOCSETPFSYNC: error = pfsync_set_ioc(sc, ifr); break; case SIOCGETPFSYNC: error = pfsync_get_ioc(sc, ifr); break; default: break; } if (error == ENETRESET) error = 0; return (error); } static int pfsync_set_mtu(struct pfsync_softc *sc, unsigned int mtu) { struct ifnet *ifp = &sc->sc_if; struct ifnet *ifp0; int error = 0; ifp0 = if_get(sc->sc_sync_ifidx); if (ifp0 == NULL) return (EINVAL); if (mtu <= PFSYNC_MINPKT || mtu > ifp0->if_mtu) { error = EINVAL; goto put; } /* commit */ ifp->if_mtu = mtu; put: if_put(ifp0); return (error); } static int pfsync_set_parent(struct pfsync_softc *sc, const struct if_parent *p) { struct ifnet *ifp = &sc->sc_if; struct ifnet *ifp0; int error = 0; ifp0 = if_unit(p->ifp_parent); if (ifp0 == NULL) return (ENXIO); if (ifp0->if_index == sc->sc_sync_ifidx) goto put; if (ISSET(ifp->if_flags, IFF_RUNNING)) { error = EBUSY; goto put; } /* commit */ sc->sc_sync_ifidx = ifp0->if_index; put: if_put(ifp0); return (error); } static int pfsync_get_parent(struct pfsync_softc *sc, struct if_parent *p) { struct ifnet *ifp0; int error = 0; ifp0 = if_get(sc->sc_sync_ifidx); if (ifp0 == NULL) error = EADDRNOTAVAIL; else strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent)); if_put(ifp0); return (error); } static int pfsync_del_parent(struct pfsync_softc *sc) { struct ifnet *ifp = &sc->sc_if; if (ISSET(ifp->if_flags, IFF_RUNNING)) return (EBUSY); /* commit */ sc->sc_sync_ifidx = 0; return (0); } static int pfsync_get_ioc(struct pfsync_softc *sc, struct ifreq *ifr) { struct pfsyncreq pfsyncr; struct ifnet *ifp0; memset(&pfsyncr, 0, sizeof(pfsyncr)); ifp0 = if_get(sc->sc_sync_ifidx); if (ifp0 != NULL) { strlcpy(pfsyncr.pfsyncr_syncdev, ifp0->if_xname, sizeof(pfsyncr.pfsyncr_syncdev)); } if_put(ifp0); pfsyncr.pfsyncr_syncpeer = sc->sc_syncpeer; pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; pfsyncr.pfsyncr_defer = sc->sc_defer; return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); } static int pfsync_set_ioc(struct pfsync_softc *sc, struct ifreq *ifr) { struct ifnet *ifp = &sc->sc_if; struct pfsyncreq pfsyncr; unsigned int sync_ifidx = sc->sc_sync_ifidx; int wantdown = 0; int error; error = suser(curproc); if (error != 0) return (error); error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)); if (error != 0) return (error); if (pfsyncr.pfsyncr_maxupdates > 255) return (EINVAL); if (pfsyncr.pfsyncr_syncdev[0] != '\0') { /* set */ struct ifnet *ifp0 = if_unit(pfsyncr.pfsyncr_syncdev); if (ifp0 == NULL) return (ENXIO); if (ifp0->if_index != sync_ifidx) wantdown = 1; sync_ifidx = ifp0->if_index; if_put(ifp0); } else { /* del */ wantdown = 1; sync_ifidx = 0; } if (pfsyncr.pfsyncr_syncpeer.s_addr == INADDR_ANY) pfsyncr.pfsyncr_syncpeer.s_addr = INADDR_PFSYNC_GROUP; if (pfsyncr.pfsyncr_syncpeer.s_addr != sc->sc_syncpeer.s_addr) wantdown = 1; if (wantdown && ISSET(ifp->if_flags, IFF_RUNNING)) return (EBUSY); /* commit */ sc->sc_sync_ifidx = sync_ifidx; sc->sc_syncpeer = pfsyncr.pfsyncr_syncpeer; sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; sc->sc_defer = pfsyncr.pfsyncr_defer; return (0); } static int pfsync_up(struct pfsync_softc *sc) { struct ifnet *ifp = &sc->sc_if; struct ifnet *ifp0; void *inm = NULL; int error = 0; struct ip *ip; NET_ASSERT_LOCKED(); KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING)); if (sc->sc_dead) return (ENXIO); /* * coordinate with pfsync_down(). if sc_up is still up and * we're here then something else is tearing pfsync down. */ if (sc->sc_up) return (EBUSY); if (sc->sc_syncpeer.s_addr == INADDR_ANY || sc->sc_syncpeer.s_addr == INADDR_BROADCAST) return (EDESTADDRREQ); ifp0 = if_get(sc->sc_sync_ifidx); if (ifp0 == NULL) return (ENXIO); if (IN_MULTICAST(sc->sc_syncpeer.s_addr)) { if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) { error = ENODEV; goto put; } inm = in_addmulti(&sc->sc_syncpeer, ifp0); if (inm == NULL) { error = ECONNABORTED; goto put; } } sc->sc_up = 1; ip = &sc->sc_template; memset(ip, 0, sizeof(*ip)); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; /* len and id are set later */ ip->ip_off = htons(IP_DF); ip->ip_ttl = PFSYNC_DFLTTL; ip->ip_p = IPPROTO_PFSYNC; ip->ip_src.s_addr = INADDR_ANY; ip->ip_dst.s_addr = sc->sc_syncpeer.s_addr; /* commit */ refcnt_init(&sc->sc_refs); /* IFF_RUNNING kind of owns this */ #if NCARP > 0 sc->sc_sync_if_down = 1; carp_group_demote_adj(&sc->sc_if, 1, "pfsync up"); #endif if_linkstatehook_add(ifp0, &sc->sc_ltask); if_detachhook_add(ifp0, &sc->sc_dtask); sc->sc_inm = inm; SET(ifp->if_flags, IFF_RUNNING); pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_UP); refcnt_take(&sc->sc_refs); /* give one to SMR */ SMR_PTR_SET_LOCKED(&pfsyncif, sc); pfsync_syncif_link(sc); /* try and push the bulk req state forward */ put: if_put(ifp0); return (error); } static struct mbuf * pfsync_encap(struct pfsync_softc *sc, struct mbuf *m) { struct { struct ip ip; struct pfsync_header ph; } __packed __aligned(4) *h; unsigned int mlen = m->m_pkthdr.len; m = m_prepend(m, sizeof(*h), M_DONTWAIT); if (m == NULL) return (NULL); h = mtod(m, void *); memset(h, 0, sizeof(*h)); mlen += sizeof(h->ph); h->ph.version = PFSYNC_VERSION; h->ph.len = htons(mlen); /* h->ph.pfcksum */ mlen += sizeof(h->ip); h->ip = sc->sc_template; h->ip.ip_len = htons(mlen); h->ip.ip_id = htons(ip_randomid()); return (m); } static void pfsync_bulk_req_send(struct pfsync_softc *sc) { struct { struct pfsync_subheader subh; struct pfsync_upd_req ur; } __packed __aligned(4) *h; unsigned mlen = max_linkhdr + sizeof(struct ip) + sizeof(struct pfsync_header) + sizeof(*h); struct mbuf *m; m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) goto fail; if (mlen > MHLEN) { MCLGETL(m, M_DONTWAIT, mlen); if (!ISSET(m->m_flags, M_EXT)) goto drop; } m_align(m, sizeof(*h)); m->m_len = m->m_pkthdr.len = sizeof(*h); h = mtod(m, void *); memset(h, 0, sizeof(*h)); h->subh.action = PFSYNC_ACT_UPD_REQ; h->subh.len = sizeof(h->ur) >> 2; h->subh.count = htons(1); h->ur.id = htobe64(0); h->ur.creatorid = htobe32(0); m = pfsync_encap(sc, m); if (m == NULL) goto fail; pfsync_sendout(sc, m); return; drop: m_freem(m); fail: printf("%s: unable to request bulk update\n", sc->sc_if.if_xname); } static void pfsync_bulk_req_nstate(struct pfsync_softc *sc, enum pfsync_bulk_req_state nstate, int seconds) { sc->sc_bulk_req.req_state = nstate; if (seconds > 0) timeout_add_sec(&sc->sc_bulk_req.req_tmo, seconds); else timeout_del(&sc->sc_bulk_req.req_tmo); } static void pfsync_bulk_req_invstate(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt) { panic("%s: unexpected event %s in state %s", sc->sc_if.if_xname, pfsync_bulk_req_event_names[evt], pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state]); } static void pfsync_bulk_req_nstate_bulk(struct pfsync_softc *sc) { /* calculate the number of packets we expect */ int t = pf_pool_limits[PF_LIMIT_STATES].limit / ((sc->sc_if.if_mtu - PFSYNC_MINPKT) / sizeof(struct pfsync_state)); /* turn it into seconds */ t /= 1000 / PFSYNC_BULK_SND_IVAL_MS; if (t == 0) t = 1; pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_BULK, t * 4); } static inline void pfsync_bulk_req_nstate_done(struct pfsync_softc *sc) { pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_DONE, 0); KASSERT(sc->sc_bulk_req.req_demoted == 1); sc->sc_bulk_req.req_demoted = 0; #if NCARP > 0 carp_group_demote_adj(&sc->sc_if, -32, "pfsync done"); #endif } static void pfsync_bulk_req_evt(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt) { struct ifnet *ifp = &sc->sc_if; rw_enter_write(&sc->sc_bulk_req.req_lock); pfsync_dprintf(sc, "%s state %s evt %s", __func__, pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state], pfsync_bulk_req_event_names[evt]); if (evt == PFSYNC_BREQ_EVT_DOWN) { /* unconditionally move down */ sc->sc_bulk_req.req_tries = 0; pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_NONE, 0); if (sc->sc_bulk_req.req_demoted) { sc->sc_bulk_req.req_demoted = 0; #if NCARP > 0 carp_group_demote_adj(&sc->sc_if, -32, "pfsync down"); #endif } } else switch (sc->sc_bulk_req.req_state) { case PFSYNC_BREQ_S_NONE: switch (evt) { case PFSYNC_BREQ_EVT_UP: KASSERT(sc->sc_bulk_req.req_demoted == 0); sc->sc_bulk_req.req_demoted = 1; #if NCARP > 0 carp_group_demote_adj(&sc->sc_if, 32, "pfsync start"); #endif pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_START, 30); break; default: pfsync_bulk_req_invstate(sc, evt); } break; case PFSYNC_BREQ_S_START: switch (evt) { case PFSYNC_BREQ_EVT_LINK: pfsync_bulk_req_send(sc); pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2); break; case PFSYNC_BREQ_EVT_TMO: pfsync_dprintf(sc, "timeout waiting for link"); pfsync_bulk_req_nstate_done(sc); break; case PFSYNC_BREQ_EVT_BUS_START: pfsync_bulk_req_nstate_bulk(sc); break; case PFSYNC_BREQ_EVT_BUS_END: /* ignore this */ break; default: pfsync_bulk_req_invstate(sc, evt); } break; case PFSYNC_BREQ_S_SENT: switch (evt) { case PFSYNC_BREQ_EVT_BUS_START: pfsync_bulk_req_nstate_bulk(sc); break; case PFSYNC_BREQ_EVT_BUS_END: case PFSYNC_BREQ_EVT_LINK: /* ignore this */ break; case PFSYNC_BREQ_EVT_TMO: if (++sc->sc_bulk_req.req_tries < PFSYNC_MAX_BULKTRIES) { pfsync_bulk_req_send(sc); pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2); break; } pfsync_dprintf(sc, "timeout waiting for bulk transfer start"); pfsync_bulk_req_nstate_done(sc); break; default: pfsync_bulk_req_invstate(sc, evt); } break; case PFSYNC_BREQ_S_BULK: switch (evt) { case PFSYNC_BREQ_EVT_BUS_START: case PFSYNC_BREQ_EVT_LINK: /* ignore this */ break; case PFSYNC_BREQ_EVT_BUS_END: pfsync_bulk_req_nstate_done(sc); break; case PFSYNC_BREQ_EVT_TMO: if (++sc->sc_bulk_req.req_tries < PFSYNC_MAX_BULKTRIES) { pfsync_bulk_req_send(sc); pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2); } pfsync_dprintf(sc, "timeout waiting for bulk transfer end"); pfsync_bulk_req_nstate_done(sc); break; default: pfsync_bulk_req_invstate(sc, evt); } break; case PFSYNC_BREQ_S_DONE: /* pfsync is up and running */ switch (evt) { case PFSYNC_BREQ_EVT_BUS_START: case PFSYNC_BREQ_EVT_BUS_END: case PFSYNC_BREQ_EVT_LINK: /* nops */ break; default: pfsync_bulk_req_invstate(sc, evt); } break; default: panic("%s: unknown event %d", ifp->if_xname, evt); /* NOTREACHED */ } rw_exit_write(&sc->sc_bulk_req.req_lock); } static void pfsync_bulk_req_tmo(void *arg) { struct pfsync_softc *sc = arg; NET_LOCK(); pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_TMO); NET_UNLOCK(); } static int pfsync_down(struct pfsync_softc *sc) { struct ifnet *ifp = &sc->sc_if; struct ifnet *ifp0; struct smr_entry smr; size_t i; void *inm = NULL; unsigned int sndbar = 0; struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds); struct pfsync_deferral *pd; NET_ASSERT_LOCKED(); KASSERT(ISSET(ifp->if_flags, IFF_RUNNING)); /* * tearing down pfsync involves waiting for pfsync to stop * running in various contexts including softnet taskqs. * this thread cannot hold netlock while waiting for a * barrier in softnet because softnet might be waiting for * the netlock. sc->sc_up is used to coordinate with * pfsync_up. */ CLR(ifp->if_flags, IFF_RUNNING); ifp0 = if_get(sc->sc_sync_ifidx); if (ifp0 != NULL) { if_linkstatehook_del(ifp0, &sc->sc_ltask); if_detachhook_del(ifp0, &sc->sc_dtask); } if_put(ifp0); #if NCARP > 0 if (sc->sc_sync_if_down) carp_group_demote_adj(&sc->sc_if, -1, "pfsync down"); #endif NET_UNLOCK(); KASSERTMSG(SMR_PTR_GET_LOCKED(&pfsyncif) == sc, "pfsyncif %p != sc %p", pfsyncif, sc); SMR_PTR_SET_LOCKED(&pfsyncif, NULL); smr_init(&smr); smr_call(&smr, (void (*)(void *))refcnt_rele_wake, &sc->sc_refs); /* stop pf producing work before cleaning up the timeouts and tasks */ refcnt_finalize(&sc->sc_refs, "pfsyncfini"); pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_DOWN); rw_enter_read(&pf_state_list.pfs_rwl); rw_enter_write(&sc->sc_bulk_snd.snd_lock); if (sc->sc_bulk_snd.snd_tail != NULL) { sndbar = !timeout_del(&sc->sc_bulk_snd.snd_tmo); sc->sc_bulk_snd.snd_again = 0; sc->sc_bulk_snd.snd_next = NULL; sc->sc_bulk_snd.snd_tail = NULL; } rw_exit_write(&sc->sc_bulk_snd.snd_lock); rw_exit_read(&pf_state_list.pfs_rwl); /* * do a single barrier for all the timeouts. because the * timeouts in each slice are configured the same way, the * barrier for one will work for all of them. */ for (i = 0; i < nitems(sc->sc_slices); i++) { struct pfsync_slice *s = &sc->sc_slices[i]; timeout_del(&s->s_tmo); task_del(s->s_softnet, &s->s_task); task_del(s->s_softnet, &s->s_send); timeout_del(&s->s_deferrals_tmo); task_del(s->s_softnet, &s->s_deferrals_task); } timeout_barrier(&sc->sc_slices[0].s_tmo); timeout_barrier(&sc->sc_bulk_req.req_tmo); /* XXX proc */ if (sndbar) { /* technically the preceding barrier does the same job */ timeout_barrier(&sc->sc_bulk_snd.snd_tmo); } net_tq_barriers("pfsyncbar"); /* pfsync is no longer running */ if (sc->sc_inm != NULL) { inm = sc->sc_inm; sc->sc_inm = NULL; } for (i = 0; i < nitems(sc->sc_slices); i++) { struct pfsync_slice *s = &sc->sc_slices[i]; struct pf_state *st; pfsync_slice_drop(sc, s); mq_purge(&s->s_sendq); while ((pd = TAILQ_FIRST(&s->s_deferrals)) != NULL) { TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); st = pd->pd_st; st->sync_defer = NULL; TAILQ_INSERT_TAIL(&pds, pd, pd_entry); } s->s_deferred = 0; } NET_LOCK(); sc->sc_up = 0; if (inm != NULL) in_delmulti(inm); while ((pd = TAILQ_FIRST(&pds)) != NULL) { TAILQ_REMOVE(&pds, pd, pd_entry); pfsync_defer_output(pd); } return (0); } int pfsync_is_up(void) { int rv; smr_read_enter(); rv = SMR_PTR_GET(&pfsyncif) != NULL; smr_read_leave(); return (rv); } static void pfsync_start(struct ifqueue *ifq) { ifq_purge(ifq); } struct pfsync_q { void (*write)(struct pf_state *, void *); size_t len; u_int8_t action; }; static struct pfsync_slice * pfsync_slice_enter(struct pfsync_softc *sc, const struct pf_state *st) { unsigned int idx = st->key[0]->hash % nitems(sc->sc_slices); struct pfsync_slice *s = &sc->sc_slices[idx]; if (!mtx_enter_try(&s->s_mtx)) { mtx_enter(&s->s_mtx); s->s_stat_contended++; } s->s_stat_locks++; return (s); } static void pfsync_slice_leave(struct pfsync_softc *sc, struct pfsync_slice *s) { mtx_leave(&s->s_mtx); } /* we have one of these for every PFSYNC_S_ */ static void pfsync_out_state(struct pf_state *, void *); static void pfsync_out_iack(struct pf_state *, void *); static void pfsync_out_upd_c(struct pf_state *, void *); static void pfsync_out_del(struct pf_state *, void *); #if defined(IPSEC) static void pfsync_out_tdb(struct tdb *, void *); #endif static const struct pfsync_q pfsync_qs[] = { { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C }, { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD } }; static void pfsync_out_state(struct pf_state *st, void *buf) { struct pfsync_state *sp = buf; mtx_enter(&st->mtx); pf_state_export(sp, st); mtx_leave(&st->mtx); } static void pfsync_out_iack(struct pf_state *st, void *buf) { struct pfsync_ins_ack *iack = buf; iack->id = st->id; iack->creatorid = st->creatorid; } static void pfsync_out_upd_c(struct pf_state *st, void *buf) { struct pfsync_upd_c *up = buf; memset(up, 0, sizeof(*up)); up->id = st->id; up->creatorid = st->creatorid; mtx_enter(&st->mtx); pf_state_peer_hton(&st->src, &up->src); pf_state_peer_hton(&st->dst, &up->dst); up->timeout = st->timeout; mtx_leave(&st->mtx); } static void pfsync_out_del(struct pf_state *st, void *buf) { struct pfsync_del_c *dp = buf; dp->id = st->id; dp->creatorid = st->creatorid; st->sync_state = PFSYNC_S_DEAD; } #if defined(IPSEC) static inline void pfsync_tdb_enter(struct tdb *tdb) { mtx_enter(&tdb->tdb_mtx); } static inline void pfsync_tdb_leave(struct tdb *tdb) { unsigned int snapped = ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); mtx_leave(&tdb->tdb_mtx); if (snapped) wakeup_one(&tdb->tdb_updates); } #endif /* defined(IPSEC) */ static void pfsync_slice_drop(struct pfsync_softc *sc, struct pfsync_slice *s) { struct pf_state *st; int q; #if defined(IPSEC) struct tdb *tdb; #endif for (q = 0; q < nitems(s->s_qs); q++) { if (TAILQ_EMPTY(&s->s_qs[q])) continue; while ((st = TAILQ_FIRST(&s->s_qs[q])) != NULL) { TAILQ_REMOVE(&s->s_qs[q], st, sync_list); #ifdef PFSYNC_DEBUG KASSERT(st->sync_state == q); #endif st->sync_state = PFSYNC_S_NONE; pf_state_unref(st); } } #if defined(IPSEC) while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) { TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); pfsync_tdb_enter(tdb); KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC)); CLR(tdb->tdb_flags, TDBF_PFSYNC); pfsync_tdb_leave(tdb); } #endif /* defined(IPSEC) */ timeout_del(&s->s_tmo); s->s_len = PFSYNC_MINPKT; } static struct mbuf * pfsync_slice_write(struct pfsync_slice *s) { struct pfsync_softc *sc = s->s_pfsync; struct mbuf *m; struct ip *ip; struct pfsync_header *ph; struct pfsync_subheader *subh; unsigned int mlen = max_linkhdr + s->s_len; unsigned int q, count; caddr_t ptr; size_t off; MUTEX_ASSERT_LOCKED(&s->s_mtx); if (s->s_len == PFSYNC_MINPKT) { s->s_stat_write_nop++; return (NULL); } task_del(s->s_softnet, &s->s_task); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) goto drop; if (mlen > MHLEN) { MCLGETL(m, M_DONTWAIT, mlen); if (!ISSET(m->m_flags, M_EXT)) goto drop; } m_align(m, s->s_len); m->m_len = m->m_pkthdr.len = s->s_len; ptr = mtod(m, caddr_t); off = 0; ip = (struct ip *)(ptr + off); off += sizeof(*ip); *ip = sc->sc_template; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_id = htons(ip_randomid()); ph = (struct pfsync_header *)(ptr + off); off += sizeof(*ph); memset(ph, 0, sizeof(*ph)); ph->version = PFSYNC_VERSION; ph->len = htons(m->m_pkthdr.len - sizeof(*ip)); for (q = 0; q < nitems(s->s_qs); q++) { struct pf_state_queue *psq = &s->s_qs[q]; struct pf_state *st; if (TAILQ_EMPTY(psq)) continue; subh = (struct pfsync_subheader *)(ptr + off); off += sizeof(*subh); count = 0; while ((st = TAILQ_FIRST(psq)) != NULL) { TAILQ_REMOVE(psq, st, sync_list); count++; KASSERT(st->sync_state == q); /* the write handler below may override this */ st->sync_state = PFSYNC_S_NONE; pfsync_qs[q].write(st, ptr + off); off += pfsync_qs[q].len; pf_state_unref(st); } subh->action = pfsync_qs[q].action; subh->len = pfsync_qs[q].len >> 2; subh->count = htons(count); } #if defined(IPSEC) if (!TAILQ_EMPTY(&s->s_tdb_q)) { struct tdb *tdb; subh = (struct pfsync_subheader *)(ptr + off); off += sizeof(*subh); count = 0; while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) { TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); count++; pfsync_tdb_enter(tdb); KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC)); /* get a consistent view of the counters */ pfsync_out_tdb(tdb, ptr + off); CLR(tdb->tdb_flags, TDBF_PFSYNC); pfsync_tdb_leave(tdb); off += sizeof(struct pfsync_tdb); } subh->action = PFSYNC_ACT_TDB; subh->len = sizeof(struct pfsync_tdb) >> 2; subh->count = htons(count); } #endif timeout_del(&s->s_tmo); s->s_len = PFSYNC_MINPKT; return (m); drop: m_freem(m); pfsyncstat_inc(pfsyncs_onomem); pfsync_slice_drop(sc, s); return (NULL); } static void pfsync_sendout(struct pfsync_softc *sc, struct mbuf *m) { struct ip_moptions imo; unsigned int len = m->m_pkthdr.len; #if NBPFILTER > 0 caddr_t if_bpf = sc->sc_if.if_bpf; if (if_bpf) bpf_mtap(if_bpf, m, BPF_DIRECTION_OUT); #endif imo.imo_ifidx = sc->sc_sync_ifidx; imo.imo_ttl = PFSYNC_DFLTTL; imo.imo_loop = 0; m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain; if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) == 0) { counters_pkt(sc->sc_if.if_counters, ifc_opackets, ifc_obytes, len); pfsyncstat_inc(pfsyncs_opackets); } else { counters_inc(sc->sc_if.if_counters, ifc_oerrors); pfsyncstat_inc(pfsyncs_oerrors); } } static void pfsync_slice_tmo(void *arg) { struct pfsync_slice *s = arg; task_add(s->s_softnet, &s->s_task); } static void pfsync_slice_sched(struct pfsync_slice *s) { s->s_stat_task_add++; task_add(s->s_softnet, &s->s_task); } static void pfsync_slice_task(void *arg) { struct pfsync_slice *s = arg; struct mbuf *m; mtx_enter(&s->s_mtx); s->s_stat_task_run++; m = pfsync_slice_write(s); mtx_leave(&s->s_mtx); if (m != NULL) { NET_LOCK(); pfsync_sendout(s->s_pfsync, m); NET_UNLOCK(); } } static void pfsync_slice_sendq(void *arg) { struct pfsync_slice *s = arg; struct mbuf_list ml; struct mbuf *m; mq_delist(&s->s_sendq, &ml); if (ml_empty(&ml)) return; mtx_enter(&s->s_mtx); s->s_stat_dequeue++; mtx_leave(&s->s_mtx); NET_LOCK(); while ((m = ml_dequeue(&ml)) != NULL) pfsync_sendout(s->s_pfsync, m); NET_UNLOCK(); } static void pfsync_q_ins(struct pfsync_slice *s, struct pf_state *st, unsigned int q) { size_t nlen = pfsync_qs[q].len; struct mbuf *m = NULL; MUTEX_ASSERT_LOCKED(&s->s_mtx); KASSERT(st->sync_state == PFSYNC_S_NONE); KASSERT(s->s_len >= PFSYNC_MINPKT); if (TAILQ_EMPTY(&s->s_qs[q])) nlen += sizeof(struct pfsync_subheader); if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) { m = pfsync_slice_write(s); if (m != NULL) { s->s_stat_enqueue++; if (mq_enqueue(&s->s_sendq, m) == 0) task_add(s->s_softnet, &s->s_send); } nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; } s->s_len += nlen; pf_state_ref(st); TAILQ_INSERT_TAIL(&s->s_qs[q], st, sync_list); st->sync_state = q; if (!timeout_pending(&s->s_tmo)) timeout_add_sec(&s->s_tmo, 1); } static void pfsync_q_del(struct pfsync_slice *s, struct pf_state *st) { unsigned int q = st->sync_state; MUTEX_ASSERT_LOCKED(&s->s_mtx); KASSERT(st->sync_state < PFSYNC_S_NONE); st->sync_state = PFSYNC_S_NONE; TAILQ_REMOVE(&s->s_qs[q], st, sync_list); pf_state_unref(st); s->s_len -= pfsync_qs[q].len; if (TAILQ_EMPTY(&s->s_qs[q])) s->s_len -= sizeof(struct pfsync_subheader); } /* * the pfsync hooks that pf calls */ void pfsync_init_state(struct pf_state *st, const struct pf_state_key *skw, const struct pf_state_key *sks, int flags) { /* this is called before pf_state_insert */ if (skw->proto == IPPROTO_PFSYNC) SET(st->state_flags, PFSTATE_NOSYNC); if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { st->sync_state = PFSYNC_S_DEAD; return; } if (ISSET(flags, PFSYNC_SI_IOCTL)) { /* all good */ return; } /* state came off the wire */ if (ISSET(flags, PFSYNC_SI_PFSYNC)) { if (ISSET(st->state_flags, PFSTATE_ACK)) { CLR(st->state_flags, PFSTATE_ACK); /* peer wants an iack, not an insert */ st->sync_state = PFSYNC_S_SYNC; } else st->sync_state = PFSYNC_S_PFSYNC; } } void pfsync_insert_state(struct pf_state *st) { struct pfsync_softc *sc; MUTEX_ASSERT_UNLOCKED(&st->mtx); if (ISSET(st->state_flags, PFSTATE_NOSYNC) || st->sync_state == PFSYNC_S_DEAD) return; smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc != NULL) { struct pfsync_slice *s = pfsync_slice_enter(sc, st); switch (st->sync_state) { case PFSYNC_S_UPD_C: /* we must have lost a race after insert */ pfsync_q_del(s, st); /* FALLTHROUGH */ case PFSYNC_S_NONE: pfsync_q_ins(s, st, PFSYNC_S_INS); break; case PFSYNC_S_SYNC: st->sync_state = PFSYNC_S_NONE; /* gross */ pfsync_q_ins(s, st, PFSYNC_S_IACK); pfsync_slice_sched(s); /* the peer is waiting */ break; case PFSYNC_S_PFSYNC: /* state was just inserted by pfsync */ st->sync_state = PFSYNC_S_NONE; break; default: panic("%s: state %p unexpected sync_state %d", __func__, st, st->sync_state); /* NOTREACHED */ } pfsync_slice_leave(sc, s); } smr_read_leave(); } void pfsync_update_state(struct pf_state *st) { struct pfsync_softc *sc; MUTEX_ASSERT_UNLOCKED(&st->mtx); if (ISSET(st->state_flags, PFSTATE_NOSYNC) || st->sync_state == PFSYNC_S_DEAD) return; smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc != NULL) { struct pfsync_slice *s = pfsync_slice_enter(sc, st); int sync = 0; switch (st->sync_state) { case PFSYNC_S_UPD_C: case PFSYNC_S_UPD: /* we're already handling it */ if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { st->sync_updates++; if (st->sync_updates >= sc->sc_maxupdates) sync = 1; } /* FALLTHROUGH */ case PFSYNC_S_INS: case PFSYNC_S_DEL: case PFSYNC_S_DEAD: break; case PFSYNC_S_IACK: pfsync_q_del(s, st); /* FALLTHROUGH */ case PFSYNC_S_NONE: pfsync_q_ins(s, st, PFSYNC_S_UPD_C); st->sync_updates = 0; break; default: panic("%s: state %p unexpected sync_state %d", __func__, st, st->sync_state); /* NOTREACHED */ } if (!sync && (getuptime() - st->pfsync_time) < 2) sync = 1; if (sync) pfsync_slice_sched(s); pfsync_slice_leave(sc, s); } smr_read_leave(); } void pfsync_delete_state(struct pf_state *st) { struct pfsync_softc *sc; MUTEX_ASSERT_UNLOCKED(&st->mtx); if (ISSET(st->state_flags, PFSTATE_NOSYNC) || st->sync_state == PFSYNC_S_DEAD) return; smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc != NULL) { struct pfsync_slice *s = pfsync_slice_enter(sc, st); switch (st->sync_state) { case PFSYNC_S_INS: /* let's pretend this never happened */ pfsync_q_del(s, st); break; case PFSYNC_S_UPD_C: case PFSYNC_S_UPD: case PFSYNC_S_IACK: pfsync_q_del(s, st); /* FALLTHROUGH */ case PFSYNC_S_NONE: pfsync_q_ins(s, st, PFSYNC_S_DEL); st->sync_updates = 0; break; case PFSYNC_S_DEL: case PFSYNC_S_DEAD: /* XXX we should count this */ break; default: panic("%s: state %p unexpected sync_state %d", __func__, st, st->sync_state); /* NOTREACHED */ } pfsync_slice_leave(sc, s); } smr_read_leave(); } struct pfsync_subh_clr { struct pfsync_subheader subh; struct pfsync_clr clr; } __packed __aligned(4); void pfsync_clear_states(u_int32_t creatorid, const char *ifname) { struct pfsync_softc *sc; struct pfsync_subh_clr *h; struct mbuf *m; unsigned int hlen, mlen; smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc != NULL) refcnt_take(&sc->sc_refs); smr_read_leave(); if (sc == NULL) return; hlen = sizeof(sc->sc_template) + sizeof(struct pfsync_header) + sizeof(*h); mlen = max_linkhdr + hlen; m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) { /* count error */ goto leave; } if (mlen > MHLEN) { MCLGETL(m, M_DONTWAIT, mlen); if (!ISSET(m->m_flags, M_EXT)) { m_freem(m); goto leave; } } m_align(m, sizeof(*h)); h = mtod(m, struct pfsync_subh_clr *); h->subh.action = PFSYNC_ACT_CLR; h->subh.len = sizeof(h->clr) >> 2; h->subh.count = htons(1); strlcpy(h->clr.ifname, ifname, sizeof(h->clr.ifname)); h->clr.creatorid = creatorid; m->m_pkthdr.len = m->m_len = sizeof(*h); m = pfsync_encap(sc, m); if (m == NULL) goto leave; pfsync_sendout(sc, m); leave: refcnt_rele_wake(&sc->sc_refs); } int pfsync_state_in_use(struct pf_state *st) { struct pfsync_softc *sc; int rv = 0; smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc != NULL) { /* * pfsync bulk sends run inside * rw_enter_read(&pf_state_list.pfs_rwl), and this * code (pfsync_state_in_use) is only called from the * purge code inside * rw_enter_write(&pf_state_list.pfs_rwl). therefore, * those two sections are exclusive so we can safely * look at the bulk send pointers. */ /* rw_assert_wrlock(&pf_state_list.pfs_rwl); */ if (sc->sc_bulk_snd.snd_next == st || sc->sc_bulk_snd.snd_tail == st) rv = 1; } smr_read_leave(); return (rv); } int pfsync_defer(struct pf_state *st, struct mbuf *m) { struct pfsync_softc *sc; struct pfsync_slice *s; struct pfsync_deferral *pd; int sched = 0; int rv = 0; if (ISSET(st->state_flags, PFSTATE_NOSYNC) || ISSET(m->m_flags, M_BCAST|M_MCAST)) return (0); smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc == NULL || !sc->sc_defer) goto leave; pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT); if (pd == NULL) { goto leave; } s = pfsync_slice_enter(sc, st); s->s_stat_defer_add++; pd->pd_st = pf_state_ref(st); pd->pd_m = m; pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC; m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; st->sync_defer = pd; sched = s->s_deferred++; TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry); if (sched == 0) timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC); else if (sched >= PFSYNC_DEFER_LIMIT) { s->s_stat_defer_overlimit++; timeout_del(&s->s_deferrals_tmo); task_add(s->s_softnet, &s->s_deferrals_task); } pfsync_slice_sched(s); pfsync_slice_leave(sc, s); rv = 1; leave: smr_read_leave(); return (rv); } static void pfsync_deferred(struct pfsync_softc *sc, struct pf_state *st) { struct pfsync_slice *s; struct pfsync_deferral *pd; s = pfsync_slice_enter(sc, st); pd = st->sync_defer; if (pd != NULL) { s->s_stat_defer_ack++; TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); s->s_deferred--; st = pd->pd_st; st->sync_defer = NULL; } pfsync_slice_leave(sc, s); if (pd != NULL) pfsync_defer_output(pd); } static void pfsync_deferrals_tmo(void *arg) { struct pfsync_slice *s = arg; if (READ_ONCE(s->s_deferred) > 0) task_add(s->s_softnet, &s->s_deferrals_task); } static void pfsync_deferrals_task(void *arg) { struct pfsync_slice *s = arg; struct pfsync_deferral *pd; struct pf_state *st; uint64_t now, nsec = 0; struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds); now = getnsecuptime(); mtx_enter(&s->s_mtx); s->s_stat_defer_run++; /* maybe move this into the loop */ for (;;) { pd = TAILQ_FIRST(&s->s_deferrals); if (pd == NULL) break; if (s->s_deferred < PFSYNC_DEFER_LIMIT && now < pd->pd_deadline) { nsec = pd->pd_deadline - now; break; } TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); s->s_deferred--; /* * detach the pd from the state. the pd still refers * to the state though. */ st = pd->pd_st; st->sync_defer = NULL; TAILQ_INSERT_TAIL(&pds, pd, pd_entry); } mtx_leave(&s->s_mtx); if (nsec > 0) { /* we were looking at a pd, but it wasn't old enough */ timeout_add_nsec(&s->s_deferrals_tmo, nsec); } if (TAILQ_EMPTY(&pds)) return; NET_LOCK(); while ((pd = TAILQ_FIRST(&pds)) != NULL) { TAILQ_REMOVE(&pds, pd, pd_entry); pfsync_defer_output(pd); } NET_UNLOCK(); } static void pfsync_defer_output(struct pfsync_deferral *pd) { struct pf_pdesc pdesc; struct pf_state *st = pd->pd_st; if (st->rt == PF_ROUTETO) { if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af, st->direction, NULL, pd->pd_m, NULL) != PF_PASS) return; switch (st->key[PF_SK_WIRE]->af) { case AF_INET: pf_route(&pdesc, st); break; #ifdef INET6 case AF_INET6: pf_route6(&pdesc, st); break; #endif /* INET6 */ default: unhandled_af(st->key[PF_SK_WIRE]->af); } pd->pd_m = pdesc.m; } else { switch (st->key[PF_SK_WIRE]->af) { case AF_INET: ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0); break; #ifdef INET6 case AF_INET6: ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL); break; #endif /* INET6 */ default: unhandled_af(st->key[PF_SK_WIRE]->af); } pd->pd_m = NULL; } pf_state_unref(st); m_freem(pd->pd_m); pool_put(&pfsync_deferrals_pool, pd); } struct pfsync_subh_bus { struct pfsync_subheader subh; struct pfsync_bus bus; } __packed __aligned(4); static unsigned int pfsync_bulk_snd_bus(struct pfsync_softc *sc, struct mbuf *m, const unsigned int space, uint32_t endtime, uint8_t status) { struct pfsync_subh_bus *h; unsigned int nlen; nlen = m->m_len + sizeof(*h); if (space < nlen) return (0); h = (struct pfsync_subh_bus *)(mtod(m, caddr_t) + m->m_len); memset(h, 0, sizeof(*h)); h->subh.action = PFSYNC_ACT_BUS; h->subh.len = sizeof(h->bus) >> 2; h->subh.count = htons(1); h->bus.creatorid = pf_status.hostid; h->bus.endtime = htonl(endtime); h->bus.status = status; m->m_len = nlen; return (1); } static unsigned int pfsync_bulk_snd_states(struct pfsync_softc *sc, struct mbuf *m, const unsigned int space, unsigned int len) { struct pf_state *st; struct pfsync_state *sp; unsigned int nlen; unsigned int count = 0; st = sc->sc_bulk_snd.snd_next; for (;;) { nlen = len + sizeof(*sp); sp = (struct pfsync_state *)(mtod(m, caddr_t) + len); if (space < nlen) break; mtx_enter(&st->mtx); pf_state_export(sp, st); mtx_leave(&st->mtx); /* commit */ count++; m->m_len = len = nlen; if (st == sc->sc_bulk_snd.snd_tail) { if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0) { /* couldn't fit the BUS */ st = NULL; break; } /* this BUS is done */ pfsync_dprintf(sc, "bulk send done (%s)", __func__); sc->sc_bulk_snd.snd_again = 0; /* XXX */ sc->sc_bulk_snd.snd_next = NULL; sc->sc_bulk_snd.snd_tail = NULL; return (count); } st = TAILQ_NEXT(st, entry_list); } /* there's still work to do */ sc->sc_bulk_snd.snd_next = st; timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS); return (count); } static unsigned int pfsync_bulk_snd_sub(struct pfsync_softc *sc, struct mbuf *m, const unsigned int space) { struct pfsync_subheader *subh; unsigned int count; unsigned int len, nlen; len = m->m_len; nlen = len + sizeof(*subh); if (nlen > space) return (0); subh = (struct pfsync_subheader *)(mtod(m, caddr_t) + len); /* * pfsync_bulk_snd_states only updates m->m_len after * filling in a state after the offset we gave it. */ count = pfsync_bulk_snd_states(sc, m, space, nlen); if (count == 0) return (0); subh->action = PFSYNC_ACT_UPD; subh->len = sizeof(struct pfsync_state) >> 2; subh->count = htons(count); return (count); } static void pfsync_bulk_snd_start(struct pfsync_softc *sc) { const unsigned int space = sc->sc_if.if_mtu - (sizeof(struct ip) + sizeof(struct pfsync_header)); struct mbuf *m; rw_enter_read(&pf_state_list.pfs_rwl); rw_enter_write(&sc->sc_bulk_snd.snd_lock); if (sc->sc_bulk_snd.snd_next != NULL) { sc->sc_bulk_snd.snd_again = 1; goto leave; } mtx_enter(&pf_state_list.pfs_mtx); sc->sc_bulk_snd.snd_next = TAILQ_FIRST(&pf_state_list.pfs_list); sc->sc_bulk_snd.snd_tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); mtx_leave(&pf_state_list.pfs_mtx); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) goto leave; MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu); if (!ISSET(m->m_flags, M_EXT)) { /* some error++ */ m_freem(m); /* drop */ goto leave; } m_align(m, space); m->m_len = 0; if (sc->sc_bulk_snd.snd_tail == NULL) { pfsync_dprintf(sc, "bulk send empty (%s)", __func__); /* list is empty */ if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0) panic("%s: mtu is too low", __func__); goto encap; } pfsync_dprintf(sc, "bulk send start (%s)", __func__); /* start a bulk update. */ if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_START) == 0) panic("%s: mtu is too low", __func__); /* fill it up with state updates. */ pfsync_bulk_snd_sub(sc, m, space); encap: m->m_pkthdr.len = m->m_len; m = pfsync_encap(sc, m); if (m == NULL) goto leave; pfsync_sendout(sc, m); leave: rw_exit_write(&sc->sc_bulk_snd.snd_lock); rw_exit_read(&pf_state_list.pfs_rwl); } static void pfsync_bulk_snd_tmo(void *arg) { struct pfsync_softc *sc = arg; const unsigned int space = sc->sc_if.if_mtu - (sizeof(struct ip) + sizeof(struct pfsync_header)); struct mbuf *m; m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) { /* some error++ */ /* retry later */ timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS); return; } MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu); if (!ISSET(m->m_flags, M_EXT)) { /* some error++ */ m_freem(m); /* retry later */ timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS); return; } m_align(m, space); m->m_len = 0; rw_enter_read(&pf_state_list.pfs_rwl); rw_enter_write(&sc->sc_bulk_snd.snd_lock); if (sc->sc_bulk_snd.snd_next == NULL) { /* there was no space in the previous packet for a BUS END */ if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0) panic("%s: mtu is too low", __func__); /* this bulk is done */ pfsync_dprintf(sc, "bulk send done (%s)", __func__); sc->sc_bulk_snd.snd_again = 0; /* XXX */ sc->sc_bulk_snd.snd_tail = NULL; } else { pfsync_dprintf(sc, "bulk send again (%s)", __func__); /* fill it up with state updates. */ pfsync_bulk_snd_sub(sc, m, space); } m->m_pkthdr.len = m->m_len; m = pfsync_encap(sc, m); rw_exit_write(&sc->sc_bulk_snd.snd_lock); rw_exit_read(&pf_state_list.pfs_rwl); if (m != NULL) { NET_LOCK(); pfsync_sendout(sc, m); NET_UNLOCK(); } } static void pfsync_update_state_req(struct pfsync_softc *sc, struct pf_state *st) { struct pfsync_slice *s = pfsync_slice_enter(sc, st); switch (st->sync_state) { case PFSYNC_S_UPD_C: case PFSYNC_S_IACK: pfsync_q_del(s, st); /* FALLTHROUGH */ case PFSYNC_S_NONE: pfsync_q_ins(s, st, PFSYNC_S_UPD); break; case PFSYNC_S_INS: case PFSYNC_S_UPD: case PFSYNC_S_DEL: /* we're already handling it */ break; default: panic("%s: state %p unexpected sync_state %d", __func__, st, st->sync_state); } pfsync_slice_sched(s); pfsync_slice_leave(sc, s); } #if defined(IPSEC) static void pfsync_out_tdb(struct tdb *tdb, void *buf) { struct pfsync_tdb *ut = buf; memset(ut, 0, sizeof(*ut)); ut->spi = tdb->tdb_spi; memcpy(&ut->dst, &tdb->tdb_dst, sizeof(ut->dst)); /* * When a failover happens, the master's rpl is probably above * what we see here (we may be up to a second late), so * increase it a bit for outbound tdbs to manage most such * situations. * * For now, just add an offset that is likely to be larger * than the number of packets we can see in one second. The RFC * just says the next packet must have a higher seq value. * * XXX What is a good algorithm for this? We could use * a rate-determined increase, but to know it, we would have * to extend struct tdb. * XXX pt->rpl can wrap over MAXINT, but if so the real tdb * will soon be replaced anyway. For now, just don't handle * this edge case. */ #define RPL_INCR 16384 ut->rpl = htobe64(tdb->tdb_rpl + (ISSET(tdb->tdb_flags, TDBF_PFSYNC_RPL) ? RPL_INCR : 0)); ut->cur_bytes = htobe64(tdb->tdb_cur_bytes); ut->sproto = tdb->tdb_sproto; ut->rdomain = htons(tdb->tdb_rdomain); } static struct pfsync_slice * pfsync_slice_enter_tdb(struct pfsync_softc *sc, const struct tdb *t) { /* * just use the first slice for all ipsec (for now) until * it's more obvious what property (eg, spi) we can distribute * tdbs over slices with. */ struct pfsync_slice *s = &sc->sc_slices[0]; if (!mtx_enter_try(&s->s_mtx)) { mtx_enter(&s->s_mtx); s->s_stat_contended++; } s->s_stat_locks++; return (s); } static void pfsync_tdb_ins(struct pfsync_slice *s, struct tdb *tdb) { size_t nlen = sizeof(struct pfsync_tdb); struct mbuf *m = NULL; KASSERT(s->s_len >= PFSYNC_MINPKT); MUTEX_ASSERT_LOCKED(&s->s_mtx); MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); if (TAILQ_EMPTY(&s->s_tdb_q)) nlen += sizeof(struct pfsync_subheader); if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) { m = pfsync_slice_write(s); if (m != NULL) { s->s_stat_enqueue++; if (mq_enqueue(&s->s_sendq, m) == 0) task_add(s->s_softnet, &s->s_send); } nlen = sizeof(struct pfsync_subheader) + sizeof(struct pfsync_tdb); } s->s_len += nlen; TAILQ_INSERT_TAIL(&s->s_tdb_q, tdb, tdb_sync_entry); tdb->tdb_updates = 0; if (!timeout_pending(&s->s_tmo)) timeout_add_sec(&s->s_tmo, 1); } static void pfsync_tdb_del(struct pfsync_slice *s, struct tdb *tdb) { MUTEX_ASSERT_LOCKED(&s->s_mtx); MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); s->s_len -= sizeof(struct pfsync_tdb); if (TAILQ_EMPTY(&s->s_tdb_q)) s->s_len -= sizeof(struct pfsync_subheader); } /* * the reference that pfsync has to a tdb is accounted for by the * TDBF_PFSYNC flag, not by tdb_ref/tdb_unref. tdb_delete_tdb() is * called after all other references to a tdb are dropped (with * tdb_unref) as part of the tdb_free(). * * tdb_free() needs to wait for pfsync to let go of the tdb though, * which would be best handled by a reference count, but tdb_free * needs the NET_LOCK which pfsync is already fighting with. instead * use the TDBF_PFSYNC_SNAPPED flag to coordinate the pfsync write/drop * with tdb_free. */ void pfsync_update_tdb(struct tdb *tdb, int output) { struct pfsync_softc *sc; MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc != NULL) { struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb); /* TDBF_PFSYNC is only changed while the slice mtx is held */ if (!ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { mtx_enter(&tdb->tdb_mtx); SET(tdb->tdb_flags, TDBF_PFSYNC); mtx_leave(&tdb->tdb_mtx); pfsync_tdb_ins(s, tdb); } else if (++tdb->tdb_updates >= sc->sc_maxupdates) pfsync_slice_sched(s); /* XXX no sync timestamp on tdbs to check */ pfsync_slice_leave(sc, s); } smr_read_leave(); } void pfsync_delete_tdb(struct tdb *tdb) { struct pfsync_softc *sc; MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc != NULL) { struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb); /* TDBF_PFSYNC is only changed while the slice mtx is held */ if (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { pfsync_tdb_del(s, tdb); mtx_enter(&tdb->tdb_mtx); CLR(tdb->tdb_flags, TDBF_PFSYNC); mtx_leave(&tdb->tdb_mtx); } pfsync_slice_leave(sc, s); } smr_read_leave(); /* * handle pfsync_slice_drop being called from pfsync_down * and the smr/slice access above won't work. */ mtx_enter(&tdb->tdb_mtx); SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); /* like a thanos snap */ while (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { msleep_nsec(&tdb->tdb_updates, &tdb->tdb_mtx, PWAIT, "tdbfree", INFSLP); } mtx_leave(&tdb->tdb_mtx); } #endif /* defined(IPSEC) */ struct pfsync_act { void (*in)(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); size_t len; }; static void pfsync_in_clr(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_iack(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_upd_c(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_ureq(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_del(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_del_c(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_bus(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_tdb(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_ins(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static void pfsync_in_upd(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); static const struct pfsync_act pfsync_acts[] = { [PFSYNC_ACT_CLR] = { pfsync_in_clr, sizeof(struct pfsync_clr) }, [PFSYNC_ACT_INS_ACK] = { pfsync_in_iack, sizeof(struct pfsync_ins_ack) }, [PFSYNC_ACT_UPD_C] = { pfsync_in_upd_c, sizeof(struct pfsync_upd_c) }, [PFSYNC_ACT_UPD_REQ] = { pfsync_in_ureq, sizeof(struct pfsync_upd_req) }, [PFSYNC_ACT_DEL] = { pfsync_in_del, sizeof(struct pfsync_state) }, [PFSYNC_ACT_DEL_C] = { pfsync_in_del_c, sizeof(struct pfsync_del_c) }, [PFSYNC_ACT_BUS] = { pfsync_in_bus, sizeof(struct pfsync_bus) }, [PFSYNC_ACT_INS] = { pfsync_in_ins, sizeof(struct pfsync_state) }, [PFSYNC_ACT_UPD] = { pfsync_in_upd, sizeof(struct pfsync_state) }, [PFSYNC_ACT_TDB] = { pfsync_in_tdb, sizeof(struct pfsync_tdb) }, }; static void pfsync_in_skip(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { /* nop */ } static struct mbuf * pfsync_input(struct mbuf *m, uint8_t ttl, unsigned int hlen) { struct pfsync_softc *sc; struct pfsync_header *ph; struct pfsync_subheader *subh; unsigned int len; void (*in)(struct pfsync_softc *, const caddr_t, unsigned int, unsigned int); pfsyncstat_inc(pfsyncs_ipackets); if (!pf_status.running) return (m); /* * pfsyncif is only set if it is up and running correctly. */ smr_read_enter(); sc = SMR_PTR_GET(&pfsyncif); if (sc == NULL) goto leave; if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) { pfsyncstat_inc(pfsyncs_badif); goto leave; } /* verify that the IP TTL is 255. */ if (ttl != PFSYNC_DFLTTL) { pfsyncstat_inc(pfsyncs_badttl); goto leave; } m_adj(m, hlen); if (m->m_pkthdr.len < sizeof(*ph)) { pfsyncstat_inc(pfsyncs_hdrops); goto leave; } if (m->m_len < sizeof(*ph)) { m = m_pullup(m, sizeof(*ph)); if (m == NULL) goto leave; } ph = mtod(m, struct pfsync_header *); if (ph->version != PFSYNC_VERSION) { pfsyncstat_inc(pfsyncs_badver); goto leave; } len = ntohs(ph->len); if (m->m_pkthdr.len < len) { pfsyncstat_inc(pfsyncs_badlen); goto leave; } if (m->m_pkthdr.len > len) m->m_pkthdr.len = len; /* ok, it's serious now */ refcnt_take(&sc->sc_refs); smr_read_leave(); counters_pkt(sc->sc_if.if_counters, ifc_ipackets, ifc_ibytes, len); m_adj(m, sizeof(*ph)); while (m->m_pkthdr.len >= sizeof(*subh)) { unsigned int action, mlen, count; if (m->m_len < sizeof(*subh)) { m = m_pullup(m, sizeof(*subh)); if (m == NULL) goto rele; } subh = mtod(m, struct pfsync_subheader *); action = subh->action; mlen = subh->len << 2; count = ntohs(subh->count); if (action >= PFSYNC_ACT_MAX || action >= nitems(pfsync_acts) || mlen < pfsync_acts[subh->action].len) { /* * subheaders are always followed by at least one * message, so if the peer is new * enough to tell us how big its messages are then we * know enough to skip them. */ if (count == 0 || mlen == 0) { pfsyncstat_inc(pfsyncs_badact); goto rele; } in = pfsync_in_skip; } else { in = pfsync_acts[action].in; if (in == NULL) in = pfsync_in_skip; } m_adj(m, sizeof(*subh)); len = mlen * count; if (len > m->m_pkthdr.len) { pfsyncstat_inc(pfsyncs_badlen); goto rele; } if (m->m_len < len) { m = m_pullup(m, len); if (m == NULL) goto rele; } (*in)(sc, mtod(m, caddr_t), mlen, count); m_adj(m, len); } rele: refcnt_rele_wake(&sc->sc_refs); return (m); leave: smr_read_leave(); return (m); } static void pfsync_in_clr(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { const struct pfsync_clr *clr; struct pf_state *head, *tail, *st, *next; struct pfi_kif *kif; uint32_t creatorid; unsigned int i; rw_enter_read(&pf_state_list.pfs_rwl); /* get a view of the state list */ mtx_enter(&pf_state_list.pfs_mtx); head = TAILQ_FIRST(&pf_state_list.pfs_list); tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); mtx_leave(&pf_state_list.pfs_mtx); PF_LOCK(); for (i = 0; i < count; i++) { clr = (struct pfsync_clr *)(buf + i * mlen); creatorid = clr->creatorid; if (clr->ifname[0] == '\0') kif = NULL; else { kif = pfi_kif_find(clr->ifname); if (kif == NULL) continue; } st = NULL; next = head; PF_STATE_ENTER_WRITE(); while (st != tail) { st = next; next = TAILQ_NEXT(st, entry_list); if (creatorid != st->creatorid) continue; if (kif != NULL && kif != st->kif) continue; mtx_enter(&st->mtx); SET(st->state_flags, PFSTATE_NOSYNC); mtx_leave(&st->mtx); pf_remove_state(st); } PF_STATE_EXIT_WRITE(); } PF_UNLOCK(); rw_exit_read(&pf_state_list.pfs_rwl); } static void pfsync_in_ins(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { const struct pfsync_state *sp; sa_family_t af1, af2; unsigned int i; PF_LOCK(); for (i = 0; i < count; i++) { sp = (struct pfsync_state *)(buf + mlen * i); af1 = sp->key[0].af; af2 = sp->key[1].af; /* check for invalid values */ if (sp->timeout >= PFTM_MAX || sp->src.state > PF_TCPS_PROXY_DST || sp->dst.state > PF_TCPS_PROXY_DST || sp->direction > PF_OUT || (((af1 || af2) && ((af1 != AF_INET && af1 != AF_INET6) || (af2 != AF_INET && af2 != AF_INET6))) || (sp->af != AF_INET && sp->af != AF_INET6))) { pfsyncstat_inc(pfsyncs_badval); continue; } if (pf_state_import(sp, PFSYNC_SI_PFSYNC) == ENOMEM) { /* drop out, but process the rest of the actions */ break; } } PF_UNLOCK(); } static void pfsync_in_iack(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { const struct pfsync_ins_ack *ia; struct pf_state_cmp id_key; struct pf_state *st; unsigned int i; for (i = 0; i < count; i++) { ia = (struct pfsync_ins_ack *)(buf + mlen * i); id_key.id = ia->id; id_key.creatorid = ia->creatorid; PF_STATE_ENTER_READ(); st = pf_find_state_byid(&id_key); pf_state_ref(st); PF_STATE_EXIT_READ(); if (st == NULL) continue; if (READ_ONCE(st->sync_defer) != NULL) pfsync_deferred(sc, st); pf_state_unref(st); } } static int pfsync_upd_tcp(struct pf_state *st, const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst) { int sync = 0; /* * The state should never go backwards except * for syn-proxy states. Neither should the * sequence window slide backwards. */ if ((st->src.state > src->state && (st->src.state < PF_TCPS_PROXY_SRC || src->state >= PF_TCPS_PROXY_SRC)) || (st->src.state == src->state && SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))) sync++; else pf_state_peer_ntoh(src, &st->src); if ((st->dst.state > dst->state) || (st->dst.state >= TCPS_SYN_SENT && SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))) sync++; else pf_state_peer_ntoh(dst, &st->dst); return (sync); } static void pfsync_in_updates(struct pfsync_softc *sc, struct pf_state *st, const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst, uint8_t timeout) { struct pf_state_scrub *sscrub = NULL; struct pf_state_scrub *dscrub = NULL; int sync; if (src->scrub.scrub_flag && st->src.scrub == NULL) { sscrub = pf_state_scrub_get(); if (sscrub == NULL) { /* inc error? */ goto out; } } if (dst->scrub.scrub_flag && st->dst.scrub == NULL) { dscrub = pf_state_scrub_get(); if (dscrub == NULL) { /* inc error? */ goto out; } } if (READ_ONCE(st->sync_defer) != NULL) pfsync_deferred(sc, st); mtx_enter(&st->mtx); /* attach the scrub memory if needed */ if (sscrub != NULL && st->src.scrub == NULL) { st->src.scrub = sscrub; sscrub = NULL; } if (dscrub != NULL && st->dst.scrub == NULL) { st->dst.scrub = dscrub; dscrub = NULL; } if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) sync = pfsync_upd_tcp(st, src, dst); else { sync = 0; /* * Non-TCP protocol state machine always go * forwards */ if (st->src.state > src->state) sync++; else pf_state_peer_ntoh(src, &st->src); if (st->dst.state > dst->state) sync++; else pf_state_peer_ntoh(dst, &st->dst); } st->pfsync_time = getuptime(); if (sync < 2) { st->expire = st->pfsync_time; st->timeout = timeout; } mtx_leave(&st->mtx); if (sync) { pfsyncstat_inc(pfsyncs_stale); pfsync_update_state(st); } out: if (sscrub != NULL) pf_state_scrub_put(sscrub); if (dscrub != NULL) pf_state_scrub_put(dscrub); } static void pfsync_in_upd(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { const struct pfsync_state *sp; struct pf_state_cmp id_key; struct pf_state *st; int error; unsigned int i; for (i = 0; i < count; i++) { sp = (struct pfsync_state *)(buf + mlen * i); /* check for invalid values */ if (sp->timeout >= PFTM_MAX || sp->src.state > PF_TCPS_PROXY_DST || sp->dst.state > PF_TCPS_PROXY_DST) { pfsyncstat_inc(pfsyncs_badval); continue; } id_key.id = sp->id; id_key.creatorid = sp->creatorid; PF_STATE_ENTER_READ(); st = pf_find_state_byid(&id_key); pf_state_ref(st); PF_STATE_EXIT_READ(); if (st == NULL) { /* insert the update */ PF_LOCK(); error = pf_state_import(sp, PFSYNC_SI_PFSYNC); if (error) pfsyncstat_inc(pfsyncs_badstate); PF_UNLOCK(); continue; } pfsync_in_updates(sc, st, &sp->src, &sp->dst, sp->timeout); pf_state_unref(st); } } static struct mbuf * pfsync_upd_req_init(struct pfsync_softc *sc, unsigned int count) { struct mbuf *m; unsigned int mlen; m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) { pfsyncstat_inc(pfsyncs_onomem); return (NULL); } mlen = max_linkhdr + sizeof(sc->sc_template) + sizeof(struct pfsync_header) + sizeof(struct pfsync_subheader) + sizeof(struct pfsync_upd_req) * count; if (mlen > MHLEN) { MCLGETL(m, M_DONTWAIT, mlen); if (!ISSET(m->m_flags, M_EXT)) { m_freem(m); return (NULL); } } m_align(m, 0); m->m_len = 0; return (m); } static void pfsync_in_upd_c(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { const struct pfsync_upd_c *up; struct pf_state_cmp id_key; struct pf_state *st; unsigned int i; struct mbuf *m = NULL; unsigned int rcount = 0; for (i = 0; i < count; i++) { up = (struct pfsync_upd_c *)(buf + mlen * i); /* check for invalid values */ if (up->timeout >= PFTM_MAX || up->src.state > PF_TCPS_PROXY_DST || up->dst.state > PF_TCPS_PROXY_DST) { pfsyncstat_inc(pfsyncs_badval); continue; } id_key.id = up->id; id_key.creatorid = up->creatorid; PF_STATE_ENTER_READ(); st = pf_find_state_byid(&id_key); pf_state_ref(st); PF_STATE_EXIT_READ(); if (st == NULL) { /* We don't have this state. Ask for it. */ struct pfsync_upd_req *ur; if (m == NULL) { m = pfsync_upd_req_init(sc, count); if (m == NULL) { pfsyncstat_inc(pfsyncs_onomem); continue; } } m = m_prepend(m, sizeof(*ur), M_DONTWAIT); if (m == NULL) { pfsyncstat_inc(pfsyncs_onomem); continue; } ur = mtod(m, struct pfsync_upd_req *); ur->id = up->id; ur->creatorid = up->creatorid; rcount++; continue; } pfsync_in_updates(sc, st, &up->src, &up->dst, up->timeout); pf_state_unref(st); } if (m != NULL) { struct pfsync_subheader *subh; m = m_prepend(m, sizeof(*subh), M_DONTWAIT); if (m == NULL) { pfsyncstat_inc(pfsyncs_onomem); return; } subh = mtod(m, struct pfsync_subheader *); subh->action = PFSYNC_ACT_UPD_REQ; subh->len = sizeof(struct pfsync_upd_req) >> 2; subh->count = htons(rcount); m = pfsync_encap(sc, m); if (m == NULL) { pfsyncstat_inc(pfsyncs_onomem); return; } pfsync_sendout(sc, m); } } static void pfsync_in_ureq(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { const struct pfsync_upd_req *ur; struct pf_state_cmp id_key; struct pf_state *st; unsigned int i; for (i = 0; i < count; i++) { ur = (struct pfsync_upd_req *)(buf + mlen * i); id_key.id = ur->id; id_key.creatorid = ur->creatorid; if (id_key.id == 0 && id_key.creatorid == 0) { pfsync_bulk_snd_start(sc); continue; } PF_STATE_ENTER_READ(); st = pf_find_state_byid(&id_key); if (st != NULL && st->timeout < PFTM_MAX && !ISSET(st->state_flags, PFSTATE_NOSYNC)) pf_state_ref(st); else st = NULL; PF_STATE_EXIT_READ(); if (st == NULL) { pfsyncstat_inc(pfsyncs_badstate); continue; } pfsync_update_state_req(sc, st); pf_state_unref(st); } } static void pfsync_in_del(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { const struct pfsync_state *sp; struct pf_state_cmp id_key; struct pf_state *st; unsigned int i; PF_LOCK(); PF_STATE_ENTER_WRITE(); for (i = 0; i < count; i++) { sp = (struct pfsync_state *)(buf + mlen * i); id_key.id = sp->id; id_key.creatorid = sp->creatorid; st = pf_find_state_byid(&id_key); if (st == NULL) { pfsyncstat_inc(pfsyncs_badstate); continue; } mtx_enter(&st->mtx); SET(st->state_flags, PFSTATE_NOSYNC); mtx_leave(&st->mtx); pf_remove_state(st); } PF_STATE_EXIT_WRITE(); PF_UNLOCK(); } static void pfsync_in_del_c(struct pfsync_softc *sc, const caddr_t buf, unsigned int mlen, unsigned int count) { const struct pfsync_del_c *sp; struct pf_state_cmp id_key; struct pf_state *st; unsigned int i; PF_LOCK(); PF_STATE_ENTER_WRITE(); for (i = 0; i < count; i++) { sp = (struct pfsync_del_c *)(buf + mlen * i); id_key.id = sp->id; id_key.creatorid = sp->creatorid; st = pf_find_state_byid(&id_key); if (st == NULL) { pfsyncstat_inc(pfsyncs_badstate); continue; } mtx_enter(&st->mtx); SET(st->state_flags, PFSTATE_NOSYNC); mtx_leave(&st->mtx); pf_remove_state(st); } PF_STATE_EXIT_WRITE(); PF_UNLOCK(); } static void pfsync_in_bus(struct pfsync_softc *sc, const caddr_t buf, unsigned int len, unsigned int count) { const struct pfsync_bus *bus = (struct pfsync_bus *)buf; switch (bus->status) { case PFSYNC_BUS_START: pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_START); break; case PFSYNC_BUS_END: pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_END); break; } } #if defined(IPSEC) /* Update an in-kernel tdb. Silently fail if no tdb is found. */ static void pfsync_update_net_tdb(const struct pfsync_tdb *pt) { struct tdb *tdb; NET_ASSERT_LOCKED(); /* check for invalid values */ if (ntohl(pt->spi) <= SPI_RESERVED_MAX || (pt->dst.sa.sa_family != AF_INET && pt->dst.sa.sa_family != AF_INET6)) goto bad; tdb = gettdb(ntohs(pt->rdomain), pt->spi, (union sockaddr_union *)&pt->dst, pt->sproto); if (tdb) { uint64_t rpl = betoh64(pt->rpl); uint64_t cur_bytes = betoh64(pt->cur_bytes); /* Neither replay nor byte counter should ever decrease. */ mtx_enter(&tdb->tdb_mtx); if (rpl >= tdb->tdb_rpl && cur_bytes >= tdb->tdb_cur_bytes) { tdb->tdb_rpl = rpl; tdb->tdb_cur_bytes = cur_bytes; } mtx_leave(&tdb->tdb_mtx); tdb_unref(tdb); } return; bad: DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: " "invalid value"); pfsyncstat_inc(pfsyncs_badstate); return; } #endif static void pfsync_in_tdb(struct pfsync_softc *sc, const caddr_t buf, unsigned int len, unsigned int count) { #if defined(IPSEC) const struct pfsync_tdb *tp; unsigned int i; for (i = 0; i < count; i++) { tp = (const struct pfsync_tdb *)(buf + len * i); pfsync_update_net_tdb(tp); } #endif } int pfsync_input4(struct mbuf **mp, int *offp, int proto, int af) { struct mbuf *m = *mp; struct ip *ip; ip = mtod(m, struct ip *); m = pfsync_input(m, ip->ip_ttl, ip->ip_hl << 2); m_freem(m); *mp = NULL; return (IPPROTO_DONE); } int pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp) { struct pfsyncstats pfsyncstat; CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t))); memset(&pfsyncstat, 0, sizeof pfsyncstat); counters_read(pfsynccounters, (uint64_t *)&pfsyncstat, pfsyncs_ncounters, NULL); return (sysctl_rdstruct(oldp, oldlenp, newp, &pfsyncstat, sizeof(pfsyncstat))); } int pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { /* All sysctl names at this level are terminal. */ if (namelen != 1) return (ENOTDIR); switch (name[0]) { case PFSYNCCTL_STATS: return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp)); default: return (ENOPROTOOPT); } }