/* $OpenBSD: tcp_usrreq.c,v 1.232 2024/11/08 15:46:55 bluhm Exp $ */ /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* * Locks used to protect global variables in this file: * I immutable after creation */ #ifndef TCP_SENDSPACE #define TCP_SENDSPACE 1024*16 #endif u_int tcp_sendspace = TCP_SENDSPACE; /* [I] */ #ifndef TCP_RECVSPACE #define TCP_RECVSPACE 1024*16 #endif u_int tcp_recvspace = TCP_RECVSPACE; /* [I] */ u_int tcp_autorcvbuf_inc = 16 * 1024; /* [I] */ const struct pr_usrreqs tcp_usrreqs = { .pru_attach = tcp_attach, .pru_detach = tcp_detach, .pru_bind = tcp_bind, .pru_listen = tcp_listen, .pru_connect = tcp_connect, .pru_accept = tcp_accept, .pru_disconnect = tcp_disconnect, .pru_shutdown = tcp_shutdown, .pru_rcvd = tcp_rcvd, .pru_send = tcp_send, .pru_abort = tcp_abort, .pru_sense = tcp_sense, .pru_rcvoob = tcp_rcvoob, .pru_sendoob = tcp_sendoob, .pru_control = in_control, .pru_sockaddr = tcp_sockaddr, .pru_peeraddr = tcp_peeraddr, }; #ifdef INET6 const struct pr_usrreqs tcp6_usrreqs = { .pru_attach = tcp_attach, .pru_detach = tcp_detach, .pru_bind = tcp_bind, .pru_listen = tcp_listen, .pru_connect = tcp_connect, .pru_accept = tcp_accept, .pru_disconnect = tcp_disconnect, .pru_shutdown = tcp_shutdown, .pru_rcvd = tcp_rcvd, .pru_send = tcp_send, .pru_abort = tcp_abort, .pru_sense = tcp_sense, .pru_rcvoob = tcp_rcvoob, .pru_sendoob = tcp_sendoob, .pru_control = in6_control, .pru_sockaddr = tcp_sockaddr, .pru_peeraddr = tcp_peeraddr, }; #endif const struct sysctl_bounded_args tcpctl_vars[] = { { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, #ifdef TCP_ECN { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, #endif { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, { TCPCTL_TSO, &tcp_do_tso, 0, 1 }, }; struct inpcbtable tcbtable; #ifdef INET6 struct inpcbtable tcb6table; #endif int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); int tcp_ident(void *, size_t *, void *, size_t, int); static inline int tcp_sogetpcb(struct socket *, struct inpcb **, struct tcpcb **); static inline int tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) { struct inpcb *inp; struct tcpcb *tp; /* * When a TCP is attached to a socket, then there will be * a (struct inpcb) pointed at by the socket, and this * structure will point at a subsidiary (struct tcpcb). */ if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { if (so->so_error) return so->so_error; return EINVAL; } *rinp = inp; *rtp = tp; return 0; } /* * Export internal TCP state information via a struct tcp_info without * leaking any sensitive information. Sequence numbers are reported * relative to the initial sequence number. */ int tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) { struct proc *p = curproc; struct tcp_info *ti; u_int t = 1000; /* msec => usec */ uint64_t now; if (sizeof(*ti) > MLEN) { MCLGETL(m, M_WAITOK, sizeof(*ti)); if (!ISSET(m->m_flags, M_EXT)) return ENOMEM; } ti = mtod(m, struct tcp_info *); m->m_len = sizeof(*ti); memset(ti, 0, sizeof(*ti)); now = tcp_now(); ti->tcpi_state = tp->t_state; if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) ti->tcpi_options |= TCPI_OPT_SACK; if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } #ifdef TCP_ECN if (tp->t_flags & TF_ECN_PERMIT) ti->tcpi_options |= TCPI_OPT_ECN; #endif ti->tcpi_rto = tp->t_rxtcur * t; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_peermss; ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; ti->tcpi_rcv_space = tp->rcv_wnd; /* * Provide only minimal information for unprivileged processes. */ if (suser(p) != 0) return 0; /* FreeBSD-specific extension fields for tcp_info. */ ti->tcpi_snd_wnd = tp->snd_wnd; ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; /* missing tcpi_toe_tid */ ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; /* OpenBSD extensions */ ti->tcpi_rttmin = tp->t_rttmin * t; ti->tcpi_max_sndwnd = tp->max_sndwnd; ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; ti->tcpi_rcv_up = tp->rcv_up - tp->irs; ti->tcpi_snd_una = tp->snd_una - tp->iss; ti->tcpi_snd_up = tp->snd_up - tp->iss; ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; ti->tcpi_snd_max = tp->snd_max - tp->iss; ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; return 0; } int tcp_ctloutput(int op, struct socket *so, int level, int optname, struct mbuf *m) { int error = 0; struct inpcb *inp; struct tcpcb *tp; int i; inp = sotoinpcb(so); if (inp == NULL) return (ECONNRESET); if (level != IPPROTO_TCP) { #ifdef INET6 if (ISSET(inp->inp_flags, INP_IPV6)) error = ip6_ctloutput(op, so, level, optname, m); else #endif error = ip_ctloutput(op, so, level, optname, m); return (error); } tp = intotcpcb(inp); switch (op) { case PRCO_SETOPT: switch (optname) { case TCP_NODELAY: if (m == NULL || m->m_len < sizeof (int)) error = EINVAL; else if (*mtod(m, int *)) tp->t_flags |= TF_NODELAY; else tp->t_flags &= ~TF_NODELAY; break; case TCP_NOPUSH: if (m == NULL || m->m_len < sizeof (int)) error = EINVAL; else if (*mtod(m, int *)) tp->t_flags |= TF_NOPUSH; else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) error = tcp_output(tp); } break; case TCP_MAXSEG: if (m == NULL || m->m_len < sizeof (int)) { error = EINVAL; break; } i = *mtod(m, int *); if (i > 0 && i <= tp->t_maxseg) tp->t_maxseg = i; else error = EINVAL; break; case TCP_SACK_ENABLE: if (m == NULL || m->m_len < sizeof (int)) { error = EINVAL; break; } if (TCPS_HAVEESTABLISHED(tp->t_state)) { error = EPERM; break; } if (tp->t_flags & TF_SIGNATURE) { error = EPERM; break; } if (*mtod(m, int *)) tp->sack_enable = 1; else tp->sack_enable = 0; break; #ifdef TCP_SIGNATURE case TCP_MD5SIG: if (m == NULL || m->m_len < sizeof (int)) { error = EINVAL; break; } if (TCPS_HAVEESTABLISHED(tp->t_state)) { error = EPERM; break; } if (*mtod(m, int *)) { tp->t_flags |= TF_SIGNATURE; tp->sack_enable = 0; } else tp->t_flags &= ~TF_SIGNATURE; break; #endif /* TCP_SIGNATURE */ default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case TCP_NODELAY: m->m_len = sizeof(int); *mtod(m, int *) = tp->t_flags & TF_NODELAY; break; case TCP_NOPUSH: m->m_len = sizeof(int); *mtod(m, int *) = tp->t_flags & TF_NOPUSH; break; case TCP_MAXSEG: m->m_len = sizeof(int); *mtod(m, int *) = tp->t_maxseg; break; case TCP_SACK_ENABLE: m->m_len = sizeof(int); *mtod(m, int *) = tp->sack_enable; break; case TCP_INFO: error = tcp_fill_info(tp, so, m); break; #ifdef TCP_SIGNATURE case TCP_MD5SIG: m->m_len = sizeof(int); *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; break; #endif default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Attach TCP protocol to socket, allocating * internet protocol control block, tcp control block, * buffer space, and entering LISTEN state to accept connections. */ int tcp_attach(struct socket *so, int proto, int wait) { struct inpcbtable *table; struct tcpcb *tp; struct inpcb *inp; int error; if (so->so_pcb) return EISCONN; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { error = soreserve(so, tcp_sendspace, tcp_recvspace); if (error) return (error); } NET_ASSERT_LOCKED(); #ifdef INET6 if (so->so_proto->pr_domain->dom_family == PF_INET6) table = &tcb6table; else #endif table = &tcbtable; error = in_pcballoc(so, table, wait); if (error) return (error); inp = sotoinpcb(so); tp = tcp_newtcpcb(inp, wait); if (tp == NULL) { unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ in_pcbdetach(inp); so->so_state |= nofd; return (ENOBUFS); } tp->t_state = TCPS_CLOSED; #ifdef INET6 if (ISSET(inp->inp_flags, INP_IPV6)) tp->pf = PF_INET6; else #endif tp->pf = PF_INET; if ((so->so_options & SO_LINGER) && so->so_linger == 0) so->so_linger = TCP_LINGERTIME; if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); return (0); } int tcp_detach(struct socket *so) { struct inpcb *inp; struct tcpcb *otp = NULL, *tp; int error; short ostate; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); if (so->so_options & SO_DEBUG) { otp = tp; ostate = tp->t_state; } /* * Detach the TCP protocol from the socket. * If the protocol state is non-embryonic, then can't * do this directly: have to initiate a PRU_DISCONNECT, * which may finish later; embryonic TCB's can just * be discarded here. */ tp = tcp_dodisconnect(tp); if (otp) tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); return (0); } /* * Give the socket an address. */ int tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) { struct inpcb *inp; struct tcpcb *tp; int error; short ostate; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); if (so->so_options & SO_DEBUG) ostate = tp->t_state; error = in_pcbbind(inp, nam, p); if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); return (error); } /* * Prepare to accept connections. */ int tcp_listen(struct socket *so) { struct inpcb *inp; struct tcpcb *tp, *otp = NULL; int error; short ostate; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); if (so->so_options & SO_DEBUG) { otp = tp; ostate = tp->t_state; } if (inp->inp_lport == 0) if ((error = in_pcbbind(inp, NULL, curproc))) goto out; /* * If the in_pcbbind() above is called, the tp->pf * should still be whatever it was before. */ tp->t_state = TCPS_LISTEN; out: if (otp) tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); return (error); } /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ int tcp_connect(struct socket *so, struct mbuf *nam) { struct inpcb *inp; struct tcpcb *tp, *otp = NULL; int error; short ostate; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); if (so->so_options & SO_DEBUG) { otp = tp; ostate = tp->t_state; } #ifdef INET6 if (ISSET(inp->inp_flags, INP_IPV6)) { struct sockaddr_in6 *sin6; if ((error = in6_nam2sin6(nam, &sin6))) goto out; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { error = EINVAL; goto out; } } else #endif { struct sockaddr_in *sin; if ((error = in_nam2sin(nam, &sin))) goto out; if ((sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(sin->sin_addr.s_addr) || in_broadcast(sin->sin_addr, inp->inp_rtableid)) { error = EINVAL; goto out; } } error = in_pcbconnect(inp, nam); if (error) goto out; tp->t_template = tcp_template(tp); if (tp->t_template == 0) { in_pcbunset_faddr(inp); in_pcbdisconnect(inp); error = ENOBUFS; goto out; } so->so_state |= SS_CONNECTOUT; /* Compute window scaling to request. */ tcp_rscale(tp, sb_max); soisconnecting(so); tcpstat_inc(tcps_connattempt); tp->t_state = TCPS_SYN_SENT; TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); tcp_set_iss_tsm(tp); tcp_sendseqinit(tp); tp->snd_last = tp->snd_una; error = tcp_output(tp); out: if (otp) tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); return (error); } /* * Accept a connection. Essentially all the work is done at higher * levels; just return the address of the peer, storing through addr. */ int tcp_accept(struct socket *so, struct mbuf *nam) { struct inpcb *inp; struct tcpcb *tp; int error; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); in_setpeeraddr(inp, nam); if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_ACCEPT, 0); return (0); } /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ int tcp_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp, *otp = NULL; int error; short ostate; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); if (so->so_options & SO_DEBUG) { otp = tp; ostate = tp->t_state; } tp = tcp_dodisconnect(tp); if (otp) tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); return (0); } /* * Mark the connection as being incapable of further output. */ int tcp_shutdown(struct socket *so) { struct inpcb *inp; struct tcpcb *tp, *otp = NULL; int error; short ostate; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); if (so->so_options & SO_DEBUG) { otp = tp; ostate = tp->t_state; } if (so->so_snd.sb_state & SS_CANTSENDMORE) goto out; socantsendmore(so); tp = tcp_usrclosed(tp); if (tp) error = tcp_output(tp); out: if (otp) tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); return (error); } /* * After a receive, possibly send window update to peer. */ void tcp_rcvd(struct socket *so) { struct inpcb *inp; struct tcpcb *tp; short ostate; soassertlocked(so); if (tcp_sogetpcb(so, &inp, &tp)) return; if (so->so_options & SO_DEBUG) ostate = tp->t_state; /* * soreceive() calls this function when a user receives * ancillary data on a listening socket. We don't call * tcp_output in such a case, since there is no header * template for a listening socket and hence the kernel * will panic. */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) (void) tcp_output(tp); if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. */ int tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, struct mbuf *control) { struct inpcb *inp; struct tcpcb *tp; int error; short ostate; soassertlocked(so); if (control && control->m_len) { error = EINVAL; goto out; } if ((error = tcp_sogetpcb(so, &inp, &tp))) goto out; if (so->so_options & SO_DEBUG) ostate = tp->t_state; sbappendstream(so, &so->so_snd, m); m = NULL; error = tcp_output(tp); if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); out: m_freem(control); m_freem(m); return (error); } /* * Abort the TCP. */ void tcp_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp, *otp = NULL; short ostate; soassertlocked(so); if (tcp_sogetpcb(so, &inp, &tp)) return; if (so->so_options & SO_DEBUG) { otp = tp; ostate = tp->t_state; } tp = tcp_drop(tp, ECONNABORTED); if (otp) tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); } int tcp_sense(struct socket *so, struct stat *ub) { struct inpcb *inp; struct tcpcb *tp; int error; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); ub->st_blksize = so->so_snd.sb_hiwat; if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); return (0); } int tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) { struct inpcb *inp; struct tcpcb *tp; int error; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { error = EINVAL; goto out; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); out: if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); return (error); } int tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, struct mbuf *control) { struct inpcb *inp; struct tcpcb *tp; int error; short ostate; soassertlocked(so); if (control && control->m_len) { error = EINVAL; goto release; } if ((error = tcp_sogetpcb(so, &inp, &tp))) goto release; if (so->so_options & SO_DEBUG) ostate = tp->t_state; if (sbspace(so, &so->so_snd) < -512) { error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream(so, &so->so_snd, m); m = NULL; tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_force = 1; error = tcp_output(tp); tp->t_force = 0; out: if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); release: m_freem(control); m_freem(m); return (error); } int tcp_sockaddr(struct socket *so, struct mbuf *nam) { struct inpcb *inp; struct tcpcb *tp; int error; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); in_setsockaddr(inp, nam); if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SOCKADDR, 0); return (0); } int tcp_peeraddr(struct socket *so, struct mbuf *nam) { struct inpcb *inp; struct tcpcb *tp; int error; soassertlocked(so); if ((error = tcp_sogetpcb(so, &inp, &tp))) return (error); in_setpeeraddr(inp, nam); if (so->so_options & SO_DEBUG) tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_PEERADDR, 0); return (0); } /* * Initiate (or continue) disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ struct tcpcb * tcp_dodisconnect(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) tp = tcp_close(tp); else if ((so->so_options & SO_LINGER) && so->so_linger == 0) tp = tcp_drop(tp, 0); else { soisdisconnecting(so); sbflush(so, &so->so_rcv); tp = tcp_usrclosed(tp); if (tp) (void) tcp_output(tp); } return (tp); } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ struct tcpcb * tcp_usrclosed(struct tcpcb *tp) { switch (tp->t_state) { case TCPS_CLOSED: case TCPS_LISTEN: case TCPS_SYN_SENT: tp->t_state = TCPS_CLOSED; tp = tcp_close(tp); break; case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: tp->t_state = TCPS_FIN_WAIT_1; break; case TCPS_CLOSE_WAIT: tp->t_state = TCPS_LAST_ACK; break; } if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* * If we are in FIN_WAIT_2, we arrived here because the * application did a shutdown of the send side. Like the * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after * a full close, we start a timer to make sure sockets are * not left in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); } return (tp); } /* * Look up a socket for ident or tcpdrop, ... */ int tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) { int error = 0; struct tcp_ident_mapping tir; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; struct in6_addr f6, l6; #endif NET_ASSERT_LOCKED(); if (dodrop) { if (oldp != NULL || *oldlenp != 0) return (EINVAL); if (newp == NULL) return (EPERM); if (newlen < sizeof(tir)) return (ENOMEM); if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) return (error); } else { if (oldp == NULL) return (EINVAL); if (*oldlenp < sizeof(tir)) return (ENOMEM); if (newp != NULL || newlen != 0) return (EINVAL); if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) return (error); } switch (tir.faddr.ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&tir.faddr; error = in6_embedscope(&f6, fin6, NULL, NULL); if (error) return EINVAL; /*?*/ lin6 = (struct sockaddr_in6 *)&tir.laddr; error = in6_embedscope(&l6, lin6, NULL, NULL); if (error) return EINVAL; /*?*/ break; #endif case AF_INET: fin = (struct sockaddr_in *)&tir.faddr; lin = (struct sockaddr_in *)&tir.laddr; break; default: return (EINVAL); } switch (tir.faddr.ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&tcb6table, &f6, fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); break; #endif case AF_INET: inp = in_pcblookup(&tcbtable, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); break; default: unhandled_af(tir.faddr.ss_family); } if (dodrop) { if (inp && (tp = intotcpcb(inp)) && ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) tp = tcp_drop(tp, ECONNABORTED); else error = ESRCH; in_pcbunref(inp); return (error); } if (inp == NULL) { tcpstat_inc(tcps_pcbhashmiss); switch (tir.faddr.ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup_listen(&tcb6table, &l6, lin6->sin6_port, NULL, tir.rdomain); break; #endif case AF_INET: inp = in_pcblookup_listen(&tcbtable, lin->sin_addr, lin->sin_port, NULL, tir.rdomain); break; } } if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { tir.ruid = inp->inp_socket->so_ruid; tir.euid = inp->inp_socket->so_euid; } else { tir.ruid = -1; tir.euid = -1; } *oldlenp = sizeof (tir); error = copyout((void *)&tir, oldp, sizeof (tir)); in_pcbunref(inp); return (error); } int tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) { uint64_t counters[tcps_ncounters]; struct tcpstat tcpstat; struct syn_cache_set *set; int i = 0; #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) memset(&tcpstat, 0, sizeof tcpstat); counters_read(tcpcounters, counters, nitems(counters), NULL); ASSIGN(tcps_connattempt); ASSIGN(tcps_accepts); ASSIGN(tcps_connects); ASSIGN(tcps_drops); ASSIGN(tcps_conndrops); ASSIGN(tcps_closed); ASSIGN(tcps_segstimed); ASSIGN(tcps_rttupdated); ASSIGN(tcps_delack); ASSIGN(tcps_timeoutdrop); ASSIGN(tcps_rexmttimeo); ASSIGN(tcps_persisttimeo); ASSIGN(tcps_persistdrop); ASSIGN(tcps_keeptimeo); ASSIGN(tcps_keepprobe); ASSIGN(tcps_keepdrops); ASSIGN(tcps_sndtotal); ASSIGN(tcps_sndpack); ASSIGN(tcps_sndbyte); ASSIGN(tcps_sndrexmitpack); ASSIGN(tcps_sndrexmitbyte); ASSIGN(tcps_sndrexmitfast); ASSIGN(tcps_sndacks); ASSIGN(tcps_sndprobe); ASSIGN(tcps_sndurg); ASSIGN(tcps_sndwinup); ASSIGN(tcps_sndctrl); ASSIGN(tcps_rcvtotal); ASSIGN(tcps_rcvpack); ASSIGN(tcps_rcvbyte); ASSIGN(tcps_rcvbadsum); ASSIGN(tcps_rcvbadoff); ASSIGN(tcps_rcvmemdrop); ASSIGN(tcps_rcvnosec); ASSIGN(tcps_rcvshort); ASSIGN(tcps_rcvduppack); ASSIGN(tcps_rcvdupbyte); ASSIGN(tcps_rcvpartduppack); ASSIGN(tcps_rcvpartdupbyte); ASSIGN(tcps_rcvoopack); ASSIGN(tcps_rcvoobyte); ASSIGN(tcps_rcvpackafterwin); ASSIGN(tcps_rcvbyteafterwin); ASSIGN(tcps_rcvafterclose); ASSIGN(tcps_rcvwinprobe); ASSIGN(tcps_rcvdupack); ASSIGN(tcps_rcvacktoomuch); ASSIGN(tcps_rcvacktooold); ASSIGN(tcps_rcvackpack); ASSIGN(tcps_rcvackbyte); ASSIGN(tcps_rcvwinupd); ASSIGN(tcps_pawsdrop); ASSIGN(tcps_predack); ASSIGN(tcps_preddat); ASSIGN(tcps_pcbhashmiss); ASSIGN(tcps_noport); ASSIGN(tcps_badsyn); ASSIGN(tcps_dropsyn); ASSIGN(tcps_rcvbadsig); ASSIGN(tcps_rcvgoodsig); ASSIGN(tcps_inswcsum); ASSIGN(tcps_outswcsum); ASSIGN(tcps_ecn_accepts); ASSIGN(tcps_ecn_rcvece); ASSIGN(tcps_ecn_rcvcwr); ASSIGN(tcps_ecn_rcvce); ASSIGN(tcps_ecn_sndect); ASSIGN(tcps_ecn_sndece); ASSIGN(tcps_ecn_sndcwr); ASSIGN(tcps_cwr_ecn); ASSIGN(tcps_cwr_frecovery); ASSIGN(tcps_cwr_timeout); ASSIGN(tcps_sc_added); ASSIGN(tcps_sc_completed); ASSIGN(tcps_sc_timed_out); ASSIGN(tcps_sc_overflowed); ASSIGN(tcps_sc_reset); ASSIGN(tcps_sc_unreach); ASSIGN(tcps_sc_bucketoverflow); ASSIGN(tcps_sc_aborted); ASSIGN(tcps_sc_dupesyn); ASSIGN(tcps_sc_dropped); ASSIGN(tcps_sc_collisions); ASSIGN(tcps_sc_retransmitted); ASSIGN(tcps_sc_seedrandom); ASSIGN(tcps_sc_hash_size); ASSIGN(tcps_sc_entry_count); ASSIGN(tcps_sc_entry_limit); ASSIGN(tcps_sc_bucket_maxlen); ASSIGN(tcps_sc_bucket_limit); ASSIGN(tcps_sc_uses_left); ASSIGN(tcps_conndrained); ASSIGN(tcps_sack_recovery_episode); ASSIGN(tcps_sack_rexmits); ASSIGN(tcps_sack_rexmit_bytes); ASSIGN(tcps_sack_rcv_opts); ASSIGN(tcps_sack_snd_opts); ASSIGN(tcps_sack_drop_opts); ASSIGN(tcps_outswtso); ASSIGN(tcps_outhwtso); ASSIGN(tcps_outpkttso); ASSIGN(tcps_outbadtso); ASSIGN(tcps_inswlro); ASSIGN(tcps_inhwlro); ASSIGN(tcps_inpktlro); ASSIGN(tcps_inbadlro); #undef ASSIGN mtx_enter(&syn_cache_mtx); set = &tcp_syn_cache[tcp_syn_cache_active]; tcpstat.tcps_sc_hash_size = set->scs_size; tcpstat.tcps_sc_entry_count = set->scs_count; tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; tcpstat.tcps_sc_bucket_maxlen = 0; for (i = 0; i < set->scs_size; i++) { if (tcpstat.tcps_sc_bucket_maxlen < set->scs_buckethead[i].sch_length) tcpstat.tcps_sc_bucket_maxlen = set->scs_buckethead[i].sch_length; } tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; tcpstat.tcps_sc_uses_left = set->scs_use; mtx_leave(&syn_cache_mtx); return (sysctl_rdstruct(oldp, oldlenp, newp, &tcpstat, sizeof(tcpstat))); } /* * Sysctl for tcp variables. */ int tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error, nval; /* All sysctl names at this level are terminal. */ if (namelen != 1) return (ENOTDIR); switch (name[0]) { case TCPCTL_KEEPINITTIME: NET_LOCK(); nval = tcptv_keep_init / TCP_TIME(1); error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1))); if (!error) tcptv_keep_init = TCP_TIME(nval); NET_UNLOCK(); return (error); case TCPCTL_KEEPIDLE: NET_LOCK(); nval = tcp_keepidle / TCP_TIME(1); error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1))); if (!error) tcp_keepidle = TCP_TIME(nval); NET_UNLOCK(); return (error); case TCPCTL_KEEPINTVL: NET_LOCK(); nval = tcp_keepintvl / TCP_TIME(1); error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1))); if (!error) tcp_keepintvl = TCP_TIME(nval); NET_UNLOCK(); return (error); case TCPCTL_BADDYNAMIC: NET_LOCK(); error = sysctl_struct(oldp, oldlenp, newp, newlen, baddynamicports.tcp, sizeof(baddynamicports.tcp)); NET_UNLOCK(); return (error); case TCPCTL_ROOTONLY: if (newp && securelevel > 0) return (EPERM); NET_LOCK(); error = sysctl_struct(oldp, oldlenp, newp, newlen, rootonlyports.tcp, sizeof(rootonlyports.tcp)); NET_UNLOCK(); return (error); case TCPCTL_IDENT: NET_LOCK(); error = tcp_ident(oldp, oldlenp, newp, newlen, 0); NET_UNLOCK(); return (error); case TCPCTL_DROP: NET_LOCK(); error = tcp_ident(oldp, oldlenp, newp, newlen, 1); NET_UNLOCK(); return (error); case TCPCTL_REASS_LIMIT: NET_LOCK(); nval = tcp_reass_limit; error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); if (!error && nval != tcp_reass_limit) { error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); if (!error) tcp_reass_limit = nval; } NET_UNLOCK(); return (error); case TCPCTL_SACKHOLE_LIMIT: NET_LOCK(); nval = tcp_sackhole_limit; error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); if (!error && nval != tcp_sackhole_limit) { error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); if (!error) tcp_sackhole_limit = nval; } NET_UNLOCK(); return (error); case TCPCTL_STATS: return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); case TCPCTL_SYN_USE_LIMIT: NET_LOCK(); error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &tcp_syn_use_limit, 0, INT_MAX); if (!error && newp != NULL) { /* * Global tcp_syn_use_limit is used when reseeding a * new cache. Also update the value in active cache. */ mtx_enter(&syn_cache_mtx); if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) tcp_syn_cache[0].scs_use = tcp_syn_use_limit; if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) tcp_syn_cache[1].scs_use = tcp_syn_use_limit; mtx_leave(&syn_cache_mtx); } NET_UNLOCK(); return (error); case TCPCTL_SYN_HASH_SIZE: NET_LOCK(); nval = tcp_syn_hash_size; error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1, 100000); if (!error && nval != tcp_syn_hash_size) { /* * If global hash size has been changed, * switch sets as soon as possible. Then * the actual hash array will be reallocated. */ mtx_enter(&syn_cache_mtx); if (tcp_syn_cache[0].scs_size != nval) tcp_syn_cache[0].scs_use = 0; if (tcp_syn_cache[1].scs_size != nval) tcp_syn_cache[1].scs_use = 0; tcp_syn_hash_size = nval; mtx_leave(&syn_cache_mtx); } NET_UNLOCK(); return (error); default: NET_LOCK(); error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name, namelen, oldp, oldlenp, newp, newlen); NET_UNLOCK(); return (error); } /* NOTREACHED */ } /* * Scale the send buffer so that inflight data is not accounted against * the limit. The buffer will scale with the congestion window, if the * the receiver stops acking data the window will shrink and therefore * the buffer size will shrink as well. * In low memory situation try to shrink the buffer to the initial size * disabling the send buffer scaling as long as the situation persists. */ void tcp_update_sndspace(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; u_long nmax = so->so_snd.sb_hiwat; if (sbchecklowmem()) { /* low on memory try to get rid of some */ if (tcp_sendspace < nmax) nmax = tcp_sendspace; } else if (so->so_snd.sb_wat != tcp_sendspace) { /* user requested buffer size, auto-scaling disabled */ nmax = so->so_snd.sb_wat; } else { /* automatic buffer scaling */ nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - tp->snd_una); } /* a writable socket must be preserved because of poll(2) semantics */ if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; /* keep in sync with sbreserve() calculation */ if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; } /* round to MSS boundary */ nmax = roundup(nmax, tp->t_maxseg); if (nmax != so->so_snd.sb_hiwat) sbreserve(so, &so->so_snd, nmax); } /* * Scale the recv buffer by looking at how much data was transferred in * one approximated RTT. If more than a big part of the recv buffer was * transferred during that time we increase the buffer by a constant. * In low memory situation try to shrink the buffer to the initial size. */ void tcp_update_rcvspace(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; u_long nmax = so->so_rcv.sb_hiwat; if (sbchecklowmem()) { /* low on memory try to get rid of some */ if (tcp_recvspace < nmax) nmax = tcp_recvspace; } else if (so->so_rcv.sb_wat != tcp_recvspace) { /* user requested buffer size, auto-scaling disabled */ nmax = so->so_rcv.sb_wat; } else { /* automatic buffer scaling */ if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) nmax = MIN(sb_max, so->so_rcv.sb_hiwat + tcp_autorcvbuf_inc); } /* a readable socket must be preserved because of poll(2) semantics */ if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && nmax < so->so_snd.sb_lowat) nmax = so->so_snd.sb_lowat; if (nmax == so->so_rcv.sb_hiwat) return; /* round to MSS boundary */ nmax = roundup(nmax, tp->t_maxseg); sbreserve(so, &so->so_rcv, nmax); }