summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Friedl <markus@cvs.openbsd.org>2004-01-06 17:38:14 +0000
committerMarkus Friedl <markus@cvs.openbsd.org>2004-01-06 17:38:14 +0000
commit401fa3b54b2101e3c2f24105fd68e172efc83eeb (patch)
tree93bf90bf09f2a13ae52bc0d676d9cc2291f6dcac
parent7eaa724b45d0826b73a65838668a14035d70761c (diff)
import netbsd's version of David Borman's syncache code
http://www.kohala.com/start/borman.97jun06.txt; ok deraadt@, henning@
-rw-r--r--sys/netinet/tcp_input.c1674
-rw-r--r--sys/netinet/tcp_subr.c42
-rw-r--r--sys/netinet/tcp_usrreq.c8
-rw-r--r--sys/netinet/tcp_var.h128
-rw-r--r--usr.bin/netstat/inet.c20
5 files changed, 1475 insertions, 397 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index a9d90cc0f1d..58f4e19a1ca 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_input.c,v 1.137 2003/12/21 14:50:04 markus Exp $ */
+/* $OpenBSD: tcp_input.c,v 1.138 2004/01/06 17:38:12 markus Exp $ */
/* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
/*
@@ -77,6 +77,8 @@
#include <sys/socketvar.h>
#include <sys/kernel.h>
+#include <dev/rndvar.h>
+
#include <net/if.h>
#include <net/route.h>
@@ -304,49 +306,6 @@ present:
return (flags);
}
-/*
- * First check for a port-specific bomb. We do not want to drop half-opens
- * for other ports if this is the only port being bombed. We only check
- * the bottom 40 half open connections, to avoid wasting too much time.
- *
- * Or, otherwise it is more likely a generic syn bomb, so delete the oldest
- * half-open connection.
- */
-void
-tcpdropoldhalfopen(avoidtp, port)
- struct tcpcb *avoidtp;
- u_int16_t port;
-{
- struct inpcb *inp;
- struct tcpcb *tp;
- int ncheck = 40;
- int s;
-
- s = splnet();
- CIRCLEQ_FOREACH_REVERSE(inp, &tcbtable.inpt_queue, inp_queue) {
- if ((tp = (struct tcpcb *)inp->inp_ppcb) &&
- tp != avoidtp &&
- tp->t_state == TCPS_SYN_RECEIVED &&
- port == inp->inp_lport) {
- tcp_close(tp);
- goto done;
- }
- if (--ncheck)
- break;
- }
-
- CIRCLEQ_FOREACH_REVERSE(inp, &tcbtable.inpt_queue, inp_queue) {
- if ((tp = (struct tcpcb *)inp->inp_ppcb) &&
- tp != avoidtp &&
- tp->t_state == TCPS_SYN_RECEIVED) {
- tcp_close(tp);
- goto done;
- }
- }
-done:
- splx(s);
-}
-
#ifdef INET6
int
tcp6_input(mp, offp, proto)
@@ -403,17 +362,13 @@ tcp_input(struct mbuf *m, ...)
int todrop, acked, ourfinisacked, needoutput = 0;
int hdroptlen = 0;
short ostate = 0;
- struct in_addr laddr;
- int dropsocket = 0;
int iss = 0;
u_long tiwin;
- u_int32_t ts_val, ts_ecr;
- int ts_present = 0;
+ struct tcp_opt_info opti;
int iphlen;
va_list ap;
struct tcphdr *th;
#ifdef INET6
- struct in6_addr laddr6;
struct ip6_hdr *ip6 = NULL;
#endif /* INET6 */
#ifdef IPSEC
@@ -433,6 +388,9 @@ tcp_input(struct mbuf *m, ...)
tcpstat.tcps_rcvtotal++;
+ opti.ts_present = 0;
+ opti.maxseg = 0;
+
/*
* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
* See below for AF specific multicast.
@@ -646,9 +604,9 @@ tcp_input(struct mbuf *m, ...)
optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
*(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
(th->th_flags & TH_SYN) == 0) {
- ts_present = 1;
- ts_val = ntohl(*(u_int32_t *)(optp + 4));
- ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
+ opti.ts_present = 1;
+ opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
+ opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
optp = NULL; /* we've parsed the options */
}
}
@@ -720,6 +678,42 @@ findpcb:
so = inp->inp_socket;
if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
+ union syn_cache_sa src;
+ union syn_cache_sa dst;
+
+ bzero(&src, sizeof(src));
+ bzero(&dst, sizeof(dst));
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ src.sin.sin_len = sizeof(struct sockaddr_in);
+ src.sin.sin_family = AF_INET;
+ src.sin.sin_addr = ip->ip_src;
+ src.sin.sin_port = th->th_sport;
+
+ dst.sin.sin_len = sizeof(struct sockaddr_in);
+ dst.sin.sin_family = AF_INET;
+ dst.sin.sin_addr = ip->ip_dst;
+ dst.sin.sin_port = th->th_dport;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ src.sin6.sin6_len = sizeof(struct sockaddr_in6);
+ src.sin6.sin6_family = AF_INET6;
+ src.sin6.sin6_addr = ip6->ip6_src;
+ src.sin6.sin6_port = th->th_sport;
+
+ dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
+ dst.sin6.sin6_family = AF_INET6;
+ dst.sin6.sin6_addr = ip6->ip6_dst;
+ dst.sin6.sin6_port = th->th_dport;
+ break;
+#endif /* INET6 */
+ default:
+ goto badsyn; /*sanity*/
+ }
+
if (so->so_options & SO_DEBUG) {
ostate = tp->t_state;
switch (af) {
@@ -734,150 +728,160 @@ findpcb:
}
}
if (so->so_options & SO_ACCEPTCONN) {
- struct socket *so1;
+ if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
+ if (tiflags & TH_RST) {
+ syn_cache_reset(&src.sa, &dst.sa, th);
+ } else if ((tiflags & (TH_ACK|TH_SYN)) ==
+ (TH_ACK|TH_SYN)) {
+ /*
+ * Received a SYN,ACK. This should
+ * never happen while we are in
+ * LISTEN. Send an RST.
+ */
+ goto badsyn;
+ } else if (tiflags & TH_ACK) {
+ so = syn_cache_get(&src.sa, &dst.sa,
+ th, iphlen, tlen, so, m);
+ if (so == NULL) {
+ /*
+ * We don't have a SYN for
+ * this ACK; send an RST.
+ */
+ goto badsyn;
+ } else if (so ==
+ (struct socket *)(-1)) {
+ /*
+ * We were unable to create
+ * the connection. If the
+ * 3-way handshake was
+ * completed, and RST has
+ * been sent to the peer.
+ * Since the mbuf might be
+ * in use for the reply,
+ * do not free it.
+ */
+ m = NULL;
+ } else {
+ /*
+ * We have created a
+ * full-blown connection.
+ */
+ tp = NULL;
+ inp = (struct inpcb *)so->so_pcb;
+ tp = intotcpcb(inp);
+ if (tp == NULL)
+ goto badsyn; /*XXX*/
+ /*
+ * Compute proper scaling
+ * value from buffer space
+ */
+ tcp_rscale(tp, so->so_rcv.sb_hiwat);
+ goto after_listen;
+ }
+ } else {
+ /*
+ * None of RST, SYN or ACK was set.
+ * This is an invalid packet for a
+ * TCB in LISTEN state. Send a RST.
+ */
+ goto badsyn;
+ }
+ } else {
+ /*
+ * Received a SYN.
+ */
#ifdef INET6
- /*
- * If deprecated address is forbidden,
- * we do not accept SYN to deprecated interface
- * address to prevent any new inbound connection from
- * getting established. So drop the SYN packet.
- * When we do not accept SYN, we send a TCP RST,
- * with deprecated source address (instead of dropping
- * it). We compromise it as it is much better for peer
- * to send a RST, and RST will be the final packet
- * for the exchange.
- *
- * If we do not forbid deprecated addresses, we accept
- * the SYN packet. RFC2462 does not suggest dropping
- * SYN in this case.
- * If we decipher RFC2462 5.5.4, it says like this:
- * 1. use of deprecated addr with existing
- * communication is okay - "SHOULD continue to be
- * used"
- * 2. use of it with new communication:
- * (2a) "SHOULD NOT be used if alternate address
- * with sufficient scope is available"
- * (2b) nothing mentioned otherwise.
- * Here we fall into (2b) case as we have no choice in
- * our source address selection - we must obey the peer.
- *
- * The wording in RFC2462 is confusing, and there are
- * multiple description text for deprecated address
- * handling - worse, they are not exactly the same.
- * I believe 5.5.4 is the best one, so we follow 5.5.4.
- */
- if (ip6 && !ip6_use_deprecated) {
- struct in6_ifaddr *ia6;
-
- if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, &ip6->ip6_dst)) &&
- (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
- tp = NULL;
- goto dropwithreset;
+ /*
+ * If deprecated address is forbidden,
+ * we do not accept SYN to deprecated interface
+ * address to prevent any new inbound connection from
+ * getting established. So drop the SYN packet.
+ * When we do not accept SYN, we send a TCP RST,
+ * with deprecated source address (instead of dropping
+ * it). We compromise it as it is much better for peer
+ * to send a RST, and RST will be the final packet
+ * for the exchange.
+ *
+ * If we do not forbid deprecated addresses, we accept
+ * the SYN packet. RFC2462 does not suggest dropping
+ * SYN in this case.
+ * If we decipher RFC2462 5.5.4, it says like this:
+ * 1. use of deprecated addr with existing
+ * communication is okay - "SHOULD continue to be
+ * used"
+ * 2. use of it with new communication:
+ * (2a) "SHOULD NOT be used if alternate address
+ * with sufficient scope is available"
+ * (2b) nothing mentioned otherwise.
+ * Here we fall into (2b) case as we have no choice in
+ * our source address selection - we must obey the peer.
+ *
+ * The wording in RFC2462 is confusing, and there are
+ * multiple description text for deprecated address
+ * handling - worse, they are not exactly the same.
+ * I believe 5.5.4 is the best one, so we follow 5.5.4.
+ */
+ if (ip6 && !ip6_use_deprecated) {
+ struct in6_ifaddr *ia6;
+
+ if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
+ &ip6->ip6_dst)) &&
+ (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
+ tp = NULL;
+ goto dropwithreset;
+ }
}
- }
#endif
- so1 = sonewconn(so, 0);
- if (so1 == NULL) {
- tcpdropoldhalfopen(tp, th->th_dport);
- so1 = sonewconn(so, 0);
- if (so1 == NULL)
- goto drop;
- }
- so = so1;
- /*
- * This is ugly, but ....
- *
- * Mark socket as temporary until we're
- * committed to keeping it. The code at
- * ``drop'' and ``dropwithreset'' check the
- * flag dropsocket to see if the temporary
- * socket created here should be discarded.
- * We mark the socket as discardable until
- * we're committed to it below in TCPS_LISTEN.
- */
- dropsocket++;
-#ifdef IPSEC
- /*
- * We need to copy the required security levels
- * from the old pcb. Ditto for any other
- * IPsec-related information.
- */
- {
- struct inpcb *newinp = (struct inpcb *)so->so_pcb;
- bcopy(inp->inp_seclevel, newinp->inp_seclevel,
- sizeof(inp->inp_seclevel));
- newinp->inp_secrequire = inp->inp_secrequire;
- if (inp->inp_ipo != NULL) {
- newinp->inp_ipo = inp->inp_ipo;
- inp->inp_ipo->ipo_ref_count++;
- }
- if (inp->inp_ipsec_remotecred != NULL) {
- newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred;
- inp->inp_ipsec_remotecred->ref_count++;
- }
- if (inp->inp_ipsec_remoteauth != NULL) {
- newinp->inp_ipsec_remoteauth
- = inp->inp_ipsec_remoteauth;
- inp->inp_ipsec_remoteauth->ref_count++;
- }
- }
-#endif /* IPSEC */
+ /*
+ * LISTEN socket received a SYN
+ * from itself? This can't possibly
+ * be valid; drop the packet.
+ */
+ if (th->th_dport == th->th_sport) {
+ switch (af) {
#ifdef INET6
- /*
- * inp still has the OLD in_pcb stuff, set the
- * v6-related flags on the new guy, too. This is
- * done particularly for the case where an AF_INET6
- * socket is bound only to a port, and a v4 connection
- * comes in on that port.
- * we also copy the flowinfo from the original pcb
- * to the new one.
- */
- {
- int flags = inp->inp_flags;
- struct inpcb *oldinpcb = inp;
-
- inp = (struct inpcb *)so->so_pcb;
- inp->inp_flags |= (flags & INP_IPV6);
- if ((inp->inp_flags & INP_IPV6) != 0) {
- inp->inp_ipv6.ip6_hlim =
- oldinpcb->inp_ipv6.ip6_hlim;
- }
- }
-#else /* INET6 */
- inp = (struct inpcb *)so->so_pcb;
+ case AF_INET6:
+ if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
+ &ip6->ip6_dst)) {
+ tcpstat.tcps_badsyn++;
+ goto drop;
+ }
+ break;
#endif /* INET6 */
- inp->inp_lport = th->th_dport;
- switch (af) {
-#ifdef INET6
- case AF_INET6:
- inp->inp_laddr6 = ip6->ip6_dst;
+ case AF_INET:
+ if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
+ tcpstat.tcps_badsyn++;
+ goto drop;
+ }
+ break;
+ }
+ }
- /*inp->inp_options = ip6_srcroute();*/ /* soon. */
/*
- * still need to tweak outbound options
- * processing to include this mbuf in
- * the right place and put the correct
- * NextHdr values in the right places.
- * XXX rja
+ * SYN looks ok; create compressed TCP
+ * state for it.
*/
- break;
-#endif /* INET6 */
- case AF_INET:
- inp->inp_laddr = ip->ip_dst;
- inp->inp_options = ip_srcroute();
- break;
+ if (so->so_qlen <= so->so_qlimit &&
+ syn_cache_add(&src.sa, &dst.sa, th, tlen,
+ so, m, optp, optlen, &opti))
+ m = NULL;
}
- in_pcbrehash(inp);
- tp = intotcpcb(inp);
- tp->t_state = TCPS_LISTEN;
-
- /* Compute proper scaling value from buffer space */
- tcp_rscale(tp, so->so_rcv.sb_hiwat);
+ goto drop;
}
}
+after_listen:
+#ifdef DIAGNOSTIC
+ /*
+ * Should not happen now that all embryonic connections
+ * are handled with compressed state.
+ */
+ if (tp->t_state == TCPS_LISTEN)
+ panic("tcp_input: TCPS_LISTEN");
+#endif
+
#ifdef IPSEC
/* Find most recent IPsec tag */
mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
@@ -946,12 +950,10 @@ findpcb:
#endif /* TCP_SACK */
/*
- * Process options if not in LISTEN state,
- * else do it below (after getting remote address).
+ * Process options.
*/
- if (optp && tp->t_state != TCPS_LISTEN)
- tcp_dooptions(tp, optp, optlen, th,
- &ts_present, &ts_val, &ts_ecr);
+ if (optp)
+ tcp_dooptions(tp, optp, optlen, th, &opti);
#ifdef TCP_SACK
if (!tp->sack_disable) {
@@ -986,7 +988,7 @@ findpcb:
#else
(tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
#endif
- (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) &&
+ (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
th->th_seq == tp->rcv_nxt &&
tiwin && tiwin == tp->snd_wnd &&
tp->snd_nxt == tp->snd_max) {
@@ -996,9 +998,9 @@ findpcb:
* record the timestamp.
* Fix from Braden, see Stevens p. 870
*/
- if (ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
tp->ts_recent_age = tcp_now;
- tp->ts_recent = ts_val;
+ tp->ts_recent = opti.ts_val;
}
if (tlen == 0) {
@@ -1010,8 +1012,8 @@ findpcb:
* this is a pure ack for outstanding data.
*/
++tcpstat.tcps_predack;
- if (ts_present)
- tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
+ if (opti.ts_present)
+ tcp_xmit_timer(tp, tcp_now-opti.ts_ecr+1);
else if (tp->t_rtttime &&
SEQ_GT(th->th_ack, tp->t_rtseq))
tcp_xmit_timer(tp,
@@ -1117,170 +1119,6 @@ findpcb:
switch (tp->t_state) {
/*
- * If the state is LISTEN then ignore segment if it contains an RST.
- * If the segment contains an ACK then it is bad and send a RST.
- * If it does not contain a SYN then it is not interesting; drop it.
- * If it is from this socket, drop it, it must be forged.
- * Don't bother responding if the destination was a broadcast.
- * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
- * tp->iss, and send a segment:
- * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
- * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
- * Fill in remote peer address fields if not previously specified.
- * Enter SYN_RECEIVED state, and process any other fields of this
- * segment in this state.
- */
- case TCPS_LISTEN: {
- struct mbuf *am;
- struct sockaddr_in *sin;
-#ifdef INET6
- struct sockaddr_in6 *sin6;
-#endif /* INET6 */
-
- if (tiflags & TH_RST)
- goto drop;
- if (tiflags & TH_ACK)
- goto dropwithreset;
- if ((tiflags & TH_SYN) == 0)
- goto drop;
- if (th->th_dport == th->th_sport) {
- switch (af) {
-#ifdef INET6
- case AF_INET6:
- if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
- &ip6->ip6_dst))
- goto drop;
- break;
-#endif /* INET6 */
- case AF_INET:
- if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
- goto drop;
- break;
- }
- }
-
- am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
- if (am == NULL)
- goto drop;
- switch (af) {
-#ifdef INET6
- case AF_INET6:
- /*
- * This is probably the place to set the tp->pf value.
- * (Don't forget to do it in the v4 code as well!)
- *
- * Also, remember to blank out things like flowlabel, or
- * set flowlabel for accepted sockets in v6.
- *
- * FURTHERMORE, this is PROBABLY the place where the
- * whole business of key munging is set up for passive
- * connections.
- */
- am->m_len = sizeof(struct sockaddr_in6);
- sin6 = mtod(am, struct sockaddr_in6 *);
- bzero(sin6, sizeof(*sin6));
- sin6->sin6_family = AF_INET6;
- sin6->sin6_len = sizeof(struct sockaddr_in6);
- sin6->sin6_addr = ip6->ip6_src;
- sin6->sin6_port = th->th_sport;
- sin6->sin6_flowinfo =
- ip6->ip6_flow & IPV6_FLOWINFO_MASK;
- laddr6 = inp->inp_laddr6;
- if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6))
- inp->inp_laddr6 = ip6->ip6_dst;
- /* This is a good optimization. */
- if (in6_pcbconnect(inp, am)) {
- inp->inp_laddr6 = laddr6;
- (void) m_free(am);
- goto drop;
- }
- break;
-#endif
- case AF_INET:
- /* drop IPv4 packet to AF_INET6 socket */
- if (inp->inp_flags & INP_IPV6) {
- (void) m_free(am);
- goto drop;
- }
- am->m_len = sizeof(struct sockaddr_in);
- sin = mtod(am, struct sockaddr_in *);
- bzero(sin, sizeof(*sin));
- sin->sin_family = AF_INET;
- sin->sin_len = sizeof(*sin);
- sin->sin_addr = ip->ip_src;
- sin->sin_port = th->th_sport;
- bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
- laddr = inp->inp_laddr;
- if (inp->inp_laddr.s_addr == INADDR_ANY)
- inp->inp_laddr = ip->ip_dst;
- if (in_pcbconnect(inp, am)) {
- inp->inp_laddr = laddr;
- (void) m_free(am);
- goto drop;
- }
- break;
- }
- (void) m_free(am);
- tp->t_template = tcp_template(tp);
- if (tp->t_template == 0) {
- tp = tcp_drop(tp, ENOBUFS);
- dropsocket = 0; /* socket is already gone */
- goto drop;
- }
- if (optp)
- tcp_dooptions(tp, optp, optlen, th,
- &ts_present, &ts_val, &ts_ecr);
-#ifdef TCP_SACK
- /*
- * If peer did not send a SACK_PERMITTED option (i.e., if
- * tcp_dooptions() did not set TF_SACK_PERMIT), set
- * sack_disable to 1 if it is currently 0.
- */
- if (!tp->sack_disable)
- if ((tp->t_flags & TF_SACK_PERMIT) == 0)
- tp->sack_disable = 1;
-#endif
-
- if (iss)
- tp->iss = iss;
- else {
-#ifdef TCP_COMPAT_42
- tcp_iss += TCP_ISSINCR/2;
- tp->iss = tcp_iss;
-#else /* TCP_COMPAT_42 */
- tp->iss = tcp_rndiss_next();
-#endif /* !TCP_COMPAT_42 */
- }
- tp->irs = th->th_seq;
- tcp_sendseqinit(tp);
-#if defined (TCP_SACK) || defined(TCP_ECN)
- tp->snd_last = tp->snd_una;
-#endif /* TCP_SACK */
-#if defined(TCP_SACK) && defined(TCP_FACK)
- tp->snd_fack = tp->snd_una;
- tp->retran_data = 0;
- tp->snd_awnd = 0;
-#endif /* TCP_FACK */
-#ifdef TCP_ECN
- /*
- * if both ECE and CWR flag bits are set, peer is ECN capable.
- */
- if (tcp_do_ecn &&
- (tiflags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
- tp->t_flags |= TF_ECN_PERMIT;
- tcpstat.tcps_ecn_accepts++;
- }
-#endif
- tcp_rcvseqinit(tp);
- tp->t_flags |= TF_ACKNOW;
- tp->t_state = TCPS_SYN_RECEIVED;
- TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
- dropsocket = 0; /* committed to socket */
- tcpstat.tcps_accepts++;
- goto trimthenstep6;
- }
-
- /*
* If the state is SYN_RECEIVED:
* if seg contains SYN/ACK, send an RST.
* if seg contains an ACK, but not for our SYN/ACK, send an RST
@@ -1334,6 +1172,7 @@ findpcb:
}
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->irs = th->th_seq;
+ tcp_mss(tp, opti.maxseg);
tcp_rcvseqinit(tp);
tp->t_flags |= TF_ACKNOW;
#ifdef TCP_SACK
@@ -1394,7 +1233,9 @@ findpcb:
} else
tp->t_state = TCPS_SYN_RECEIVED;
+#if 0
trimthenstep6:
+#endif
/*
* Advance th->th_seq to correspond to first data byte.
* If data, trim to stay within window,
@@ -1422,10 +1263,10 @@ trimthenstep6:
* drop leading data (and SYN); if nothing left, just ack.
*
* RFC 1323 PAWS: If we have a timestamp reply on this segment
- * and it's less than ts_recent, drop it.
+ * and it's less than opti.ts_recent, drop it.
*/
- if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
- TSTMP_LT(ts_val, tp->ts_recent)) {
+ if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
+ TSTMP_LT(opti.ts_val, tp->ts_recent)) {
/* Check to see if ts_recent is over 24 days old. */
if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
@@ -1547,10 +1388,10 @@ trimthenstep6:
* record its timestamp.
* Fix from Braden, see Stevens p. 870
*/
- if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) &&
+ if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
tp->ts_recent_age = tcp_now;
- tp->ts_recent = ts_val;
+ tp->ts_recent = opti.ts_val;
}
/*
@@ -1919,8 +1760,8 @@ trimthenstep6:
* timer backoff (cf., Phil Karn's retransmit alg.).
* Recompute the initial retransmit timer.
*/
- if (ts_present)
- tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
+ if (opti.ts_present)
+ tcp_xmit_timer(tp, tcp_now-opti.ts_ecr+1);
else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
@@ -2251,6 +2092,14 @@ dodata: /* XXX */
}
return;
+badsyn:
+ /*
+ * Received a bad SYN. Increment counters and dropwithreset.
+ */
+ tcpstat.tcps_badsyn++;
+ tp = NULL;
+ goto dropwithreset;
+
dropafterack:
/*
* Generate an ACK dropping incoming segment if it occupies
@@ -2294,9 +2143,6 @@ dropwithreset:
tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen,
(tcp_seq)0, TH_RST|TH_ACK);
}
- /* destroy temporarily created socket */
- if (dropsocket)
- (void) soabort(so);
return;
drop:
@@ -2319,21 +2165,17 @@ drop:
}
m_freem(m);
- /* destroy temporarily created socket */
- if (dropsocket)
- (void) soabort(so);
return;
#ifndef TUBA_INCLUDE
}
void
-tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr)
+tcp_dooptions(tp, cp, cnt, th, oi)
struct tcpcb *tp;
u_char *cp;
int cnt;
struct tcphdr *th;
- int *ts_present;
- u_int32_t *ts_val, *ts_ecr;
+ struct tcp_opt_info *oi;
{
u_int16_t mss = 0;
int opt, optlen;
@@ -2363,6 +2205,7 @@ tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr)
continue;
bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
NTOHS(mss);
+ oi->maxseg = mss;
break;
case TCPOPT_WINDOW:
@@ -2377,11 +2220,11 @@ tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr)
case TCPOPT_TIMESTAMP:
if (optlen != TCPOLEN_TIMESTAMP)
continue;
- *ts_present = 1;
- bcopy((char *)cp + 2, (char *) ts_val, sizeof(*ts_val));
- NTOHL(*ts_val);
- bcopy((char *)cp + 6, (char *) ts_ecr, sizeof(*ts_ecr));
- NTOHL(*ts_ecr);
+ oi->ts_present = 1;
+ bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
+ NTOHL(oi->ts_val);
+ bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
+ NTOHL(oi->ts_ecr);
/*
* A timestamp received in a SYN makes
@@ -2389,7 +2232,7 @@ tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr)
*/
if (th->th_flags & TH_SYN) {
tp->t_flags |= TF_RCVD_TSTMP;
- tp->ts_recent = *ts_val;
+ tp->ts_recent = oi->ts_val;
tp->ts_recent_age = tcp_now;
}
break;
@@ -2409,13 +2252,6 @@ tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr)
#endif
}
}
- /* Update t_maxopd and t_maxseg after all options are processed */
- if (th->th_flags & TH_SYN) {
- (void) tcp_mss(tp, mss); /* sets t_maxseg */
-
- if (mss)
- tcp_mss_update(tp);
- }
}
#if defined(TCP_SACK)
@@ -3245,3 +3081,1071 @@ tcp_newreno(tp, th)
return 0;
}
#endif /* TCP_SACK */
+
+static int
+tcp_mss_adv(struct ifnet *ifp, int af)
+{
+ u_int16_t mss = 0;
+ int iphlen;
+
+ switch (af) {
+ case AF_INET:
+ if (ifp != NULL)
+ mss = ifp->if_mtu;
+ iphlen = sizeof(struct ip);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (ifp != NULL)
+ mss = IN6_LINKMTU(ifp);
+ iphlen = sizeof(struct ip6_hdr);
+ break;
+#endif
+ }
+ mss = mss - iphlen - sizeof(struct tcphdr);
+ return (max(mss, tcp_mssdflt));
+}
+
+/*
+ * TCP compressed state engine. Currently used to hold compressed
+ * state for SYN_RECEIVED.
+ */
+
+u_long syn_cache_count;
+u_int32_t syn_hash1, syn_hash2;
+
+#define SYN_HASH(sa, sp, dp) \
+ ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
+ ((u_int32_t)(sp)))^syn_hash2)))
+#ifndef INET6
+#define SYN_HASHALL(hash, src, dst) \
+do { \
+ hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
+ ((struct sockaddr_in *)(src))->sin_port, \
+ ((struct sockaddr_in *)(dst))->sin_port); \
+} while (/*CONSTCOND*/ 0)
+#else
+#define SYN_HASH6(sa, sp, dp) \
+ ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
+ (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
+ & 0x7fffffff)
+
+#define SYN_HASHALL(hash, src, dst) \
+do { \
+ switch ((src)->sa_family) { \
+ case AF_INET: \
+ hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
+ ((struct sockaddr_in *)(src))->sin_port, \
+ ((struct sockaddr_in *)(dst))->sin_port); \
+ break; \
+ case AF_INET6: \
+ hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \
+ ((struct sockaddr_in6 *)(src))->sin6_port, \
+ ((struct sockaddr_in6 *)(dst))->sin6_port); \
+ break; \
+ default: \
+ hash = 0; \
+ } \
+} while (/*CONSTCOND*/0)
+#endif /* INET6 */
+
+#define SYN_CACHE_RM(sc) \
+do { \
+ TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \
+ (sc), sc_bucketq); \
+ (sc)->sc_tp = NULL; \
+ LIST_REMOVE((sc), sc_tpq); \
+ tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \
+ timeout_del(&(sc)->sc_timer); \
+ syn_cache_count--; \
+} while (/*CONSTCOND*/0)
+
+#define SYN_CACHE_PUT(sc) \
+do { \
+ if ((sc)->sc_ipopts) \
+ (void) m_free((sc)->sc_ipopts); \
+ if ((sc)->sc_route4.ro_rt != NULL) \
+ RTFREE((sc)->sc_route4.ro_rt); \
+ pool_put(&syn_cache_pool, (sc)); \
+} while (/*CONSTCOND*/0)
+
+struct pool syn_cache_pool;
+
+/*
+ * We don't estimate RTT with SYNs, so each packet starts with the default
+ * RTT and each timer step has a fixed timeout value.
+ */
+#define SYN_CACHE_TIMER_ARM(sc) \
+do { \
+ TCPT_RANGESET((sc)->sc_rxtcur, \
+ TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
+ TCPTV_REXMTMAX); \
+ if (!timeout_initialized(&(sc)->sc_timer)) \
+ timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \
+ timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \
+} while (/*CONSTCOND*/0)
+
+#define SYN_CACHE_TIMESTAMP(sc) tcp_now
+
+void
+syn_cache_init()
+{
+ int i;
+
+ /* Initialize the hash buckets. */
+ for (i = 0; i < tcp_syn_cache_size; i++)
+ TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
+
+ /* Initialize the syn cache pool. */
+ pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
+ "synpl", NULL);
+}
+
+void
+syn_cache_insert(sc, tp)
+ struct syn_cache *sc;
+ struct tcpcb *tp;
+{
+ struct syn_cache_head *scp;
+ struct syn_cache *sc2;
+ int s;
+
+ /*
+ * If there are no entries in the hash table, reinitialize
+ * the hash secrets.
+ */
+ if (syn_cache_count == 0) {
+ syn_hash1 = arc4random();
+ syn_hash2 = arc4random();
+ }
+
+ SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
+ sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
+ scp = &tcp_syn_cache[sc->sc_bucketidx];
+
+ /*
+ * Make sure that we don't overflow the per-bucket
+ * limit or the total cache size limit.
+ */
+ s = splsoftnet();
+ if (scp->sch_length >= tcp_syn_bucket_limit) {
+ tcpstat.tcps_sc_bucketoverflow++;
+ /*
+ * The bucket is full. Toss the oldest element in the
+ * bucket. This will be the first entry in the bucket.
+ */
+ sc2 = TAILQ_FIRST(&scp->sch_bucket);
+#ifdef DIAGNOSTIC
+ /*
+ * This should never happen; we should always find an
+ * entry in our bucket.
+ */
+ if (sc2 == NULL)
+ panic("syn_cache_insert: bucketoverflow: impossible");
+#endif
+ SYN_CACHE_RM(sc2);
+ SYN_CACHE_PUT(sc2);
+ } else if (syn_cache_count >= tcp_syn_cache_limit) {
+ struct syn_cache_head *scp2, *sce;
+
+ tcpstat.tcps_sc_overflowed++;
+ /*
+ * The cache is full. Toss the oldest entry in the
+ * first non-empty bucket we can find.
+ *
+ * XXX We would really like to toss the oldest
+ * entry in the cache, but we hope that this
+ * condition doesn't happen very often.
+ */
+ scp2 = scp;
+ if (TAILQ_EMPTY(&scp2->sch_bucket)) {
+ sce = &tcp_syn_cache[tcp_syn_cache_size];
+ for (++scp2; scp2 != scp; scp2++) {
+ if (scp2 >= sce)
+ scp2 = &tcp_syn_cache[0];
+ if (! TAILQ_EMPTY(&scp2->sch_bucket))
+ break;
+ }
+#ifdef DIAGNOSTIC
+ /*
+ * This should never happen; we should always find a
+ * non-empty bucket.
+ */
+ if (scp2 == scp)
+ panic("syn_cache_insert: cacheoverflow: "
+ "impossible");
+#endif
+ }
+ sc2 = TAILQ_FIRST(&scp2->sch_bucket);
+ SYN_CACHE_RM(sc2);
+ SYN_CACHE_PUT(sc2);
+ }
+
+ /*
+ * Initialize the entry's timer.
+ */
+ sc->sc_rxttot = 0;
+ sc->sc_rxtshift = 0;
+ SYN_CACHE_TIMER_ARM(sc);
+
+ /* Link it from tcpcb entry */
+ LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
+
+ /* Put it into the bucket. */
+ TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
+ scp->sch_length++;
+ syn_cache_count++;
+
+ tcpstat.tcps_sc_added++;
+ splx(s);
+}
+
+/*
+ * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
+ * If we have retransmitted an entry the maximum number of times, expire
+ * that entry.
+ */
+void
+syn_cache_timer(void *arg)
+{
+ struct syn_cache *sc = arg;
+ int s;
+
+ s = splsoftnet();
+
+ if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
+ /* Drop it -- too many retransmissions. */
+ goto dropit;
+ }
+
+ /*
+ * Compute the total amount of time this entry has
+ * been on a queue. If this entry has been on longer
+ * than the keep alive timer would allow, expire it.
+ */
+ sc->sc_rxttot += sc->sc_rxtcur;
+ if (sc->sc_rxttot >= tcptv_keep_init)
+ goto dropit;
+
+ tcpstat.tcps_sc_retransmitted++;
+ (void) syn_cache_respond(sc, NULL);
+
+ /* Advance the timer back-off. */
+ sc->sc_rxtshift++;
+ SYN_CACHE_TIMER_ARM(sc);
+
+ splx(s);
+ return;
+
+ dropit:
+ tcpstat.tcps_sc_timed_out++;
+ SYN_CACHE_RM(sc);
+ SYN_CACHE_PUT(sc);
+ splx(s);
+}
+
+/*
+ * Remove syn cache created by the specified tcb entry,
+ * because this does not make sense to keep them
+ * (if there's no tcb entry, syn cache entry will never be used)
+ */
+void
+syn_cache_cleanup(tp)
+ struct tcpcb *tp;
+{
+ struct syn_cache *sc, *nsc;
+ int s;
+
+ s = splsoftnet();
+
+ for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
+ nsc = LIST_NEXT(sc, sc_tpq);
+
+#ifdef DIAGNOSTIC
+ if (sc->sc_tp != tp)
+ panic("invalid sc_tp in syn_cache_cleanup");
+#endif
+ SYN_CACHE_RM(sc);
+ SYN_CACHE_PUT(sc);
+ }
+ /* just for safety */
+ LIST_INIT(&tp->t_sc);
+
+ splx(s);
+}
+
+/*
+ * Find an entry in the syn cache.
+ */
+struct syn_cache *
+syn_cache_lookup(src, dst, headp)
+ struct sockaddr *src;
+ struct sockaddr *dst;
+ struct syn_cache_head **headp;
+{
+ struct syn_cache *sc;
+ struct syn_cache_head *scp;
+ u_int32_t hash;
+ int s;
+
+ SYN_HASHALL(hash, src, dst);
+
+ scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
+ *headp = scp;
+ s = splsoftnet();
+ for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
+ sc = TAILQ_NEXT(sc, sc_bucketq)) {
+ if (sc->sc_hash != hash)
+ continue;
+ if (!bcmp(&sc->sc_src, src, src->sa_len) &&
+ !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
+ splx(s);
+ return (sc);
+ }
+ }
+ splx(s);
+ return (NULL);
+}
+
+/*
+ * This function gets called when we receive an ACK for a
+ * socket in the LISTEN state. We look up the connection
+ * in the syn cache, and if its there, we pull it out of
+ * the cache and turn it into a full-blown connection in
+ * the SYN-RECEIVED state.
+ *
+ * The return values may not be immediately obvious, and their effects
+ * can be subtle, so here they are:
+ *
+ * NULL SYN was not found in cache; caller should drop the
+ * packet and send an RST.
+ *
+ * -1 We were unable to create the new connection, and are
+ * aborting it. An ACK,RST is being sent to the peer
+ * (unless we got screwey sequence numbners; see below),
+ * because the 3-way handshake has been completed. Caller
+ * should not free the mbuf, since we may be using it. If
+ * we are not, we will free it.
+ *
+ * Otherwise, the return value is a pointer to the new socket
+ * associated with the connection.
+ */
+struct socket *
+syn_cache_get(src, dst, th, hlen, tlen, so, m)
+ struct sockaddr *src;
+ struct sockaddr *dst;
+ struct tcphdr *th;
+ unsigned int hlen, tlen;
+ struct socket *so;
+ struct mbuf *m;
+{
+ struct syn_cache *sc;
+ struct syn_cache_head *scp;
+ struct inpcb *inp = NULL;
+ struct tcpcb *tp = 0;
+ struct mbuf *am;
+ int s;
+ struct socket *oso;
+
+ s = splsoftnet();
+ if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
+ splx(s);
+ return (NULL);
+ }
+
+ /*
+ * Verify the sequence and ack numbers. Try getting the correct
+ * response again.
+ */
+ if ((th->th_ack != sc->sc_iss + 1) ||
+ SEQ_LEQ(th->th_seq, sc->sc_irs) ||
+ SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
+ (void) syn_cache_respond(sc, m);
+ splx(s);
+ return ((struct socket *)(-1));
+ }
+
+ /* Remove this cache entry */
+ SYN_CACHE_RM(sc);
+ splx(s);
+
+ /*
+ * Ok, create the full blown connection, and set things up
+ * as they would have been set up if we had created the
+ * connection when the SYN arrived. If we can't create
+ * the connection, abort it.
+ */
+ oso = so;
+ so = sonewconn(so, SS_ISCONNECTED);
+ if (so == NULL)
+ goto resetandabort;
+
+ inp = sotoinpcb(oso);
+#ifdef IPSEC
+ /*
+ * We need to copy the required security levels
+ * from the old pcb. Ditto for any other
+ * IPsec-related information.
+ */
+ {
+ struct inpcb *newinp = (struct inpcb *)so->so_pcb;
+ bcopy(inp->inp_seclevel, newinp->inp_seclevel,
+ sizeof(inp->inp_seclevel));
+ newinp->inp_secrequire = inp->inp_secrequire;
+ if (inp->inp_ipo != NULL) {
+ newinp->inp_ipo = inp->inp_ipo;
+ inp->inp_ipo->ipo_ref_count++;
+ }
+ if (inp->inp_ipsec_remotecred != NULL) {
+ newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred;
+ inp->inp_ipsec_remotecred->ref_count++;
+ }
+ if (inp->inp_ipsec_remoteauth != NULL) {
+ newinp->inp_ipsec_remoteauth
+ = inp->inp_ipsec_remoteauth;
+ inp->inp_ipsec_remoteauth->ref_count++;
+ }
+ }
+#endif /* IPSEC */
+#ifdef INET6
+ /*
+ * inp still has the OLD in_pcb stuff, set the
+ * v6-related flags on the new guy, too. This is
+ * done particularly for the case where an AF_INET6
+ * socket is bound only to a port, and a v4 connection
+ * comes in on that port.
+ * we also copy the flowinfo from the original pcb
+ * to the new one.
+ */
+ {
+ int flags = inp->inp_flags;
+ struct inpcb *oldinpcb = inp;
+
+ inp = (struct inpcb *)so->so_pcb;
+ inp->inp_flags |= (flags & INP_IPV6);
+ if ((inp->inp_flags & INP_IPV6) != 0) {
+ inp->inp_ipv6.ip6_hlim =
+ oldinpcb->inp_ipv6.ip6_hlim;
+ }
+ }
+#else /* INET6 */
+ inp = (struct inpcb *)so->so_pcb;
+#endif /* INET6 */
+
+ inp->inp_lport = th->th_dport;
+ switch (src->sa_family) {
+#ifdef INET6
+ case AF_INET6:
+ inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr;
+
+ /*inp->inp_options = ip6_srcroute();*/ /* soon. */
+ /*
+ * still need to tweak outbound options
+ * processing to include this mbuf in
+ * the right place and put the correct
+ * NextHdr values in the right places.
+ * XXX rja
+ */
+ break;
+#endif /* INET6 */
+ case AF_INET:
+
+ inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
+ inp->inp_options = ip_srcroute();
+ if (inp->inp_options == NULL) {
+ inp->inp_options = sc->sc_ipopts;
+ sc->sc_ipopts = NULL;
+ }
+ break;
+ }
+ in_pcbrehash(inp);
+
+ /*
+ * Give the new socket our cached route reference.
+ */
+ if (inp)
+ inp->inp_route = sc->sc_route4; /* struct assignment */
+#ifdef INET6
+ else
+ inp->inp_route6 = sc->sc_route6;
+#endif
+ sc->sc_route4.ro_rt = NULL;
+
+ am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
+ if (am == NULL)
+ goto resetandabort;
+ am->m_len = src->sa_len;
+ bcopy(src, mtod(am, caddr_t), src->sa_len);
+
+ switch (src->sa_family) {
+ case AF_INET:
+ /* drop IPv4 packet to AF_INET6 socket */
+ if (inp->inp_flags & INP_IPV6) {
+ (void) m_free(am);
+ goto resetandabort;
+ }
+ if (in_pcbconnect(inp, am)) {
+ (void) m_free(am);
+ goto resetandabort;
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (in6_pcbconnect(inp, am)) {
+ (void) m_free(am);
+ goto resetandabort;
+ }
+ break;
+#endif
+ }
+ (void) m_free(am);
+
+ tp = intotcpcb(inp);
+ tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
+ if (sc->sc_request_r_scale != 15) {
+ tp->requested_s_scale = sc->sc_requested_s_scale;
+ tp->request_r_scale = sc->sc_request_r_scale;
+ tp->snd_scale = sc->sc_requested_s_scale;
+ tp->rcv_scale = sc->sc_request_r_scale;
+ tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
+ }
+ if (sc->sc_flags & SCF_TIMESTAMP)
+ tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
+
+ tp->t_template = tcp_template(tp);
+ if (tp->t_template == 0) {
+ tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
+ so = NULL;
+ m_freem(m);
+ goto abort;
+ }
+#ifdef TCP_SACK
+ tp->sack_disable = (sc->sc_flags & SCF_SACK_PERMIT) ? 0 : 1;
+#endif
+
+ tp->iss = sc->sc_iss;
+ tp->irs = sc->sc_irs;
+ tcp_sendseqinit(tp);
+#if defined (TCP_SACK) || defined(TCP_ECN)
+ tp->snd_last = tp->snd_una;
+#endif /* TCP_SACK */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+ tp->snd_awnd = 0;
+#endif /* TCP_FACK */
+#ifdef TCP_ECN
+ if (sc->sc_flags & SCF_ECN_PERMIT) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ tcpstat.tcps_ecn_accepts++;
+ }
+#endif
+#ifdef TCP_SACK
+ if (sc->sc_flags & SCF_SACK_PERMIT)
+ tp->t_flags |= TF_SACK_PERMIT;
+#endif
+ tcp_rcvseqinit(tp);
+ tp->t_state = TCPS_SYN_RECEIVED;
+ tp->t_rcvtime = tcp_now;
+ TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
+ tcpstat.tcps_accepts++;
+
+ tcp_mss(tp, sc->sc_peermaxseg);
+
+#if 0
+ /*
+ * XXX
+ * Initialize the initial congestion window. If we
+ * had to retransmit the SYN,ACK, we must initialize cwnd
+ * to 1 segment (i.e. the Loss Window).
+ */
+#endif
+ tp->snd_wl1 = sc->sc_irs;
+ tp->rcv_up = sc->sc_irs + 1;
+
+ /*
+ * This is what whould have happened in tcp_output() when
+ * the SYN,ACK was sent.
+ */
+ tp->snd_up = tp->snd_una;
+ tp->snd_max = tp->snd_nxt = tp->iss+1;
+ TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
+ if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
+ tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
+ tp->last_ack_sent = tp->rcv_nxt;
+
+ tcpstat.tcps_sc_completed++;
+ SYN_CACHE_PUT(sc);
+ return (so);
+
+resetandabort:
+ (void) tcp_respond(NULL, mtod(m, caddr_t), m,
+ th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
+abort:
+ if (so != NULL)
+ (void) soabort(so);
+ SYN_CACHE_PUT(sc);
+ tcpstat.tcps_sc_aborted++;
+ return ((struct socket *)(-1));
+}
+
+/*
+ * This function is called when we get a RST for a
+ * non-existent connection, so that we can see if the
+ * connection is in the syn cache. If it is, zap it.
+ */
+
+void
+syn_cache_reset(src, dst, th)
+ struct sockaddr *src;
+ struct sockaddr *dst;
+ struct tcphdr *th;
+{
+ struct syn_cache *sc;
+ struct syn_cache_head *scp;
+ int s = splsoftnet();
+
+ if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
+ splx(s);
+ return;
+ }
+ if (SEQ_LT(th->th_seq, sc->sc_irs) ||
+ SEQ_GT(th->th_seq, sc->sc_irs+1)) {
+ splx(s);
+ return;
+ }
+ SYN_CACHE_RM(sc);
+ splx(s);
+ tcpstat.tcps_sc_reset++;
+ SYN_CACHE_PUT(sc);
+}
+
+void
+syn_cache_unreach(src, dst, th)
+ struct sockaddr *src;
+ struct sockaddr *dst;
+ struct tcphdr *th;
+{
+ struct syn_cache *sc;
+ struct syn_cache_head *scp;
+ int s;
+
+ s = splsoftnet();
+ if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
+ splx(s);
+ return;
+ }
+ /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
+ if (ntohl (th->th_seq) != sc->sc_iss) {
+ splx(s);
+ return;
+ }
+
+ /*
+ * If we've retransmitted 3 times and this is our second error,
+ * we remove the entry. Otherwise, we allow it to continue on.
+ * This prevents us from incorrectly nuking an entry during a
+ * spurious network outage.
+ *
+ * See tcp_notify().
+ */
+ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
+ sc->sc_flags |= SCF_UNREACH;
+ splx(s);
+ return;
+ }
+
+ SYN_CACHE_RM(sc);
+ splx(s);
+ tcpstat.tcps_sc_unreach++;
+ SYN_CACHE_PUT(sc);
+}
+
+/*
+ * Given a LISTEN socket and an inbound SYN request, add
+ * this to the syn cache, and send back a segment:
+ * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
+ * to the source.
+ *
+ * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
+ * Doing so would require that we hold onto the data and deliver it
+ * to the application. However, if we are the target of a SYN-flood
+ * DoS attack, an attacker could send data which would eventually
+ * consume all available buffer space if it were ACKed. By not ACKing
+ * the data, we avoid this DoS scenario.
+ */
+
+int
+syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi)
+ struct sockaddr *src;
+ struct sockaddr *dst;
+ struct tcphdr *th;
+ unsigned int hlen;
+ struct socket *so;
+ struct mbuf *m;
+ u_char *optp;
+ int optlen;
+ struct tcp_opt_info *oi;
+{
+ struct tcpcb tb, *tp;
+ long win;
+ struct syn_cache *sc;
+ struct syn_cache_head *scp;
+ struct mbuf *ipopts;
+
+ tp = sototcpcb(so);
+
+ /*
+ * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
+ *
+ * Note this check is performed in tcp_input() very early on.
+ */
+
+ /*
+ * Initialize some local state.
+ */
+ win = sbspace(&so->so_rcv);
+ if (win > TCP_MAXWIN)
+ win = TCP_MAXWIN;
+
+ switch (src->sa_family) {
+#ifdef INET
+ case AF_INET:
+ /*
+ * Remember the IP options, if any.
+ */
+ ipopts = ip_srcroute();
+ break;
+#endif
+ default:
+ ipopts = NULL;
+ }
+
+ if (optp) {
+ tb.t_inpcb = tp->t_inpcb; /* XXX */
+ tb.pf = tp->pf;
+#ifdef TCP_SACK
+ tb.sack_disable = tcp_do_sack ? 0 : 1;
+#endif
+ tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
+ tcp_dooptions(&tb, optp, optlen, th, oi);
+
+ /* Update t_maxopd and t_maxseg after all options are processed */
+ (void) tcp_mss(tp, oi->maxseg); /* sets t_maxseg */
+ if (oi->maxseg)
+ tcp_mss_update(tp);
+
+ } else
+ tb.t_flags = 0;
+
+ /*
+ * See if we already have an entry for this connection.
+ * If we do, resend the SYN,ACK. We do not count this
+ * as a retransmission (XXX though maybe we should).
+ */
+ if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
+ tcpstat.tcps_sc_dupesyn++;
+ if (ipopts) {
+ /*
+ * If we were remembering a previous source route,
+ * forget it and use the new one we've been given.
+ */
+ if (sc->sc_ipopts)
+ (void) m_free(sc->sc_ipopts);
+ sc->sc_ipopts = ipopts;
+ }
+ sc->sc_timestamp = tb.ts_recent;
+ if (syn_cache_respond(sc, m) == 0) {
+ tcpstat.tcps_sndacks++;
+ tcpstat.tcps_sndtotal++;
+ }
+ return (1);
+ }
+
+ sc = pool_get(&syn_cache_pool, PR_NOWAIT);
+ if (sc == NULL) {
+ if (ipopts)
+ (void) m_free(ipopts);
+ return (0);
+ }
+
+ /*
+ * Fill in the cache, and put the necessary IP and TCP
+ * options into the reply.
+ */
+ bzero(sc, sizeof(struct syn_cache));
+ bzero(&sc->sc_timer, sizeof(sc->sc_timer));
+ bcopy(src, &sc->sc_src, src->sa_len);
+ bcopy(dst, &sc->sc_dst, dst->sa_len);
+ sc->sc_flags = 0;
+ sc->sc_ipopts = ipopts;
+ sc->sc_irs = th->th_seq;
+
+#ifdef TCP_COMPAT_42
+ tcp_iss += TCP_ISSINCR/2;
+ sc->sc_iss = tcp_iss;
+#else
+ sc->sc_iss = tcp_rndiss_next();
+#endif
+ sc->sc_peermaxseg = oi->maxseg;
+ sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ?
+ m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family);
+ sc->sc_win = win;
+ sc->sc_timestamp = tb.ts_recent;
+ if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
+ (TF_REQ_TSTMP|TF_RCVD_TSTMP))
+ sc->sc_flags |= SCF_TIMESTAMP;
+ if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ sc->sc_requested_s_scale = tb.requested_s_scale;
+ sc->sc_request_r_scale = 0;
+ while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
+ TCP_MAXWIN << sc->sc_request_r_scale <
+ so->so_rcv.sb_hiwat)
+ sc->sc_request_r_scale++;
+ } else {
+ sc->sc_requested_s_scale = 15;
+ sc->sc_request_r_scale = 15;
+ }
+#ifdef TCP_ECN
+ /*
+ * if both ECE and CWR flag bits are set, peer is ECN capable.
+ */
+ if (tcp_do_ecn &&
+ (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR))
+ sc->sc_flags |= SCF_ECN_PERMIT;
+#endif
+#ifdef TCP_SACK
+ /*
+ * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option
+ * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT).
+ */
+ if (!tb.sack_disable && (tb.t_flags & TF_SACK_PERMIT))
+ sc->sc_flags |= SCF_SACK_PERMIT;
+#endif
+ sc->sc_tp = tp;
+ if (syn_cache_respond(sc, m) == 0) {
+ syn_cache_insert(sc, tp);
+ tcpstat.tcps_sndacks++;
+ tcpstat.tcps_sndtotal++;
+ } else {
+ SYN_CACHE_PUT(sc);
+ tcpstat.tcps_sc_dropped++;
+ }
+ return (1);
+}
+
+int
+syn_cache_respond(sc, m)
+ struct syn_cache *sc;
+ struct mbuf *m;
+{
+ struct route *ro;
+ u_int8_t *optp;
+ int optlen, error;
+ u_int16_t tlen;
+ struct ip *ip = NULL;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+#endif
+ struct tcpcb *tp;
+ struct tcphdr *th;
+ u_int hlen;
+ struct socket *so;
+
+ switch (sc->sc_src.sa.sa_family) {
+ case AF_INET:
+ hlen = sizeof(struct ip);
+ ro = &sc->sc_route4;
+ break;
+#ifdef INET6
+ case AF_INET6:
+ hlen = sizeof(struct ip6_hdr);
+ ro = (struct route *)&sc->sc_route6;
+ break;
+#endif
+ default:
+ if (m)
+ m_freem(m);
+ return (EAFNOSUPPORT);
+ }
+
+ /* Compute the size of the TCP options. */
+ optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
+ ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) +
+ ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
+
+ tlen = hlen + sizeof(struct tcphdr) + optlen;
+
+ /*
+ * Create the IP+TCP header from scratch.
+ */
+ if (m)
+ m_freem(m);
+#ifdef DIAGNOSTIC
+ if (max_linkhdr + tlen > MCLBYTES)
+ return (ENOBUFS);
+#endif
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m && tlen > MHLEN) {
+ MCLGET(m, M_DONTWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_freem(m);
+ m = NULL;
+ }
+ }
+ if (m == NULL)
+ return (ENOBUFS);
+
+ /* Fixup the mbuf. */
+ m->m_data += max_linkhdr;
+ m->m_len = m->m_pkthdr.len = tlen;
+ if (sc->sc_tp) {
+ tp = sc->sc_tp;
+ so = tp->t_inpcb->inp_socket;
+ } else
+ so = NULL;
+ m->m_pkthdr.rcvif = NULL;
+ memset(mtod(m, u_char *), 0, tlen);
+
+ switch (sc->sc_src.sa.sa_family) {
+ case AF_INET:
+ ip = mtod(m, struct ip *);
+ ip->ip_dst = sc->sc_src.sin.sin_addr;
+ ip->ip_src = sc->sc_dst.sin.sin_addr;
+ ip->ip_p = IPPROTO_TCP;
+ th = (struct tcphdr *)(ip + 1);
+ th->th_dport = sc->sc_src.sin.sin_port;
+ th->th_sport = sc->sc_dst.sin.sin_port;
+ break;
+#ifdef INET6
+ case AF_INET6:
+ ip6 = mtod(m, struct ip6_hdr *);
+ ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
+ ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
+ ip6->ip6_nxt = IPPROTO_TCP;
+ /* ip6_plen will be updated in ip6_output() */
+ th = (struct tcphdr *)(ip6 + 1);
+ th->th_dport = sc->sc_src.sin6.sin6_port;
+ th->th_sport = sc->sc_dst.sin6.sin6_port;
+ break;
+#endif
+ default:
+ th = NULL;
+ }
+
+ th->th_seq = htonl(sc->sc_iss);
+ th->th_ack = htonl(sc->sc_irs + 1);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ th->th_flags = TH_SYN|TH_ACK;
+#ifdef TCP_ECN
+ /* Set ECE for SYN-ACK if peer supports ECN. */
+ if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT))
+ th->th_flags |= TH_ECE;
+#endif
+ th->th_win = htons(sc->sc_win);
+ /* th_sum already 0 */
+ /* th_urp already 0 */
+
+ /* Tack on the TCP options. */
+ optp = (u_int8_t *)(th + 1);
+ *optp++ = TCPOPT_MAXSEG;
+ *optp++ = 4;
+ *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
+ *optp++ = sc->sc_ourmaxseg & 0xff;
+
+#ifdef TCP_SACK
+ /* Include SACK_PERMIT_HDR option if peer has already done so. */
+ if (sc->sc_flags & SCF_SACK_PERMIT) {
+ *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR);
+ optp += 4;
+ }
+#endif
+
+ if (sc->sc_request_r_scale != 15) {
+ *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
+ TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
+ sc->sc_request_r_scale);
+ optp += 4;
+ }
+
+ if (sc->sc_flags & SCF_TIMESTAMP) {
+ u_int32_t *lp = (u_int32_t *)(optp);
+ /* Form timestamp option as shown in appendix A of RFC 1323. */
+ *lp++ = htonl(TCPOPT_TSTAMP_HDR);
+ *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
+ *lp = htonl(sc->sc_timestamp);
+ optp += TCPOLEN_TSTAMP_APPA;
+ }
+
+#ifdef TCP_SIGNATURE
+ /* XXX */
+#endif
+
+ /* Compute the packet's checksum. */
+ switch (sc->sc_src.sa.sa_family) {
+ case AF_INET:
+ ip->ip_len = htons(tlen - hlen);
+ th->th_sum = 0;
+ th->th_sum = in_cksum(m, tlen);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ ip6->ip6_plen = htons(tlen - hlen);
+ th->th_sum = 0;
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
+ break;
+#endif
+ }
+
+ /*
+ * Fill in some straggling IP bits. Note the stack expects
+ * ip_len to be in host order, for convenience.
+ */
+ switch (sc->sc_src.sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ ip->ip_len = htons(tlen);
+ ip->ip_ttl = ip_defttl;
+ /* XXX tos? */
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
+ ip6->ip6_vfc |= IPV6_VERSION;
+ ip6->ip6_plen = htons(tlen - hlen);
+ /* ip6_hlim will be initialized afterwards */
+ /* XXX flowlabel? */
+ break;
+#endif
+ }
+
+#if 0
+ /* XXX use IPsec policy on listening socket, on SYN ACK */
+ tp = sc->sc_tp;
+#endif
+
+ switch (sc->sc_src.sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ error = ip_output(m, sc->sc_ipopts, ro,
+ (ip_mtudisc ? IP_MTUDISC : 0),
+ (struct ip_moptions *)NULL, so);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ ip6->ip6_hlim = in6_selecthlim(NULL,
+ ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
+
+ error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
+ (struct ip6_moptions *)0, NULL);
+ break;
+#endif
+ default:
+ error = EAFNOSUPPORT;
+ break;
+ }
+ return (error);
+}
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 0a7d7d841cc..74203f61dce 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_subr.c,v 1.71 2003/12/10 07:22:43 itojun Exp $ */
+/* $OpenBSD: tcp_subr.c,v 1.72 2004/01/06 17:38:13 markus Exp $ */
/* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */
/*
@@ -139,6 +139,14 @@ u_int32_t tcp_now;
#endif
int tcbhashsize = TCBHASHSIZE;
+/* syn hash parameters */
+#define TCP_SYN_HASH_SIZE 293
+#define TCP_SYN_BUCKET_SIZE 35
+int tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
+int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
+int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
+struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
+
#ifdef INET6
extern int ip6_defhlim;
#endif /* INET6 */
@@ -185,6 +193,9 @@ tcp_init()
icmp6_mtudisc_callback_register(tcp6_mtudisc_callback);
#endif /* INET6 */
+ /* Initialize the compressed state engine. */
+ syn_cache_init();
+
/* Initialize timer state. */
tcp_timer_init();
}
@@ -673,6 +684,7 @@ tcp_close(struct tcpcb *tp)
tcp_canceltimers(tp);
TCP_CLEAR_DELACK(tp);
+ syn_cache_cleanup(tp);
#ifdef TCP_SACK
/* Free SACK holes. */
@@ -852,8 +864,14 @@ tcp6_ctlinput(cmd, sa, d)
return;
}
- (void) in6_pcbnotify(&tcbtable, sa, th.th_dport,
- (struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify);
+ if (in6_pcbnotify(&tcbtable, sa, th.th_dport,
+ (struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify) == 0 &&
+ syn_cache_count &&
+ (inet6ctlerrmap[cmd] == EHOSTUNREACH ||
+ inet6ctlerrmap[cmd] == ENETUNREACH ||
+ inet6ctlerrmap[cmd] == EHOSTDOWN))
+ syn_cache_unreach((struct sockaddr *)sa6_src,
+ sa, &th);
} else {
(void) in6_pcbnotify(&tcbtable, sa, 0,
(struct sockaddr *)sa6_src, 0, cmd, NULL, notify);
@@ -912,8 +930,22 @@ tcp_ctlinput(cmd, sa, v)
if (ip) {
th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
- in_pcbnotify(&tcbtable, sa, th->th_dport, ip->ip_src,
- th->th_sport, errno, notify);
+ if (in_pcbnotify(&tcbtable, sa, th->th_dport, ip->ip_src,
+ th->th_sport, errno, notify) == 0 &&
+ syn_cache_count &&
+ (inetctlerrmap[cmd] == EHOSTUNREACH ||
+ inetctlerrmap[cmd] == ENETUNREACH ||
+ inetctlerrmap[cmd] == EHOSTDOWN)) {
+ struct sockaddr_in sin;
+
+ bzero(&sin, sizeof(sin));
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_port = th->th_sport;
+ sin.sin_addr = ip->ip_src;
+ syn_cache_unreach((struct sockaddr *)&sin,
+ sa, th);
+ }
} else
in_pcbnotifyall(&tcbtable, sa, errno, notify);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index cf5ac6fd09a..c1a167a37a5 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_usrreq.c,v 1.74 2003/12/10 07:22:43 itojun Exp $ */
+/* $OpenBSD: tcp_usrreq.c,v 1.75 2004/01/06 17:38:13 markus Exp $ */
/* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */
/*
@@ -925,6 +925,12 @@ tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
return (sysctl_int(oldp, oldlenp, newp, newlen,
&tcp_do_ecn));
#endif
+ case TCPCTL_SYN_CACHE_LIMIT:
+ return (sysctl_int(oldp, oldlenp, newp, newlen,
+ &tcp_syn_cache_limit));
+ case TCPCTL_SYN_BUCKET_LIMIT:
+ return (sysctl_int(oldp, oldlenp, newp, newlen,
+ &tcp_syn_bucket_limit));
default:
return (ENOPROTOOPT);
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 3acccd04282..43dc045ec9c 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_var.h,v 1.51 2003/06/09 07:40:25 itojun Exp $ */
+/* $OpenBSD: tcp_var.h,v 1.52 2004/01/06 17:38:13 markus Exp $ */
/* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */
/*
@@ -82,6 +82,7 @@ struct tcpcb {
#define TF_SEND_CWR 0x00020000 /* send CWR in next seg */
#define TF_DISABLE_ECN 0x00040000 /* disable ECN for this connection */
#endif
+#define TF_DEAD 0x00080000 /* dead and to-be-released */
struct mbuf *t_template; /* skeletal packet for transmit */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
@@ -171,6 +172,9 @@ struct tcpcb {
u_int32_t ts_recent_age; /* when last updated */
tcp_seq last_ack_sent;
+/* pointer for syn cache entries*/
+ LIST_HEAD(, syn_cache) t_sc; /* list of entries by this tcb */
+
/* TUBA stuff */
caddr_t t_tuba_pcb; /* next level down pcb for TCP over z */
@@ -208,6 +212,80 @@ do { \
#endif /* _KERNEL */
/*
+ * Handy way of passing around TCP option info.
+ */
+struct tcp_opt_info {
+ int ts_present;
+ u_int32_t ts_val;
+ u_int32_t ts_ecr;
+ u_int16_t maxseg;
+};
+
+#ifdef _KERNEL
+
+/*
+ * Data for the TCP compressed state engine.
+ */
+union syn_cache_sa {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+#if 1 /*def INET6*/
+ struct sockaddr_in6 sin6;
+#endif
+};
+
+struct syn_cache {
+ TAILQ_ENTRY(syn_cache) sc_bucketq; /* link on bucket list */
+ struct timeout sc_timer; /* rexmt timer */
+ union { /* cached route */
+ struct route route4;
+#ifdef INET6
+ struct route_in6 route6;
+#endif
+ } sc_route_u;
+#define sc_route4 sc_route_u.route4
+#ifdef INET6
+#define sc_route6 sc_route_u.route6
+#endif
+ long sc_win; /* advertised window */
+ int sc_bucketidx; /* our bucket index */
+ u_int32_t sc_hash;
+ u_int32_t sc_timestamp; /* timestamp from SYN */
+#if 0
+ u_int32_t sc_timebase; /* our local timebase */
+#endif
+ union syn_cache_sa sc_src;
+ union syn_cache_sa sc_dst;
+ tcp_seq sc_irs;
+ tcp_seq sc_iss;
+ u_int sc_rxtcur; /* current rxt timeout */
+ u_int sc_rxttot; /* total time spend on queues */
+ u_short sc_rxtshift; /* for computing backoff */
+ u_short sc_flags;
+
+#define SCF_UNREACH 0x0001 /* we've had an unreach error */
+#define SCF_TIMESTAMP 0x0002 /* peer will do timestamps */
+#define SCF_DEAD 0x0004 /* this entry to be released */
+#define SCF_SACK_PERMIT 0x0008 /* permit sack */
+#define SCF_ECN_PERMIT 0x0010 /* permit ecn */
+
+ struct mbuf *sc_ipopts; /* IP options */
+ u_int16_t sc_peermaxseg;
+ u_int16_t sc_ourmaxseg;
+ u_int8_t sc_request_r_scale : 4,
+ sc_requested_s_scale : 4;
+
+ struct tcpcb *sc_tp; /* tcb for listening socket */
+ LIST_ENTRY(syn_cache) sc_tpq; /* list of entries by same tp */
+};
+
+struct syn_cache_head {
+ TAILQ_HEAD(, syn_cache) sch_bucket; /* bucket entries */
+ u_short sch_length; /* # entries in bucket */
+};
+#endif /* _KERNEL */
+
+/*
* The smoothed round-trip time and estimated variance
* are stored as fixed point numbers scaled by the values below.
* For convenience, these scales are also used in smoothing the average
@@ -319,6 +397,20 @@ struct tcpstat {
u_int32_t tcps_cwr_ecn; /* # of cwnd reduced by ecn */
u_int32_t tcps_cwr_frecovery; /* # of cwnd reduced by fastrecovery */
u_int32_t tcps_cwr_timeout; /* # of cwnd reduced by timeout */
+
+ /* These statistics deal with the SYN cache. */
+ u_int64_t tcps_sc_added; /* # of entries added */
+ u_int64_t tcps_sc_completed; /* # of connections completed */
+ u_int64_t tcps_sc_timed_out; /* # of entries timed out */
+ u_int64_t tcps_sc_overflowed; /* # dropped due to overflow */
+ u_int64_t tcps_sc_reset; /* # dropped due to RST */
+ u_int64_t tcps_sc_unreach; /* # dropped due to ICMP unreach */
+ u_int64_t tcps_sc_bucketoverflow;/* # dropped due to bucket overflow */
+ u_int64_t tcps_sc_aborted; /* # of entries aborted (no mem) */
+ u_int64_t tcps_sc_dupesyn; /* # of duplicate SYNs received */
+ u_int64_t tcps_sc_dropped; /* # of SYNs dropped (no route/mem) */
+ u_int64_t tcps_sc_collisions; /* # of hash collisions */
+ u_int64_t tcps_sc_retransmitted;/* # of retransmissions */
};
/*
@@ -339,7 +431,9 @@ struct tcpstat {
#define TCPCTL_RSTPPSLIMIT 12 /* RST pps limit */
#define TCPCTL_ACK_ON_PUSH 13 /* ACK immediately on PUSH */
#define TCPCTL_ECN 14 /* RFC3168 ECN */
-#define TCPCTL_MAXID 15
+#define TCPCTL_SYN_CACHE_LIMIT 15 /* max size of comp. state engine */
+#define TCPCTL_SYN_BUCKET_LIMIT 16 /* max size of hash bucket */
+#define TCPCTL_MAXID 17
#define TCPCTL_NAMES { \
{ 0, 0 }, \
@@ -357,6 +451,8 @@ struct tcpstat {
{ "rstppslimit", CTLTYPE_INT }, \
{ "ackonpush", CTLTYPE_INT }, \
{ "ecn", CTLTYPE_INT }, \
+ { "syn_cache_limit", CTLTYPE_INT }, \
+ { "syn_bucket_limit", CTLTYPE_INT }, \
}
struct tcp_ident_mapping {
@@ -377,6 +473,13 @@ extern struct pool sackhl_pool;
#endif
extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */
+extern int tcp_syn_cache_limit; /* max entries for compressed state engine */
+extern int tcp_syn_bucket_limit;/* max entries per hash bucket */
+
+extern int tcp_syn_cache_size;
+extern struct syn_cache_head tcp_syn_cache[];
+extern u_long syn_cache_count;
+
int tcp_attach(struct socket *);
void tcp_canceltimers(struct tcpcb *);
struct tcpcb *
@@ -391,7 +494,7 @@ struct tcpcb *
struct tcpcb *
tcp_drop(struct tcpcb *, int);
void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
- int *, u_int32_t *, u_int32_t *);
+ struct tcp_opt_info *);
void tcp_drain(void);
void tcp_init(void);
#if defined(INET6) && !defined(TCP6)
@@ -456,5 +559,24 @@ void tcp_rndiss_init(void);
tcp_seq tcp_rndiss_next(void);
u_int16_t
tcp_rndiss_encrypt(u_int16_t);
+
+int syn_cache_add(struct sockaddr *, struct sockaddr *,
+ struct tcphdr *, unsigned int, struct socket *,
+ struct mbuf *, u_char *, int, struct tcp_opt_info *);
+void syn_cache_unreach(struct sockaddr *, struct sockaddr *,
+ struct tcphdr *);
+struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *,
+ struct tcphdr *, unsigned int, unsigned int,
+ struct socket *so, struct mbuf *);
+void syn_cache_init(void);
+void syn_cache_insert(struct syn_cache *, struct tcpcb *);
+struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *,
+ struct syn_cache_head **);
+void syn_cache_reset(struct sockaddr *, struct sockaddr *,
+ struct tcphdr *);
+int syn_cache_respond(struct syn_cache *, struct mbuf *);
+void syn_cache_timer(void *);
+void syn_cache_cleanup(struct tcpcb *);
+
#endif /* _KERNEL */
#endif /* _NETINET_TCP_VAR_H_ */
diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c
index 85b5bdfac77..54e82dfd221 100644
--- a/usr.bin/netstat/inet.c
+++ b/usr.bin/netstat/inet.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: inet.c,v 1.77 2003/12/27 23:17:34 mcbride Exp $ */
+/* $OpenBSD: inet.c,v 1.78 2004/01/06 17:38:12 markus Exp $ */
/* $NetBSD: inet.c,v 1.14 1995/10/03 21:42:37 thorpej Exp $ */
/*
@@ -34,7 +34,7 @@
#if 0
static char sccsid[] = "from: @(#)inet.c 8.4 (Berkeley) 4/20/94";
#else
-static char *rcsid = "$OpenBSD: inet.c,v 1.77 2003/12/27 23:17:34 mcbride Exp $";
+static char *rcsid = "$OpenBSD: inet.c,v 1.78 2004/01/06 17:38:12 markus Exp $";
#endif
#endif /* not lint */
@@ -321,7 +321,6 @@ tcp_stats(u_long off, char *name)
p(tcps_predack, "\t%u correct ACK header prediction%s\n");
p(tcps_preddat, "\t%u correct data packet header prediction%s\n");
p3(tcps_pcbhashmiss, "\t%u PCB cache miss%s\n");
- p(tcps_badsyn, "\t%u SYN packet%s received with same src/dst address/port\n");
p(tcps_ecn_accepts, "\t%u ECN connection%s accepted\n");
p(tcps_ecn_rcvece, "\t\t%u ECE packet%s received\n");
@@ -334,6 +333,21 @@ tcp_stats(u_long off, char *name)
p1(tcps_cwr_timeout, "\t\t\tcwr by timeout: %u\n");
p1(tcps_cwr_ecn, "\t\t\tcwr by ecn: %u\n");
+ p(tcps_badsyn, "\t%u bad connection attempt%s\n");
+ p1(tcps_sc_added, "\t%qd SYN cache entries added\n");
+ p(tcps_sc_collisions, "\t\t%qd hash collision%s\n");
+ p1(tcps_sc_completed, "\t\t%qd completed\n");
+ p1(tcps_sc_aborted, "\t\t%qd aborted (no space to build PCB)\n");
+ p1(tcps_sc_timed_out, "\t\t%qd timed out\n");
+ p1(tcps_sc_overflowed, "\t\t%qd dropped due to overflow\n");
+ p1(tcps_sc_bucketoverflow, "\t\t%qd dropped due to bucket overflow\n");
+ p1(tcps_sc_reset, "\t\t%qd dropped due to RST\n");
+ p1(tcps_sc_unreach, "\t\t%qd dropped due to ICMP unreachable\n");
+ p(tcps_sc_retransmitted, "\t%qd SYN,ACK%s retransmitted\n");
+ p(tcps_sc_dupesyn, "\t%qd duplicate SYN%s received for entries "
+ "already in the cache\n");
+ p(tcps_sc_dropped, "\t%qd SYN%s dropped (no route or no space)\n");
+
#undef p
#undef p1
#undef p2