summaryrefslogtreecommitdiff
path: root/sys/netinet/tcp_input.c
diff options
context:
space:
mode:
authorKenjiro Cho <kjc@cvs.openbsd.org>2002-05-16 14:10:52 +0000
committerKenjiro Cho <kjc@cvs.openbsd.org>2002-05-16 14:10:52 +0000
commitb0681ff14398e9fc13c6ca3bdfb5f07c3f8fd97d (patch)
treec244cb486630be17846439dbc754ab939b097fd9 /sys/netinet/tcp_input.c
parent4839e29c232ec84c87ca5908c81c5493f0a2fc10 (diff)
bring in ECN support from KAME.
it consists of - ECN support in TCP - tunnel-egress and fragment reassembly rules in layer-3 not to lose congestion info at tunnel-egress and fragment reassembly to enable ECN in TCP, build a kernel with TCP_ECN, and then, turn it on by "sysctl -w net.inet.tcp.ecn=1". ok deraadt@
Diffstat (limited to 'sys/netinet/tcp_input.c')
-rw-r--r--sys/netinet/tcp_input.c132
1 files changed, 128 insertions, 4 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 847eacf5947..fd775bf73ee 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_input.c,v 1.110 2002/03/19 14:58:54 itojun Exp $ */
+/* $OpenBSD: tcp_input.c,v 1.111 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
/*
@@ -142,6 +142,18 @@ do { \
#define ND6_HINT(tp)
#endif
+#ifdef TCP_ECN
+/*
+ * ECN (Explicit Congestion Notification) support based on RFC3168
+ * implementation note:
+ * snd_last is used to track a recovery phase.
+ * when cwnd is reduced, snd_last is set to snd_max.
+ * while snd_last > snd_una, the sender is in a recovery phase and
+ * its cwnd should not be reduced again.
+ * snd_last follows snd_una when not in a recovery phase.
+ */
+#endif
+
/*
* Macro to compute ACK transmission behavior. Delay the ACK unless
* we have already delayed an ACK (must send an ACK every two segments).
@@ -419,6 +431,9 @@ tcp_input(struct mbuf *m, ...)
int error, s;
#endif /* IPSEC */
int af;
+#ifdef TCP_ECN
+ u_char iptos;
+#endif
va_start(ap, m);
iphlen = va_arg(ap, int);
@@ -515,6 +530,10 @@ tcp_input(struct mbuf *m, ...)
#endif
ti = mtod(m, struct tcpiphdr *);
+#ifdef TCP_ECN
+ /* save ip_tos before clearing it for checksum */
+ iptos = ip->ip_tos;
+#endif
/*
* Checksum extended TCP header and data.
*/
@@ -542,6 +561,9 @@ tcp_input(struct mbuf *m, ...)
case AF_INET6:
ipv6 = mtod(m, struct ip6_hdr *);
tlen = m->m_pkthdr.len - iphlen;
+#ifdef TCP_ECN
+ iptos = (ntohl(ipv6->ip6_flow) >> 20) & 0xff;
+#endif
/* Be proactive about malicious use of IPv4 mapped address */
if (IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_src) ||
@@ -896,6 +918,13 @@ findpcb:
tp->rcv_lastend = th->th_seq + tlen;
}
#endif /* TCP_SACK */
+#ifdef TCP_ECN
+ /* if congestion experienced, set ECE bit in subsequent packets. */
+ if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
+ tp->t_flags |= TF_RCVD_CE;
+ tcpstat.tcps_ecn_rcvce++;
+ }
+#endif
/*
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
@@ -911,7 +940,11 @@ findpcb:
* the socket buffer and note that we need a delayed ack.
*/
if (tp->t_state == TCPS_ESTABLISHED &&
+#ifdef TCP_ECN
+ (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK &&
+#else
(tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+#endif
(!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) &&
th->th_seq == tp->rcv_nxt &&
tiwin && tiwin == tp->snd_wnd &&
@@ -948,12 +981,15 @@ findpcb:
ND6_HINT(tp);
sbdrop(&so->so_snd, acked);
tp->snd_una = th->th_ack;
-#if defined(TCP_SACK)
+#if defined(TCP_SACK) || defined(TCP_ECN)
/*
* We want snd_last to track snd_una so
* as to avoid sequence wraparound problems
* for very large transfers.
*/
+#ifdef TCP_ECN
+ if (SEQ_GT(tp->snd_una, tp->snd_last))
+#endif
tp->snd_last = tp->snd_una;
#endif /* TCP_SACK */
#if defined(TCP_SACK) && defined(TCP_FACK)
@@ -1188,7 +1224,7 @@ findpcb:
}
tp->irs = th->th_seq;
tcp_sendseqinit(tp);
-#if defined (TCP_SACK)
+#if defined (TCP_SACK) || defined(TCP_ECN)
tp->snd_last = tp->snd_una;
#endif /* TCP_SACK */
#if defined(TCP_SACK) && defined(TCP_FACK)
@@ -1196,6 +1232,16 @@ findpcb:
tp->retran_data = 0;
tp->snd_awnd = 0;
#endif /* TCP_FACK */
+#ifdef TCP_ECN
+ /*
+ * if both ECE and CWR flag bits are set, peer is ECN capable.
+ */
+ if (tcp_do_ecn &&
+ (tiflags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ tcpstat.tcps_ecn_accepts++;
+ }
+#endif
tcp_rcvseqinit(tp);
tp->t_flags |= TF_ACKNOW;
tp->t_state = TCPS_SYN_RECEIVED;
@@ -1241,6 +1287,11 @@ findpcb:
SEQ_GT(th->th_ack, tp->snd_max)))
goto dropwithreset;
if (tiflags & TH_RST) {
+#ifdef TCP_ECN
+ /* if ECN is enabled, fall back to non-ecn at rexmit */
+ if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
+ goto drop;
+#endif
if (tiflags & TH_ACK)
tp = tcp_drop(tp, ECONNREFUSED);
goto drop;
@@ -1266,6 +1317,24 @@ findpcb:
if ((tp->t_flags & TF_SACK_PERMIT) == 0)
tp->sack_disable = 1;
#endif
+#ifdef TCP_ECN
+ /*
+ * if ECE is set but CWR is not set for SYN-ACK, or
+ * both ECE and CWR are set for simultaneous open,
+ * peer is ECN capable.
+ */
+ if (tcp_do_ecn) {
+ if ((tiflags & (TH_ACK|TH_ECE|TH_CWR))
+ == (TH_ACK|TH_ECE) ||
+ (tiflags & (TH_ACK|TH_ECE|TH_CWR))
+ == (TH_ECE|TH_CWR)) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ tiflags &= ~(TH_ECE|TH_CWR);
+ tcpstat.tcps_ecn_accepts++;
+ }
+ }
+#endif
+
if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
tcpstat.tcps_connects++;
soisconnected(so);
@@ -1471,6 +1540,11 @@ trimthenstep6:
switch (tp->t_state) {
case TCPS_SYN_RECEIVED:
+#ifdef TCP_ECN
+ /* if ECN is enabled, fall back to non-ecn at rexmit */
+ if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
+ goto drop;
+#endif
so->so_error = ECONNREFUSED;
goto close;
@@ -1551,6 +1625,39 @@ trimthenstep6:
case TCPS_CLOSING:
case TCPS_LAST_ACK:
case TCPS_TIME_WAIT:
+#ifdef TCP_ECN
+ /*
+ * if we receive ECE and are not already in recovery phase,
+ * reduce cwnd by half but don't slow-start.
+ * advance snd_last to snd_max not to reduce cwnd again
+ * until all outstanding packets are acked.
+ */
+ if (tcp_do_ecn && (tiflags & TH_ECE)) {
+ if ((tp->t_flags & TF_ECN_PERMIT) &&
+ SEQ_GEQ(tp->snd_una, tp->snd_last)) {
+ u_int win;
+
+ win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg;
+ if (win > 1) {
+ tp->snd_ssthresh = win / 2 * tp->t_maxseg;
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_last = tp->snd_max;
+ tp->t_flags |= TF_SEND_CWR;
+ tcpstat.tcps_cwr_ecn++;
+ }
+ }
+ tcpstat.tcps_ecn_rcvece++;
+ }
+ /*
+ * if we receive CWR, we know that the peer has reduced
+ * its congestion window. stop sending ecn-echo.
+ */
+ if ((tiflags & TH_CWR)) {
+ tp->t_flags &= ~TF_RCVD_CE;
+ tcpstat.tcps_ecn_rcvcwr++;
+ }
+#endif /* TCP_ECN */
+
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
/*
* Duplicate/old ACK processing.
@@ -1621,7 +1728,7 @@ trimthenstep6:
ulmin(tp->snd_wnd, tp->snd_cwnd) /
2 / tp->t_maxseg;
-#if defined(TCP_SACK)
+#if defined(TCP_SACK) || defined(TCP_ECN)
if (SEQ_LT(th->th_ack, tp->snd_last)){
/*
* False fast retx after
@@ -1641,6 +1748,12 @@ trimthenstep6:
if (!tp->sack_disable) {
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
+#ifdef TCP_ECN
+ tp->t_flags |= TF_SEND_CWR;
+#endif
+#if 1 /* TCP_ECN */
+ tcpstat.tcps_cwr_frecovery++;
+#endif
tcpstat.tcps_sndrexmitfast++;
#if defined(TCP_SACK) && defined(TCP_FACK)
tp->t_dupacks = tcprexmtthresh;
@@ -1666,6 +1779,12 @@ trimthenstep6:
tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
+#ifdef TCP_ECN
+ tp->t_flags |= TF_SEND_CWR;
+#endif
+#if 1 /* TCP_ECN */
+ tcpstat.tcps_cwr_frecovery++;
+#endif
tcpstat.tcps_sndrexmitfast++;
(void) tcp_output(tp);
@@ -1818,6 +1937,11 @@ trimthenstep6:
if (sb_notify(&so->so_snd))
sowwakeup(so);
tp->snd_una = th->th_ack;
+#ifdef TCP_ECN
+ /* sync snd_last with snd_una */
+ if (SEQ_GT(tp->snd_una, tp->snd_last))
+ tp->snd_last = tp->snd_una;
+#endif
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
#if defined (TCP_SACK) && defined (TCP_FACK)