summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Provos <provos@cvs.openbsd.org>1998-11-17 19:23:04 +0000
committerNiels Provos <provos@cvs.openbsd.org>1998-11-17 19:23:04 +0000
commitf4056acce40a0cafaefbc4d11482cc581a495726 (patch)
tree18b3206061d7da9c999130a9b9449317fa5c48db
parentb6b557f3649354e658028fd32cc7555b7f400c72 (diff)
NewReno, SACK and FACK support for TCP, adapted from code for BSDI
by Hari Balakrishnan (hari@lcs.mit.edu), Tom Henderson (tomh@cs.berkeley.edu) and Venkat Padmanabhan (padmanab@cs.berkeley.edu) as part of the Daedalus research group at the University of California, (http://daedalus.cs.berkeley.edu). [I was able to do this on time spent at the Center for Information Technology Integration (citi.umich.edu)]
-rw-r--r--sys/netinet/tcp.h20
-rw-r--r--sys/netinet/tcp_debug.c2
-rw-r--r--sys/netinet/tcp_input.c705
-rw-r--r--sys/netinet/tcp_output.c253
-rw-r--r--sys/netinet/tcp_subr.c28
-rw-r--r--sys/netinet/tcp_timer.c25
-rw-r--r--sys/netinet/tcp_usrreq.c30
-rw-r--r--sys/netinet/tcp_var.h69
8 files changed, 1098 insertions, 34 deletions
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 57069c1a682..39c001c3962 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp.h,v 1.2 1997/02/24 14:06:44 niklas Exp $ */
+/* $OpenBSD: tcp.h,v 1.3 1998/11/17 19:23:00 provos Exp $ */
/* $NetBSD: tcp.h,v 1.8 1995/04/17 05:32:58 cgd Exp $ */
/*
@@ -75,6 +75,7 @@ struct tcphdr {
#define TCPOPT_SACK_PERMITTED 4 /* Experimental */
#define TCPOLEN_SACK_PERMITTED 2
#define TCPOPT_SACK 5 /* Experimental */
+#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */
#define TCPOPT_TIMESTAMP 8
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
@@ -82,6 +83,20 @@ struct tcphdr {
#define TCPOPT_TSTAMP_HDR \
(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
+#ifdef TCP_SACK
+/* Option definitions */
+#define TCPOPT_SACK_PERMIT_HDR \
+(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
+#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
+/* Miscellaneous constants */
+#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */
+#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */
+#endif /* TCP_SACK */
+
+#if defined(TCP_SACK) || defined(TCP_NEWRENO)
+#define TCP_MAXBURST 4 /* Max # packets after leaving Fast Rxmit */
+#endif
+
/*
* Default maximum segment size for TCP.
* With an IP MSS of 576, this is 536,
@@ -99,3 +114,6 @@ struct tcphdr {
*/
#define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */
#define TCP_MAXSEG 0x02 /* set maximum segment size */
+#ifdef TCP_SACK
+#define TCP_SACK_DISABLE 0x300 /* disable SACKs(if enabled by deflt.)*/
+#endif /* TCP_SACK */
diff --git a/sys/netinet/tcp_debug.c b/sys/netinet/tcp_debug.c
index 17e038acf8d..6adc013193e 100644
--- a/sys/netinet/tcp_debug.c
+++ b/sys/netinet/tcp_debug.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_debug.c,v 1.2 1996/03/03 22:30:44 niklas Exp $ */
+/* $OpenBSD: tcp_debug.c,v 1.3 1998/11/17 19:23:01 provos Exp $ */
/* $NetBSD: tcp_debug.c,v 1.10 1996/02/13 23:43:36 christos Exp $ */
/*
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index cacdcc1b9e8..72f19aafa92 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_input.c,v 1.20 1998/10/28 21:34:32 provos Exp $ */
+/* $OpenBSD: tcp_input.c,v 1.21 1998/11/17 19:23:01 provos Exp $ */
/* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
/*
@@ -609,6 +609,11 @@ findpcb:
if (tp->t_state != TCPS_SYN_RECEIVED)
tp->t_timer[TCPT_KEEP] = tcp_keepidle;
+#ifdef TCP_SACK
+ if (!tp->sack_disable)
+ tcp_del_sackholes(tp, ti); /* Delete stale SACK holes */
+#endif /* TCP_SACK */
+
/*
* Process options if not in LISTEN state,
* else do it below (after getting remote address).
@@ -617,6 +622,12 @@ findpcb:
tcp_dooptions(tp, optp, optlen, ti,
&ts_present, &ts_val, &ts_ecr);
+#ifdef TCP_SACK
+ if (!tp->sack_disable) {
+ tp->rcv_laststart = ti->ti_seq; /* last rec'vd segment*/
+ tp->rcv_lastend = ti->ti_seq + ti->ti_len;
+ }
+#endif /* TCP_SACK */
/*
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
@@ -652,7 +663,7 @@ findpcb:
if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
- tp->t_dupacks < tcprexmtthresh) {
+ tp->t_dupacks == 0) {
/*
* this is a pure ack for outstanding data.
*/
@@ -667,6 +678,10 @@ findpcb:
tcpstat.tcps_rcvackbyte += acked;
sbdrop(&so->so_snd, acked);
tp->snd_una = ti->ti_ack;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+#endif /* TCP_FACK */
m_freem(m);
/*
@@ -697,6 +712,11 @@ findpcb:
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
+#ifdef TCP_SACK
+ /* Clean receiver SACK report if present */
+ if (!tp->sack_disable && tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
+#endif /* TCP_SACK */
++tcpstat.tcps_preddat;
tp->rcv_nxt += ti->ti_len;
tcpstat.tcps_rcvpack++;
@@ -822,6 +842,17 @@ findpcb:
if (optp)
tcp_dooptions(tp, optp, optlen, ti,
&ts_present, &ts_val, &ts_ecr);
+#ifdef TCP_SACK
+ /*
+ * If peer did not send a SACK_PERMITTED option (i.e., if
+ * tcp_dooptions() did not set TF_SACK_PERMIT), set
+ * sack_disable to 1 if it is currently 0.
+ */
+ if (!tp->sack_disable)
+ if ((tp->t_flags & TF_SACK_PERMIT) == 0)
+ tp->sack_disable = 1;
+#endif
+
if (iss)
tp->iss = iss;
else
@@ -833,6 +864,14 @@ findpcb:
#endif /* !TCP_COMPAT_42 */
tp->irs = ti->ti_seq;
tcp_sendseqinit(tp);
+#if defined (TCP_SACK) || defined (TCP_NEWRENO)
+ tp->snd_last = tp->snd_una;
+#endif /* TCP_SACK || TCP_NEWRENO */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+ tp->snd_awnd = 0;
+#endif /* TCP_FACK */
tcp_rcvseqinit(tp);
tp->t_flags |= TF_ACKNOW;
tp->t_state = TCPS_SYN_RECEIVED;
@@ -893,6 +932,16 @@ findpcb:
tp->irs = ti->ti_seq;
tcp_rcvseqinit(tp);
tp->t_flags |= TF_ACKNOW;
+#ifdef TCP_SACK
+ /*
+ * If we've sent a SACK_PERMITTED option, and the peer
+ * also replied with one, then TF_SACK_PERMIT should have
+ * been set in tcp_dooptions(). If it was not, disable SACKs.
+ */
+ if (!tp->sack_disable)
+ if ((tp->t_flags & TF_SACK_PERMIT) == 0)
+ tp->sack_disable = 1;
+#endif
if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
tcpstat.tcps_connects++;
soisconnected(so);
@@ -911,6 +960,15 @@ findpcb:
*/
if (tp->t_rtt)
tcp_xmit_timer(tp, tp->t_rtt);
+ /*
+ * Since new data was acked (the SYN), open the
+ * congestion window by one MSS. We do this
+ * here, because we won't go through the normal
+ * ACK processing below. And since this is the
+ * start of the connection, we know we are in
+ * the exponential phase of slow-start.
+ */
+ tp->snd_cwnd += tp->t_maxseg;
} else
tp->t_state = TCPS_SYN_RECEIVED;
@@ -1169,7 +1227,31 @@ trimthenstep6:
case TCPS_TIME_WAIT:
if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
- if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
+ /*
+ * Duplicate/old ACK processing.
+ * Increments t_dupacks:
+ * Pure duplicate (same seq/ack/window, no data)
+ * Doesn't affect t_dupacks:
+ * Data packets.
+ * Normal window updates (window opens)
+ * Resets t_dupacks:
+ * New data ACKed.
+ * Window shrinks
+ * Old ACK
+ */
+ if (ti->ti_len)
+ break;
+ /*
+ * If we get an old ACK, there is probably packet
+ * reordering going on. Be conservative and reset
+ * t_dupacks so that we are less agressive in
+ * doing a fast retransmit.
+ */
+ if (ti->ti_ack != tp->snd_una) {
+ tp->t_dupacks = 0;
+ break;
+ }
+ if (tiwin == tp->snd_wnd) {
tcpstat.tcps_rcvdupack++;
/*
* If we have outstanding data (other than
@@ -1195,45 +1277,186 @@ trimthenstep6:
* to keep a constant cwnd packets in the
* network.
*/
- if (tp->t_timer[TCPT_REXMT] == 0 ||
- ti->ti_ack != tp->snd_una)
+ if (tp->t_timer[TCPT_REXMT] == 0)
tp->t_dupacks = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /*
+ * In FACK, can enter fast rec. if the receiver
+ * reports a reass. queue longer than 3 segs.
+ */
+ else if (++tp->t_dupacks == tcprexmtthresh ||
+ ((SEQ_GT(tp->snd_fack, tcprexmtthresh *
+ tp->t_maxseg + tp->snd_una)) &&
+ SEQ_GT(tp->snd_una, tp->snd_last))) {
+#else
else if (++tp->t_dupacks == tcprexmtthresh) {
+#endif /* TCP_FACK */
tcp_seq onxt = tp->snd_nxt;
u_int win =
min(tp->snd_wnd, tp->snd_cwnd) / 2 /
tp->t_maxseg;
+#if defined(TCP_SACK) || defined(TCP_NEWRENO)
+ if (SEQ_LT(ti->ti_ack, tp->snd_last)){
+ /*
+ * False fast retx after
+ * timeout. Do not cut window.
+ */
+ tp->snd_cwnd += tp->t_maxseg;
+ tp->t_dupacks = 0;
+ (void) tcp_output(tp);
+ goto drop;
+ }
+#endif
if (win < 2)
win = 2;
tp->snd_ssthresh = win * tp->t_maxseg;
+#if defined(TCP_SACK) || defined(TCP_NEWRENO)
+ tp->snd_last = tp->snd_max;
+#endif
+#ifdef TCP_SACK
+ if (!tp->sack_disable) {
+ tp->t_timer[TCPT_REXMT] = 0;
+ tp->t_rtt = 0;
+ tcpstat.tcps_sndrexmitfast++;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ (void) tcp_output(tp);
+ /*
+ * During FR, snd_cwnd is held
+ * constant for FACK.
+ */
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_dupacks = tcprexmtthresh;
+#else
+ /*
+ * tcp_output() will send
+ * oldest SACK-eligible rtx.
+ */
+ (void) tcp_output(tp);
+ tp->snd_cwnd = tp->snd_ssthresh+
+ tp->t_maxseg * tp->t_dupacks;
+#endif /* TCP_FACK */
+ /*
+ * It is possible for
+ * tcp_output to fail to send
+ * a segment. If so, make
+ * sure that REMXT timer is set.
+ */
+ if (SEQ_GT(tp->snd_max,
+ tp->snd_una) &&
+ tp->t_timer[TCPT_REXMT] == 0 &&
+ tp->t_timer[TCPT_PERSIST] == 0)
+ tp->t_timer[TCPT_REXMT] =
+ tp->t_rxtcur;
+ goto drop;
+ }
+#endif /* TCP_SACK */
tp->t_timer[TCPT_REXMT] = 0;
tp->t_rtt = 0;
tp->snd_nxt = ti->ti_ack;
tp->snd_cwnd = tp->t_maxseg;
+ tcpstat.tcps_sndrexmitfast++;
(void) tcp_output(tp);
+
tp->snd_cwnd = tp->snd_ssthresh +
tp->t_maxseg * tp->t_dupacks;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
goto drop;
} else if (tp->t_dupacks > tcprexmtthresh) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /*
+ * while (awnd < cwnd)
+ * sendsomething();
+ */
+ if (!tp->sack_disable) {
+ if (tp->snd_awnd < tp->snd_cwnd)
+ tcp_output(tp);
+ goto drop;
+ }
+#endif /* TCP_FACK */
tp->snd_cwnd += tp->t_maxseg;
(void) tcp_output(tp);
goto drop;
}
- } else
+ } else if (tiwin < tp->snd_wnd) {
+ /*
+ * The window was retracted! Previous dup
+ * ACKs may have been due to packets arriving
+ * after the shrunken window, not a missing
+ * packet, so play it safe and reset t_dupacks
+ */
tp->t_dupacks = 0;
+ }
break;
}
/*
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
+#ifdef TCP_NEWRENO
+ if (tp->t_dupacks >= tcprexmtthresh && !tcp_newreno(tp, ti)) {
+ /* Out of fast recovery */
+ tp->snd_cwnd = tp->snd_ssthresh;
+ /*
+ * Window inflation should have left us with approx.
+ * snd_ssthresh outstanding data. But in case we
+ * would be inclined to send a burst, better to do
+ * it via the slow start mechanism.
+ */
+ if (tcp_seq_subtract(tp->snd_max, ti->ti_ack) <
+ tp->snd_ssthresh)
+ tp->snd_cwnd = tcp_seq_subtract(tp->snd_max,
+ ti->ti_ack) + tp->t_maxseg;
+ tp->t_dupacks = 0;
+ }
+#elif defined(TCP_SACK)
+ if (!tp->sack_disable) {
+ if (tp->t_dupacks >= tcprexmtthresh) {
+ /* Check for a partial ACK */
+ if (tcp_sack_partialack(tp, ti)) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /* Force call to tcp_output */
+ if (tp->snd_awnd < tp->snd_cwnd)
+ needoutput = 1;
+#else
+ tp->snd_cwnd += tp->t_maxseg;
+ needoutput = 1;
+#endif /* TCP_FACK */
+ } else {
+ /* Out of fast recovery */
+ tp->snd_cwnd = tp->snd_ssthresh;
+ if (tcp_seq_subtract(tp->snd_max,
+ ti->ti_ack) < tp->snd_ssthresh)
+ tp->snd_cwnd =
+ tcp_seq_subtract(tp->snd_max,
+ ti->ti_ack) + tp->t_maxseg;
+ tp->t_dupacks = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ if (SEQ_GT(ti->ti_ack, tp->snd_fack))
+ tp->snd_fack = ti->ti_ack;
+#endif /* TCP_FACK */
+ }
+ }
+ } else {
+ if (tp->t_dupacks >= tcprexmtthresh &&
+ !tcp_newreno(tp, ti)) {
+ /* Out of fast recovery */
+ tp->snd_cwnd = tp->snd_ssthresh;
+ if (tcp_seq_subtract(tp->snd_max, ti->ti_ack) <
+ tp->snd_ssthresh)
+ tp->snd_cwnd =
+ tcp_seq_subtract(tp->snd_max,
+ ti->ti_ack) + tp->t_maxseg;
+ tp->t_dupacks = 0;
+ }
+ }
+#else /* else neither TCP_NEWRENO nor TCP_SACK */
if (tp->t_dupacks >= tcprexmtthresh &&
tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_dupacks = 0;
+#endif
if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
tcpstat.tcps_rcvacktoomuch++;
goto dropafterack;
@@ -1272,9 +1495,7 @@ trimthenstep6:
* If the window gives us less than ssthresh packets
* in flight, open exponentially (maxseg per packet).
* Otherwise open linearly: maxseg per window
- * (maxseg^2 / cwnd per packet), plus a constant
- * fraction of a packet (maxseg/8) to help larger windows
- * open quickly enough.
+ * (maxseg^2 / cwnd per packet).
*/
{
register u_int cw = tp->snd_cwnd;
@@ -1282,6 +1503,9 @@ trimthenstep6:
if (cw > tp->snd_ssthresh)
incr = incr * incr / cw;
+#if defined (TCP_NEWRENO) || defined (TCP_SACK)
+ if (SEQ_GEQ(ti->ti_ack, tp->snd_last))
+#endif
tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
}
if (acked > so->so_snd.sb_cc) {
@@ -1298,6 +1522,10 @@ trimthenstep6:
tp->snd_una = ti->ti_ack;
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
+#if defined (TCP_SACK) && defined (TCP_FACK)
+ if (SEQ_GT(tp->snd_una, tp->snd_fack))
+ tp->snd_fack = tp->snd_una;
+#endif
switch (tp->t_state) {
@@ -1454,6 +1682,10 @@ dodata: /* XXX */
if ((ti->ti_len || (tiflags & TH_FIN)) &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
TCP_REASS(tp, ti, m, so, tiflags);
+#ifdef TCP_SACK
+ if (!tp->sack_disable)
+ tcp_update_sack_list(tp);
+#endif
/*
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
@@ -1519,8 +1751,20 @@ dodata: /* XXX */
/*
* Return any desired output.
*/
- if (needoutput || (tp->t_flags & TF_ACKNOW))
+ if (needoutput || (tp->t_flags & TF_ACKNOW)) {
(void) tcp_output(tp);
+#ifdef TCP_SACK
+ /*
+ * In SACK, it is possible for tcp_output() to fail to send a segment
+ * after the retransmission timer has been turned off. Make sure that
+ * the retransmission timer is set if we are in fast recovery.
+ */
+ if (needoutput && SEQ_GT(tp->snd_max, tp->snd_una) &&
+ tp->t_timer[TCPT_REXMT] == 0 &&
+ tp->t_timer[TCPT_PERSIST] == 0)
+ tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
+#endif
+ }
return;
dropafterack:
@@ -1636,6 +1880,20 @@ tcp_dooptions(tp, cp, cnt, ti, ts_present, ts_val, ts_ecr)
tp->ts_recent_age = tcp_now;
}
break;
+
+#ifdef TCP_SACK
+ case TCPOPT_SACK_PERMITTED:
+ if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED)
+ continue;
+ if (ti->ti_flags & TH_SYN)
+ /* MUST only be set on SYN */
+ tp->t_flags |= TF_SACK_PERMIT;
+ break;
+ case TCPOPT_SACK:
+ if (tcp_sack_option(tp, ti, cp, optlen))
+ continue;
+ break;
+#endif
}
}
/* Update t_maxopd and t_maxseg after all options are processed */
@@ -1643,6 +1901,395 @@ tcp_dooptions(tp, cp, cnt, ti, ts_present, ts_val, ts_ecr)
(void) tcp_mss(tp, mss); /* sets t_maxseg */
}
+#if defined(TCP_SACK) || defined(TCP_NEWRENO)
+u_long
+tcp_seq_subtract(a, b)
+ u_long a, b;
+{
+ return ((long)(a - b));
+}
+#endif
+
+
+#ifdef TCP_SACK
+/*
+ * This function is called upon receipt of new valid data (while not in header
+ * prediction mode), and it updates the ordered list of sacks.
+ */
+void
+tcp_update_sack_list(tp)
+ struct tcpcb *tp;
+{
+ /*
+ * First reported block MUST be the most recent one. Subsequent
+ * blocks SHOULD be in the order in which they arrived at the
+ * receiver. These two conditions make the implementation fully
+ * compliant with RFC 2018.
+ */
+ int i, j = 0, count = 0, lastpos = -1;
+ struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
+
+ /* First clean up current list of sacks */
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0) {
+ count++; /* count = number of blocks to be discarded */
+ continue;
+ }
+ if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ count++;
+ } else {
+ temp[j].start = tp->sackblks[i].start;
+ temp[j++].end = tp->sackblks[i].end;
+ }
+ }
+ tp->rcv_numsacks -= count;
+ if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
+ tcp_clean_sackreport(tp);
+ if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) {
+ /* ==> need first sack block */
+ tp->sackblks[0].start = tp->rcv_laststart;
+ tp->sackblks[0].end = tp->rcv_lastend;
+ tp->rcv_numsacks = 1;
+ }
+ return;
+ }
+ /* Otherwise, sack blocks are already present. */
+ for (i = 0; i < tp->rcv_numsacks; i++)
+ tp->sackblks[i] = temp[i]; /* first copy back sack list */
+ if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend))
+ return; /* sack list remains unchanged */
+ /*
+ * From here, segment just received should be (part of) the 1st sack.
+ * Go through list, possibly coalescing sack block entries.
+ */
+ firstsack.start = tp->rcv_laststart;
+ firstsack.end = tp->rcv_lastend;
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (SEQ_LT(sack.end, firstsack.start) ||
+ SEQ_GT(sack.start, firstsack.end))
+ continue; /* no overlap */
+ if (sack.start == firstsack.start && sack.end == firstsack.end){
+ /*
+ * identical block; delete it here since we will
+ * move it to the front of the list.
+ */
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ lastpos = i; /* last posn with a zero entry */
+ continue;
+ }
+ if (SEQ_LEQ(sack.start, firstsack.start))
+ firstsack.start = sack.start; /* merge blocks */
+ if (SEQ_GEQ(sack.end, firstsack.end))
+ firstsack.end = sack.end; /* merge blocks */
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ lastpos = i; /* last posn with a zero entry */
+ }
+ if (lastpos != -1) { /* at least one merge */
+ for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0)
+ continue;
+ temp[j++] = sack;
+ }
+ tp->rcv_numsacks = j; /* including first blk (added later) */
+ for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
+ tp->sackblks[i] = temp[i];
+ } else { /* no merges -- shift sacks by 1 */
+ if (tp->rcv_numsacks < MAX_SACK_BLKS)
+ tp->rcv_numsacks++;
+ for (i = tp->rcv_numsacks-1; i > 0; i--)
+ tp->sackblks[i] = tp->sackblks[i-1];
+ }
+ tp->sackblks[0] = firstsack;
+ return;
+}
+
+/*
+ * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue,
+ * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list
+ * of holes (oldest to newest, in terms of the sequence space).
+ */
+int
+tcp_sack_option(tp, ti, cp, optlen)
+ struct tcpcb *tp;
+ struct tcpiphdr *ti;
+ u_char *cp;
+ int optlen;
+{
+ int tmp_olen;
+ u_char *tmp_cp;
+ struct sackhole *cur, *p, *temp;
+
+ if (tp->sack_disable)
+ return 1;
+
+ /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
+ if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
+ return 1;
+ tmp_cp = cp + 2;
+ tmp_olen = optlen - 2;
+ if (tp->snd_numholes < 0)
+ tp->snd_numholes = 0;
+ if (tp->t_maxseg == 0)
+ panic("tcp_sack_option"); /* Should never happen */
+ while (tmp_olen > 0) {
+ struct sackblk sack;
+
+ bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
+ NTOHL(sack.start);
+ bcopy((char *) tmp_cp + sizeof(tcp_seq),
+ (char *) &(sack.end), sizeof(tcp_seq));
+ NTOHL(sack.end);
+ tmp_olen -= TCPOLEN_SACK;
+ tmp_cp += TCPOLEN_SACK;
+ if (SEQ_LEQ(sack.end, sack.start))
+ continue; /* bad SACK fields */
+ if (SEQ_LEQ(sack.end, tp->snd_una))
+ continue; /* old block */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /* Updates snd_fack. */
+ if (SEQ_GEQ(sack.end, tp->snd_fack))
+ tp->snd_fack = sack.end;
+#endif /* TCP_FACK */
+ if (tp->snd_holes == 0) { /* first hole */
+ tp->snd_holes = (struct sackhole *)
+ malloc(sizeof(struct sackhole), M_PCB, M_NOWAIT);
+ cur = tp->snd_holes;
+ cur->start = ti->ti_ack;
+ cur->end = sack.start;
+ cur->rxmit = cur->start;
+ cur->next = 0;
+ tp->snd_numholes = 1;
+ tp->rcv_lastsack = sack.end;
+ /*
+ * dups is at least one. If more data has been
+ * SACKed, it can be greater than one.
+ */
+ cur->dups = min(tcprexmtthresh,
+ ((sack.end - cur->end)/tp->t_maxseg));
+ if (cur->dups < 1)
+ cur->dups = 1;
+ continue; /* with next sack block */
+ }
+ /* Go thru list of holes: p = previous, cur = current */
+ p = cur = tp->snd_holes;
+ while (cur) {
+ if (SEQ_LEQ(sack.end, cur->start))
+ /* SACKs data before the current hole */
+ break; /* no use going through more holes */
+ if (SEQ_GEQ(sack.start, cur->end)) {
+ /* SACKs data beyond the current hole */
+ cur->dups++;
+ if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+ tcprexmtthresh)
+ cur->dups = tcprexmtthresh;
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ if (SEQ_LEQ(sack.start, cur->start)) {
+ /* Data acks at least the beginning of hole */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ if (SEQ_GT(sack.end, cur->rxmit))
+ tp->retran_data -=
+ tcp_seq_subtract(cur->rxmit,
+ cur->start);
+ else
+ tp->retran_data -=
+ tcp_seq_subtract(sack.end,
+ cur->start);
+#endif /* TCP_FACK */
+ if (SEQ_GEQ(sack.end,cur->end)){
+ /* Acks entire hole, so delete hole */
+ if (p != cur) {
+ p->next = cur->next;
+ free(cur, M_PCB);
+ cur = p->next;
+ } else {
+ cur=cur->next;
+ free(p, M_PCB);
+ p = cur;
+ tp->snd_holes = p;
+ }
+ tp->snd_numholes--;
+ continue;
+ }
+ /* otherwise, move start of hole forward */
+ cur->start = sack.end;
+ cur->rxmit = max (cur->rxmit, cur->start);
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ /* move end of hole backward */
+ if (SEQ_GEQ(sack.end, cur->end)) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ if (SEQ_GT(cur->rxmit, sack.start))
+ tp->retran_data -=
+ tcp_seq_subtract(cur->rxmit,
+ sack.start);
+#endif /* TCP_FACK */
+ cur->end = sack.start;
+ cur->rxmit = min (cur->rxmit, cur->end);
+ cur->dups++;
+ if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+ tcprexmtthresh)
+ cur->dups = tcprexmtthresh;
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ if (SEQ_LT(cur->start, sack.start) &&
+ SEQ_GT(cur->end, sack.end)) {
+ /*
+ * ACKs some data in middle of a hole; need to
+ * split current hole
+ */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ if (SEQ_GT(cur->rxmit, sack.end))
+ tp->retran_data -=
+ tcp_seq_subtract(sack.end,
+ sack.start);
+ else if (SEQ_GT(cur->rxmit, sack.start))
+ tp->retran_data -=
+ tcp_seq_subtract(cur->rxmit,
+ sack.start);
+#endif /* TCP_FACK */
+ temp = (struct sackhole *)malloc(sizeof(*temp),
+ M_PCB,M_NOWAIT);
+ temp->next = cur->next;
+ temp->start = sack.end;
+ temp->end = cur->end;
+ temp->dups = cur->dups;
+ temp->rxmit = max (cur->rxmit, temp->start);
+ cur->end = sack.start;
+ cur->rxmit = min (cur->rxmit, cur->end);
+ cur->dups++;
+ if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+ tcprexmtthresh)
+ cur->dups = tcprexmtthresh;
+ cur->next = temp;
+ p = temp;
+ cur = p->next;
+ tp->snd_numholes++;
+ }
+ }
+ /* At this point, p points to the last hole on the list */
+ if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
+ /*
+ * Need to append new hole at end.
+ * Last hole is p (and it's not NULL).
+ */
+ temp = (struct sackhole *) malloc(sizeof(*temp),
+ M_PCB, M_NOWAIT);
+ temp->start = tp->rcv_lastsack;
+ temp->end = sack.start;
+ temp->dups = min(tcprexmtthresh,
+ ((sack.end - sack.start)/tp->t_maxseg));
+ if (temp->dups < 1)
+ temp->dups = 1;
+ temp->rxmit = temp->start;
+ temp->next = 0;
+ p->next = temp;
+ tp->rcv_lastsack = sack.end;
+ tp->snd_numholes++;
+ }
+ }
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /*
+ * Update retran_data, snd_fack, and snd_awnd. Go through the list of
+ * holes. Increment retran_data by (hole->rxmit - hole->start).
+ * snd_fack gets the highest value of hole->end.
+ */
+ tp->retran_data = 0;
+ cur = tp->snd_holes;
+ while (cur) {
+ tp->retran_data += cur->rxmit - cur->start;
+ cur = cur->next;
+ }
+ tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) +
+ tp->retran_data;
+#endif /* TCP_FACK */
+
+ return 0;
+}
+
+/*
+ * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
+ * it is completely acked; otherwise, tcp_sack_option(), called from
+ * tcp_dooptions(), will fix up the hole.
+ */
+void
+tcp_del_sackholes(tp, ti)
+ struct tcpcb *tp;
+ struct tcpiphdr *ti;
+{
+ if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) {
+ /* max because this could be an older ack just arrived */
+ tcp_seq lastack = max(ti->ti_ack, tp->snd_una);
+ struct sackhole *cur = tp->snd_holes;
+ struct sackhole *prev = cur;
+ while (cur)
+ if (SEQ_LEQ(cur->end, lastack)) {
+ cur = cur->next;
+ free(prev, M_PCB);
+ prev = cur;
+ tp->snd_numholes--;
+ } else if (SEQ_LT(cur->start, lastack)) {
+ cur->start = lastack;
+ break;
+ } else
+ break;
+ tp->snd_holes = cur;
+ }
+}
+
+/*
+ * Delete all receiver-side SACK information.
+ */
+void
+tcp_clean_sackreport(tp)
+ struct tcpcb *tp;
+{
+ int i;
+
+ tp->rcv_numsacks = 0;
+ for (i = 0; i < MAX_SACK_BLKS; i++)
+ tp->sackblks[i].start = tp->sackblks[i].end=0;
+
+}
+
+/*
+ * Checks for partial ack. If partial ack arrives, turn off retransmission
+ * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
+ * If the ack advances at least to tp->snd_last, return 0.
+ */
+int
+tcp_sack_partialack(tp, ti)
+ struct tcpcb *tp;
+ struct tcpiphdr *ti;
+{
+ if (SEQ_LT(ti->ti_ack, tp->snd_last)) {
+ /* Turn off retx. timer (will start again next segment) */
+ tp->t_timer[TCPT_REXMT] = 0;
+ tp->t_rtt = 0;
+#ifndef TCP_FACK
+ /*
+ * Partial window deflation. This statement relies on the
+ * fact that tp->snd_una has not been updated yet. In FACK
+ * hold snd_cwnd constant during fast recovery.
+ */
+ tp->snd_cwnd -= (ti->ti_ack - tp->snd_una - tp->t_maxseg);
+#endif
+ return 1;
+ }
+ return 0;
+}
+#endif TCP_SACK
+
/*
* Pull out of band byte out of a segment so
* it doesn't appear in the user's data queue.
@@ -1784,7 +2431,6 @@ tcp_mss(tp, offer)
u_long bufsize;
struct inpcb *inp;
struct socket *so;
- extern int tcp_mssdflt;
inp = tp->t_inpcb;
ro = &inp->inp_route;
@@ -1919,3 +2565,40 @@ tcp_mss(tp, offer)
return (mss);
}
#endif /* TUBA_INCLUDE */
+
+#if defined(TCP_NEWRENO) || defined (TCP_SACK)
+/*
+ * Checks for partial ack. If partial ack arrives, force the retransmission
+ * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
+ * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
+ * be started again. If the ack advances at least to tp->snd_last, return 0.
+ */
+int
+tcp_newreno(tp, ti)
+struct tcpcb *tp;
+struct tcpiphdr *ti;
+{
+ if (SEQ_LT(ti->ti_ack, tp->snd_last)) {
+ tcp_seq onxt = tp->snd_nxt;
+ tcp_seq ouna = tp->snd_una; /* snd_una not yet updated */
+ u_long ocwnd = tp->snd_cwnd;
+ tp->t_timer[TCPT_REXMT] = 0;
+ tp->t_rtt = 0;
+ tp->snd_nxt = ti->ti_ack;
+ tp->snd_cwnd = tp->t_maxseg;
+ tp->snd_una = ti->ti_ack;
+ (void) tcp_output(tp);
+ tp->snd_cwnd = ocwnd;
+ tp->snd_una = ouna;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ /*
+ * Partial window deflation. Relies on fact that tp->snd_una
+ * not updated yet.
+ */
+ tp->snd_cwnd -= (ti->ti_ack - tp->snd_una - tp->t_maxseg);
+ return 1;
+ }
+ return 0;
+}
+#endif /* TCP_NEWRENO || TCP_SACK */
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index e73e39b6696..de8eed77369 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_output.c,v 1.11 1998/10/28 21:34:33 provos Exp $ */
+/* $OpenBSD: tcp_output.c,v 1.12 1998/11/17 19:23:02 provos Exp $ */
/* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */
/*
@@ -70,8 +70,88 @@
extern struct mbuf *m_copypack();
#endif
+#ifdef TCP_SACK
+extern int tcprexmtthresh;
+#endif
+#ifdef TCP_SACK
+#define MAX_TCPOPTLEN 40 /* need 40 at least for 3 SACKs + TIMESTAMP */
+#else
#define MAX_TCPOPTLEN 32 /* max # bytes that go in options */
+#endif
+
+#ifdef TCP_SACK
+#ifdef TCP_SACK_DEBUG
+void
+tcp_print_holes(tp)
+struct tcpcb *tp;
+{
+ struct sackhole *p = tp->snd_holes;
+ if (p == 0)
+ return;
+ printf("Hole report: start--end dups rxmit\n");
+ while (p) {
+ printf("%x--%x d %d r %x\n", p->start, p->end, p->dups,
+ p->rxmit);
+ p = p->next;
+ }
+ printf("\n");
+}
+#endif /* TCP_SACK_DEBUG */
+
+/*
+ * Returns pointer to a sackhole if there are any pending retransmissions;
+ * NULL otherwise.
+ */
+struct sackhole *
+tcp_sack_output(tp)
+register struct tcpcb *tp;
+{
+ struct sackhole *p;
+ if (tp->sack_disable)
+ return 0;
+ p = tp->snd_holes;
+ while (p) {
+ if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) {
+ if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
+ p = p->next;
+ continue;
+ }
+#ifdef TCP_SACK_DEBUG
+ if (p)
+ tcp_print_holes(tp);
+#endif
+ return p;
+ }
+ p = p->next;
+ }
+ return 0;
+}
+
+/*
+ * After a timeout, the SACK list may be rebuilt. This SACK information
+ * should be used to avoid retransmitting SACKed data. This function
+ * traverses the SACK list to see if snd_nxt should be moved forward.
+ */
+void
+tcp_sack_adjust(tp)
+ struct tcpcb *tp;
+{
+ int i;
+
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ if (SEQ_LT(tp->snd_nxt, tp->sackblks[i].start))
+ break;
+ if (SEQ_LEQ(tp->sackblks[i].end, tp->snd_nxt))
+ continue;
+ if (tp->sackblks[i].start == 0 && tp->sackblks[i].end == 0)
+ continue;
+ /* snd_nxt must be in middle of block of SACKed data */
+ tp->snd_nxt = tp->sackblks[i].end;
+ break;
+ }
+}
+#endif /* TCP_SACK */
/*
* Tcp output routine: figure out what should be sent and send it.
@@ -88,6 +168,13 @@ tcp_output(tp)
u_char opt[MAX_TCPOPTLEN];
unsigned int optlen, hdrlen;
int idle, sendalot;
+#ifdef TCP_SACK
+ int i, sack_rxmit = 0;
+ struct sackhole *p;
+#endif
+#if defined(TCP_SACK) || defined(TCP_NEWRENO)
+ int maxburst = TCP_MAXBURST;
+#endif
/*
* Determine length of data that should be transmitted,
@@ -105,6 +192,15 @@ tcp_output(tp)
tp->snd_cwnd = tp->t_maxseg;
again:
sendalot = 0;
+#ifdef TCP_SACK
+ /*
+ * If we've recently taken a timeout, snd_max will be greater than
+ * snd_nxt. There may be SACK information that allows us to avoid
+ * resending already delivered data. Adjust snd_nxt accordingly.
+ */
+ if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max))
+ tcp_sack_adjust(tp);
+#endif
off = tp->snd_nxt - tp->snd_una;
win = min(tp->snd_wnd, tp->snd_cwnd);
@@ -115,6 +211,32 @@ again:
* and timer expired, we will send what we can
* and go to transmit state.
*/
+
+#ifdef TCP_SACK
+ /*
+ * Send any SACK-generated retransmissions. If we're explicitly trying
+ * to send out new data (when sendalot is 1), bypass this function.
+ * If we retransmit in fast recovery mode, decrement snd_cwnd, since
+ * we're replacing a (future) new transmission with a retransmission
+ * now, and we previously incremented snd_cwnd in tcp_input().
+ */
+ if (!tp->sack_disable && !sendalot) {
+ if ((p = tcp_sack_output(tp))) {
+ off = p->rxmit - tp->snd_una;
+ sack_rxmit = 1;
+#if 0
+ /* Coalesce holes into a single retransmission */
+#endif
+ len = min(tp->t_maxseg, p->end - p->rxmit);
+#ifndef TCP_FACK
+ /* in FACK, hold snd_cwnd constant during recovery */
+ if (SEQ_LT(tp->snd_una, tp->snd_last))
+ tp->snd_cwnd -= tp->t_maxseg;
+#endif
+ }
+ }
+#endif /* TCP_SACK */
+
if (tp->t_force) {
if (win == 0) {
/*
@@ -142,11 +264,24 @@ again:
}
}
- if (win < so->so_snd.sb_cc) {
- len = win - off;
- flags &= ~TH_FIN;
- } else
- len = so->so_snd.sb_cc - off;
+#ifdef TCP_SACK
+ if (!sack_rxmit) {
+#endif
+ len = min(so->so_snd.sb_cc, win) - off;
+
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /*
+ * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and
+ * amount of outstanding data (snd_awnd) is >= snd_cwnd, then
+ * do not send data (like zero window conditions)
+ */
+ if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) &&
+ (tp->snd_awnd >= tp->snd_cwnd))
+ len = 0;
+#endif /* TCP_FACK */
+#ifdef TCP_SACK
+ }
+#endif
if (len < 0) {
/*
@@ -154,25 +289,23 @@ again:
* but we haven't been called to retransmit,
* len will be -1. Otherwise, window shrank
* after we sent into it. If window shrank to 0,
- * calcel pending retransmit, pull snd_nxt back
- * to (closed) window, and set the persist timer
- * if it isn't already running. If the window
- * didn't close completely, just wait for an ACK.
+ * cancel pending retransmit and pull snd_nxt
+ * back to (closed) window. We will enter persist
+ * state below. If the window didn't close completely,
+ * just wait for an ACK.
*/
len = 0;
if (win == 0) {
tp->t_timer[TCPT_REXMT] = 0;
- tp->t_rxtshift = 0;
tp->snd_nxt = tp->snd_una;
- if (tp->t_timer[TCPT_PERSIST] == 0)
- tcp_setpersist(tp);
}
}
if (len > tp->t_maxseg) {
len = tp->t_maxseg;
- flags &= ~TH_FIN;
sendalot = 1;
}
+ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
+ flags &= ~TH_FIN;
win = sbspace(&so->so_rcv);
@@ -198,6 +331,10 @@ again:
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max))
goto send;
+#ifdef TCP_SACK
+ if (sack_rxmit)
+ goto send;
+#endif
}
/*
@@ -294,6 +431,20 @@ send:
mss = htons((u_int16_t) tcp_mss(tp, 0));
bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss));
optlen = 4;
+#ifdef TCP_SACK
+ /*
+ * If this is the first SYN of connection (not a SYN
+ * ACK), include SACK_PERMIT_HDR option. If this is a
+ * SYN ACK, include SACK_PERMIT_HDR option if peer has
+ * already done so.
+ */
+ if (!tp->sack_disable && ((flags & TH_ACK) == 0 ||
+ (tp->t_flags & TF_SACK_PERMIT))) {
+ *((u_long *) (opt + optlen)) =
+ htonl(TCPOPT_SACK_PERMIT_HDR);
+ optlen += 4;
+ }
+#endif
if ((tp->t_flags & TF_REQ_SCALE) &&
((flags & TH_ACK) == 0 ||
@@ -326,6 +477,34 @@ send:
optlen += TCPOLEN_TSTAMP_APPA;
}
+#ifdef TCP_SACK
+ /*
+ * Send SACKs if necessary. This should be the last option processed.
+ * Only as many SACKs are sent as are permitted by the maximum options
+ * size. No more than three SACKs are sent.
+ */
+ if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED &&
+ (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
+ tp->rcv_numsacks) {
+ u_long *lp = (u_long *) (opt + optlen);
+ u_long *olp = lp++;
+ int count = 0; /* actual number of SACKs inserted */
+ int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
+
+ maxsack = min(maxsack, TCP_MAX_SACK);
+ for (i=0; (i < tp->rcv_numsacks && count < maxsack); i++) {
+ struct sackblk sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0)
+ continue;
+ *lp++ = htonl(sack.start);
+ *lp++ = htonl(sack.end);
+ count++;
+ }
+ *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
+ optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
+ }
+#endif /* TCP_SACK */
+
hdrlen += optlen;
/*
@@ -447,6 +626,23 @@ send:
ti->ti_seq = htonl(tp->snd_nxt);
else
ti->ti_seq = htonl(tp->snd_max);
+#ifdef TCP_SACK
+ if (sack_rxmit) {
+ /*
+ * If sendalot was turned on (due to option stuffing), turn it
+ * off. Properly set th_seq field. Advance the ret'x pointer
+ * by len.
+ */
+ if (sendalot)
+ sendalot = 0;
+ ti->ti_seq = htonl(p->rxmit);
+ p->rxmit += len;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->retran_data += len;
+#endif /* TCP_FACK */
+ }
+#endif /* TCP_SACK */
+
ti->ti_ack = htonl(tp->rcv_nxt);
if (optlen) {
bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen);
@@ -508,6 +704,13 @@ send:
tp->t_flags |= TF_SENTFIN;
}
}
+#ifdef TCP_SACK
+ if (!tp->sack_disable) {
+ if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
+ goto timer;
+ }
+ }
+#endif
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
@@ -530,6 +733,19 @@ send:
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
+#ifdef TCP_SACK
+ timer:
+ if (!tp->sack_disable && sack_rxmit &&
+ tp->t_timer[TCPT_REXMT] == 0 &&
+ tp->snd_nxt != tp->snd_max) {
+ tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
+ if (tp->t_timer[TCPT_PERSIST]) {
+ tp->t_timer[TCPT_PERSIST] = 0;
+ tp->t_rxtshift = 0;
+ }
+ }
+#endif
+
if (tp->t_timer[TCPT_REXMT] == 0 &&
tp->snd_nxt != tp->snd_una) {
tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
@@ -571,6 +787,11 @@ send:
error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route,
so->so_options & SO_DONTROUTE);
#endif
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /* Update snd_awnd to reflect the new data that was sent. */
+ tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) +
+ tp->retran_data;
+#endif
}
if (error) {
out:
@@ -597,7 +818,11 @@ out:
tp->rcv_adv = tp->rcv_nxt + win;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
+#if defined(TCP_SACK) || defined(TCP_NEWRENO)
+ if (sendalot && --maxburst)
+#else
if (sendalot)
+#endif
goto again;
return (0);
}
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 293d769f5d0..255f94e8519 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_subr.c,v 1.11 1998/10/28 21:34:33 provos Exp $ */
+/* $OpenBSD: tcp_subr.c,v 1.12 1998/11/17 19:23:02 provos Exp $ */
/* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */
/*
@@ -77,10 +77,19 @@ int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
* used as the default).
*/
#ifndef TCP_DO_RFC1323
-#define TCP_DO_RFC1323 1
+#define TCP_DO_RFC1323 1
#endif
int tcp_do_rfc1323 = TCP_DO_RFC1323;
+#ifndef TCP_DO_SACK
+#ifdef TCP_SACK
+#define TCP_DO_SACK 1
+#else
+#define TCP_DO_SACK 0
+#endif
+#endif
+int tcp_do_sack = TCP_DO_SACK; /* RFC 2018 selective ACKs */
+
#ifndef TCBHASHSIZE
#define TCBHASHSIZE 128
#endif
@@ -237,6 +246,9 @@ tcp_newtcpcb(inp)
LIST_INIT(&tp->segq);
tp->t_maxseg = tp->t_maxopd = tcp_mssdflt;
+#ifdef TCP_SACK
+ tp->sack_disable = tcp_do_sack ? 0 : 1;
+#endif
tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
tp->t_inpcb = inp;
/*
@@ -293,6 +305,9 @@ tcp_close(tp)
register struct ipqent *qe;
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
+#ifdef TCP_SACK
+ struct sackhole *p, *q;
+#endif
#ifdef RTV_RTT
register struct rtentry *rt;
@@ -369,6 +384,15 @@ tcp_close(tp)
m_freem(qe->ipqe_m);
FREE(qe, M_IPQ);
}
+#ifdef TCP_SACK
+ /* Free SACK holes. */
+ q = p = tp->snd_holes;
+ while (p != 0) {
+ q = p->next;
+ free(p, M_PCB);
+ p = q;
+ }
+#endif
if (tp->t_template)
(void) m_free(dtom(tp->t_template));
free(tp, M_PCB);
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 357fa300db2..53f4ef0e130 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_timer.c,v 1.8 1997/08/26 20:02:34 deraadt Exp $ */
+/* $OpenBSD: tcp_timer.c,v 1.9 1998/11/17 19:23:02 provos Exp $ */
/* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */
/*
@@ -178,6 +178,26 @@ tcp_timers(tp, timer)
int timer;
{
register int rexmt;
+#ifdef TCP_SACK
+ struct sackhole *p, *q;
+ /*
+ * Free SACK holes for 2MSL and REXMT timers.
+ */
+ if (timer == TCPT_2MSL || timer == TCPT_REXMT) {
+ q = p = tp->snd_holes;
+ while (p != 0) {
+ q = p->next;
+ free(p, M_PCB);
+ p = q;
+ }
+ tp->snd_holes = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+ tp->snd_awnd = 0;
+#endif /* TCP_FACK */
+ }
+#endif /* TCP_SACK */
switch (timer) {
@@ -227,6 +247,9 @@ tcp_timers(tp, timer)
tp->t_srtt = 0;
}
tp->snd_nxt = tp->snd_una;
+#if defined (TCP_NEWRENO) || defined (TCP_SACK)
+ tp->snd_last = tp->snd_una;
+#endif
/*
* If timing a segment in this window, stop the timer.
*/
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 880b21ec5e0..7ed5d330fc4 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_usrreq.c,v 1.28 1998/06/27 02:42:41 deraadt Exp $ */
+/* $OpenBSD: tcp_usrreq.c,v 1.29 1998/11/17 19:23:02 provos Exp $ */
/* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */
/*
@@ -241,6 +241,14 @@ tcp_usrreq(so, req, m, nam, control)
tcp_iss += arc4random() % (TCP_ISSINCR / 2) + 1;
#endif /* !TCP_COMPAT_42 */
tcp_sendseqinit(tp);
+#if defined(TCP_SACK) || defined(TCP_NEWRENO)
+ tp->snd_last = tp->snd_una;
+#endif
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+ tp->snd_awnd = 0;
+#endif
error = tcp_output(tp);
break;
@@ -435,6 +443,12 @@ tcp_ctloutput(op, so, level, optname, mp)
error = EINVAL;
break;
+#ifdef TCP_SACK
+ case TCP_SACK_DISABLE:
+ i = *mtod(m, int *);
+ tp->sack_disable = i;
+ break;
+#endif
default:
error = ENOPROTOOPT;
break;
@@ -454,6 +468,11 @@ tcp_ctloutput(op, so, level, optname, mp)
case TCP_MAXSEG:
*mtod(m, int *) = tp->t_maxseg;
break;
+#ifdef TCP_SACK
+ case TCP_SACK_DISABLE:
+ *mtod(m, int *) = tp->sack_disable;
+ break;
+#endif
default:
error = ENOPROTOOPT;
break;
@@ -654,7 +673,14 @@ tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
case TCPCTL_RFC1323:
return (sysctl_int(oldp, oldlenp, newp, newlen,
&tcp_do_rfc1323));
-
+#ifdef TCP_SACK
+ case TCPCTL_SACK:
+ return (sysctl_int(oldp, oldlenp, newp, newlen,
+ &tcp_do_sack));
+#endif
+ case TCPCTL_MSSDFLT:
+ return (sysctl_int(oldp, oldlenp, newp, newlen,
+ &tcp_mssdflt));
case TCPCTL_KEEPINITTIME:
return (sysctl_int(oldp, oldlenp, newp, newlen,
&tcptv_keep_init));
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 3dd17d12a51..a8c59db5987 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_var.h,v 1.12 1998/10/28 21:34:33 provos Exp $ */
+/* $OpenBSD: tcp_var.h,v 1.13 1998/11/17 19:23:03 provos Exp $ */
/* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */
/*
@@ -36,6 +36,23 @@
* @(#)tcp_var.h 8.3 (Berkeley) 4/10/94
*/
+#ifdef TCP_SACK
+struct sackblk
+{
+ tcp_seq start; /* start seq no. of sack block */
+ tcp_seq end; /* end seq no. */
+};
+
+struct sackhole
+{
+ tcp_seq start; /* start seq no. of hole */
+ tcp_seq end; /* end seq no. */
+ int dups; /* number of dup(s)acks for this hole */
+ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */
+ struct sackhole *next; /* next in list */
+};
+#endif
+
/*
* Kernel variables for tcp.
*/
@@ -78,11 +95,33 @@ struct tcpcb {
tcp_seq snd_wl2; /* window update seg ack number */
tcp_seq iss; /* initial send sequence number */
u_long snd_wnd; /* send window */
+#ifdef TCP_SACK
+ int sack_disable; /* disable SACK for this connection */
+ int snd_numholes; /* number of holes seen by sender */
+ struct sackhole *snd_holes; /* linked list of holes (sorted) */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tcp_seq snd_fack; /* for FACK congestion control */
+ u_long snd_awnd; /* snd_nxt - snd_fack + */
+ /* retransmitted data */
+ int retran_data; /* amount of outstanding retx. data */
+#endif /* TCP_FACK */
+#endif /* TCP_SACK */
+#if defined(TCP_SACK) || defined(TCP_NEWRENO)
+ tcp_seq snd_last; /* for use in fast recovery */
+#endif
/* receive sequence variables */
u_long rcv_wnd; /* receive window */
tcp_seq rcv_nxt; /* receive next */
tcp_seq rcv_up; /* receive urgent pointer */
tcp_seq irs; /* initial receive sequence number */
+#ifdef TCP_SACK
+ tcp_seq rcv_laststart; /* start of last segment recd. */
+ tcp_seq rcv_lastend; /* end of ... */
+ tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
+ int rcv_numsacks; /* # distinct sack blks present */
+ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
+#endif
+
/*
* Additional variables for this implementation.
*/
@@ -193,6 +232,7 @@ struct tcpstat {
u_quad_t tcps_sndbyte; /* data bytes sent */
u_long tcps_sndrexmitpack; /* data packets retransmitted */
u_quad_t tcps_sndrexmitbyte; /* data bytes retransmitted */
+ u_quad_t tcps_sndrexmitfast; /* Fast retransmits */
u_long tcps_sndacks; /* ack-only packets sent */
u_long tcps_sndprobe; /* window probes sent */
u_long tcps_sndurg; /* packets sent with URG only */
@@ -243,7 +283,9 @@ struct tcpstat {
#define TCPCTL_RECVSPACE 7 /* receive buffer space */
#define TCPCTL_SENDSPACE 8 /* send buffer space */
#define TCPCTL_IDENT 9 /* get connection owner */
-#define TCPCTL_MAXID 10
+#define TCPCTL_SACK 10 /* selective acknowledgement, rfc 2018 */
+#define TCPCTL_MSSDFLT 11 /* Default maximum segment size */
+#define TCPCTL_MAXID 12
#define TCPCTL_NAMES { \
{ 0, 0 }, \
@@ -256,6 +298,8 @@ struct tcpstat {
{ "recvspace", CTLTYPE_INT }, \
{ "sendspace", CTLTYPE_INT }, \
{ "ident", CTLTYPE_STRUCT }, \
+ { "sack", CTLTYPE_INT }, \
+ { "mssdflt", CTLTYPE_INT }, \
}
struct tcp_ident_mapping {
@@ -268,6 +312,10 @@ struct inpcbtable tcbtable; /* head of queue of active tcpcb's */
struct tcpstat tcpstat; /* tcp statistics */
u_int32_t tcp_now; /* for RFC 1323 timestamps */
extern int tcp_do_rfc1323; /* enabled/disabled? */
+extern int tcp_mssdflt; /* default maximum segment size */
+#ifdef TCP_SACK
+extern int tcp_do_sack; /* SACK enabled/disabled */
+#endif
int tcp_attach __P((struct socket *));
void tcp_canceltimers __P((struct tcpcb *));
@@ -310,4 +358,21 @@ int tcp_usrreq __P((struct socket *,
int, struct mbuf *, struct mbuf *, struct mbuf *));
void tcp_xmit_timer __P((struct tcpcb *, int));
void tcpdropoldhalfopen __P((struct tcpcb *, u_int16_t));
+#ifdef TCP_SACK
+int tcp_sack_option __P((struct tcpcb *,struct tcpiphdr *,u_char *,int));
+void tcp_update_sack_list __P((struct tcpcb *tp));
+void tcp_del_sackholes __P((struct tcpcb *, struct tcpiphdr *));
+void tcp_clean_sackreport __P((struct tcpcb *tp));
+void tcp_sack_adjust __P((struct tcpcb *tp));
+struct sackhole * tcp_sack_output __P((struct tcpcb *tp));
+int tcp_sack_partialack __P((struct tcpcb *, struct tcpiphdr *));
+#ifdef DEBUG
+void tcp_print_holes __P((struct tcpcb *tp));
#endif
+#endif /* TCP_SACK */
+#if defined(TCP_NEWRENO) || defined(TCP_SACK)
+int tcp_newreno __P((struct tcpcb *, struct tcpiphdr *));
+u_long tcp_seq_subtract __P((u_long, u_long ));
+#endif /* TCP_NEWRENO || TCP_SACK */
+
+#endif /* KERNEL */