diff options
author | Niels Provos <provos@cvs.openbsd.org> | 1998-11-17 19:23:04 +0000 |
---|---|---|
committer | Niels Provos <provos@cvs.openbsd.org> | 1998-11-17 19:23:04 +0000 |
commit | f4056acce40a0cafaefbc4d11482cc581a495726 (patch) | |
tree | 18b3206061d7da9c999130a9b9449317fa5c48db | |
parent | b6b557f3649354e658028fd32cc7555b7f400c72 (diff) |
NewReno, SACK and FACK support for TCP, adapted from code for BSDI
by Hari Balakrishnan (hari@lcs.mit.edu), Tom Henderson (tomh@cs.berkeley.edu)
and Venkat Padmanabhan (padmanab@cs.berkeley.edu) as part of the
Daedalus research group at the University of California,
(http://daedalus.cs.berkeley.edu). [I was able to do this on time spent
at the Center for Information Technology Integration (citi.umich.edu)]
-rw-r--r-- | sys/netinet/tcp.h | 20 | ||||
-rw-r--r-- | sys/netinet/tcp_debug.c | 2 | ||||
-rw-r--r-- | sys/netinet/tcp_input.c | 705 | ||||
-rw-r--r-- | sys/netinet/tcp_output.c | 253 | ||||
-rw-r--r-- | sys/netinet/tcp_subr.c | 28 | ||||
-rw-r--r-- | sys/netinet/tcp_timer.c | 25 | ||||
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 30 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 69 |
8 files changed, 1098 insertions, 34 deletions
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 57069c1a682..39c001c3962 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp.h,v 1.2 1997/02/24 14:06:44 niklas Exp $ */ +/* $OpenBSD: tcp.h,v 1.3 1998/11/17 19:23:00 provos Exp $ */ /* $NetBSD: tcp.h,v 1.8 1995/04/17 05:32:58 cgd Exp $ */ /* @@ -75,6 +75,7 @@ struct tcphdr { #define TCPOPT_SACK_PERMITTED 4 /* Experimental */ #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 /* Experimental */ +#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ @@ -82,6 +83,20 @@ struct tcphdr { #define TCPOPT_TSTAMP_HDR \ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) +#ifdef TCP_SACK +/* Option definitions */ +#define TCPOPT_SACK_PERMIT_HDR \ +(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED) +#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8) +/* Miscellaneous constants */ +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */ +#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */ +#endif /* TCP_SACK */ + +#if defined(TCP_SACK) || defined(TCP_NEWRENO) +#define TCP_MAXBURST 4 /* Max # packets after leaving Fast Rxmit */ +#endif + /* * Default maximum segment size for TCP. * With an IP MSS of 576, this is 536, @@ -99,3 +114,6 @@ struct tcphdr { */ #define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ #define TCP_MAXSEG 0x02 /* set maximum segment size */ +#ifdef TCP_SACK +#define TCP_SACK_DISABLE 0x300 /* disable SACKs(if enabled by deflt.)*/ +#endif /* TCP_SACK */ diff --git a/sys/netinet/tcp_debug.c b/sys/netinet/tcp_debug.c index 17e038acf8d..6adc013193e 100644 --- a/sys/netinet/tcp_debug.c +++ b/sys/netinet/tcp_debug.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_debug.c,v 1.2 1996/03/03 22:30:44 niklas Exp $ */ +/* $OpenBSD: tcp_debug.c,v 1.3 1998/11/17 19:23:01 provos Exp $ */ /* $NetBSD: tcp_debug.c,v 1.10 1996/02/13 23:43:36 christos Exp $ */ /* diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index cacdcc1b9e8..72f19aafa92 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_input.c,v 1.20 1998/10/28 21:34:32 provos Exp $ */ +/* $OpenBSD: tcp_input.c,v 1.21 1998/11/17 19:23:01 provos Exp $ */ /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ /* @@ -609,6 +609,11 @@ findpcb: if (tp->t_state != TCPS_SYN_RECEIVED) tp->t_timer[TCPT_KEEP] = tcp_keepidle; +#ifdef TCP_SACK + if (!tp->sack_disable) + tcp_del_sackholes(tp, ti); /* Delete stale SACK holes */ +#endif /* TCP_SACK */ + /* * Process options if not in LISTEN state, * else do it below (after getting remote address). @@ -617,6 +622,12 @@ findpcb: tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); +#ifdef TCP_SACK + if (!tp->sack_disable) { + tp->rcv_laststart = ti->ti_seq; /* last rec'vd segment*/ + tp->rcv_lastend = ti->ti_seq + ti->ti_len; + } +#endif /* TCP_SACK */ /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has @@ -652,7 +663,7 @@ findpcb: if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - tp->t_dupacks < tcprexmtthresh) { + tp->t_dupacks == 0) { /* * this is a pure ack for outstanding data. */ @@ -667,6 +678,10 @@ findpcb: tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); tp->snd_una = ti->ti_ack; +#if defined(TCP_SACK) && defined(TCP_FACK) + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; +#endif /* TCP_FACK */ m_freem(m); /* @@ -697,6 +712,11 @@ findpcb: * with nothing on the reassembly queue and * we have enough buffer space to take it. */ +#ifdef TCP_SACK + /* Clean receiver SACK report if present */ + if (!tp->sack_disable && tp->rcv_numsacks) + tcp_clean_sackreport(tp); +#endif /* TCP_SACK */ ++tcpstat.tcps_preddat; tp->rcv_nxt += ti->ti_len; tcpstat.tcps_rcvpack++; @@ -822,6 +842,17 @@ findpcb: if (optp) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); +#ifdef TCP_SACK + /* + * If peer did not send a SACK_PERMITTED option (i.e., if + * tcp_dooptions() did not set TF_SACK_PERMIT), set + * sack_disable to 1 if it is currently 0. + */ + if (!tp->sack_disable) + if ((tp->t_flags & TF_SACK_PERMIT) == 0) + tp->sack_disable = 1; +#endif + if (iss) tp->iss = iss; else @@ -833,6 +864,14 @@ findpcb: #endif /* !TCP_COMPAT_42 */ tp->irs = ti->ti_seq; tcp_sendseqinit(tp); +#if defined (TCP_SACK) || defined (TCP_NEWRENO) + tp->snd_last = tp->snd_una; +#endif /* TCP_SACK || TCP_NEWRENO */ +#if defined(TCP_SACK) && defined(TCP_FACK) + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; + tp->snd_awnd = 0; +#endif /* TCP_FACK */ tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; tp->t_state = TCPS_SYN_RECEIVED; @@ -893,6 +932,16 @@ findpcb: tp->irs = ti->ti_seq; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; +#ifdef TCP_SACK + /* + * If we've sent a SACK_PERMITTED option, and the peer + * also replied with one, then TF_SACK_PERMIT should have + * been set in tcp_dooptions(). If it was not, disable SACKs. + */ + if (!tp->sack_disable) + if ((tp->t_flags & TF_SACK_PERMIT) == 0) + tp->sack_disable = 1; +#endif if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { tcpstat.tcps_connects++; soisconnected(so); @@ -911,6 +960,15 @@ findpcb: */ if (tp->t_rtt) tcp_xmit_timer(tp, tp->t_rtt); + /* + * Since new data was acked (the SYN), open the + * congestion window by one MSS. We do this + * here, because we won't go through the normal + * ACK processing below. And since this is the + * start of the connection, we know we are in + * the exponential phase of slow-start. + */ + tp->snd_cwnd += tp->t_maxseg; } else tp->t_state = TCPS_SYN_RECEIVED; @@ -1169,7 +1227,31 @@ trimthenstep6: case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { - if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { + /* + * Duplicate/old ACK processing. + * Increments t_dupacks: + * Pure duplicate (same seq/ack/window, no data) + * Doesn't affect t_dupacks: + * Data packets. + * Normal window updates (window opens) + * Resets t_dupacks: + * New data ACKed. + * Window shrinks + * Old ACK + */ + if (ti->ti_len) + break; + /* + * If we get an old ACK, there is probably packet + * reordering going on. Be conservative and reset + * t_dupacks so that we are less agressive in + * doing a fast retransmit. + */ + if (ti->ti_ack != tp->snd_una) { + tp->t_dupacks = 0; + break; + } + if (tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than @@ -1195,45 +1277,186 @@ trimthenstep6: * to keep a constant cwnd packets in the * network. */ - if (tp->t_timer[TCPT_REXMT] == 0 || - ti->ti_ack != tp->snd_una) + if (tp->t_timer[TCPT_REXMT] == 0) tp->t_dupacks = 0; +#if defined(TCP_SACK) && defined(TCP_FACK) + /* + * In FACK, can enter fast rec. if the receiver + * reports a reass. queue longer than 3 segs. + */ + else if (++tp->t_dupacks == tcprexmtthresh || + ((SEQ_GT(tp->snd_fack, tcprexmtthresh * + tp->t_maxseg + tp->snd_una)) && + SEQ_GT(tp->snd_una, tp->snd_last))) { +#else else if (++tp->t_dupacks == tcprexmtthresh) { +#endif /* TCP_FACK */ tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; +#if defined(TCP_SACK) || defined(TCP_NEWRENO) + if (SEQ_LT(ti->ti_ack, tp->snd_last)){ + /* + * False fast retx after + * timeout. Do not cut window. + */ + tp->snd_cwnd += tp->t_maxseg; + tp->t_dupacks = 0; + (void) tcp_output(tp); + goto drop; + } +#endif if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; +#if defined(TCP_SACK) || defined(TCP_NEWRENO) + tp->snd_last = tp->snd_max; +#endif +#ifdef TCP_SACK + if (!tp->sack_disable) { + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtt = 0; + tcpstat.tcps_sndrexmitfast++; +#if defined(TCP_SACK) && defined(TCP_FACK) + (void) tcp_output(tp); + /* + * During FR, snd_cwnd is held + * constant for FACK. + */ + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = tcprexmtthresh; +#else + /* + * tcp_output() will send + * oldest SACK-eligible rtx. + */ + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh+ + tp->t_maxseg * tp->t_dupacks; +#endif /* TCP_FACK */ + /* + * It is possible for + * tcp_output to fail to send + * a segment. If so, make + * sure that REMXT timer is set. + */ + if (SEQ_GT(tp->snd_max, + tp->snd_una) && + tp->t_timer[TCPT_REXMT] == 0 && + tp->t_timer[TCPT_PERSIST] == 0) + tp->t_timer[TCPT_REXMT] = + tp->t_rxtcur; + goto drop; + } +#endif /* TCP_SACK */ tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; + tcpstat.tcps_sndrexmitfast++; (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > tcprexmtthresh) { +#if defined(TCP_SACK) && defined(TCP_FACK) + /* + * while (awnd < cwnd) + * sendsomething(); + */ + if (!tp->sack_disable) { + if (tp->snd_awnd < tp->snd_cwnd) + tcp_output(tp); + goto drop; + } +#endif /* TCP_FACK */ tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } - } else + } else if (tiwin < tp->snd_wnd) { + /* + * The window was retracted! Previous dup + * ACKs may have been due to packets arriving + * after the shrunken window, not a missing + * packet, so play it safe and reset t_dupacks + */ tp->t_dupacks = 0; + } break; } /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ +#ifdef TCP_NEWRENO + if (tp->t_dupacks >= tcprexmtthresh && !tcp_newreno(tp, ti)) { + /* Out of fast recovery */ + tp->snd_cwnd = tp->snd_ssthresh; + /* + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (tcp_seq_subtract(tp->snd_max, ti->ti_ack) < + tp->snd_ssthresh) + tp->snd_cwnd = tcp_seq_subtract(tp->snd_max, + ti->ti_ack) + tp->t_maxseg; + tp->t_dupacks = 0; + } +#elif defined(TCP_SACK) + if (!tp->sack_disable) { + if (tp->t_dupacks >= tcprexmtthresh) { + /* Check for a partial ACK */ + if (tcp_sack_partialack(tp, ti)) { +#if defined(TCP_SACK) && defined(TCP_FACK) + /* Force call to tcp_output */ + if (tp->snd_awnd < tp->snd_cwnd) + needoutput = 1; +#else + tp->snd_cwnd += tp->t_maxseg; + needoutput = 1; +#endif /* TCP_FACK */ + } else { + /* Out of fast recovery */ + tp->snd_cwnd = tp->snd_ssthresh; + if (tcp_seq_subtract(tp->snd_max, + ti->ti_ack) < tp->snd_ssthresh) + tp->snd_cwnd = + tcp_seq_subtract(tp->snd_max, + ti->ti_ack) + tp->t_maxseg; + tp->t_dupacks = 0; +#if defined(TCP_SACK) && defined(TCP_FACK) + if (SEQ_GT(ti->ti_ack, tp->snd_fack)) + tp->snd_fack = ti->ti_ack; +#endif /* TCP_FACK */ + } + } + } else { + if (tp->t_dupacks >= tcprexmtthresh && + !tcp_newreno(tp, ti)) { + /* Out of fast recovery */ + tp->snd_cwnd = tp->snd_ssthresh; + if (tcp_seq_subtract(tp->snd_max, ti->ti_ack) < + tp->snd_ssthresh) + tp->snd_cwnd = + tcp_seq_subtract(tp->snd_max, + ti->ti_ack) + tp->t_maxseg; + tp->t_dupacks = 0; + } + } +#else /* else neither TCP_NEWRENO nor TCP_SACK */ if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; +#endif if (SEQ_GT(ti->ti_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; @@ -1272,9 +1495,7 @@ trimthenstep6: * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window - * (maxseg^2 / cwnd per packet), plus a constant - * fraction of a packet (maxseg/8) to help larger windows - * open quickly enough. + * (maxseg^2 / cwnd per packet). */ { register u_int cw = tp->snd_cwnd; @@ -1282,6 +1503,9 @@ trimthenstep6: if (cw > tp->snd_ssthresh) incr = incr * incr / cw; +#if defined (TCP_NEWRENO) || defined (TCP_SACK) + if (SEQ_GEQ(ti->ti_ack, tp->snd_last)) +#endif tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); } if (acked > so->so_snd.sb_cc) { @@ -1298,6 +1522,10 @@ trimthenstep6: tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; +#if defined (TCP_SACK) && defined (TCP_FACK) + if (SEQ_GT(tp->snd_una, tp->snd_fack)) + tp->snd_fack = tp->snd_una; +#endif switch (tp->t_state) { @@ -1454,6 +1682,10 @@ dodata: /* XXX */ if ((ti->ti_len || (tiflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); +#ifdef TCP_SACK + if (!tp->sack_disable) + tcp_update_sack_list(tp); +#endif /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's @@ -1519,8 +1751,20 @@ dodata: /* XXX */ /* * Return any desired output. */ - if (needoutput || (tp->t_flags & TF_ACKNOW)) + if (needoutput || (tp->t_flags & TF_ACKNOW)) { (void) tcp_output(tp); +#ifdef TCP_SACK + /* + * In SACK, it is possible for tcp_output() to fail to send a segment + * after the retransmission timer has been turned off. Make sure that + * the retransmission timer is set if we are in fast recovery. + */ + if (needoutput && SEQ_GT(tp->snd_max, tp->snd_una) && + tp->t_timer[TCPT_REXMT] == 0 && + tp->t_timer[TCPT_PERSIST] == 0) + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; +#endif + } return; dropafterack: @@ -1636,6 +1880,20 @@ tcp_dooptions(tp, cp, cnt, ti, ts_present, ts_val, ts_ecr) tp->ts_recent_age = tcp_now; } break; + +#ifdef TCP_SACK + case TCPOPT_SACK_PERMITTED: + if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED) + continue; + if (ti->ti_flags & TH_SYN) + /* MUST only be set on SYN */ + tp->t_flags |= TF_SACK_PERMIT; + break; + case TCPOPT_SACK: + if (tcp_sack_option(tp, ti, cp, optlen)) + continue; + break; +#endif } } /* Update t_maxopd and t_maxseg after all options are processed */ @@ -1643,6 +1901,395 @@ tcp_dooptions(tp, cp, cnt, ti, ts_present, ts_val, ts_ecr) (void) tcp_mss(tp, mss); /* sets t_maxseg */ } +#if defined(TCP_SACK) || defined(TCP_NEWRENO) +u_long +tcp_seq_subtract(a, b) + u_long a, b; +{ + return ((long)(a - b)); +} +#endif + + +#ifdef TCP_SACK +/* + * This function is called upon receipt of new valid data (while not in header + * prediction mode), and it updates the ordered list of sacks. + */ +void +tcp_update_sack_list(tp) + struct tcpcb *tp; +{ + /* + * First reported block MUST be the most recent one. Subsequent + * blocks SHOULD be in the order in which they arrived at the + * receiver. These two conditions make the implementation fully + * compliant with RFC 2018. + */ + int i, j = 0, count = 0, lastpos = -1; + struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; + + /* First clean up current list of sacks */ + for (i = 0; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) { + count++; /* count = number of blocks to be discarded */ + continue; + } + if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { + tp->sackblks[i].start = tp->sackblks[i].end = 0; + count++; + } else { + temp[j].start = tp->sackblks[i].start; + temp[j++].end = tp->sackblks[i].end; + } + } + tp->rcv_numsacks -= count; + if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ + tcp_clean_sackreport(tp); + if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) { + /* ==> need first sack block */ + tp->sackblks[0].start = tp->rcv_laststart; + tp->sackblks[0].end = tp->rcv_lastend; + tp->rcv_numsacks = 1; + } + return; + } + /* Otherwise, sack blocks are already present. */ + for (i = 0; i < tp->rcv_numsacks; i++) + tp->sackblks[i] = temp[i]; /* first copy back sack list */ + if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend)) + return; /* sack list remains unchanged */ + /* + * From here, segment just received should be (part of) the 1st sack. + * Go through list, possibly coalescing sack block entries. + */ + firstsack.start = tp->rcv_laststart; + firstsack.end = tp->rcv_lastend; + for (i = 0; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (SEQ_LT(sack.end, firstsack.start) || + SEQ_GT(sack.start, firstsack.end)) + continue; /* no overlap */ + if (sack.start == firstsack.start && sack.end == firstsack.end){ + /* + * identical block; delete it here since we will + * move it to the front of the list. + */ + tp->sackblks[i].start = tp->sackblks[i].end = 0; + lastpos = i; /* last posn with a zero entry */ + continue; + } + if (SEQ_LEQ(sack.start, firstsack.start)) + firstsack.start = sack.start; /* merge blocks */ + if (SEQ_GEQ(sack.end, firstsack.end)) + firstsack.end = sack.end; /* merge blocks */ + tp->sackblks[i].start = tp->sackblks[i].end = 0; + lastpos = i; /* last posn with a zero entry */ + } + if (lastpos != -1) { /* at least one merge */ + for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) + continue; + temp[j++] = sack; + } + tp->rcv_numsacks = j; /* including first blk (added later) */ + for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ + tp->sackblks[i] = temp[i]; + } else { /* no merges -- shift sacks by 1 */ + if (tp->rcv_numsacks < MAX_SACK_BLKS) + tp->rcv_numsacks++; + for (i = tp->rcv_numsacks-1; i > 0; i--) + tp->sackblks[i] = tp->sackblks[i-1]; + } + tp->sackblks[0] = firstsack; + return; +} + +/* + * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue, + * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list + * of holes (oldest to newest, in terms of the sequence space). + */ +int +tcp_sack_option(tp, ti, cp, optlen) + struct tcpcb *tp; + struct tcpiphdr *ti; + u_char *cp; + int optlen; +{ + int tmp_olen; + u_char *tmp_cp; + struct sackhole *cur, *p, *temp; + + if (tp->sack_disable) + return 1; + + /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ + if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) + return 1; + tmp_cp = cp + 2; + tmp_olen = optlen - 2; + if (tp->snd_numholes < 0) + tp->snd_numholes = 0; + if (tp->t_maxseg == 0) + panic("tcp_sack_option"); /* Should never happen */ + while (tmp_olen > 0) { + struct sackblk sack; + + bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); + NTOHL(sack.start); + bcopy((char *) tmp_cp + sizeof(tcp_seq), + (char *) &(sack.end), sizeof(tcp_seq)); + NTOHL(sack.end); + tmp_olen -= TCPOLEN_SACK; + tmp_cp += TCPOLEN_SACK; + if (SEQ_LEQ(sack.end, sack.start)) + continue; /* bad SACK fields */ + if (SEQ_LEQ(sack.end, tp->snd_una)) + continue; /* old block */ +#if defined(TCP_SACK) && defined(TCP_FACK) + /* Updates snd_fack. */ + if (SEQ_GEQ(sack.end, tp->snd_fack)) + tp->snd_fack = sack.end; +#endif /* TCP_FACK */ + if (tp->snd_holes == 0) { /* first hole */ + tp->snd_holes = (struct sackhole *) + malloc(sizeof(struct sackhole), M_PCB, M_NOWAIT); + cur = tp->snd_holes; + cur->start = ti->ti_ack; + cur->end = sack.start; + cur->rxmit = cur->start; + cur->next = 0; + tp->snd_numholes = 1; + tp->rcv_lastsack = sack.end; + /* + * dups is at least one. If more data has been + * SACKed, it can be greater than one. + */ + cur->dups = min(tcprexmtthresh, + ((sack.end - cur->end)/tp->t_maxseg)); + if (cur->dups < 1) + cur->dups = 1; + continue; /* with next sack block */ + } + /* Go thru list of holes: p = previous, cur = current */ + p = cur = tp->snd_holes; + while (cur) { + if (SEQ_LEQ(sack.end, cur->start)) + /* SACKs data before the current hole */ + break; /* no use going through more holes */ + if (SEQ_GEQ(sack.start, cur->end)) { + /* SACKs data beyond the current hole */ + cur->dups++; + if ( ((sack.end - cur->end)/tp->t_maxseg) >= + tcprexmtthresh) + cur->dups = tcprexmtthresh; + p = cur; + cur = cur->next; + continue; + } + if (SEQ_LEQ(sack.start, cur->start)) { + /* Data acks at least the beginning of hole */ +#if defined(TCP_SACK) && defined(TCP_FACK) + if (SEQ_GT(sack.end, cur->rxmit)) + tp->retran_data -= + tcp_seq_subtract(cur->rxmit, + cur->start); + else + tp->retran_data -= + tcp_seq_subtract(sack.end, + cur->start); +#endif /* TCP_FACK */ + if (SEQ_GEQ(sack.end,cur->end)){ + /* Acks entire hole, so delete hole */ + if (p != cur) { + p->next = cur->next; + free(cur, M_PCB); + cur = p->next; + } else { + cur=cur->next; + free(p, M_PCB); + p = cur; + tp->snd_holes = p; + } + tp->snd_numholes--; + continue; + } + /* otherwise, move start of hole forward */ + cur->start = sack.end; + cur->rxmit = max (cur->rxmit, cur->start); + p = cur; + cur = cur->next; + continue; + } + /* move end of hole backward */ + if (SEQ_GEQ(sack.end, cur->end)) { +#if defined(TCP_SACK) && defined(TCP_FACK) + if (SEQ_GT(cur->rxmit, sack.start)) + tp->retran_data -= + tcp_seq_subtract(cur->rxmit, + sack.start); +#endif /* TCP_FACK */ + cur->end = sack.start; + cur->rxmit = min (cur->rxmit, cur->end); + cur->dups++; + if ( ((sack.end - cur->end)/tp->t_maxseg) >= + tcprexmtthresh) + cur->dups = tcprexmtthresh; + p = cur; + cur = cur->next; + continue; + } + if (SEQ_LT(cur->start, sack.start) && + SEQ_GT(cur->end, sack.end)) { + /* + * ACKs some data in middle of a hole; need to + * split current hole + */ +#if defined(TCP_SACK) && defined(TCP_FACK) + if (SEQ_GT(cur->rxmit, sack.end)) + tp->retran_data -= + tcp_seq_subtract(sack.end, + sack.start); + else if (SEQ_GT(cur->rxmit, sack.start)) + tp->retran_data -= + tcp_seq_subtract(cur->rxmit, + sack.start); +#endif /* TCP_FACK */ + temp = (struct sackhole *)malloc(sizeof(*temp), + M_PCB,M_NOWAIT); + temp->next = cur->next; + temp->start = sack.end; + temp->end = cur->end; + temp->dups = cur->dups; + temp->rxmit = max (cur->rxmit, temp->start); + cur->end = sack.start; + cur->rxmit = min (cur->rxmit, cur->end); + cur->dups++; + if ( ((sack.end - cur->end)/tp->t_maxseg) >= + tcprexmtthresh) + cur->dups = tcprexmtthresh; + cur->next = temp; + p = temp; + cur = p->next; + tp->snd_numholes++; + } + } + /* At this point, p points to the last hole on the list */ + if (SEQ_LT(tp->rcv_lastsack, sack.start)) { + /* + * Need to append new hole at end. + * Last hole is p (and it's not NULL). + */ + temp = (struct sackhole *) malloc(sizeof(*temp), + M_PCB, M_NOWAIT); + temp->start = tp->rcv_lastsack; + temp->end = sack.start; + temp->dups = min(tcprexmtthresh, + ((sack.end - sack.start)/tp->t_maxseg)); + if (temp->dups < 1) + temp->dups = 1; + temp->rxmit = temp->start; + temp->next = 0; + p->next = temp; + tp->rcv_lastsack = sack.end; + tp->snd_numholes++; + } + } +#if defined(TCP_SACK) && defined(TCP_FACK) + /* + * Update retran_data, snd_fack, and snd_awnd. Go through the list of + * holes. Increment retran_data by (hole->rxmit - hole->start). + * snd_fack gets the highest value of hole->end. + */ + tp->retran_data = 0; + cur = tp->snd_holes; + while (cur) { + tp->retran_data += cur->rxmit - cur->start; + cur = cur->next; + } + tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + + tp->retran_data; +#endif /* TCP_FACK */ + + return 0; +} + +/* + * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if + * it is completely acked; otherwise, tcp_sack_option(), called from + * tcp_dooptions(), will fix up the hole. + */ +void +tcp_del_sackholes(tp, ti) + struct tcpcb *tp; + struct tcpiphdr *ti; +{ + if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) { + /* max because this could be an older ack just arrived */ + tcp_seq lastack = max(ti->ti_ack, tp->snd_una); + struct sackhole *cur = tp->snd_holes; + struct sackhole *prev = cur; + while (cur) + if (SEQ_LEQ(cur->end, lastack)) { + cur = cur->next; + free(prev, M_PCB); + prev = cur; + tp->snd_numholes--; + } else if (SEQ_LT(cur->start, lastack)) { + cur->start = lastack; + break; + } else + break; + tp->snd_holes = cur; + } +} + +/* + * Delete all receiver-side SACK information. + */ +void +tcp_clean_sackreport(tp) + struct tcpcb *tp; +{ + int i; + + tp->rcv_numsacks = 0; + for (i = 0; i < MAX_SACK_BLKS; i++) + tp->sackblks[i].start = tp->sackblks[i].end=0; + +} + +/* + * Checks for partial ack. If partial ack arrives, turn off retransmission + * timer, deflate the window, do not clear tp->t_dupacks, and return 1. + * If the ack advances at least to tp->snd_last, return 0. + */ +int +tcp_sack_partialack(tp, ti) + struct tcpcb *tp; + struct tcpiphdr *ti; +{ + if (SEQ_LT(ti->ti_ack, tp->snd_last)) { + /* Turn off retx. timer (will start again next segment) */ + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtt = 0; +#ifndef TCP_FACK + /* + * Partial window deflation. This statement relies on the + * fact that tp->snd_una has not been updated yet. In FACK + * hold snd_cwnd constant during fast recovery. + */ + tp->snd_cwnd -= (ti->ti_ack - tp->snd_una - tp->t_maxseg); +#endif + return 1; + } + return 0; +} +#endif TCP_SACK + /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. @@ -1784,7 +2431,6 @@ tcp_mss(tp, offer) u_long bufsize; struct inpcb *inp; struct socket *so; - extern int tcp_mssdflt; inp = tp->t_inpcb; ro = &inp->inp_route; @@ -1919,3 +2565,40 @@ tcp_mss(tp, offer) return (mss); } #endif /* TUBA_INCLUDE */ + +#if defined(TCP_NEWRENO) || defined (TCP_SACK) +/* + * Checks for partial ack. If partial ack arrives, force the retransmission + * of the next unacknowledged segment, do not clear tp->t_dupacks, and return + * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to + * be started again. If the ack advances at least to tp->snd_last, return 0. + */ +int +tcp_newreno(tp, ti) +struct tcpcb *tp; +struct tcpiphdr *ti; +{ + if (SEQ_LT(ti->ti_ack, tp->snd_last)) { + tcp_seq onxt = tp->snd_nxt; + tcp_seq ouna = tp->snd_una; /* snd_una not yet updated */ + u_long ocwnd = tp->snd_cwnd; + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtt = 0; + tp->snd_nxt = ti->ti_ack; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_una = ti->ti_ack; + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + tp->snd_una = ouna; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + /* + * Partial window deflation. Relies on fact that tp->snd_una + * not updated yet. + */ + tp->snd_cwnd -= (ti->ti_ack - tp->snd_una - tp->t_maxseg); + return 1; + } + return 0; +} +#endif /* TCP_NEWRENO || TCP_SACK */ diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index e73e39b6696..de8eed77369 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_output.c,v 1.11 1998/10/28 21:34:33 provos Exp $ */ +/* $OpenBSD: tcp_output.c,v 1.12 1998/11/17 19:23:02 provos Exp $ */ /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ /* @@ -70,8 +70,88 @@ extern struct mbuf *m_copypack(); #endif +#ifdef TCP_SACK +extern int tcprexmtthresh; +#endif +#ifdef TCP_SACK +#define MAX_TCPOPTLEN 40 /* need 40 at least for 3 SACKs + TIMESTAMP */ +#else #define MAX_TCPOPTLEN 32 /* max # bytes that go in options */ +#endif + +#ifdef TCP_SACK +#ifdef TCP_SACK_DEBUG +void +tcp_print_holes(tp) +struct tcpcb *tp; +{ + struct sackhole *p = tp->snd_holes; + if (p == 0) + return; + printf("Hole report: start--end dups rxmit\n"); + while (p) { + printf("%x--%x d %d r %x\n", p->start, p->end, p->dups, + p->rxmit); + p = p->next; + } + printf("\n"); +} +#endif /* TCP_SACK_DEBUG */ + +/* + * Returns pointer to a sackhole if there are any pending retransmissions; + * NULL otherwise. + */ +struct sackhole * +tcp_sack_output(tp) +register struct tcpcb *tp; +{ + struct sackhole *p; + if (tp->sack_disable) + return 0; + p = tp->snd_holes; + while (p) { + if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) { + if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ + p = p->next; + continue; + } +#ifdef TCP_SACK_DEBUG + if (p) + tcp_print_holes(tp); +#endif + return p; + } + p = p->next; + } + return 0; +} + +/* + * After a timeout, the SACK list may be rebuilt. This SACK information + * should be used to avoid retransmitting SACKed data. This function + * traverses the SACK list to see if snd_nxt should be moved forward. + */ +void +tcp_sack_adjust(tp) + struct tcpcb *tp; +{ + int i; + + for (i = 0; i < tp->rcv_numsacks; i++) { + if (SEQ_LT(tp->snd_nxt, tp->sackblks[i].start)) + break; + if (SEQ_LEQ(tp->sackblks[i].end, tp->snd_nxt)) + continue; + if (tp->sackblks[i].start == 0 && tp->sackblks[i].end == 0) + continue; + /* snd_nxt must be in middle of block of SACKed data */ + tp->snd_nxt = tp->sackblks[i].end; + break; + } +} +#endif /* TCP_SACK */ /* * Tcp output routine: figure out what should be sent and send it. @@ -88,6 +168,13 @@ tcp_output(tp) u_char opt[MAX_TCPOPTLEN]; unsigned int optlen, hdrlen; int idle, sendalot; +#ifdef TCP_SACK + int i, sack_rxmit = 0; + struct sackhole *p; +#endif +#if defined(TCP_SACK) || defined(TCP_NEWRENO) + int maxburst = TCP_MAXBURST; +#endif /* * Determine length of data that should be transmitted, @@ -105,6 +192,15 @@ tcp_output(tp) tp->snd_cwnd = tp->t_maxseg; again: sendalot = 0; +#ifdef TCP_SACK + /* + * If we've recently taken a timeout, snd_max will be greater than + * snd_nxt. There may be SACK information that allows us to avoid + * resending already delivered data. Adjust snd_nxt accordingly. + */ + if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); +#endif off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); @@ -115,6 +211,32 @@ again: * and timer expired, we will send what we can * and go to transmit state. */ + +#ifdef TCP_SACK + /* + * Send any SACK-generated retransmissions. If we're explicitly trying + * to send out new data (when sendalot is 1), bypass this function. + * If we retransmit in fast recovery mode, decrement snd_cwnd, since + * we're replacing a (future) new transmission with a retransmission + * now, and we previously incremented snd_cwnd in tcp_input(). + */ + if (!tp->sack_disable && !sendalot) { + if ((p = tcp_sack_output(tp))) { + off = p->rxmit - tp->snd_una; + sack_rxmit = 1; +#if 0 + /* Coalesce holes into a single retransmission */ +#endif + len = min(tp->t_maxseg, p->end - p->rxmit); +#ifndef TCP_FACK + /* in FACK, hold snd_cwnd constant during recovery */ + if (SEQ_LT(tp->snd_una, tp->snd_last)) + tp->snd_cwnd -= tp->t_maxseg; +#endif + } + } +#endif /* TCP_SACK */ + if (tp->t_force) { if (win == 0) { /* @@ -142,11 +264,24 @@ again: } } - if (win < so->so_snd.sb_cc) { - len = win - off; - flags &= ~TH_FIN; - } else - len = so->so_snd.sb_cc - off; +#ifdef TCP_SACK + if (!sack_rxmit) { +#endif + len = min(so->so_snd.sb_cc, win) - off; + +#if defined(TCP_SACK) && defined(TCP_FACK) + /* + * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and + * amount of outstanding data (snd_awnd) is >= snd_cwnd, then + * do not send data (like zero window conditions) + */ + if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) && + (tp->snd_awnd >= tp->snd_cwnd)) + len = 0; +#endif /* TCP_FACK */ +#ifdef TCP_SACK + } +#endif if (len < 0) { /* @@ -154,25 +289,23 @@ again: * but we haven't been called to retransmit, * len will be -1. Otherwise, window shrank * after we sent into it. If window shrank to 0, - * calcel pending retransmit, pull snd_nxt back - * to (closed) window, and set the persist timer - * if it isn't already running. If the window - * didn't close completely, just wait for an ACK. + * cancel pending retransmit and pull snd_nxt + * back to (closed) window. We will enter persist + * state below. If the window didn't close completely, + * just wait for an ACK. */ len = 0; if (win == 0) { tp->t_timer[TCPT_REXMT] = 0; - tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; - if (tp->t_timer[TCPT_PERSIST] == 0) - tcp_setpersist(tp); } } if (len > tp->t_maxseg) { len = tp->t_maxseg; - flags &= ~TH_FIN; sendalot = 1; } + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; win = sbspace(&so->so_rcv); @@ -198,6 +331,10 @@ again: goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; +#ifdef TCP_SACK + if (sack_rxmit) + goto send; +#endif } /* @@ -294,6 +431,20 @@ send: mss = htons((u_int16_t) tcp_mss(tp, 0)); bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss)); optlen = 4; +#ifdef TCP_SACK + /* + * If this is the first SYN of connection (not a SYN + * ACK), include SACK_PERMIT_HDR option. If this is a + * SYN ACK, include SACK_PERMIT_HDR option if peer has + * already done so. + */ + if (!tp->sack_disable && ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_SACK_PERMIT))) { + *((u_long *) (opt + optlen)) = + htonl(TCPOPT_SACK_PERMIT_HDR); + optlen += 4; + } +#endif if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || @@ -326,6 +477,34 @@ send: optlen += TCPOLEN_TSTAMP_APPA; } +#ifdef TCP_SACK + /* + * Send SACKs if necessary. This should be the last option processed. + * Only as many SACKs are sent as are permitted by the maximum options + * size. No more than three SACKs are sent. + */ + if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && + tp->rcv_numsacks) { + u_long *lp = (u_long *) (opt + optlen); + u_long *olp = lp++; + int count = 0; /* actual number of SACKs inserted */ + int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; + + maxsack = min(maxsack, TCP_MAX_SACK); + for (i=0; (i < tp->rcv_numsacks && count < maxsack); i++) { + struct sackblk sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) + continue; + *lp++ = htonl(sack.start); + *lp++ = htonl(sack.end); + count++; + } + *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); + optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ + } +#endif /* TCP_SACK */ + hdrlen += optlen; /* @@ -447,6 +626,23 @@ send: ti->ti_seq = htonl(tp->snd_nxt); else ti->ti_seq = htonl(tp->snd_max); +#ifdef TCP_SACK + if (sack_rxmit) { + /* + * If sendalot was turned on (due to option stuffing), turn it + * off. Properly set th_seq field. Advance the ret'x pointer + * by len. + */ + if (sendalot) + sendalot = 0; + ti->ti_seq = htonl(p->rxmit); + p->rxmit += len; +#if defined(TCP_SACK) && defined(TCP_FACK) + tp->retran_data += len; +#endif /* TCP_FACK */ + } +#endif /* TCP_SACK */ + ti->ti_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen); @@ -508,6 +704,13 @@ send: tp->t_flags |= TF_SENTFIN; } } +#ifdef TCP_SACK + if (!tp->sack_disable) { + if (sack_rxmit && (p->rxmit != tp->snd_nxt)) { + goto timer; + } + } +#endif tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; @@ -530,6 +733,19 @@ send: * Initialize shift counter which is used for backoff * of retransmit time. */ +#ifdef TCP_SACK + timer: + if (!tp->sack_disable && sack_rxmit && + tp->t_timer[TCPT_REXMT] == 0 && + tp->snd_nxt != tp->snd_max) { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } +#endif + if (tp->t_timer[TCPT_REXMT] == 0 && tp->snd_nxt != tp->snd_una) { tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; @@ -571,6 +787,11 @@ send: error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route, so->so_options & SO_DONTROUTE); #endif +#if defined(TCP_SACK) && defined(TCP_FACK) + /* Update snd_awnd to reflect the new data that was sent. */ + tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) + + tp->retran_data; +#endif } if (error) { out: @@ -597,7 +818,11 @@ out: tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); +#if defined(TCP_SACK) || defined(TCP_NEWRENO) + if (sendalot && --maxburst) +#else if (sendalot) +#endif goto again; return (0); } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 293d769f5d0..255f94e8519 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_subr.c,v 1.11 1998/10/28 21:34:33 provos Exp $ */ +/* $OpenBSD: tcp_subr.c,v 1.12 1998/11/17 19:23:02 provos Exp $ */ /* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */ /* @@ -77,10 +77,19 @@ int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; * used as the default). */ #ifndef TCP_DO_RFC1323 -#define TCP_DO_RFC1323 1 +#define TCP_DO_RFC1323 1 #endif int tcp_do_rfc1323 = TCP_DO_RFC1323; +#ifndef TCP_DO_SACK +#ifdef TCP_SACK +#define TCP_DO_SACK 1 +#else +#define TCP_DO_SACK 0 +#endif +#endif +int tcp_do_sack = TCP_DO_SACK; /* RFC 2018 selective ACKs */ + #ifndef TCBHASHSIZE #define TCBHASHSIZE 128 #endif @@ -237,6 +246,9 @@ tcp_newtcpcb(inp) LIST_INIT(&tp->segq); tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; +#ifdef TCP_SACK + tp->sack_disable = tcp_do_sack ? 0 : 1; +#endif tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; tp->t_inpcb = inp; /* @@ -293,6 +305,9 @@ tcp_close(tp) register struct ipqent *qe; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; +#ifdef TCP_SACK + struct sackhole *p, *q; +#endif #ifdef RTV_RTT register struct rtentry *rt; @@ -369,6 +384,15 @@ tcp_close(tp) m_freem(qe->ipqe_m); FREE(qe, M_IPQ); } +#ifdef TCP_SACK + /* Free SACK holes. */ + q = p = tp->snd_holes; + while (p != 0) { + q = p->next; + free(p, M_PCB); + p = q; + } +#endif if (tp->t_template) (void) m_free(dtom(tp->t_template)); free(tp, M_PCB); diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 357fa300db2..53f4ef0e130 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_timer.c,v 1.8 1997/08/26 20:02:34 deraadt Exp $ */ +/* $OpenBSD: tcp_timer.c,v 1.9 1998/11/17 19:23:02 provos Exp $ */ /* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */ /* @@ -178,6 +178,26 @@ tcp_timers(tp, timer) int timer; { register int rexmt; +#ifdef TCP_SACK + struct sackhole *p, *q; + /* + * Free SACK holes for 2MSL and REXMT timers. + */ + if (timer == TCPT_2MSL || timer == TCPT_REXMT) { + q = p = tp->snd_holes; + while (p != 0) { + q = p->next; + free(p, M_PCB); + p = q; + } + tp->snd_holes = 0; +#if defined(TCP_SACK) && defined(TCP_FACK) + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; + tp->snd_awnd = 0; +#endif /* TCP_FACK */ + } +#endif /* TCP_SACK */ switch (timer) { @@ -227,6 +247,9 @@ tcp_timers(tp, timer) tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; +#if defined (TCP_NEWRENO) || defined (TCP_SACK) + tp->snd_last = tp->snd_una; +#endif /* * If timing a segment in this window, stop the timer. */ diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 880b21ec5e0..7ed5d330fc4 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_usrreq.c,v 1.28 1998/06/27 02:42:41 deraadt Exp $ */ +/* $OpenBSD: tcp_usrreq.c,v 1.29 1998/11/17 19:23:02 provos Exp $ */ /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ /* @@ -241,6 +241,14 @@ tcp_usrreq(so, req, m, nam, control) tcp_iss += arc4random() % (TCP_ISSINCR / 2) + 1; #endif /* !TCP_COMPAT_42 */ tcp_sendseqinit(tp); +#if defined(TCP_SACK) || defined(TCP_NEWRENO) + tp->snd_last = tp->snd_una; +#endif +#if defined(TCP_SACK) && defined(TCP_FACK) + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; + tp->snd_awnd = 0; +#endif error = tcp_output(tp); break; @@ -435,6 +443,12 @@ tcp_ctloutput(op, so, level, optname, mp) error = EINVAL; break; +#ifdef TCP_SACK + case TCP_SACK_DISABLE: + i = *mtod(m, int *); + tp->sack_disable = i; + break; +#endif default: error = ENOPROTOOPT; break; @@ -454,6 +468,11 @@ tcp_ctloutput(op, so, level, optname, mp) case TCP_MAXSEG: *mtod(m, int *) = tp->t_maxseg; break; +#ifdef TCP_SACK + case TCP_SACK_DISABLE: + *mtod(m, int *) = tp->sack_disable; + break; +#endif default: error = ENOPROTOOPT; break; @@ -654,7 +673,14 @@ tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) case TCPCTL_RFC1323: return (sysctl_int(oldp, oldlenp, newp, newlen, &tcp_do_rfc1323)); - +#ifdef TCP_SACK + case TCPCTL_SACK: + return (sysctl_int(oldp, oldlenp, newp, newlen, + &tcp_do_sack)); +#endif + case TCPCTL_MSSDFLT: + return (sysctl_int(oldp, oldlenp, newp, newlen, + &tcp_mssdflt)); case TCPCTL_KEEPINITTIME: return (sysctl_int(oldp, oldlenp, newp, newlen, &tcptv_keep_init)); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 3dd17d12a51..a8c59db5987 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_var.h,v 1.12 1998/10/28 21:34:33 provos Exp $ */ +/* $OpenBSD: tcp_var.h,v 1.13 1998/11/17 19:23:03 provos Exp $ */ /* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */ /* @@ -36,6 +36,23 @@ * @(#)tcp_var.h 8.3 (Berkeley) 4/10/94 */ +#ifdef TCP_SACK +struct sackblk +{ + tcp_seq start; /* start seq no. of sack block */ + tcp_seq end; /* end seq no. */ +}; + +struct sackhole +{ + tcp_seq start; /* start seq no. of hole */ + tcp_seq end; /* end seq no. */ + int dups; /* number of dup(s)acks for this hole */ + tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ + struct sackhole *next; /* next in list */ +}; +#endif + /* * Kernel variables for tcp. */ @@ -78,11 +95,33 @@ struct tcpcb { tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq iss; /* initial send sequence number */ u_long snd_wnd; /* send window */ +#ifdef TCP_SACK + int sack_disable; /* disable SACK for this connection */ + int snd_numholes; /* number of holes seen by sender */ + struct sackhole *snd_holes; /* linked list of holes (sorted) */ +#if defined(TCP_SACK) && defined(TCP_FACK) + tcp_seq snd_fack; /* for FACK congestion control */ + u_long snd_awnd; /* snd_nxt - snd_fack + */ + /* retransmitted data */ + int retran_data; /* amount of outstanding retx. data */ +#endif /* TCP_FACK */ +#endif /* TCP_SACK */ +#if defined(TCP_SACK) || defined(TCP_NEWRENO) + tcp_seq snd_last; /* for use in fast recovery */ +#endif /* receive sequence variables */ u_long rcv_wnd; /* receive window */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_up; /* receive urgent pointer */ tcp_seq irs; /* initial receive sequence number */ +#ifdef TCP_SACK + tcp_seq rcv_laststart; /* start of last segment recd. */ + tcp_seq rcv_lastend; /* end of ... */ + tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/ + int rcv_numsacks; /* # distinct sack blks present */ + struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ +#endif + /* * Additional variables for this implementation. */ @@ -193,6 +232,7 @@ struct tcpstat { u_quad_t tcps_sndbyte; /* data bytes sent */ u_long tcps_sndrexmitpack; /* data packets retransmitted */ u_quad_t tcps_sndrexmitbyte; /* data bytes retransmitted */ + u_quad_t tcps_sndrexmitfast; /* Fast retransmits */ u_long tcps_sndacks; /* ack-only packets sent */ u_long tcps_sndprobe; /* window probes sent */ u_long tcps_sndurg; /* packets sent with URG only */ @@ -243,7 +283,9 @@ struct tcpstat { #define TCPCTL_RECVSPACE 7 /* receive buffer space */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_IDENT 9 /* get connection owner */ -#define TCPCTL_MAXID 10 +#define TCPCTL_SACK 10 /* selective acknowledgement, rfc 2018 */ +#define TCPCTL_MSSDFLT 11 /* Default maximum segment size */ +#define TCPCTL_MAXID 12 #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -256,6 +298,8 @@ struct tcpstat { { "recvspace", CTLTYPE_INT }, \ { "sendspace", CTLTYPE_INT }, \ { "ident", CTLTYPE_STRUCT }, \ + { "sack", CTLTYPE_INT }, \ + { "mssdflt", CTLTYPE_INT }, \ } struct tcp_ident_mapping { @@ -268,6 +312,10 @@ struct inpcbtable tcbtable; /* head of queue of active tcpcb's */ struct tcpstat tcpstat; /* tcp statistics */ u_int32_t tcp_now; /* for RFC 1323 timestamps */ extern int tcp_do_rfc1323; /* enabled/disabled? */ +extern int tcp_mssdflt; /* default maximum segment size */ +#ifdef TCP_SACK +extern int tcp_do_sack; /* SACK enabled/disabled */ +#endif int tcp_attach __P((struct socket *)); void tcp_canceltimers __P((struct tcpcb *)); @@ -310,4 +358,21 @@ int tcp_usrreq __P((struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *)); void tcp_xmit_timer __P((struct tcpcb *, int)); void tcpdropoldhalfopen __P((struct tcpcb *, u_int16_t)); +#ifdef TCP_SACK +int tcp_sack_option __P((struct tcpcb *,struct tcpiphdr *,u_char *,int)); +void tcp_update_sack_list __P((struct tcpcb *tp)); +void tcp_del_sackholes __P((struct tcpcb *, struct tcpiphdr *)); +void tcp_clean_sackreport __P((struct tcpcb *tp)); +void tcp_sack_adjust __P((struct tcpcb *tp)); +struct sackhole * tcp_sack_output __P((struct tcpcb *tp)); +int tcp_sack_partialack __P((struct tcpcb *, struct tcpiphdr *)); +#ifdef DEBUG +void tcp_print_holes __P((struct tcpcb *tp)); #endif +#endif /* TCP_SACK */ +#if defined(TCP_NEWRENO) || defined(TCP_SACK) +int tcp_newreno __P((struct tcpcb *, struct tcpiphdr *)); +u_long tcp_seq_subtract __P((u_long, u_long )); +#endif /* TCP_NEWRENO || TCP_SACK */ + +#endif /* KERNEL */ |