diff options
author | Claudio Jeker <claudio@cvs.openbsd.org> | 2010-09-24 02:59:47 +0000 |
---|---|---|
committer | Claudio Jeker <claudio@cvs.openbsd.org> | 2010-09-24 02:59:47 +0000 |
commit | 31e2f59e8c8a6fa052489381e45a195e88ce638c (patch) | |
tree | 986bd8c580c587575ac30f491bb3ca9e441f0d75 /sys/netinet | |
parent | 1e5d893cfbeb506fb1db4168cecd4a2484185f27 (diff) |
TCP send and recv buffer scaling.
Send buffer is scaled by not accounting unacknowledged on the wire
data against the buffer limit. Receive buffer scaling is done similar
to FreeBSD -- measure the delay * bandwith product and base the
buffer on that. The problem is that our RTT measurment is coarse
so it overshoots on low delay links. This does not matter that much
since the recvbuffer is almost always empty.
Add a back pressure mechanism to control the amount of memory
assigned to socketbuffers that kicks in when 80% of the cluster
pool is used.
Increases the download speed from 300kB/s to 4.4MB/s on ftp.eu.openbsd.org.
Based on work by markus@ and djm@.
OK dlg@, henning@, put it in deraadt@
Diffstat (limited to 'sys/netinet')
-rw-r--r-- | sys/netinet/tcp_input.c | 42 | ||||
-rw-r--r-- | sys/netinet/tcp_output.c | 9 | ||||
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 74 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 17 |
4 files changed, 130 insertions, 12 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index d923a3c6713..664687ee821 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_input.c,v 1.235 2010/07/20 15:36:03 matthew Exp $ */ +/* $OpenBSD: tcp_input.c,v 1.236 2010/09/24 02:59:45 claudio Exp $ */ /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ /* @@ -1091,6 +1091,7 @@ after_listen: else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); + tcp_update_sndspace(tp); if (sb_notify(&so->so_snd)) sowwakeup(so); if (so->so_snd.sb_cc) @@ -1122,6 +1123,16 @@ after_listen: if (so->so_state & SS_CANTRCVMORE) m_freem(m); else { + if (opti.ts_present && opti.ts_ecr) { + if (tp->rfbuf_ts < opti.ts_ecr && + opti.ts_ecr - tp->rfbuf_ts < hz) { + tcp_update_rcvspace(tp); + /* Start over with next RTT. */ + tp->rfbuf_cnt = 0; + tp->rfbuf_ts = 0; + } else + tp->rfbuf_cnt += tlen; + } m_adj(m, iphlen + off); sbappendstream(&so->so_rcv, m); } @@ -1152,6 +1163,10 @@ after_listen: tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + tp->rfbuf_cnt = 0; + tp->rfbuf_ts = 0; + switch (tp->t_state) { /* @@ -1861,6 +1876,8 @@ trimthenstep6: tp->snd_wnd -= acked; ourfinisacked = 0; } + + tcp_update_sndspace(tp); if (sb_notify(&so->so_snd)) sowwakeup(so); @@ -4063,9 +4080,28 @@ syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, (TF_RCVD_SCALE|TF_REQ_SCALE)) { sc->sc_requested_s_scale = tb.requested_s_scale; sc->sc_request_r_scale = 0; + /* + * Pick the smallest possible scaling factor that + * will still allow us to scale up to sb_max. + * + * We do this because there are broken firewalls that + * will corrupt the window scale option, leading to + * the other endpoint believing that our advertised + * window is unscaled. At scale factors larger than + * 5 the unscaled window will drop below 1500 bytes, + * leading to serious problems when traversing these + * broken firewalls. + * + * With the default sbmax of 256K, a scale factor + * of 3 will be chosen by this algorithm. Those who + * choose a larger sbmax should watch out + * for the compatiblity problems mentioned above. + * + * RFC1323: The Window field in a SYN (i.e., a <SYN> + * or <SYN,ACK>) segment itself is never scaled. + */ while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && - TCP_MAXWIN << sc->sc_request_r_scale < - so->so_rcv.sb_hiwat) + (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) sc->sc_request_r_scale++; } else { sc->sc_requested_s_scale = 15; diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index d58421cf3a5..08d93835a4a 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_output.c,v 1.91 2010/09/08 08:34:42 claudio Exp $ */ +/* $OpenBSD: tcp_output.c,v 1.92 2010/09/24 02:59:45 claudio Exp $ */ /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ /* @@ -593,6 +593,11 @@ send: *lp++ = htonl(tcp_now + tp->ts_modulate); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; + + /* Set receive buffer autosizing timestamp. */ + if (tp->rfbuf_ts == 0) + tp->rfbuf_ts = tcp_now; + } #ifdef TCP_SIGNATURE @@ -1030,6 +1035,8 @@ send: if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; + tcp_update_sndspace(tp); + /* * Trace. */ diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 6c09f5b896d..c3676bcceaa 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_usrreq.c,v 1.102 2010/06/07 13:08:43 claudio Exp $ */ +/* $OpenBSD: tcp_usrreq.c,v 1.103 2010/09/24 02:59:45 claudio Exp $ */ /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ /* @@ -119,6 +119,7 @@ u_int tcp_sendspace = TCP_SENDSPACE; #define TCP_RECVSPACE 1024*16 #endif u_int tcp_recvspace = TCP_RECVSPACE; +u_int tcp_autorcvbuf_inc = 16 * 1024; int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; @@ -313,7 +314,7 @@ tcp_usrreq(so, req, m, nam, control, p) so->so_state |= SS_CONNECTOUT; /* Compute window scaling to request. */ - tcp_rscale(tp, so->so_rcv.sb_hiwat); + tcp_rscale(tp, sb_max); soisconnecting(so); tcpstat.tcps_connattempt++; @@ -652,10 +653,19 @@ tcp_attach(so) int error; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { - error = soreserve(so, tcp_sendspace, tcp_recvspace); + /* if low on memory only allow smaller then default buffers */ + if (so->so_snd.sb_wat == 0 || + sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace)) + so->so_snd.sb_wat = tcp_sendspace; + if (so->so_rcv.sb_wat == 0 || + sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) + so->so_rcv.sb_wat = tcp_recvspace; + + error = soreserve(so, so->so_snd.sb_wat, so->so_rcv.sb_wat); if (error) return (error); } + error = in_pcballoc(so, &tcbtable); if (error) return (error); @@ -950,3 +960,61 @@ tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) } /* NOTREACHED */ } + +/* + * Scale the send buffer so that inflight data is not accounted against + * the limit. The buffer will scale with the congestion window, if the + * the receiver stops acking data the window will shrink and therefor + * the buffer size will shrink as well. + * In low memory situation shrink + */ +void +tcp_update_sndspace(struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + int nmax; + + if (sbchecklowmem()) + /* low on memory try to get rid of some */ + nmax = tcp_sendspace; + else if (so->so_snd.sb_wat != tcp_sendspace) + /* user requested buffer size, auto-scaling disabled */ + nmax = so->so_snd.sb_wat; + else + /* automatic buffer scaling */ + nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - + tp->snd_una); + + /* round to MSS boundary */ + nmax = roundup(nmax, tp->t_maxseg); + + if (nmax != so->so_snd.sb_hiwat) + sbreserve(&so->so_snd, nmax); +} + +void +tcp_update_rcvspace(struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + int nmax = so->so_rcv.sb_hiwat; + + if (sbchecklowmem()) + /* low on memory try to get rid of some */ + nmax = tcp_recvspace; + else if (so->so_rcv.sb_wat != tcp_recvspace) + /* user requested buffer size, auto-scaling disabled */ + nmax = so->so_rcv.sb_wat; + else { + /* automatic buffer scaling */ + if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) + nmax = MIN(sb_max, so->so_rcv.sb_hiwat + + tcp_autorcvbuf_inc); + } + + if (nmax == so->so_rcv.sb_hiwat) + return; + + /* round to MSS boundary */ + nmax = roundup(nmax, tp->t_maxseg); + sbreserve(&so->so_rcv, nmax); +} diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 1558747c0fd..dcca8ad53a9 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_var.h,v 1.95 2010/07/09 16:58:06 reyk Exp $ */ +/* $OpenBSD: tcp_var.h,v 1.96 2010/09/24 02:59:46 claudio Exp $ */ /* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */ /* @@ -151,6 +151,11 @@ struct tcpcb { * for slow start exponential to * linear switch */ + +/* auto-sizing variables */ + u_int rfbuf_cnt; /* recv buffer autoscaling byte count */ + u_int32_t rfbuf_ts; /* recv buffer autoscaling time stamp */ + u_short t_maxopd; /* mss plus options */ u_short t_peermss; /* peer's maximum segment size */ @@ -476,8 +481,8 @@ struct tcpstat { { "keepintvl", CTLTYPE_INT }, \ { "slowhz", CTLTYPE_INT }, \ { "baddynamic", CTLTYPE_STRUCT }, \ - { "recvspace", CTLTYPE_INT }, \ - { "sendspace", CTLTYPE_INT }, \ + { NULL, 0 }, \ + { NULL, 0 }, \ { "ident", CTLTYPE_STRUCT }, \ { "sack", CTLTYPE_INT }, \ { "mssdflt", CTLTYPE_INT }, \ @@ -501,8 +506,8 @@ struct tcpstat { &tcp_keepintvl, \ NULL, \ NULL, \ - &tcp_recvspace, \ - &tcp_sendspace, \ + NULL, \ + NULL, \ NULL, \ NULL, \ &tcp_mssdflt, \ @@ -590,6 +595,8 @@ void tcp_rscale(struct tcpcb *, u_long); void tcp_respond(struct tcpcb *, caddr_t, struct tcphdr *, tcp_seq, tcp_seq, int, u_int); void tcp_setpersist(struct tcpcb *); +void tcp_update_sndspace(struct tcpcb *); +void tcp_update_rcvspace(struct tcpcb *); void tcp_slowtimo(void); struct mbuf * tcp_template(struct tcpcb *); |