diff options
-rw-r--r-- | sys/kern/uipc_socket.c | 8 | ||||
-rw-r--r-- | sys/kern/uipc_socket2.c | 39 | ||||
-rw-r--r-- | sys/netinet/tcp_input.c | 42 | ||||
-rw-r--r-- | sys/netinet/tcp_output.c | 9 | ||||
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 74 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 17 | ||||
-rw-r--r-- | sys/sys/socketvar.h | 5 |
7 files changed, 160 insertions, 34 deletions
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 5c2e7448bb0..53648922f37 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_socket.c,v 1.83 2010/07/03 04:44:51 guenther Exp $ */ +/* $OpenBSD: uipc_socket.c,v 1.84 2010/09/24 02:59:45 claudio Exp $ */ /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ /* @@ -1036,19 +1036,21 @@ sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) switch (optname) { case SO_SNDBUF: - if (sbcheckreserve(cnt, so->so_snd.sb_hiwat) || + if (sbcheckreserve(cnt, so->so_snd.sb_wat) || sbreserve(&so->so_snd, cnt)) { error = ENOBUFS; goto bad; } + so->so_snd.sb_wat = cnt; break; case SO_RCVBUF: - if (sbcheckreserve(cnt, so->so_rcv.sb_hiwat) || + if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || sbreserve(&so->so_rcv, cnt)) { error = ENOBUFS; goto bad; } + so->so_rcv.sb_wat = cnt; break; case SO_SNDLOWAT: diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index 45c335f8236..c1071fd3e19 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_socket2.c,v 1.50 2009/11/09 17:53:39 nicm Exp $ */ +/* $OpenBSD: uipc_socket2.c,v 1.51 2010/09/24 02:59:45 claudio Exp $ */ /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ /* @@ -53,6 +53,7 @@ u_long sb_max = SB_MAX; /* patchable */ extern struct pool mclpools[]; +extern struct pool mbpool; /* * Procedures to manipulate state flags of socket @@ -147,8 +148,6 @@ sonewconn(struct socket *head, int connstatus) { struct socket *so; int soqueue = connstatus ? 1 : 0; - extern u_long unpst_sendspace, unpst_recvspace; - u_long snd_sb_hiwat, rcv_sb_hiwat; splsoftassert(IPL_SOFTNET); @@ -175,17 +174,11 @@ sonewconn(struct socket *head, int connstatus) so->so_sigeuid = head->so_sigeuid; /* - * If we are tight on mbuf clusters, create the new socket - * with the minimum. Sorry, you lose. + * Inherit watermarks but those may get clamped in low mem situations. */ - snd_sb_hiwat = head->so_snd.sb_hiwat; - if (sbcheckreserve(snd_sb_hiwat, unpst_sendspace)) - snd_sb_hiwat = unpst_sendspace; /* and udp? */ - rcv_sb_hiwat = head->so_rcv.sb_hiwat; - if (sbcheckreserve(rcv_sb_hiwat, unpst_recvspace)) - rcv_sb_hiwat = unpst_recvspace; /* and udp? */ - - (void) soreserve(so, snd_sb_hiwat, rcv_sb_hiwat); + so->so_snd.sb_wat = head->so_snd.sb_wat; + so->so_rcv.sb_wat = head->so_rcv.sb_wat; + soqinsque(head, so, soqueue); if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, NULL, NULL, NULL, curproc)) { @@ -392,18 +385,30 @@ sbreserve(struct sockbuf *sb, u_long cc) } /* - * If over 50% of mbuf clusters in use, do not accept any - * greater than normal request. + * In low memory situation, do not accept any greater than normal request. */ int sbcheckreserve(u_long cnt, u_long defcnt) { - if (cnt > defcnt && - mclpools[0].pr_nout> mclpools[0].pr_hardlimit / 2) + if (cnt > defcnt && sbchecklowmem()) return (ENOBUFS); return (0); } +int +sbchecklowmem(void) +{ + static int sblowmem; + + if (mclpools[0].pr_nout < mclpools[0].pr_hardlimit * 60 / 100 || + mbpool.pr_nout < mbpool.pr_hardlimit * 60 / 100) + sblowmem = 0; + if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 80 / 100 || + mbpool.pr_nout > mbpool.pr_hardlimit * 80 / 100) + sblowmem = 1; + return (sblowmem); +} + /* * Free mbufs held by a socket, and reserved mbuf space. */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index d923a3c6713..664687ee821 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_input.c,v 1.235 2010/07/20 15:36:03 matthew Exp $ */ +/* $OpenBSD: tcp_input.c,v 1.236 2010/09/24 02:59:45 claudio Exp $ */ /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ /* @@ -1091,6 +1091,7 @@ after_listen: else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); + tcp_update_sndspace(tp); if (sb_notify(&so->so_snd)) sowwakeup(so); if (so->so_snd.sb_cc) @@ -1122,6 +1123,16 @@ after_listen: if (so->so_state & SS_CANTRCVMORE) m_freem(m); else { + if (opti.ts_present && opti.ts_ecr) { + if (tp->rfbuf_ts < opti.ts_ecr && + opti.ts_ecr - tp->rfbuf_ts < hz) { + tcp_update_rcvspace(tp); + /* Start over with next RTT. */ + tp->rfbuf_cnt = 0; + tp->rfbuf_ts = 0; + } else + tp->rfbuf_cnt += tlen; + } m_adj(m, iphlen + off); sbappendstream(&so->so_rcv, m); } @@ -1152,6 +1163,10 @@ after_listen: tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + tp->rfbuf_cnt = 0; + tp->rfbuf_ts = 0; + switch (tp->t_state) { /* @@ -1861,6 +1876,8 @@ trimthenstep6: tp->snd_wnd -= acked; ourfinisacked = 0; } + + tcp_update_sndspace(tp); if (sb_notify(&so->so_snd)) sowwakeup(so); @@ -4063,9 +4080,28 @@ syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, (TF_RCVD_SCALE|TF_REQ_SCALE)) { sc->sc_requested_s_scale = tb.requested_s_scale; sc->sc_request_r_scale = 0; + /* + * Pick the smallest possible scaling factor that + * will still allow us to scale up to sb_max. + * + * We do this because there are broken firewalls that + * will corrupt the window scale option, leading to + * the other endpoint believing that our advertised + * window is unscaled. At scale factors larger than + * 5 the unscaled window will drop below 1500 bytes, + * leading to serious problems when traversing these + * broken firewalls. + * + * With the default sbmax of 256K, a scale factor + * of 3 will be chosen by this algorithm. Those who + * choose a larger sbmax should watch out + * for the compatiblity problems mentioned above. + * + * RFC1323: The Window field in a SYN (i.e., a <SYN> + * or <SYN,ACK>) segment itself is never scaled. + */ while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && - TCP_MAXWIN << sc->sc_request_r_scale < - so->so_rcv.sb_hiwat) + (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) sc->sc_request_r_scale++; } else { sc->sc_requested_s_scale = 15; diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index d58421cf3a5..08d93835a4a 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_output.c,v 1.91 2010/09/08 08:34:42 claudio Exp $ */ +/* $OpenBSD: tcp_output.c,v 1.92 2010/09/24 02:59:45 claudio Exp $ */ /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ /* @@ -593,6 +593,11 @@ send: *lp++ = htonl(tcp_now + tp->ts_modulate); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; + + /* Set receive buffer autosizing timestamp. */ + if (tp->rfbuf_ts == 0) + tp->rfbuf_ts = tcp_now; + } #ifdef TCP_SIGNATURE @@ -1030,6 +1035,8 @@ send: if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; + tcp_update_sndspace(tp); + /* * Trace. */ diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 6c09f5b896d..c3676bcceaa 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_usrreq.c,v 1.102 2010/06/07 13:08:43 claudio Exp $ */ +/* $OpenBSD: tcp_usrreq.c,v 1.103 2010/09/24 02:59:45 claudio Exp $ */ /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ /* @@ -119,6 +119,7 @@ u_int tcp_sendspace = TCP_SENDSPACE; #define TCP_RECVSPACE 1024*16 #endif u_int tcp_recvspace = TCP_RECVSPACE; +u_int tcp_autorcvbuf_inc = 16 * 1024; int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; @@ -313,7 +314,7 @@ tcp_usrreq(so, req, m, nam, control, p) so->so_state |= SS_CONNECTOUT; /* Compute window scaling to request. */ - tcp_rscale(tp, so->so_rcv.sb_hiwat); + tcp_rscale(tp, sb_max); soisconnecting(so); tcpstat.tcps_connattempt++; @@ -652,10 +653,19 @@ tcp_attach(so) int error; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { - error = soreserve(so, tcp_sendspace, tcp_recvspace); + /* if low on memory only allow smaller then default buffers */ + if (so->so_snd.sb_wat == 0 || + sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace)) + so->so_snd.sb_wat = tcp_sendspace; + if (so->so_rcv.sb_wat == 0 || + sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) + so->so_rcv.sb_wat = tcp_recvspace; + + error = soreserve(so, so->so_snd.sb_wat, so->so_rcv.sb_wat); if (error) return (error); } + error = in_pcballoc(so, &tcbtable); if (error) return (error); @@ -950,3 +960,61 @@ tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) } /* NOTREACHED */ } + +/* + * Scale the send buffer so that inflight data is not accounted against + * the limit. The buffer will scale with the congestion window, if the + * the receiver stops acking data the window will shrink and therefor + * the buffer size will shrink as well. + * In low memory situation shrink + */ +void +tcp_update_sndspace(struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + int nmax; + + if (sbchecklowmem()) + /* low on memory try to get rid of some */ + nmax = tcp_sendspace; + else if (so->so_snd.sb_wat != tcp_sendspace) + /* user requested buffer size, auto-scaling disabled */ + nmax = so->so_snd.sb_wat; + else + /* automatic buffer scaling */ + nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - + tp->snd_una); + + /* round to MSS boundary */ + nmax = roundup(nmax, tp->t_maxseg); + + if (nmax != so->so_snd.sb_hiwat) + sbreserve(&so->so_snd, nmax); +} + +void +tcp_update_rcvspace(struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + int nmax = so->so_rcv.sb_hiwat; + + if (sbchecklowmem()) + /* low on memory try to get rid of some */ + nmax = tcp_recvspace; + else if (so->so_rcv.sb_wat != tcp_recvspace) + /* user requested buffer size, auto-scaling disabled */ + nmax = so->so_rcv.sb_wat; + else { + /* automatic buffer scaling */ + if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) + nmax = MIN(sb_max, so->so_rcv.sb_hiwat + + tcp_autorcvbuf_inc); + } + + if (nmax == so->so_rcv.sb_hiwat) + return; + + /* round to MSS boundary */ + nmax = roundup(nmax, tp->t_maxseg); + sbreserve(&so->so_rcv, nmax); +} diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 1558747c0fd..dcca8ad53a9 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_var.h,v 1.95 2010/07/09 16:58:06 reyk Exp $ */ +/* $OpenBSD: tcp_var.h,v 1.96 2010/09/24 02:59:46 claudio Exp $ */ /* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */ /* @@ -151,6 +151,11 @@ struct tcpcb { * for slow start exponential to * linear switch */ + +/* auto-sizing variables */ + u_int rfbuf_cnt; /* recv buffer autoscaling byte count */ + u_int32_t rfbuf_ts; /* recv buffer autoscaling time stamp */ + u_short t_maxopd; /* mss plus options */ u_short t_peermss; /* peer's maximum segment size */ @@ -476,8 +481,8 @@ struct tcpstat { { "keepintvl", CTLTYPE_INT }, \ { "slowhz", CTLTYPE_INT }, \ { "baddynamic", CTLTYPE_STRUCT }, \ - { "recvspace", CTLTYPE_INT }, \ - { "sendspace", CTLTYPE_INT }, \ + { NULL, 0 }, \ + { NULL, 0 }, \ { "ident", CTLTYPE_STRUCT }, \ { "sack", CTLTYPE_INT }, \ { "mssdflt", CTLTYPE_INT }, \ @@ -501,8 +506,8 @@ struct tcpstat { &tcp_keepintvl, \ NULL, \ NULL, \ - &tcp_recvspace, \ - &tcp_sendspace, \ + NULL, \ + NULL, \ NULL, \ NULL, \ &tcp_mssdflt, \ @@ -590,6 +595,8 @@ void tcp_rscale(struct tcpcb *, u_long); void tcp_respond(struct tcpcb *, caddr_t, struct tcphdr *, tcp_seq, tcp_seq, int, u_int); void tcp_setpersist(struct tcpcb *); +void tcp_update_sndspace(struct tcpcb *); +void tcp_update_rcvspace(struct tcpcb *); void tcp_slowtimo(void); struct mbuf * tcp_template(struct tcpcb *); diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index 14af61bfadb..81c6e17267b 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: socketvar.h,v 1.46 2009/08/10 16:49:38 thib Exp $ */ +/* $OpenBSD: socketvar.h,v 1.47 2010/09/24 02:59:46 claudio Exp $ */ /* $NetBSD: socketvar.h,v 1.18 1996/02/09 18:25:38 christos Exp $ */ /*- @@ -82,6 +82,7 @@ struct socket { u_long sb_cc; /* actual chars in buffer */ u_long sb_datacc; /* data only chars in buffer */ u_long sb_hiwat; /* max actual char count */ + u_long sb_wat; /* default watermark */ u_long sb_mbcnt; /* chars of mbufs used */ u_long sb_mbmax; /* max chars of mbufs to use */ long sb_lowat; /* low water mark */ @@ -225,7 +226,6 @@ struct socket { #ifdef _KERNEL extern u_long sb_max; -struct socket *sonewconn(struct socket *head, int connstatus); extern struct pool socket_pool; @@ -268,6 +268,7 @@ void sbflush(struct sockbuf *sb); void sbinsertoob(struct sockbuf *sb, struct mbuf *m0); void sbrelease(struct sockbuf *sb); int sbcheckreserve(u_long cnt, u_long defcnt); +int sbchecklowmem(void); int sbreserve(struct sockbuf *sb, u_long cc); int sbwait(struct sockbuf *sb); int sb_lock(struct sockbuf *sb); |