From 886d2362a0e2964795f03a71c0e918b1b37b5214 Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Mon, 24 Nov 2008 12:57:38 +0000 Subject: add several backend pools to allocate mbufs clusters of various sizes out of. currently limited to MCLBYTES (2048 bytes) and 4096 bytes until pools can allocate objects of sizes greater than PAGESIZE. this allows drivers to ask for "jumbo" packets to fill rx rings with. the second half of this change is per interface mbuf cluster allocator statistics. drivers can use the new interface (MCLGETI), which will use these stats to selectively fail allocations based on demand for mbufs. if the driver isnt rapidly consuming rx mbufs, we dont allow it to allocate many to put on its rx ring. drivers require modifications to take advantage of both the new allocation semantic and large clusters. this was written and developed with deraadt@ over the last two days ok deraadt@ claudio@ --- sys/kern/uipc_mbuf.c | 87 ++++++++++++++++++++++++++++++++++++------------- sys/kern/uipc_socket2.c | 8 ++--- sys/net/if.c | 55 ++++++++++++++++++++++++++++++- sys/net/if.h | 11 ++++++- sys/net/if_ethersubr.c | 4 ++- sys/sys/mbuf.h | 19 +++++++++-- sys/sys/param.h | 4 +-- 7 files changed, 154 insertions(+), 34 deletions(-) diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 7db45a81cff..c0c53fb447b 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_mbuf.c,v 1.94 2008/10/14 18:01:53 naddy Exp $ */ +/* $OpenBSD: uipc_mbuf.c,v 1.95 2008/11/24 12:57:37 dlg Exp $ */ /* $NetBSD: uipc_mbuf.c,v 1.15.4.1 1996/06/13 17:11:44 cgd Exp $ */ /* @@ -90,7 +90,22 @@ struct mbstat mbstat; /* mbuf stats */ struct pool mbpool; /* mbuf pool */ -struct pool mclpool; /* mbuf cluster pool */ + +/* mbuf cluster pools */ +struct mclsizes mclsizes[] = { + { MCLBYTES, 4, 1 }, /* must be at slot 0 */ + { 4 * 1024, 4, 2 }, +#ifdef notyet + /* pool allocator cannot cope with >PAGESIZE objects */ + { 8 * 1024, 4, 2 }, + { 9 * 1024, 4, 2 }, + { 12 * 1024, 4, 2 }, + { 16 * 1024, 4, 2 }, + { 64 * 1024, 4, 2 } +#endif +}; +static char mclnames[MCLPOOLS][8]; +struct pool mclpools[MCLPOOLS]; int max_linkhdr; /* largest link-level header */ int max_protohdr; /* largest protocol header */ @@ -102,7 +117,7 @@ void nmbclust_update(void); const char *mclpool_warnmsg = - "WARNING: mclpool limit reached; increase kern.maxclusters"; + "WARNING: mclpools limit reached; increase kern.maxclusters"; /* * Initialize the mbuf allocator. @@ -110,30 +125,35 @@ const char *mclpool_warnmsg = void mbinit(void) { + int i; + pool_init(&mbpool, MSIZE, 0, 0, 0, "mbpl", NULL); - pool_init(&mclpool, MCLBYTES, 0, 0, 0, "mclpl", NULL); + pool_setlowat(&mbpool, mblowat); - nmbclust_update(); + for (i = 0; i < nitems(mclsizes); i++) { + snprintf(mclnames[i], sizeof(mclnames[0]), "mcl%dk", + mclsizes[i].size >> 10); + pool_init(&mclpools[i], mclsizes[i].size, 0, 0, 0, mclnames[i], + NULL); + pool_setlowat(&mclpools[i], mcllowat); + } - /* - * Set a low water mark for both mbufs and clusters. This should - * help ensure that they can be allocated in a memory starvation - * situation. This is important for e.g. diskless systems which - * must allocate mbufs in order for the pagedaemon to clean pages. - */ - pool_setlowat(&mbpool, mblowat); - pool_setlowat(&mclpool, mcllowat); + nmbclust_update(); } void nmbclust_update(void) { + int i; /* - * Set the hard limit on the mclpool to the number of + * Set the hard limit on the mclpools to the number of * mbuf clusters the kernel is to support. Log the limit * reached message max once a minute. */ - (void)pool_sethardlimit(&mclpool, nmbclust, mclpool_warnmsg, 60); + for (i = 0; i < nitems(mclsizes); i++) { + (void)pool_sethardlimit(&mclpools[i], nmbclust, + mclpool_warnmsg, 60); + } pool_sethiwat(&mbpool, nmbclust); } @@ -244,20 +264,41 @@ m_getclr(int nowait, int type) } void -m_clget(struct mbuf *m, int how) +m_clget(struct mbuf *m, int how, struct ifnet *ifp, u_int pktlen) { + struct pool *mclp; + int pi; int s; + for (pi = 0; pi < nitems(mclpools); pi++) { + mclp = &mclpools[pi]; + if (pktlen <= mclp->pr_size) + break; + } + +#ifdef DIAGNOSTIC + if (mclp == NULL) + panic("m_clget: request for %d sized cluster", pktlen); +#endif + + if (ifp != NULL && m_cldrop(ifp, pi)) + return; + s = splvm(); - m->m_ext.ext_buf = - pool_get(&mclpool, how == M_WAIT ? PR_WAITOK : 0); + m->m_ext.ext_buf = pool_get(mclp, how == M_WAIT ? PR_WAITOK : 0); splx(s); if (m->m_ext.ext_buf != NULL) { m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT|M_CLUSTER; - m->m_ext.ext_size = MCLBYTES; + m->m_ext.ext_size = mclp->pr_size; m->m_ext.ext_free = NULL; m->m_ext.ext_arg = NULL; + + m->m_ext.ext_backend = pi; + m->m_ext.ext_ifp = ifp; + if (ifp != NULL) + m_clcount(ifp, pi); + MCLINITREFERENCE(m); } } @@ -278,9 +319,11 @@ m_free(struct mbuf *m) m->m_ext.ext_prevref; m->m_ext.ext_prevref->m_ext.ext_nextref = m->m_ext.ext_nextref; - } else if (m->m_flags & M_CLUSTER) - pool_put(&mclpool, m->m_ext.ext_buf); - else if (m->m_ext.ext_free) + } else if (m->m_flags & M_CLUSTER) { + m_cluncount(m); + pool_put(&mclpools[m->m_ext.ext_backend], + m->m_ext.ext_buf); + } else if (m->m_ext.ext_free) (*(m->m_ext.ext_free))(m->m_ext.ext_buf, m->m_ext.ext_size, m->m_ext.ext_arg); else diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index 1cf4403f75d..647edfce827 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uipc_socket2.c,v 1.44 2008/05/23 15:51:12 thib Exp $ */ +/* $OpenBSD: uipc_socket2.c,v 1.45 2008/11/24 12:57:37 dlg Exp $ */ /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ /* @@ -58,7 +58,7 @@ const char netlck[] = "netlck"; u_long sb_max = SB_MAX; /* patchable */ -extern struct pool mclpool; +extern struct pool mclpools[]; /* * Procedures to manipulate state flags of socket @@ -158,7 +158,7 @@ sonewconn(struct socket *head, int connstatus) splassert(IPL_SOFTNET); - if (mclpool.pr_nout > mclpool.pr_hardlimit * 95 / 100) + if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 95 / 100) return ((struct socket *)0); if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) return ((struct socket *)0); @@ -407,7 +407,7 @@ int sbcheckreserve(u_long cnt, u_long defcnt) { if (cnt > defcnt && - mclpool.pr_nout> mclpool.pr_hardlimit / 2) + mclpools[0].pr_nout> mclpools[0].pr_hardlimit / 2) return (ENOBUFS); return (0); } diff --git a/sys/net/if.c b/sys/net/if.c index 8bfb97df149..db68e28ce8a 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if.c,v 1.176 2008/11/24 12:53:53 claudio Exp $ */ +/* $OpenBSD: if.c,v 1.177 2008/11/24 12:57:37 dlg Exp $ */ /* $NetBSD: if.c,v 1.35 1996/05/07 05:26:04 thorpej Exp $ */ /* @@ -147,6 +147,8 @@ struct if_clone *if_clone_lookup(const char *, int *); void if_congestion_clear(void *); int if_group_egress_build(void); +void m_clinitifp(struct ifnet *); + TAILQ_HEAD(, ifg_group) ifg_head; LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners); int if_cloners_count; @@ -455,6 +457,8 @@ if_attach(struct ifnet *ifp) TAILQ_INSERT_TAIL(&ifnet, ifp, if_list); #endif + m_clinitifp(ifp); + if_attachsetup(ifp); } @@ -1105,9 +1109,12 @@ if_up(struct ifnet *ifp) #ifdef INET6 in6_if_up(ifp); #endif + #ifndef SMALL_KERNEL rt_if_track(ifp); #endif + + m_clinitifp(ifp); } /* @@ -2008,3 +2015,49 @@ sysctl_ifq(int *name, u_int namelen, void *oldp, size_t *oldlenp, } /* NOTREACHED */ } + +void +m_clinitifp(struct ifnet *ifp) +{ + extern struct mclsizes mclsizes[]; + int i; + + /* Initialize high water marks for use of cluster pools */ + for (i = 0; i < MCLPOOLS; i++) + ifp->if_mclstat.mclpool[i].mcl_hwm = mclsizes[i].hwm; +} + +int +m_cldrop(struct ifnet *ifp, int pi) +{ + struct mclstat *mcls = &ifp->if_mclstat; + extern struct mclsizes mclsizes[]; + + if (mcls->mclpool[pi].mcl_alive <= 2 && ISSET(ifp->if_flags, IFF_UP)) { + /* About to run out, so increase the watermark */ + mcls->mclpool[pi].mcl_hwm += + mcls->mclpool[pi].mcl_hwm / mclsizes[pi].factor; + } else if (mcls->mclpool[pi].mcl_alive >= mcls->mclpool[pi].mcl_hwm) + return (1); /* No more packets given */ + + return (0); +} + +void +m_clcount(struct ifnet *ifp, int pi) +{ + ifp->if_mclstat.mclpool[pi].mcl_alive++; +} + +void +m_cluncount(struct mbuf *m) +{ + struct mbuf_ext *me = &m->m_ext; + + if (((m->m_flags & (M_EXT|M_CLUSTER)) != (M_EXT|M_CLUSTER)) || + (me->ext_ifp == NULL)) + return; + + me->ext_ifp->if_mclstat.mclpool[me->ext_backend].mcl_alive--; + me->ext_ifp = NULL; +} diff --git a/sys/net/if.h b/sys/net/if.h index c366caeb756..a9b8c02eb12 100644 --- a/sys/net/if.h +++ b/sys/net/if.h @@ -1,4 +1,4 @@ -/* $OpenBSD: if.h,v 1.95 2008/11/07 05:50:33 deraadt Exp $ */ +/* $OpenBSD: if.h,v 1.96 2008/11/24 12:57:37 dlg Exp $ */ /* $NetBSD: if.h,v 1.23 1996/05/07 02:40:27 thorpej Exp $ */ /* @@ -146,6 +146,13 @@ struct ifqueue { struct timeout *ifq_congestion; }; +struct mclstat { + struct { + u_short mcl_alive; + u_short mcl_hwm; + } mclpool[MCLPOOLS]; +}; + /* * Values for if_link_state. */ @@ -224,6 +231,8 @@ struct ifnet { /* and the entries */ struct sockaddr_dl *if_sadl; /* pointer to our sockaddr_dl */ void *if_afdata[AF_MAX]; + + struct mclstat if_mclstat; /* mbuf cluster pool stats */ }; #define if_mtu if_data.ifi_mtu #define if_type if_data.ifi_type diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 8adca1c9a9b..e68e442eff8 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if_ethersubr.c,v 1.128 2008/11/23 23:44:01 tedu Exp $ */ +/* $OpenBSD: if_ethersubr.c,v 1.129 2008/11/24 12:57:37 dlg Exp $ */ /* $NetBSD: if_ethersubr.c,v 1.19 1996/05/07 02:40:30 thorpej Exp $ */ /* @@ -519,6 +519,8 @@ ether_input(ifp0, eh, m) struct ether_header *eh_tmp; #endif + m_cluncount(m); + if (eh == NULL) { eh = mtod(m, struct ether_header *); m_adj(m, ETHER_HDR_LEN); diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index a5a51e3d55b..0b2ba7f0af6 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -1,4 +1,4 @@ -/* $OpenBSD: mbuf.h,v 1.108 2008/11/23 16:17:17 dlg Exp $ */ +/* $OpenBSD: mbuf.h,v 1.109 2008/11/24 12:57:37 dlg Exp $ */ /* $NetBSD: mbuf.h,v 1.19 1996/02/09 18:25:14 christos Exp $ */ /* @@ -109,6 +109,8 @@ struct mbuf_ext { void *ext_arg; /* argument for ext_free */ u_int ext_size; /* size of buffer, for ext_free */ int ext_type; + struct ifnet* ext_ifp; + int ext_backend; /* backend pool the storage came from */ struct mbuf *ext_nextref; struct mbuf *ext_prevref; #ifdef DEBUG @@ -296,7 +298,8 @@ struct mbuf { MCLINITREFERENCE(m); \ } while (/* CONSTCOND */ 0) -#define MCLGET(m, how) m_clget((m), (how)) +#define MCLGET(m, how) m_clget((m), (how), NULL, MCLBYTES) +#define MCLGETI(m, how, ifp, l) m_clget((m), (how), (ifp), (l)) /* * Reset the data pointer on an mbuf. @@ -438,6 +441,13 @@ struct mbstat { }; #ifdef _KERNEL + +struct mclsizes { + u_int size; + u_int hwm; + u_int factor; +}; + extern struct mbstat mbstat; extern int nmbclust; /* limit on the # of clusters */ extern int mblowat; /* mbuf low water mark */ @@ -465,7 +475,10 @@ struct mbuf *m_inject(struct mbuf *, int, int, int); struct mbuf *m_getptr(struct mbuf *, int, int *); int m_leadingspace(struct mbuf *); int m_trailingspace(struct mbuf *); -void m_clget(struct mbuf *, int); +void m_clget(struct mbuf *, int, struct ifnet *, u_int); +int m_cldrop(struct ifnet *, int); +void m_clcount(struct ifnet *, int); +void m_cluncount(struct mbuf *); void m_adj(struct mbuf *, int); void m_copyback(struct mbuf *, int, int, const void *); void m_freem(struct mbuf *); diff --git a/sys/sys/param.h b/sys/sys/param.h index 2a743dd0a9a..2cf7cb55c0c 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -1,4 +1,4 @@ -/* $OpenBSD: param.h,v 1.79 2008/11/23 12:43:37 deraadt Exp $ */ +/* $OpenBSD: param.h,v 1.80 2008/11/24 12:57:37 dlg Exp $ */ /* $NetBSD: param.h,v 1.23 1996/03/17 01:02:29 thorpej Exp $ */ /*- @@ -140,7 +140,7 @@ /* 2K cluster can hold Ether frame */ #define MCLBYTES (1 << MCLSHIFT) /* size of a m_buf cluster */ #define MCLOFSET (MCLBYTES - 1) - +#define MCLPOOLS 7 /* number of cluster pools */ /* * File system parameters and macros. -- cgit v1.2.3