1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
|
/* $OpenBSD: ifq.h,v 1.42 2024/11/20 02:18:45 dlg Exp $ */
/*
* Copyright (c) 2015 David Gwynne <dlg@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifndef _NET_IFQ_H_
#define _NET_IFQ_H_
struct ifnet;
struct kstat;
struct ifq_ops;
struct ifqueue {
struct ifnet *ifq_if;
struct taskq *ifq_softnet;
union {
void *_ifq_softc;
/*
* a rings sndq is found by looking up an array of pointers.
* by default we only have one sndq and the default drivers
* dont use ifq_softc, so we can borrow it for the map until
* we need to allocate a proper map.
*/
struct ifqueue *_ifq_ifqs[1];
} _ifq_ptr;
#define ifq_softc _ifq_ptr._ifq_softc
#define ifq_ifqs _ifq_ptr._ifq_ifqs
/* mbuf handling */
struct mutex ifq_mtx;
const struct ifq_ops *ifq_ops;
void *ifq_q;
struct mbuf_list ifq_free;
unsigned int ifq_len;
unsigned int ifq_oactive;
/* statistics */
uint64_t ifq_packets;
uint64_t ifq_bytes;
uint64_t ifq_qdrops;
uint64_t ifq_errors;
uint64_t ifq_mcasts;
uint32_t ifq_oactives;
struct kstat *ifq_kstat;
/* work serialisation */
struct mutex ifq_task_mtx;
struct task_list ifq_task_list;
void *ifq_serializer;
struct task ifq_bundle;
/* work to be serialised */
struct task ifq_start;
struct task ifq_restart;
/* properties */
unsigned int ifq_maxlen;
unsigned int ifq_idx;
};
struct ifiqueue {
struct ifnet *ifiq_if;
struct taskq *ifiq_softnet;
union {
void *_ifiq_softc;
struct ifiqueue *_ifiq_ifiqs[1];
} _ifiq_ptr;
#define ifiq_softc _ifiq_ptr._ifiq_softc
#define ifiq_ifiqs _ifiq_ptr._ifiq_ifiqs
struct mutex ifiq_mtx;
struct mbuf_list ifiq_ml;
struct task ifiq_task;
unsigned int ifiq_pressure;
/* counters */
uint64_t ifiq_packets;
uint64_t ifiq_bytes;
uint64_t ifiq_fdrops;
uint64_t ifiq_qdrops;
uint64_t ifiq_errors;
uint64_t ifiq_mcasts;
uint64_t ifiq_noproto;
/* number of times a list of packets were put on ifiq_ml */
uint64_t ifiq_enqueues;
/* number of times a list of packets were pulled off ifiq_ml */
uint64_t ifiq_dequeues;
struct kstat *ifiq_kstat;
/* properties */
unsigned int ifiq_idx;
};
#ifdef _KERNEL
#define IFQ_MAXLEN 256
/*
*
* Interface Send Queues
*
* struct ifqueue sits between the network stack and a drivers
* transmission of packets. The high level view is that when the stack
* has finished generating a packet it hands it to a driver for
* transmission. It does this by queueing the packet on an ifqueue and
* notifying the driver to start transmission of the queued packets.
*
* A network device may have multiple contexts for the transmission
* of packets, ie, independent transmit rings. Such a network device,
* represented by a struct ifnet, would then have multiple ifqueue
* structures, each of which maps to an independent transmit ring.
*
* struct ifqueue also provides the point where conditioning of
* traffic (ie, priq and hfsc) is implemented, and provides some
* infrastructure to assist in the implementation of network drivers.
*
* = ifq API
*
* The ifq API provides functions for three distinct consumers:
*
* 1. The network stack
* 2. Traffic QoS/conditioning implementations
* 3. Network drivers
*
* == Network Stack API
*
* The network stack is responsible for initialising and destroying
* the ifqueue structures, changing the traffic conditioner on an
* interface, enqueuing packets for transmission, and notifying
* the driver to start transmission of a particular ifqueue.
*
* === ifq_init()
*
* During if_attach(), the network stack calls ifq_init to initialise
* the ifqueue structure. By default it configures the priq traffic
* conditioner.
*
* === ifq_destroy()
*
* The network stack calls ifq_destroy() during if_detach to tear down
* the ifqueue structure. It frees the traffic conditioner state, and
* frees any mbufs that were left queued.
*
* === ifq_attach()
*
* ifq_attach() is used to replace the current traffic conditioner on
* the ifqueue. All the pending mbufs are removed from the previous
* conditioner and requeued on the new.
*
* === ifq_idx()
*
* ifq_idx() selects a specific ifqueue from the current ifnet
* structure for use in the transmission of the mbuf.
*
* === ifq_enqueue()
*
* ifq_enqueue() attempts to fit an mbuf onto the ifqueue. The
* current traffic conditioner may drop a packet to make space on the
* queue.
*
* === ifq_start()
*
* Once a packet has been successfully queued with ifq_enqueue(),
* the network card is notified with a call to ifq_start().
* Calls to ifq_start() run in the ifqueue serialisation context,
* guaranteeing that only one instance of ifp->if_qstart() will be
* running on behalf of a specific ifqueue in the system at any point
* in time.
*
* == Traffic conditioners API
*
* The majority of interaction between struct ifqueue and a traffic
* conditioner occurs via the callbacks a traffic conditioner provides
* in an instance of struct ifq_ops.
*
* XXX document ifqop_*
*
* The ifqueue API implements the locking on behalf of the conditioning
* implementations so conditioners only have to reject or keep mbufs.
* If something needs to inspect a conditioners internals, the queue lock
* needs to be taken to allow for a consistent or safe view. The queue
* lock may be taken and released with ifq_q_enter() and ifq_q_leave().
*
* === ifq_q_enter()
*
* Code wishing to access a conditioners internals may take the queue
* lock with ifq_q_enter(). The caller must pass a reference to the
* conditioners ifq_ops structure so the infrastructure can ensure the
* caller is able to understand the internals. ifq_q_enter() returns
* a pointer to the conditioners internal structures, or NULL if the
* ifq_ops did not match the current conditioner.
*
* === ifq_q_leave()
*
* The queue lock acquired with ifq_q_enter() is released with
* ifq_q_leave().
*
* === ifq_mfreem() and ifq_mfreeml()
*
* A goal of the API is to avoid freeing an mbuf while mutexes are
* held. Because the ifq API manages the lock on behalf of the backend
* ifqops, the backend should not directly free mbufs. If a conditioner
* backend needs to drop a packet during the handling of ifqop_deq_begin,
* it may free it by calling ifq_mfreem(). This accounts for the drop,
* and schedules the free of the mbuf outside the hold of ifq_mtx.
* ifq_mfreeml() takes an mbuf list as an argument instead.
*
*
* == Network Driver API
*
* The API used by network drivers is mostly documented in the
* ifq_dequeue(9) manpage except for ifq_serialize().
*
* === ifq_serialize()
*
* A driver may run arbitrary work in the ifqueue serialiser context
* via ifq_serialize(). The work to be done is represented by a task
* that has been prepared with task_set.
*
* The work will be run in series with any other work dispatched by
* ifq_start(), ifq_restart(), or other ifq_serialize() calls.
*
* Because the work may be run on another CPU, the lifetime of the
* task and the work it represents can extend beyond the end of the
* call to ifq_serialize() that dispatched it.
*
*
* = ifqueue work serialisation
*
* ifqueues provide a mechanism to dispatch work to be run in a single
* context. Work in this mechanism is represented by task structures.
*
* The tasks are run in a context similar to a taskq serviced by a
* single kernel thread, except the work is run immediately by the
* first CPU that dispatches work. If a second CPU attempts to dispatch
* additional tasks while the first is still running, it will be queued
* to be run by the first CPU. The second CPU will return immediately.
*
* = MP Safe Network Drivers
*
* An MP safe network driver is one in which its start routine can be
* called by the network stack without holding the big kernel lock.
*
* == Attach
*
* A driver advertises its ability to run its start routine without
* the kernel lock by setting the IFXF_MPSAFE flag in ifp->if_xflags
* before calling if_attach(). Advertising an MPSAFE start routine
* also implies that the driver understands that a network card can
* have multiple rings or transmit queues, and therefore provides
* if_qstart function (which takes an ifqueue pointer) instead of an
* if_start function (which takes an ifnet pointer).
*
* If the hardware supports multiple transmit rings, it advertises
* support for multiple rings to the network stack with if_attach_queues()
* after the call to if_attach(). if_attach_queues allocates a struct
* ifqueue for each hardware ring, which can then be initialised by
* the driver with data for each ring.
*
* void drv_start(struct ifqueue *);
*
* void
* drv_attach()
* {
* ...
* ifp->if_xflags = IFXF_MPSAFE;
* ifp->if_qstart = drv_start;
* if_attach(ifp);
*
* if_attach_queues(ifp, DRV_NUM_TX_RINGS);
* for (i = 0; i < DRV_NUM_TX_RINGS; i++) {
* struct ifqueue *ifq = ifp->if_ifqs[i];
* struct drv_tx_ring *ring = &sc->sc_tx_rings[i];
*
* ifq->ifq_softc = ring;
* ring->ifq = ifq;
* }
* }
*
* The network stack will then call ifp->if_qstart via ifq_start()
* to guarantee there is only one instance of that function running
* for each ifq in the system, and to serialise it with other work
* the driver may provide.
*
* == Initialise
*
* When the stack requests an interface be brought up (ie, drv_ioctl()
* is called to handle SIOCSIFFLAGS with IFF_UP set in ifp->if_flags)
* drivers should set IFF_RUNNING in ifp->if_flags, and then call
* ifq_clr_oactive() against each ifq.
*
* == if_start
*
* ifq_start() checks that IFF_RUNNING is set in ifp->if_flags, that
* ifq_is_oactive() does not return true, and that there are pending
* packets to transmit via a call to ifq_len(). Therefore, drivers are
* no longer responsible for doing this themselves.
*
* If a driver should not transmit packets while its link is down, use
* ifq_purge() to flush pending packets from the transmit queue.
*
* Drivers for hardware should use the following pattern to transmit
* packets:
*
* void
* drv_start(struct ifqueue *ifq)
* {
* struct drv_tx_ring *ring = ifq->ifq_softc;
* struct ifnet *ifp = ifq->ifq_if;
* struct drv_softc *sc = ifp->if_softc;
* struct mbuf *m;
* int kick = 0;
*
* if (NO_LINK) {
* ifq_purge(ifq);
* return;
* }
*
* for (;;) {
* if (NO_SPACE(ring)) {
* ifq_set_oactive(ifq);
* break;
* }
*
* m = ifq_dequeue(ifq);
* if (m == NULL)
* break;
*
* if (drv_encap(sc, ring, m) != 0) { // map and fill ring
* m_freem(m);
* continue;
* }
*
* bpf_mtap();
* }
*
* drv_kick(ring); // notify hw of new descriptors on the ring
* }
*
* == Transmission completion
*
* The following pattern should be used for transmit queue interrupt
* processing:
*
* void
* drv_txeof(struct drv_tx_ring *ring)
* {
* struct ifqueue *ifq = ring->ifq;
*
* while (COMPLETED_PKTS(ring)) {
* // unmap packets, m_freem() the mbufs.
* }
*
* if (ifq_is_oactive(ifq))
* ifq_restart(ifq);
* }
*
* == Stop
*
* Bringing an interface down (ie, IFF_UP was cleared in ifp->if_flags)
* should clear IFF_RUNNING in ifp->if_flags, and guarantee the start
* routine is not running before freeing any resources it uses:
*
* void
* drv_down(struct drv_softc *sc)
* {
* struct ifnet *ifp = &sc->sc_if;
* struct ifqueue *ifq;
* int i;
*
* CLR(ifp->if_flags, IFF_RUNNING);
* DISABLE_INTERRUPTS();
*
* for (i = 0; i < sc->sc_num_queues; i++) {
* ifq = ifp->if_ifqs[i];
* ifq_barrier(ifq);
* }
*
* intr_barrier(sc->sc_ih);
*
* FREE_RESOURCES();
*
* for (i = 0; i < sc->sc_num_queues; i++) {
* ifq = ifp->if_ifqs[i];
* ifq_clr_oactive(ifq);
* }
* }
*
*/
struct ifq_ops {
unsigned int (*ifqop_idx)(unsigned int,
const struct mbuf *);
struct mbuf *(*ifqop_enq)(struct ifqueue *, struct mbuf *);
struct mbuf *(*ifqop_deq_begin)(struct ifqueue *, void **);
void (*ifqop_deq_commit)(struct ifqueue *,
struct mbuf *, void *);
void (*ifqop_purge)(struct ifqueue *,
struct mbuf_list *);
void *(*ifqop_alloc)(unsigned int, void *);
void (*ifqop_free)(unsigned int, void *);
};
extern const struct ifq_ops * const ifq_priq_ops;
/*
* Interface send queues.
*/
void ifq_init(struct ifqueue *, struct ifnet *, unsigned int);
void ifq_attach(struct ifqueue *, const struct ifq_ops *, void *);
void ifq_destroy(struct ifqueue *);
void ifq_add_data(struct ifqueue *, struct if_data *);
int ifq_enqueue(struct ifqueue *, struct mbuf *);
void ifq_start(struct ifqueue *);
struct mbuf *ifq_deq_begin(struct ifqueue *);
void ifq_deq_commit(struct ifqueue *, struct mbuf *);
void ifq_deq_rollback(struct ifqueue *, struct mbuf *);
struct mbuf *ifq_dequeue(struct ifqueue *);
int ifq_hdatalen(struct ifqueue *);
void ifq_init_maxlen(struct ifqueue *, unsigned int);
void ifq_mfreem(struct ifqueue *, struct mbuf *);
void ifq_mfreeml(struct ifqueue *, struct mbuf_list *);
unsigned int ifq_purge(struct ifqueue *);
void *ifq_q_enter(struct ifqueue *, const struct ifq_ops *);
void ifq_q_leave(struct ifqueue *, void *);
void ifq_serialize(struct ifqueue *, struct task *);
void ifq_barrier(struct ifqueue *);
void ifq_set_oactive(struct ifqueue *);
void ifq_deq_set_oactive(struct ifqueue *);
int ifq_deq_sleep(struct ifqueue *, struct mbuf **, int, int,
const char *, volatile unsigned int *,
volatile unsigned int *);
#define ifq_len(_ifq) READ_ONCE((_ifq)->ifq_len)
#define ifq_empty(_ifq) (ifq_len(_ifq) == 0)
static inline int
ifq_is_priq(struct ifqueue *ifq)
{
return (ifq->ifq_ops == ifq_priq_ops);
}
static inline void
ifq_clr_oactive(struct ifqueue *ifq)
{
ifq->ifq_oactive = 0;
}
static inline unsigned int
ifq_is_oactive(struct ifqueue *ifq)
{
return (ifq->ifq_oactive);
}
static inline void
ifq_restart(struct ifqueue *ifq)
{
ifq_serialize(ifq, &ifq->ifq_restart);
}
static inline unsigned int
ifq_idx(struct ifqueue *ifq, unsigned int nifqs, const struct mbuf *m)
{
return ((*ifq->ifq_ops->ifqop_idx)(nifqs, m));
}
/* ifiq */
void ifiq_init(struct ifiqueue *, struct ifnet *, unsigned int);
void ifiq_destroy(struct ifiqueue *);
int ifiq_input(struct ifiqueue *, struct mbuf_list *);
int ifiq_enqueue(struct ifiqueue *, struct mbuf *);
void ifiq_add_data(struct ifiqueue *, struct if_data *);
#define ifiq_len(_ifiq) READ_ONCE(ml_len(&(_ifiq)->ifiq_ml))
#define ifiq_empty(_ifiq) (ifiq_len(_ifiq) == 0)
#endif /* _KERNEL */
#endif /* _NET_IFQ_H_ */
|