summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Pieuchot <mpi@cvs.openbsd.org>2015-08-20 12:39:44 +0000
committerMartin Pieuchot <mpi@cvs.openbsd.org>2015-08-20 12:39:44 +0000
commit7f00a19d0cc487d4dc2da05315669e8e80b790cc (patch)
tree89789fe3193711c95900bbe78ff1a2f93fcbf5f1
parentdc40e77aa9206ff1be157fb2b74f764e4ab81062 (diff)
Import an alternative routing table backend based on Yoichi Hariguchi's
ART implementation. ART (Allotment Routing Table) is a multibit-trie algorithm invented by D. Knuth while reviewing Yoichi's SMART [0] (Smart Multi-Array Routing Table) paper. This implementation, unlike the one from the KAME project, supports variable stride lengths which makes it easier to adapt the consumed memory/speed trade-off. It also let you use a bigger first-level table, what other algorithms such as POPTRIE [1] need to implement separately. Adaptation to the OpenBSD kernel has been done with two different data structures. ART nodes and route entries are managed separately which makes the algorithm implementation free of any MULTIPATH logic. This implementation does not include Path Compression. [0] http://www.hariguchi.org/art/smart.pdf [1] http://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p57.pdf ok dlg@, reyk@
-rw-r--r--sys/net/art.c789
-rw-r--r--sys/net/art.h61
-rw-r--r--sys/net/route.c4
-rw-r--r--sys/net/route.h9
-rw-r--r--sys/net/rtable.c456
-rw-r--r--sys/net/rtable.h19
6 files changed, 1334 insertions, 4 deletions
diff --git a/sys/net/art.c b/sys/net/art.c
new file mode 100644
index 00000000000..55193d13051
--- /dev/null
+++ b/sys/net/art.c
@@ -0,0 +1,789 @@
+/* $OpenBSD: art.c,v 1.1 2015/08/20 12:39:43 mpi Exp $ */
+
+/*
+ * Copyright (c) 2015 Martin Pieuchot
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Copyright (c) 2001 Yoichi Hariguchi. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Yoichi Hariguchi.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * THE AUTHOR DOES NOT GUARANTEE THAT THIS SOFTWARE DOES NOT INFRINGE
+ * ANY OTHERS' INTELLECTUAL PROPERTIES. IN NO EVENT SHALL THE AUTHOR
+ * BE LIABLE FOR ANY INFRINGEMENT OF ANY OTHERS' INTELLECTUAL
+ * PROPERTIES.
+ */
+
+/*
+ * Allotment Routing Table (ART).
+ *
+ * Yoichi Hariguchi paper can be found at:
+ * http://www.hariguchi.org/art/art.pdf
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+
+#include <net/art.h>
+
+#define ISLEAF(e) (((unsigned long)(e).node & 1) == 0)
+#define SUBTABLE(e) (((struct art_table *)((unsigned long)(e).child & ~1)))
+#define ASNODE(t) ((struct art_node *)((unsigned long)(t) | 1))
+
+/*
+ * Allotment Table.
+ */
+struct art_table {
+ struct art_table *at_parent; /* Parent table */
+ uint32_t at_index; /* Index in the parent table */
+ uint32_t at_minfringe; /* Index that fringe begins */
+ uint32_t at_level; /* Level of the table */
+ uint8_t at_bits; /* Stride length of the table */
+ uint8_t at_offset; /* Sum of parents' stride len */
+
+ /*
+ * Items stored in the heap are pointers to nodes, in the leaf
+ * case, or tables otherwise. One exception is index 0 which
+ * is a route counter.
+ */
+ union {
+ struct art_node *node;
+ struct art_table *child;
+ unsigned long count;
+ } *at_heap; /* Array of 2^(slen+1) items */
+};
+#define at_refcnt at_heap[0].count/* Refcounter (1 per different route) */
+#define at_default at_heap[1].node /* Default route (was in parent heap) */
+
+int art_bindex(struct art_table *, uint8_t *, int);
+void art_allot(struct art_table *at, int, struct art_node *,
+ struct art_node *);
+struct art_table *art_table_get(struct art_root *, struct art_table *,
+ int);
+struct art_table *art_table_put(struct art_root *, struct art_table *);
+struct art_node *art_table_insert(struct art_root *, struct art_table *,
+ int, struct art_node *);
+struct art_node *art_table_delete(struct art_root *, struct art_table *,
+ int, struct art_node *);
+void art_table_ref(struct art_root *, struct art_table *);
+int art_table_free(struct art_root *, struct art_table *);
+int art_table_walk(struct art_table *,
+ int (*f)(struct art_node *, void *), void *);
+
+/*
+ * Per routing table initialization API function.
+ */
+int
+art_attach(void **head, int off)
+{
+ struct art_root *ar;
+ int i;
+
+ if (*head)
+ return (1);
+ ar = malloc(sizeof(*ar), M_RTABLE, M_NOWAIT|M_ZERO);
+ if (ar == NULL)
+ return (0);
+
+ /* XXX using the offset is a hack. */
+ switch (off) {
+ case 32: /* AF_INET && AF_MPLS */
+ ar->ar_alen = 32;
+ ar->ar_nlvl = 3;
+ ar->ar_bits[0] = 16;
+ ar->ar_bits[1] = 8;
+ ar->ar_bits[2] = 8;
+ break;
+#ifdef INET6
+ case 64: /* AF_INET6 */
+ ar->ar_alen = 128;
+ ar->ar_nlvl = 16;
+ for (i = 0; i < ar->ar_nlvl; i++)
+ ar->ar_bits[i] = 8;
+ break;
+#endif /* INET6 */
+ default:
+ printf("%s: unknown offset %d\n", __func__, off);
+ free(ar, M_RTABLE, sizeof(*ar));
+ return (0);
+ }
+
+ ar->ar_off = off / 8;
+
+ ar->ar_root = art_table_get(ar, NULL, -1);
+ if (ar->ar_root == NULL) {
+ free(ar, M_RTABLE, sizeof(*ar));
+ return (0);
+ }
+
+ *head = ar;
+ return (1);
+}
+
+/*
+ * Return a pointer to the address (key). This is an heritage from the
+ * BSD radix tree needed to skip the non-address fields from the flavor
+ * of "struct sockaddr" used by this routing table.
+ */
+static inline uint8_t *
+art_satoaddr(struct art_root *at, struct sockaddr *sa)
+{
+ return (((uint8_t *)sa) + at->ar_off);
+}
+
+/*
+ * Return 1 if ``old'' and ``new`` are identical, 0 otherwise.
+ */
+static inline int
+art_check_duplicate(struct art_root *ar, struct art_node *old,
+ struct art_node *new)
+{
+ if (old == NULL || old->an_plen != new->an_plen)
+ return (0);
+
+ if (memcmp(old->an_dst, new->an_dst, new->an_dst->sa_len) == 0)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Return the base index of the part of ``addr'' and ``plen''
+ * corresponding to the range covered by the table ``at''.
+ *
+ * In other words, this function take the multi-level (complete)
+ * address ``addr'' and prefix length ``plen'' and return the
+ * single level base index for the table ``at''.
+ *
+ * For example with an address size of 32bit divided into four
+ * 8bit-long tables, there's a maximum of 4 base indexes if the
+ * prefix length is > 24.
+ */
+int
+art_bindex(struct art_table *at, uint8_t *addr, int plen)
+{
+ uint8_t boff, bend;
+ uint32_t k;
+
+ if (plen < at->at_offset || plen > (at->at_offset + at->at_bits))
+ return (-1);
+
+ /*
+ * We are only interested in the part of the prefix length
+ * corresponding to the range of this table.
+ */
+ plen -= at->at_offset;
+
+ /*
+ * Jump to the first byte of the address containing bits
+ * covered by this table.
+ */
+ addr += (at->at_offset / 8);
+
+ /* ``at'' covers the bit range between ``boff'' & ``bend''. */
+ boff = (at->at_offset % 8);
+ bend = (at->at_bits + boff);
+
+ KASSERT(bend <= 32);
+
+ if (bend > 24) {
+ k = (addr[0] & ((1 << (8 - boff)) - 1)) << (bend - 8);
+ k |= addr[1] << (bend - 16);
+ k |= addr[2] << (bend - 24);
+ k |= addr[3] >> (32 - bend);
+ } else if (bend > 16) {
+ k = (addr[0] & ((1 << (8 - boff)) - 1)) << (bend - 8);
+ k |= addr[1] << (bend - 16);
+ k |= addr[2] >> (24 - bend);
+ } else if (bend > 8) {
+ k = (addr[0] & ((1 << (8 - boff)) - 1)) << (bend - 8);
+ k |= addr[1] >> (16 - bend);
+ } else {
+ k = (addr[0] >> (8 - bend)) & ((1 << at->at_bits) - 1);
+ }
+
+ /*
+ * Single level base index formula:
+ */
+ return ((k >> (at->at_bits - plen)) + (1 << plen));
+}
+
+/*
+ * Single level lookup function.
+ *
+ * Return the fringe index of the part of ``addr''
+ * corresponding to the range covered by the table ``at''.
+ */
+static inline int
+art_findex(struct art_table *at, uint8_t *addr)
+{
+ return art_bindex(at, addr, (at->at_offset + at->at_bits));
+}
+
+/*
+ * (Non-perfect) lookup API function.
+ *
+ * Return the best existing match for a destination.
+ */
+struct art_node *
+art_match(struct art_root *ar, struct sockaddr *dst)
+{
+ struct art_table *at = ar->ar_root;
+ struct art_node *dflt = NULL;
+ uint8_t *addr;
+ int j;
+
+ addr = art_satoaddr(ar, dst);
+
+ /*
+ * Iterate until we find a leaf.
+ */
+ while (1) {
+ /*
+ * Rember the default route of this table in case
+ * we do not find a better matching route.
+ */
+ if (at->at_default != NULL)
+ dflt = at->at_default;
+
+ /* Do a single level route lookup. */
+ j = art_findex(at, addr);
+
+ /* If this is a leaf we're done. */
+ if (ISLEAF(at->at_heap[j]))
+ break;
+
+ at = SUBTABLE(at->at_heap[j]);
+ }
+
+ if (at->at_heap[j].node != NULL)
+ return (at->at_heap[j].node);
+
+ return (dflt);
+}
+
+/*
+ * Perfect lookup API function.
+ *
+ * Return a perfect match for a destination/prefix-length pair or NULL if
+ * it does not exist.
+ */
+struct art_node *
+art_lookup(struct art_root *ar, struct sockaddr *dst, int plen)
+{
+ struct art_table *at = ar->ar_root;
+ struct art_node *an;
+ uint8_t *addr;
+ int i, j;
+
+ KASSERT(plen >= 0 && plen <= ar->ar_alen);
+
+ addr = art_satoaddr(ar, dst);
+
+ /* Default route */
+ if (plen == 0)
+ return (at->at_default);
+
+ /*
+ * If the prefix length is smaller than the sum of
+ * the stride length at this level the entry must
+ * be in the current table.
+ */
+ while (plen > (at->at_offset + at->at_bits)) {
+ /* Do a single level route lookup. */
+ j = art_findex(at, addr);
+
+ /* A leaf is a match, but not a perfect one. */
+ if (ISLEAF(at->at_heap[j]))
+ return (NULL);
+
+ at = SUBTABLE(at->at_heap[j]);
+ }
+
+ i = art_bindex(at, addr, plen);
+ if (i == -1)
+ return (NULL);
+
+ if (!ISLEAF(at->at_heap[i]))
+ an = SUBTABLE(at->at_heap[i])->at_default;
+ else
+ an = at->at_heap[i].node;
+
+ /* Make sure we've got a perfect match. */
+ if (an == NULL || an->an_plen != plen ||
+ memcmp(an->an_dst, dst, dst->sa_len))
+ return (NULL);
+
+ return (an);
+}
+
+
+/*
+ * Insertion API function.
+ *
+ * Insert the given node or return an existing one if a node with the
+ * same destination/mask pair is already present.
+ */
+struct art_node *
+art_insert(struct art_root *ar, struct art_node *an)
+{
+ struct art_table *at = ar->ar_root;
+ uint8_t *addr;
+ int i, j, plen;
+
+ addr = art_satoaddr(ar, an->an_dst);
+ plen = an->an_plen;
+
+ KASSERT(plen >= 0 && plen <= ar->ar_alen);
+
+ /* Default route */
+ if (plen == 0) {
+ if (art_check_duplicate(ar, at->at_default, an))
+ return (at->at_default);
+
+ at->at_default = an;
+ return (an);
+ }
+
+ /*
+ * If the prefix length is smaller than the sum of
+ * the stride length at this level the entry must
+ * be in the current table.
+ */
+ while (plen > (at->at_offset + at->at_bits)) {
+ /* Do a single level route lookup. */
+ j = art_findex(at, addr);
+
+ /*
+ * If the node corresponding to the fringe index is
+ * a leaf we need to allocate a subtable. The route
+ * entry of this node will then become the default
+ * route of the subtable.
+ */
+ if (ISLEAF(at->at_heap[j])) {
+ struct art_table *child;
+
+ child = art_table_get(ar, at, j);
+ if (child == NULL)
+ return (NULL);
+
+ at->at_heap[j].node = ASNODE(child);
+ }
+
+ at = SUBTABLE(at->at_heap[j]);
+ }
+
+ i = art_bindex(at, addr, plen);
+ if (i == -1)
+ return (NULL);
+
+ return (art_table_insert(ar, at, i, an));
+}
+
+/*
+ * Single level insertion.
+ */
+struct art_node *
+art_table_insert(struct art_root *ar, struct art_table *at, int i,
+ struct art_node *an)
+{
+ struct art_node *prev;
+
+ if (!ISLEAF(at->at_heap[i]))
+ prev = SUBTABLE(at->at_heap[i])->at_default;
+ else
+ prev = at->at_heap[i].node;
+
+ if (art_check_duplicate(ar, prev, an))
+ return (prev);
+
+ art_table_ref(ar, at);
+
+ /*
+ * If the index `i' of the route that we are inserting is not
+ * a fringe index, we need to allot this new route pointer to
+ * all the corresponding fringe indices.
+ */
+ if (i < at->at_minfringe)
+ art_allot(at, i, prev, an);
+ else if (!ISLEAF(at->at_heap[i]))
+ SUBTABLE(at->at_heap[i])->at_default = an;
+ else
+ at->at_heap[i].node = an;
+
+ return (an);
+}
+
+
+/*
+ * Deletion API function.
+ */
+struct art_node *
+art_delete(struct art_root *ar, struct art_node *an)
+{
+ struct art_table *at = ar->ar_root;
+ struct art_node *dflt;
+ uint8_t *addr;
+ int i, j, plen;
+
+ addr = art_satoaddr(ar, an->an_dst);
+ plen = an->an_plen;
+
+ KASSERT(plen >= 0 && plen <= ar->ar_alen);
+
+ /* Default route */
+ if (plen == 0) {
+ dflt = at->at_default;
+ if (!art_check_duplicate(ar, dflt, an))
+ return (NULL);
+ at->at_default = NULL;
+ return (dflt);
+ }
+
+ /*
+ * If the prefix length is smaller than the sum of
+ * the stride length at this level the entry must
+ * be in the current table.
+ */
+ while (plen > (at->at_offset + at->at_bits)) {
+ /* Do a single level route lookup. */
+ j = art_findex(at, addr);
+
+ /* If this is a leaf, there is no route to delete. */
+ if (ISLEAF(at->at_heap[j]))
+ return (NULL);
+
+ at = SUBTABLE(at->at_heap[j]);
+ }
+
+ i = art_bindex(at, addr, plen);
+ if (i == -1)
+ return (NULL);
+
+ return (art_table_delete(ar, at, i, an));
+}
+
+/*
+ * Single level deletion.
+ */
+struct art_node *
+art_table_delete(struct art_root *ar, struct art_table *at, int i,
+ struct art_node *node)
+{
+ struct art_node *next;
+
+#ifdef DIAGNOSTIC
+ struct art_node *prev;
+
+ if (!ISLEAF(at->at_heap[i]))
+ prev = SUBTABLE(at->at_heap[i])->at_default;
+ else
+ prev = at->at_heap[i].node;
+
+ KASSERT(prev == node);
+#endif
+
+ /* We are removing an entry from this table. */
+ if (art_table_free(ar, at))
+ return (node);
+
+ /* Get the next most specific route for the index `i'. */
+ if ((i >> 1) > 1)
+ next = at->at_heap[i >> 1].node;
+ else
+ next = NULL;
+
+ /*
+ * If the index `i' of the route that we are removing is not
+ * a fringe index, we need to allot the next most specific
+ * route pointer to all the corresponding fringe indices.
+ */
+ if (i < at->at_minfringe)
+ art_allot(at, i, node, next);
+ else if (!ISLEAF(at->at_heap[i]))
+ SUBTABLE(at->at_heap[i])->at_default = next;
+ else
+ at->at_heap[i].node = next;
+
+ return (node);
+}
+
+void
+art_table_ref(struct art_root *ar, struct art_table *at)
+{
+ at->at_refcnt++;
+}
+
+int
+art_table_free(struct art_root *ar, struct art_table *at)
+{
+ at->at_refcnt--;
+
+ if (at->at_refcnt == 0) {
+ /*
+ * Garbage collect this table and all its parents
+ * that are empty.
+ */
+ do {
+ at = art_table_put(ar, at);
+ } while (at->at_refcnt == 0);
+
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Iteration API function.
+ */
+int
+art_walk(struct art_root *ar, int (*f)(struct art_node *, void *), void *arg)
+{
+ struct art_table *at = ar->ar_root;
+ int error;
+
+ /*
+ * The default route should be processed here because the root
+ * table does not have a parent.
+ */
+ if (at->at_default != NULL) {
+ error = (*f)(at->at_default, arg);
+ if (error)
+ return (error);
+ }
+
+ return (art_table_walk(at, f, arg));
+}
+
+int
+art_table_walk(struct art_table *at,
+ int (*f)(struct art_node *, void *), void *arg)
+{
+ struct art_node *next, *an = NULL;
+ int i, j, error = 0;
+ uint32_t maxfringe = (at->at_minfringe << 1);
+
+ /*
+ * Iterate non-fringe nodes in the ``natural'' order.
+ */
+ for (j = 1; j < at->at_minfringe; j += 2) {
+ /*
+ * The default route (index 1) is processed by the
+ * parent table (where it belongs) otherwise it could
+ * be processed more than once.
+ */
+ for (i = max(j, 2); i < at->at_minfringe; i <<= 1) {
+ next = at->at_heap[i >> 1].node;
+ an = at->at_heap[i].node;
+ if ((an != NULL) && (an != next)) {
+ error = (*f)(an, arg);
+ if (error)
+ return (error);
+ }
+ }
+ }
+
+ /*
+ * Iterate fringe nodes.
+ */
+ for (i = at->at_minfringe; i < maxfringe; i++) {
+ next = at->at_heap[i >> 1].node;
+ if (!ISLEAF(at->at_heap[i]))
+ an = SUBTABLE(at->at_heap[i])->at_default;
+ else
+ an = at->at_heap[i].node;
+ if ((an != NULL) && (an != next)) {
+ error = (*f)(an, arg);
+ if (error)
+ return (error);
+ }
+
+ if (ISLEAF(at->at_heap[i]))
+ continue;
+
+ error = art_table_walk(SUBTABLE(at->at_heap[i]), f, arg);
+ if (error)
+ break;
+ }
+
+ return (error);
+}
+
+
+/*
+ * Create a table and use the given index to set its default route.
+ *
+ * Note: This function does not modify the root or the parent.
+ */
+struct art_table *
+art_table_get(struct art_root *ar, struct art_table *parent, int j)
+{
+ struct art_table *at;
+ size_t size;
+ uint32_t lvl;
+
+ KASSERT(j != 0 && j != 1);
+ KASSERT(parent != NULL || j == -1);
+
+ if (parent != NULL)
+ lvl = parent->at_level + 1;
+ else
+ lvl = 0;
+
+ KASSERT(lvl < ar->ar_nlvl);
+
+ size = sizeof(*at) + (1 << (ar->ar_bits[lvl] + 1)) * sizeof(void *);
+ at = malloc(size, M_RTABLE, M_NOWAIT|M_ZERO);
+ if (at == NULL)
+ return (NULL);
+
+ at->at_parent = parent;
+ at->at_index = j;
+ at->at_minfringe = (1 << ar->ar_bits[lvl]);
+ at->at_level = lvl;
+ at->at_bits = ar->ar_bits[lvl];
+ at->at_heap = (void *)(at + 1);
+
+ art_table_ref(ar, at);
+
+ if (parent != NULL) {
+ at->at_default = parent->at_heap[j].node;
+ at->at_offset = (parent->at_offset + parent->at_bits);
+ }
+
+ return (at);
+}
+
+
+/*
+ * Delete a table and use its index to restore its parent's default route.
+ *
+ * Note: Modify its parent to unlink the table from it.
+ */
+struct art_table *
+art_table_put(struct art_root *ar, struct art_table *at)
+{
+ struct art_table *parent = at->at_parent;
+ size_t size;
+ uint32_t lvl = at->at_level;
+ uint32_t j = at->at_index;
+
+ KASSERT(j != 0 && j != 1);
+ KASSERT(parent != NULL);
+ KASSERT(lvl == parent->at_level + 1);
+
+ /* Give the route back to its parent. */
+ parent->at_heap[j].node = at->at_default;
+ art_table_free(ar, parent);
+
+ size = sizeof(*at) + (1 << (ar->ar_bits[lvl] + 1)) * sizeof(void *);
+ free(at, M_RTABLE, size);
+
+ return (parent);
+}
+
+/*
+ * Substitute a node by another in the subtree whose root index is given.
+ *
+ * This function iterates on the table ``at'' at index ``i'' until no
+ * more ``old'' node can be replaced by ``new''.
+ *
+ * This function was originally written by Don Knuth in CWEB. The
+ * complicated ``goto''s are the result of expansion of the two
+ * following recursions:
+ *
+ * art_allot(at, i, old, new)
+ * {
+ * int k = i;
+ * if (at->at_heap[k] == old)
+ * at->at_heap[k] = new;
+ * if (k >= at->at_minfringe)
+ * return;
+ * k <<= 1;
+ * art_allot(at, k, old, new);
+ * k++;
+ * art_allot(at, k, old, new);
+ * }
+ */
+void
+art_allot(struct art_table *at, int i, struct art_node *old,
+ struct art_node *new)
+{
+ int k = i;
+
+ KASSERT(i < at->at_minfringe);
+
+again:
+ k <<= 1;
+ if (k < at->at_minfringe)
+ goto nonfringe;
+
+ /* Change fringe nodes. */
+ while (1) {
+ if (!ISLEAF(at->at_heap[k])) {
+ if (SUBTABLE(at->at_heap[k])->at_default == old) {
+ SUBTABLE(at->at_heap[k])->at_default = new;
+ }
+ } else if (at->at_heap[k].node == old) {
+ at->at_heap[k].node = new;
+ }
+ if (k % 2)
+ goto moveup;
+ k++;
+ }
+
+nonfringe:
+ if (at->at_heap[k].node == old)
+ goto again;
+moveon:
+ if (k % 2)
+ goto moveup;
+ k++;
+ goto nonfringe;
+moveup:
+ k >>= 1;
+ at->at_heap[k].node = new;
+
+ /* Change non-fringe node. */
+ if (k != i)
+ goto moveon;
+}
diff --git a/sys/net/art.h b/sys/net/art.h
new file mode 100644
index 00000000000..2cc33f57209
--- /dev/null
+++ b/sys/net/art.h
@@ -0,0 +1,61 @@
+/* $OpenBSD: art.h,v 1.1 2015/08/20 12:39:43 mpi Exp $ */
+
+/*
+ * Copyright (c) 2015 Martin Pieuchot
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _NET_ART_H_
+#define _NET_ART_H_
+
+#define ART_MAXLVL 16 /* We currently use 16 levels for IPv6. */
+
+/*
+ * Root of the ART tables, equivalent to the radix head.
+ */
+struct art_root {
+ struct art_table *ar_root; /* First table */
+ uint8_t ar_bits[ART_MAXLVL]; /* Per level stride */
+ uint8_t ar_nlvl; /* Number of levels */
+ uint8_t ar_alen; /* Address length in bits */
+ uint8_t ar_off; /* Offset of the key in bytes */
+ unsigned int ar_rtableid; /* ID of this routing table */
+};
+
+/*
+ * Forward declaration needed for the list of mpath routes
+ * attached to a single ART node.
+ */
+struct rtentry;
+
+/*
+ * A node is the internal representation of a route entry.
+ */
+struct art_node {
+ struct sockaddr *an_dst; /* Destination address (key) */
+ int an_plen; /* Prefix length */
+
+ LIST_HEAD(, rtentry) an_rtlist; /* Route related to this node */
+};
+
+void art_init(void);
+int art_attach(void **, int);
+struct art_node *art_insert(struct art_root *, struct art_node *);
+struct art_node *art_delete(struct art_root *, struct art_node *);
+struct art_node *art_match(struct art_root *, struct sockaddr *);
+struct art_node *art_lookup(struct art_root *, struct sockaddr *, int);
+int art_walk(struct art_root *,
+ int (*)(struct art_node *, void *), void *);
+
+#endif /* _NET_ART_H_ */
diff --git a/sys/net/route.c b/sys/net/route.c
index c8d75fee77e..56adb555ec9 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: route.c,v 1.223 2015/08/19 13:27:38 bluhm Exp $ */
+/* $OpenBSD: route.c,v 1.224 2015/08/20 12:39:43 mpi Exp $ */
/* $NetBSD: route.c,v 1.14 1996/02/13 22:00:46 christos Exp $ */
/*
@@ -1238,6 +1238,7 @@ rt_ifa_del(struct ifaddr *ifa, int flags, struct sockaddr *dst)
}
if ((rt = rtalloc(dst, 0, rtableid)) != NULL) {
rt->rt_refcnt--;
+#ifndef ART
/* try to find the right route */
while (rt && rt->rt_ifa != ifa)
rt = (struct rtentry *)
@@ -1248,6 +1249,7 @@ rt_ifa_del(struct ifaddr *ifa, int flags, struct sockaddr *dst)
return (flags & RTF_HOST ? EHOSTUNREACH
: ENETUNREACH);
}
+#endif
}
memset(&info, 0, sizeof(info));
diff --git a/sys/net/route.h b/sys/net/route.h
index 40bf7b6a6ad..5491b45ccab 100644
--- a/sys/net/route.h
+++ b/sys/net/route.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: route.h,v 1.109 2015/07/18 15:51:16 mpi Exp $ */
+/* $OpenBSD: route.h,v 1.110 2015/08/20 12:39:43 mpi Exp $ */
/* $NetBSD: route.h,v 1.9 1996/02/13 22:00:49 christos Exp $ */
/*
@@ -93,7 +93,14 @@ struct rt_metrics {
*/
struct rtentry {
+#ifndef ART
struct radix_node rt_nodes[2]; /* tree glue, and other values */
+#else
+ struct art_node *rt_node; /* ART entry */
+ struct sockaddr *rt_dest; /* destination */
+ struct sockaddr *rt_mask; /* mask (radix tree compat) */
+ LIST_ENTRY(rtentry) rt_next; /* Next multipath entry to our dst. */
+#endif
struct sockaddr *rt_gateway; /* value */
struct ifnet *rt_ifp; /* the answer: interface to use */
struct ifaddr *rt_ifa; /* the answer: interface addr to use */
diff --git a/sys/net/rtable.c b/sys/net/rtable.c
index 48206248094..599f74bec41 100644
--- a/sys/net/rtable.c
+++ b/sys/net/rtable.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: rtable.c,v 1.1 2015/07/18 15:51:16 mpi Exp $ */
+/* $OpenBSD: rtable.c,v 1.2 2015/08/20 12:39:43 mpi Exp $ */
/*
* Copyright (c) 2014-2015 Martin Pieuchot
@@ -19,10 +19,15 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/socket.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/queue.h>
#include <net/rtable.h>
#include <net/route.h>
+#ifndef ART
+
void
rtable_init(void)
{
@@ -201,4 +206,453 @@ rtable_mpath_reprio(struct rtentry *rt, uint8_t newprio)
rn_mpath_reprio(rn, newprio);
}
+#endif /* SMALL_KERNEL */
+
+#else /* ART */
+
+struct pool an_pool; /* pool for ART node structures */
+
+static inline int satoplen(struct art_root *, struct sockaddr *);
+
+void
+rtable_init(void)
+{
+ pool_init(&an_pool, sizeof(struct art_node), 0, 0, 0, "art node", NULL);
+}
+
+int
+rtable_attach(void **head, int off)
+{
+ return (art_attach(head, off));
+}
+
+struct rtentry *
+rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
+ struct sockaddr *mask)
+{
+ struct art_root *ar;
+ struct art_node *an;
+ int plen;
+
+ ar = rtable_get(rtableid, dst->sa_family);
+ if (ar == NULL)
+ return (NULL);
+
+ /* No need for a perfect match. */
+ if (mask == NULL) {
+ an = art_match(ar, dst);
+ } else {
+ plen = satoplen(ar, mask);
+ if (plen == -1)
+ return (NULL);
+ an = art_lookup(ar, dst, plen);
+ }
+
+ if (an == NULL)
+ return (NULL);
+
+ return (LIST_FIRST(&an->an_rtlist));
+}
+
+struct rtentry *
+rtable_match(unsigned int rtableid, struct sockaddr *dst)
+{
+ struct art_root *ar;
+ struct art_node *an;
+
+ ar = rtable_get(rtableid, dst->sa_family);
+ if (ar == NULL)
+ return (NULL);
+
+ an = art_match(ar, dst);
+ if (an == NULL)
+ return (NULL);
+
+ return (LIST_FIRST(&an->an_rtlist));
+}
+
+int
+rtable_insert(unsigned int rtableid, struct sockaddr *dst,
+ struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
+{
+#ifndef SMALL_KERNEL
+ struct rtentry *mrt;
+#endif
+ struct art_root *ar;
+ struct art_node *an, *prev;
+ int plen = 0;
+
+ ar = rtable_get(rtableid, dst->sa_family);
+ if (ar == NULL)
+ return (EAFNOSUPPORT);
+
+ plen = satoplen(ar, mask);
+ if (plen == -1)
+ return (EINVAL);
+
+ an = pool_get(&an_pool, PR_NOWAIT | PR_ZERO);
+ if (an == NULL)
+ return (ENOBUFS);
+
+ an->an_dst = dst;
+ an->an_plen = plen;
+
+ prev = art_insert(ar, an);
+ if (prev == NULL) {
+ pool_put(&an_pool, an);
+ return (ESRCH);
+ }
+
+ if (prev == an) {
+ rt->rt_flags &= ~RTF_MPATH;
+ } else {
+ pool_put(&an_pool, an);
+#ifndef SMALL_KERNEL
+ an = prev;
+
+ mrt = LIST_FIRST(&an->an_rtlist);
+
+ KASSERT(mrt != NULL);
+ KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
+
+ /*
+ * An ART node with the same destination/netmask already
+ * exists, MPATH conflict must have been already checked.
+ */
+ if (rt->rt_flags & RTF_MPATH) {
+ /*
+ * Only keep the RTF_MPATH flag if two routes have
+ * the same gateway.
+ */
+ rt->rt_flags &= ~RTF_MPATH;
+ LIST_FOREACH(mrt, &an->an_rtlist, rt_next) {
+ if (mrt->rt_priority == prio) {
+ mrt->rt_flags |= RTF_MPATH;
+ rt->rt_flags |= RTF_MPATH;
+ }
+ }
+ }
+#else
+ return (EEXIST);
+#endif /* SMALL_KERNEL */
+ }
+
+ rt->rt_node = an;
+ rt->rt_dest = dst;
+
+ /*
+ * XXX Allocating a sockaddr for the mask per node wastes a lot
+ * of memory, thankfully we'll get rid of that when rt_mask()
+ * will be no more.
+ */
+ if (mask != NULL) {
+ struct sockaddr *msk;
+
+ msk = malloc(dst->sa_len, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (msk == NULL) {
+ pool_put(&an_pool, an);
+ return (ENOMEM);
+ }
+ memcpy(msk, mask, dst->sa_len);
+ rt->rt_mask = msk;
+ }
+
+#ifndef SMALL_KERNEL
+ if ((mrt = LIST_FIRST(&an->an_rtlist)) != NULL) {
+ /*
+ * Select the order of the MPATH routes.
+ */
+ while (LIST_NEXT(mrt, rt_next) != NULL) {
+ if (mrt->rt_priority > prio)
+ break;
+ mrt = LIST_NEXT(mrt, rt_next);
+ }
+
+ if (mrt->rt_priority > prio)
+ LIST_INSERT_BEFORE(mrt, rt, rt_next);
+ else
+ LIST_INSERT_AFTER(mrt, rt, rt_next);
+
+ return (0);
+ }
+#endif /* SMALL_KERNEL */
+
+ LIST_INSERT_HEAD(&an->an_rtlist, rt, rt_next);
+ return (0);
+}
+
+int
+rtable_delete(unsigned int rtableid, struct sockaddr *dst,
+ struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
+{
+ struct art_root *ar;
+ struct art_node *an = rt->rt_node;
+
+ ar = rtable_get(rtableid, dst->sa_family);
+ if (ar == NULL)
+ return (EAFNOSUPPORT);
+
+#ifdef DIAGNOSTIC
+ if (memcmp(dst, an->an_dst, dst->sa_len)) {
+ printf("%s: destination do not match\n", __func__);
+ return (EINVAL);
+ }
+ if (mask != NULL && an->an_plen != satoplen(ar, mask)) {
+ printf("%s: mask do not match\n", __func__);
+ return (EINVAL);
+ }
+#endif
+
+ /*
+ * XXX Is it safe to free the mask now? Are we sure rt_mask()
+ * is only used when entries are in the table?
+ */
+ free(rt->rt_mask, M_RTABLE, 0);
+
+ /* Remove rt <-> ART glue. */
+ rt->rt_node = NULL;
+ rt->rt_mask = NULL;
+ LIST_REMOVE(rt, rt_next);
+ KASSERT(rt->rt_refcnt >= 0);
+
+#ifndef SMALL_KERNEL
+ if ((rt = LIST_FIRST(&an->an_rtlist)) != NULL) {
+ an->an_dst = rt->rt_dest;
+ if (LIST_NEXT(rt, rt_next) == NULL)
+ rt->rt_flags &= ~RTF_MPATH;
+ return (0);
+ }
+#endif /* SMALL_KERNEL */
+
+ if (art_delete(ar, an) == NULL)
+ return (ESRCH);
+
+ pool_put(&an_pool, an);
+
+ return (0);
+}
+
+int
+rtable_setid(void **p, unsigned int rtableid, sa_family_t af)
+{
+ struct art_root **ar = (struct art_root **)p;
+
+ if (ar == NULL || ar[af] == NULL)
+ return (EINVAL);
+
+ ar[af]->ar_rtableid = rtableid;
+
+ return (0);
+}
+
+struct rtable_walk_cookie {
+ int (*rwc_func)(struct rtentry *, void *, unsigned int);
+ void *rwc_arg;
+ unsigned int rwc_rid;
+};
+
+/*
+ * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
+ */
+int
+rtable_walk_helper(struct art_node *an, void *xrwc)
+{
+ struct rtable_walk_cookie *rwc = xrwc;
+ struct rtentry *rt, *nrt;
+ int error = 0;
+
+ LIST_FOREACH_SAFE(rt, &an->an_rtlist, rt_next, nrt) {
+ if ((error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid)))
+ break;
+ }
+
+ return (error);
+}
+
+int
+rtable_walk(unsigned int rtableid, sa_family_t af,
+ int (*func)(struct rtentry *, void *, unsigned int), void *arg)
+{
+ struct art_root *ar;
+ struct rtable_walk_cookie rwc;
+
+ ar = rtable_get(rtableid, af);
+ if (ar == NULL)
+ return (EAFNOSUPPORT);
+
+ rwc.rwc_func = func;
+ rwc.rwc_arg = arg;
+ rwc.rwc_rid = rtableid;
+
+ return art_walk(ar, rtable_walk_helper, &rwc);
+}
+
+#ifndef SMALL_KERNEL
+int
+rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
+{
+ return (1);
+}
+
+struct rtentry *
+rtable_mpath_match(unsigned int rtableid, struct rtentry *rt0,
+ struct sockaddr *gateway, uint8_t prio)
+{
+ struct art_node *an = rt0->rt_node;
+ struct rtentry *rt;
+
+ LIST_FOREACH(rt, &an->an_rtlist, rt_next) {
+ if (prio != RTP_ANY &&
+ (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
+ continue;
+
+ if (gateway == NULL)
+ return (rt);
+
+ if (rt->rt_gateway->sa_len == gateway->sa_len &&
+ memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
+ break;
+ }
+
+ return (rt);
+}
+
+int
+rtable_mpath_conflict(unsigned int rtableid, struct sockaddr *dst,
+ struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio, int mpathok)
+{
+ struct art_root *ar;
+ struct art_node *an;
+ struct rtentry *rt;
+ int plen;
+
+ ar = rtable_get(rtableid, dst->sa_family);
+ if (ar == NULL)
+ return (EAFNOSUPPORT);
+
+ plen = satoplen(ar, mask);
+ if (plen == -1)
+ return (EINVAL);
+ an = art_lookup(ar, dst, plen);
+ if (an == NULL)
+ return (0);
+
+ LIST_FOREACH(rt, &an->an_rtlist, rt_next) {
+ if (prio != RTP_ANY &&
+ (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
+ continue;
+
+ if (!mpathok)
+ return (EEXIST);
+
+ if (rt->rt_gateway->sa_len == gateway->sa_len &&
+ memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
+ return (EEXIST);
+ }
+
+
+ return (0);
+}
+
+struct rtentry *
+rtable_mpath_select(struct rtentry *rt, uint32_t *src)
+{
+ struct art_node *an = rt->rt_node;
+
+ /*
+ * XXX consider using ``src'' (8
+ */
+ return (LIST_FIRST(&an->an_rtlist));
+}
+
+void
+rtable_mpath_reprio(struct rtentry *rt, uint8_t newprio)
+{
+ /* XXX */
+}
+#endif /* SMALL_KERNEL */
+
+/*
+ * Return the prefix length of a mask.
+ */
+static inline int
+satoplen(struct art_root *ar, struct sockaddr *mask)
+{
+ uint8_t *ap, *ep;
+ int skip, mlen, plen = 0;
+
+ /* Host route */
+ if (mask == NULL)
+ return (ar->ar_alen);
+
+ mlen = mask->sa_len;
+
+ /* Default route */
+ if (mlen == 0)
+ return (0);
+
+ skip = ar->ar_off;
+
+ ap = (uint8_t *)((uint8_t *)mask) + skip;
+ ep = (uint8_t *)((uint8_t *)mask) + mlen;
+ if (ap > ep)
+ return (-1);
+
+ if (ap == ep)
+ return (0);
+
+ /* "Beauty" adapted from sbin/route/show.c ... */
+ while (ap < ep) {
+ switch (*ap) {
+ case 0xff:
+ plen += 8;
+ ap++;
+ break;
+ case 0xfe:
+ plen += 7;
+ ap++;
+ goto out;
+ case 0xfc:
+ plen += 6;
+ ap++;
+ goto out;
+ case 0xf8:
+ plen += 5;
+ ap++;
+ goto out;
+ case 0xf0:
+ plen += 4;
+ ap++;
+ goto out;
+ case 0xe0:
+ plen += 3;
+ ap++;
+ goto out;
+ case 0xc0:
+ plen += 2;
+ ap++;
+ goto out;
+ case 0x80:
+ plen += 1;
+ ap++;
+ goto out;
+ case 0x00:
+ goto out;
+ default:
+ /* Non contiguous mask. */
+ return (-1);
+ }
+
+ }
+
+out:
+#ifdef DIAGNOSTIC
+ for (; ap < ep; ap++) {
+ if (*ap != 0x00)
+ return (-1);
+ }
#endif
+
+ return (plen);
+}
+#endif /* ART */
diff --git a/sys/net/rtable.h b/sys/net/rtable.h
index 9a6a9c74b66..df07c538945 100644
--- a/sys/net/rtable.h
+++ b/sys/net/rtable.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: rtable.h,v 1.1 2015/07/18 15:51:16 mpi Exp $ */
+/* $OpenBSD: rtable.h,v 1.2 2015/08/20 12:39:43 mpi Exp $ */
/*
* Copyright (c) 2014-2015 Martin Pieuchot
@@ -19,6 +19,8 @@
#ifndef _NET_RTABLE_H_
#define _NET_RTABLE_H_
+#ifndef ART
+
/*
* Traditional BSD routing table implementation based on a radix tree.
*/
@@ -32,6 +34,21 @@
#define RT_ACTIVE(rt) ((rt)->rt_nodes[0].rn_flags & RNF_ACTIVE)
#define RT_ROOT(rt) ((rt)->rt_nodes[0].rn_flags & RNF_ROOT)
+#else /* ART */
+
+/*
+ * Newer routing table implementation based on ART (Allotment Routing
+ * Table).
+ */
+#include <net/art.h>
+
+#define rt_key(rt) ((rt)->rt_dest)
+#define rt_mask(rt) ((rt)->rt_mask)
+#define RT_ACTIVE(rt) ((rt)->rt_node != NULL)
+#define RT_ROOT(rt) (0)
+
+#endif /* ART */
+
void rtable_init(void);
int rtable_attach(void **, int);
struct rtentry *rtable_lookup(unsigned int, struct sockaddr *,