/* $OpenBSD: tsort.c,v 1.5 2001/03/26 22:53:33 espie Exp $ */ /* ex:ts=8 sw=4: */ /* * Copyright (c) 1999-2001 Marc Espie. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Marc Espie for the OpenBSD * Project. * * THIS SOFTWARE IS PROVIDED BY THE OPENBSD PROJECT AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBSD * PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include /* The complexity of topological sorting is O(e), where e is the * size of input. While reading input, vertices have to be identified, * thus add the complexity of e keys retrieval among v keys using * an appropriate data structure. This program uses open double hashing * for that purpose. See Knuth for the expected complexity of double * hashing (Brent variation should probably be used if v << e, as a user * option). * * The algorithm used for longest cycle reporting is accurate, but somewhat * expensive. It may need to build all free paths of the graph (a free * path is a path that never goes twice through the same node), whose * number can be as high as O(2^e). Usually, the number of free paths is * much smaller though. This program's author does not believe that a * significantly better worst-case complexity algorithm exists. * * In case of a hints file, the set of minimal nodes is maintained as a * heap. The resulting complexity is O(e+v log v) for the worst case. * The average should actually be near O(e). * * The simple topological sort algorithm detects cycles. This program * goes further, breaking cycles through the use of simple heuristics. * Each cycle break checks the whole set of nodes, hence if c cycles break * are needed, this is an extra cost of O(c v). * * Possible heuristics are as follows: * - break cycle at node with lowest number of predecessors (default case), * - break longest cycle at node with lowest number of predecessors, * - break cycle at next node from the hints file. * * Except for the hints file case, which sets an explicit constraint on * which cycle to break, those heuristics locally result in the smallest * number of broken edges. * * Those are admittedly greedy strategies, as is the selection of the next * node from the hints file amongst equivalent candidates that is used for * `stable' topological sorting. */ #ifdef __GNUC__ #define UNUSED __attribute__((unused)) #else #define UNUSED #endif struct node; /* The set of arcs from a given node is stored as a linked list. */ struct link { struct link *next; struct node *node; }; struct node { unsigned int refs; /* Number of arcs left, coming into this node . * Note that nodes with a null count can't * be part of cycles. */ struct link *arcs; /* List of forward arcs. */ unsigned int order; /* Order of nodes according to a hint file. */ /* Cycle detection algorithms build a free path of nodes. */ struct node *from; /* Previous node in the current path. */ unsigned int mark; /* Mark processed nodes in cycle discovery. */ struct link *traverse; /* Next link to traverse when backtracking. */ char k[1]; /* Name of this node. */ }; #define HASH_START 9 struct array { unsigned int entries; struct node **t; }; static void nodes_init __P((struct ohash *)); static struct node *node_lookup __P((struct ohash *, const char *, const char *)); static void usage __P((void)); static struct node *new_node __P((const char *, const char *)); static void read_pairs __P((FILE *, struct ohash *, int, const char *)); static void split_nodes __P((struct ohash *, struct array *, struct array *)); static void insert_arc __P((struct node *, struct node *)); #ifdef DEBUG static void dump_node __P((struct node *)); static void dump_array __P((struct array *)); static void dump_hash __P((struct ohash *)); #endif static void read_hints __P((FILE *, struct ohash *, const char *)); static struct node *find_smallest_node __P((struct array *)); static struct node *find_good_cycle_break __P((struct array *)); static void print_cycle __P((struct array *)); static int find_cycle_with __P((struct node *, struct array *)); static struct node *find_predecessor __P((struct array *, struct node *)); static unsigned int traverse_node __P((struct node *, unsigned int, struct array *)); static struct node *find_longest_cycle __P((struct array *, struct array *)); static void heap_down __P((struct array *, unsigned int)); static void heapify __P((struct array *)); static struct node *dequeue __P((struct array *)); static void enqueue __P((struct array *, struct node *)); #define erealloc(n, s) emem(realloc(n, s)) static void *hash_alloc __P((size_t, void *)); static void hash_free __P((void *, size_t, void *)); static void* entry_alloc __P((size_t, void *)); static void *emalloc __P((size_t)); static void *emem __P((void *)); #define DEBUG_TRAVERSE 0 static struct ohash_info node_info = { offsetof(struct node, k), NULL, hash_alloc, hash_free, entry_alloc }; int main __P((int, char *[])); /*** *** Memory handling. ***/ static void * emem(p) void *p; { if (p) return p; else errx(EX_SOFTWARE, "Memory exhausted"); } static void * hash_alloc(s, u) size_t s; void *u UNUSED; { return emem(calloc(s, 1)); } static void hash_free(p, s, u) void *p; size_t s UNUSED; void *u UNUSED; { free(p); } static void * entry_alloc(s, u) size_t s; void *u UNUSED; { return emalloc(s); } static void * emalloc(s) size_t s; { return emem(malloc(s)); } /*** *** Hash table. ***/ /* Inserting and finding nodes in the hash structure. * We handle interval strings for efficiency wrt fgetln. */ static struct node * new_node(start, end) const char *start; const char *end; { struct node *n; n = ohash_create_entry(&node_info, start, &end); n->from = NULL; n->arcs = NULL; n->refs = 0; n->mark = 0; n->order = 0; n->traverse = NULL; return n; } static void nodes_init(h) struct ohash *h; { ohash_init(h, HASH_START, &node_info); } static struct node * node_lookup(h, start, end) struct ohash *h; const char *start; const char *end; { unsigned int i; struct node * n; i = ohash_qlookupi(h, start, &end); n = ohash_find(h, i); if (n == NULL) n = ohash_insert(h, i, new_node(start, end)); return n; } #ifdef DEBUG static void dump_node(n) struct node *n; { struct link *l; if (n->refs == 0) return; printf("%s (%u): ", n->k, n->refs); for (l = n->arcs; l != NULL; l = l->next) if (n->refs != 0) printf("%s(%u) ", l->node->k, l->node->refs); putchar('\n'); } static void dump_array(a) struct array *a; { unsigned int i; for (i = 0; i < a->entries; i++) dump_node(a->t[i]); } static void dump_hash(h) struct hash *h; { unsigned int i; struct node *n; for (n = ohash_first(h, &i); n != NULL; n = ohash_next(h, &i)) dump_node(n); } #endif /*** *** Reading data. ***/ static void insert_arc(a, b) struct node *a, *b; { struct link *l; /* Check that this arc is not already present. */ for (l = a->arcs; l != NULL; l = l->next) { if (l->node == b) return; } b->refs++; l = emalloc(sizeof(struct link)); l->node = b; l->next = a->arcs; a->arcs = l; } static void read_pairs(f, h, reverse, name) FILE *f; struct ohash *h; int reverse; const char *name; { int toggle; struct node *a; size_t size; char *str; unsigned int o; o = 1; toggle = 1; a = NULL; while ((str = fgetln(f, &size)) != NULL) { char *sentinel; sentinel = str + size; for (;;) { char *e; while (isspace(*str) && str < sentinel) str++; if (str == sentinel) break; for (e = str; !isspace(*e) && e < sentinel; e++) continue; if (toggle) { a = node_lookup(h, str, e); if (a->order == 0) a->order = o++; } else { struct node *b; b = node_lookup(h, str, e); assert(a != NULL); if (b != a) { if (reverse) insert_arc(b, a); else insert_arc(a, b); } } toggle = !toggle; str = e; } } if (toggle == 0) errx(EX_DATAERR, "odd number of pairs in %s", name); if (!feof(f)) err(EX_IOERR, "error reading %s", name); } static void read_hints(f, h, name) FILE *f; struct ohash *h; const char *name; { char *str; size_t size; unsigned int i; i = 1; while ((str = fgetln(f, &size)) != NULL) { char *sentinel; sentinel = str + size; for (;;) { char *e; struct node *a; while (isspace(*str) && str < sentinel) str++; if (str == sentinel) break; for (e = str; !isspace(*e) && e < sentinel; e++) continue; a = node_lookup(h, str, e); if (a->order != 0) errx(EX_DATAERR, "duplicate node %s in hints file %s", a->k, name); else a->order = i++; str = e; } } } /*** *** Standard heap handling routines. ***/ static void heap_down(h, i) struct array *h; unsigned int i; { unsigned int j; struct node *swap; for (; (j=2*i+1) < h->entries; i = j) { if (j+1 < h->entries && h->t[j+1]->order < h->t[j]->order) j++; if (h->t[i]->order <= h->t[j]->order) break; swap = h->t[i]; h->t[i] = h->t[j]; h->t[j] = swap; } } static void heapify(h) struct array *h; { unsigned int i; for (i = h->entries; i != 0;) heap_down(h, --i); } #define DEQUEUE(h) ( hints_flag ? dequeue(h) : (h)->t[--(h)->entries] ) static struct node * dequeue(h) struct array *h; { struct node *n; if (h->entries == 0) n = NULL; else { n = h->t[0]; if (--h->entries != 0) { h->t[0] = h->t[h->entries]; heap_down(h, 0); } } return n; } #define ENQUEUE(h, n) do { \ if (hints_flag) \ enqueue((h), (n)); \ else \ (h)->t[(h)->entries++] = (n); \ } while(0); static void enqueue(h, n) struct array *h; struct node *n; { unsigned int i, j; struct node *swap; h->t[h->entries++] = n; for (i = h->entries-1; i > 0; i = j) { j = (i-1)/2; if (h->t[j]->order < h->t[i]->order) break; swap = h->t[j]; h->t[j] = h->t[i]; h->t[i] = swap; } } /*** *** Search through hash array for nodes. ***/ /* Split nodes into unrefed nodes/live nodes. */ static void split_nodes(hash, heap, remaining) struct ohash *hash; struct array *heap; struct array *remaining; { struct node *n; unsigned int i; heap->t = emalloc(sizeof(struct node *) * ohash_entries(hash)); remaining->t = emalloc(sizeof(struct node *) * ohash_entries(hash)); heap->entries = 0; remaining->entries = 0; for (n = ohash_first(hash, &i); n != NULL; n = ohash_next(hash, &i)) { if (n->refs == 0) heap->t[heap->entries++] = n; else remaining->t[remaining->entries++] = n; } } /* Good point to break a cycle: live node with as few refs as possible. */ static struct node * find_good_cycle_break(h) struct array *h; { unsigned int i; unsigned int best; struct node *u; best = UINT_MAX; u = NULL; assert(h->entries != 0); for (i = 0; i < h->entries; i++) { struct node *n = h->t[i]; /* No need to look further. */ if (n->refs == 1) return n; if (n->refs != 0 && n->refs < best) { best = n->refs; u = n; } } assert(u != NULL); return u; } /* Retrieve the node with the smallest order. */ static struct node * find_smallest_node(h) struct array *h; { unsigned int i; unsigned int best; struct node *u; best = UINT_MAX; u = NULL; assert(h->entries != 0); for (i = 0; i < h->entries; i++) { struct node *n = h->t[i]; if (n->refs != 0 && n->order < best) { best = n->order; u = n; } } assert(u != NULL); return u; } /*** *** Graph algorithms. ***/ /* Explore the nodes reachable from i to find a cycle containing it, store * it in c. This may fail. */ static int find_cycle_with(i, c) struct node *i; struct array *c; { struct node *n; n = i; /* XXX Previous cycle findings may have left this pointer non-null. */ i->from = NULL; for (;;) { /* Note that all marks are reversed before this code exits. */ n->mark = 1; if (n->traverse) n->traverse = n->traverse->next; else n->traverse = n->arcs; /* Skip over dead nodes. */ while (n->traverse && n->traverse->node->refs == 0) n->traverse = n->traverse->next; if (n->traverse) { struct node *go = n->traverse->node; if (go->mark) { if (go == i) { c->entries = 0; for (; n != NULL; n = n->from) c->t[c->entries++] = n; return 1; } } else { go->from = n; n = go; } } else { n->mark = 0; n = n->from; if (n == NULL) return 0; } } } /* Find a live predecessor of node n. This is a slow routine, as it needs * to go through the whole array, but it is not needed often. */ static struct node * find_predecessor(a, n) struct array *a; struct node *n; { unsigned int i; for (i = 0; i < a->entries; i++) { struct node *m; m = a->t[i]; if (m->refs != 0) { struct link *l; for (l = m->arcs; l != NULL; l = l->next) if (l->node == n) return m; } } assert(1 == 0); return NULL; } /* Traverse all strongly connected components reachable from node n. Start numbering them at o. Return the maximum order reached. Update the largest cycle found so far. */ static unsigned int traverse_node(n, o, c) struct node *n; unsigned int o; struct array *c; { unsigned int min, max; n->from = NULL; min = o; max = ++o; for (;;) { n->mark = o; if (DEBUG_TRAVERSE) printf("%s(%d) ", n->k, n->mark); /* Find next arc to explore. */ if (n->traverse) n->traverse = n->traverse->next; else n->traverse = n->arcs; /* Skip over dead nodes. */ while (n->traverse && n->traverse->node->refs == 0) n->traverse = n->traverse->next; /* If arc left. */ if (n->traverse) { struct node *go; go = n->traverse->node; /* Optimisation: if go->mark < min, we already * visited this strongly-connected component in * a previous pass. Hence, this can yield no new * cycle. */ /* Not part of the current path: go for it. */ if (go->mark == 0 || go->mark == min) { go->from = n; n = go; o++; if (o > max) max = o; /* Part of the current path: check cycle length. */ } else if (go->mark > min) { if (DEBUG_TRAVERSE) printf("%d\n", o - go->mark + 1); if (o - go->mark + 1 > c->entries) { struct node *t; unsigned int i; c->entries = o - go->mark + 1; i = 0; c->t[i++] = go; for (t = n; t != go; t = t->from) c->t[i++] = t; } } /* No arc left: backtrack. */ } else { n->mark = min; n = n->from; if (!n) return max; o--; } } } static void print_cycle(c) struct array *c; { unsigned int i; /* Printing in reverse order, since cycle discoveries finds reverse * edges. */ for (i = c->entries; i != 0;) { i--; warnx("%s", c->t[i]->k); } } static struct node * find_longest_cycle(h, c) struct array *h; struct array *c; { unsigned int i; unsigned int o; unsigned int best; struct node *n; static int notfirst = 0; assert(h->entries != 0); /* No cycle found yet. */ c->entries = 0; /* Reset the set of marks, except the first time around. */ if (notfirst) { for (i = 0; i < h->entries; i++) h->t[i]->mark = 0; } else notfirst = 1; o = 0; /* Traverse the array. Each unmarked, live node heralds a * new set of strongly connected components. */ for (i = 0; i < h->entries; i++) { n = h->t[i]; if (n->refs != 0 && n->mark == 0) { /* Each call to traverse_node uses a separate * interval of numbers to mark nodes. */ o++; o = traverse_node(n, o, c); } } assert(c->entries != 0); n = c->t[0]; best = n->refs; for (i = 0; i < c->entries; i++) { if (c->t[i]->refs < best) { n = c->t[i]; best = n->refs; } } return n; } #define plural(n) ((n) > 1 ? "s" : "") int main(argc, argv) int argc; char *argv[]; { struct ohash pairs; int reverse_flag, quiet_flag, long_flag, warn_flag, hints_flag, verbose_flag; reverse_flag = quiet_flag = long_flag = warn_flag = hints_flag = verbose_flag = 0; nodes_init(&pairs); { int c; while ((c = getopt(argc, argv, "h:flqrvw")) != -1) { switch(c) { case 'h': { FILE *f; f = fopen(optarg, "r"); if (f == NULL) err(EX_NOINPUT, "Can't open hint file %s", optarg); read_hints(f, &pairs, optarg); fclose(f); } /*FALLTHRU*/ case 'f': if (hints_flag == 1) usage(); hints_flag = 1; break; case 'l': long_flag = 1; break; case 'q': quiet_flag = 1; break; case 'r': reverse_flag = 1; break; case 'v': verbose_flag = 1; break; case 'w': warn_flag = 1; break; default: usage(); } } argc -= optind; argv += optind; } switch(argc) { case 1: { FILE *f; f = fopen(argv[0], "r"); if (f == NULL) err(EX_NOINPUT, "Can't open file %s", argv[1]); read_pairs(f, &pairs, reverse_flag, argv[1]); fclose(f); break; } case 0: read_pairs(stdin, &pairs, reverse_flag, "stdin"); break; default: usage(); } { struct array aux; /* Unrefed nodes/cycle reporting. */ struct array remaining; unsigned int broken_arcs, broken_cycles; unsigned int left; broken_arcs = 0; broken_cycles = 0; split_nodes(&pairs, &aux, &remaining); ohash_delete(&pairs); if (hints_flag) heapify(&aux); left = remaining.entries + aux.entries; while (left != 0) { /* Standard topological sort. */ while (aux.entries) { struct link *l; struct node *n; n = DEQUEUE(&aux); printf("%s\n", n->k); left--; /* We can't free nodes, as we don't know which * entry we can remove in the hash table. We * rely on refs == 0 to recognize live nodes. * Decrease ref count of live nodes, enter new * candidates into the unrefed list. */ for (l = n->arcs; l != NULL; l = l->next) if (l->node->refs != 0 && --l->node->refs == 0) { ENQUEUE(&aux, l->node); } } /* There are still cycles to break. */ if (left != 0) { struct node *n; broken_cycles++; /* XXX Simple cycle detection and long cycle * detection are mutually exclusive. */ if (long_flag) { n = find_longest_cycle(&remaining, &aux); } else { if (hints_flag) n = find_smallest_node(&remaining); else n = find_good_cycle_break(&remaining); if (!quiet_flag) { while (!find_cycle_with(n, &aux)) n = find_predecessor(&remaining, n); } } if (!quiet_flag) { warnx("cycle in data"); print_cycle(&aux); } if (verbose_flag) warnx("%u edge%s broken", n->refs, plural(n->refs)); broken_arcs += n->refs; n->refs = 0; /* Reinitialization, cycle reporting uses aux. */ aux.t[0] = n; aux.entries = 1; } } if (verbose_flag && broken_cycles != 0) warnx("%u cycle%s broken, for a total of %u edge%s", broken_cycles, plural(broken_cycles), broken_arcs, plural(broken_arcs)); if (warn_flag) exit(broken_cycles < 256 ? broken_cycles : 255); else exit(EX_OK); } } extern char *__progname; static void usage() { fprintf(stderr, "Usage: %s [-h file] [-flqrvw] [file]\n", __progname); exit(EX_USAGE); }