diff options
author | Marc Espie <espie@cvs.openbsd.org> | 2001-03-26 22:53:34 +0000 |
---|---|---|
committer | Marc Espie <espie@cvs.openbsd.org> | 2001-03-26 22:53:34 +0000 |
commit | 60ea328912452bb904fc34be01f64367de8fe8b3 (patch) | |
tree | 3a4743094e1fef5ac26efb7c9c1b9af4d6a1c1a5 /usr.bin | |
parent | 0ed499a8c4ed0e7ef2f74d6e681f98b82cce5028 (diff) |
Replacement for original tsort.
The old code suffers from a few defects:
- it does not even implement the standard optimal topological sort
algorithm. It's much slower.
- its longest cycle computation is completely bogus.
This is clean-slate code, that does implement the actual standard optimal
topological sort, together with a correct graph traversal to find longest
cycles.
It does also feature a `stable tsort' mode, where it uses a heap to yield
the least disturbed permutation of input nodes that satisfies the ordering
constraints (in particular, try tsort -f).
Thanks to the nature of the problem, the actual output won't exactly match
the old one, but it does pass the regression suite (and it is a topological
sorter).
Ok millert@
Diffstat (limited to 'usr.bin')
-rw-r--r-- | usr.bin/tsort/tsort.1 | 48 | ||||
-rw-r--r-- | usr.bin/tsort/tsort.c | 1215 |
2 files changed, 904 insertions, 359 deletions
diff --git a/usr.bin/tsort/tsort.1 b/usr.bin/tsort/tsort.1 index 48132883ff7..971db804316 100644 --- a/usr.bin/tsort/tsort.1 +++ b/usr.bin/tsort/tsort.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: tsort.1,v 1.6 2000/03/11 21:40:06 aaron Exp $ +.\" $OpenBSD: tsort.1,v 1.7 2001/03/26 22:53:33 espie Exp $ .\" $NetBSD: tsort.1,v 1.6 1996/01/17 20:37:49 mycroft Exp $ .\" .\" Copyright (c) 1990, 1993, 1994 @@ -37,7 +37,7 @@ .\" .\" @(#)tsort.1 8.3 (Berkeley) 4/1/94 .\" -.Dd April 1, 1994 +.Dd November 1, 1999 .Dt TSORT 1 .Os .Sh NAME @@ -45,11 +45,15 @@ .Nd topological sort of a directed graph .Sh SYNOPSIS .Nm tsort +.Op Fl f +.Op Fl h Ar file .Op Fl l .Op Fl q +.Op Fl r +.Op Fl w .Op Ar file .Sh DESCRIPTION -.Nm +.Nm tsort takes a list of pairs of node names representing directed arcs in a graph and prints the nodes in topological order on standard output. Input is taken from the named @@ -57,7 +61,7 @@ Input is taken from the named or from standard input if no file is given. .Pp -Node names in the input are separated by whitespace and there must +Node names in the input are separated by white space and there must be an even number of node pairs. .Pp Presence of a node in a graph can be represented by an arc from the node @@ -70,23 +74,45 @@ Cycles are reported on standard error. .Pp The options are as follows: .Bl -tag -width Ds +.It Fl f +Resolve ambiguities by selecting nodes based on the order of apparition +of the first component of the pairs. +.It Fl h Ar file +Use +.Ar file , +which holds an ordered list of nodes, to resolve ambiguities. .It Fl l Search for and display the longest cycle. -Can take a very long time. .It Fl q -Do not display informational messages about cycles. -This is primarily +Do not display informational messages about cycles. This is primarily intended for building libraries, where optimal ordering is not critical, and cycles occur often. +.It Fl r +Reverse the ordering relation. +.It Fl v +Inform on the exact number of edges broken while breaking cycles. +.It Fl w +Exit with exit code the number of cycles +.Nm +had to break. .El .Sh SEE ALSO -.Xr ar 1 +.Xr ar 1 , +.Xr lorder 1 , +.Rs +.%A Donald E. Knuth +.%B The Art of Computer Programming +.%V Vol. 1 +.%P pp 258-268 +.%D 1973 +.Re .Sh HISTORY A .Nm command appeared in .At v7 . This -.Nm -command and manual page are derived from sources contributed to Berkeley by -Michael Rendell of Memorial University of Newfoundland. +.Nm tsort +command was completely rewritten by Marc Espie for +.Ox , +to finally use the well-known optimal algorithms for topological sorting. diff --git a/usr.bin/tsort/tsort.c b/usr.bin/tsort/tsort.c index 8fc1acf0379..c43ace0cf96 100644 --- a/usr.bin/tsort/tsort.c +++ b/usr.bin/tsort/tsort.c @@ -1,12 +1,9 @@ -/* $OpenBSD: tsort.c,v 1.4 1997/01/15 23:43:26 millert Exp $ */ -/* $NetBSD: tsort.c,v 1.11 1996/01/17 20:37:53 mycroft Exp $ */ +/* $OpenBSD: tsort.c,v 1.5 2001/03/26 22:53:33 espie Exp $ */ +/* ex:ts=8 sw=4: + */ /* - * Copyright (c) 1989, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Michael Rendell of Memorial University of Newfoundland. + * Copyright (c) 1999-2001 Marc Espie. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -18,423 +15,945 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * This product includes software developed by Marc Espie for the OpenBSD + * Project. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * THIS SOFTWARE IS PROVIDED BY THE OPENBSD PROJECT AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBSD + * PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef lint -static char copyright[] = -"@(#) Copyright (c) 1989, 1993, 1994\n\ - The Regents of the University of California. All rights reserved.\n"; -#endif /* not lint */ - -#ifndef lint -#if 0 -static char sccsid[] = "@(#)tsort.c 8.3 (Berkeley) 5/4/95"; -#endif -static char rcsid[] = "$OpenBSD: tsort.c,v 1.4 1997/01/15 23:43:26 millert Exp $"; -#endif /* not lint */ - #include <sys/types.h> - +#include <assert.h> #include <ctype.h> -#include <db.h> #include <err.h> -#include <errno.h> -#include <fcntl.h> +#include <limits.h> +#include <stddef.h> +#include <ohash.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sysexits.h> #include <unistd.h> -/* - * Topological sort. Input is a list of pairs of strings separated by - * white space (spaces, tabs, and/or newlines); strings are written to - * standard output in sorted order, one per line. +/* The complexity of topological sorting is O(e), where e is the + * size of input. While reading input, vertices have to be identified, + * thus add the complexity of e keys retrieval among v keys using + * an appropriate data structure. This program uses open double hashing + * for that purpose. See Knuth for the expected complexity of double + * hashing (Brent variation should probably be used if v << e, as a user + * option). + * + * The algorithm used for longest cycle reporting is accurate, but somewhat + * expensive. It may need to build all free paths of the graph (a free + * path is a path that never goes twice through the same node), whose + * number can be as high as O(2^e). Usually, the number of free paths is + * much smaller though. This program's author does not believe that a + * significantly better worst-case complexity algorithm exists. + * + * In case of a hints file, the set of minimal nodes is maintained as a + * heap. The resulting complexity is O(e+v log v) for the worst case. + * The average should actually be near O(e). + * + * The simple topological sort algorithm detects cycles. This program + * goes further, breaking cycles through the use of simple heuristics. + * Each cycle break checks the whole set of nodes, hence if c cycles break + * are needed, this is an extra cost of O(c v). * - * usage: - * tsort [-l] [inputfile] - * If no input file is specified, standard input is read. + * Possible heuristics are as follows: + * - break cycle at node with lowest number of predecessors (default case), + * - break longest cycle at node with lowest number of predecessors, + * - break cycle at next node from the hints file. * - * Should be compatable with AT&T tsort HOWEVER the output is not identical - * (i.e. for most graphs there is more than one sorted order, and this tsort - * usually generates a different one then the AT&T tsort). Also, cycle - * reporting seems to be more accurate in this version (the AT&T tsort - * sometimes says a node is in a cycle when it isn't). + * Except for the hints file case, which sets an explicit constraint on + * which cycle to break, those heuristics locally result in the smallest + * number of broken edges. * - * Michael Rendell, michael@stretch.cs.mun.ca - Feb 26, '90 + * Those are admittedly greedy strategies, as is the selection of the next + * node from the hints file amongst equivalent candidates that is used for + * `stable' topological sorting. */ -#define HASHSIZE 53 /* doesn't need to be big */ -#define NF_MARK 0x1 /* marker for cycle detection */ -#define NF_ACYCLIC 0x2 /* this node is cycle free */ -#define NF_NODEST 0x4 /* Unreachable */ - -typedef struct node_str NODE; - -struct node_str { - NODE **n_prevp; /* pointer to previous node's n_next */ - NODE *n_next; /* next node in graph */ - NODE **n_arcs; /* array of arcs to other nodes */ - int n_narcs; /* number of arcs in n_arcs[] */ - int n_arcsize; /* size of n_arcs[] array */ - int n_refcnt; /* # of arcs pointing to this node */ - int n_flags; /* NF_* */ - char n_name[1]; /* name of this node */ + +#ifdef __GNUC__ +#define UNUSED __attribute__((unused)) +#else +#define UNUSED +#endif + +struct node; + +/* The set of arcs from a given node is stored as a linked list. */ +struct link { + struct link *next; + struct node *node; }; -typedef struct _buf { - char *b_buf; - int b_bsize; -} BUF; +struct node { + unsigned int refs; /* Number of arcs left, coming into this node . + * Note that nodes with a null count can't + * be part of cycles. */ + struct link *arcs; /* List of forward arcs. */ -DB *db; -NODE *graph, **cycle_buf, **longest_cycle; -int debug, longest, quiet; + unsigned int order; /* Order of nodes according to a hint file. */ -void add_arc __P((char *, char *)); -int find_cycle __P((NODE *, NODE *, int, int)); -NODE *get_node __P((char *)); -void *grow_buf __P((void *, int)); -void remove_node __P((NODE *)); -void tsort __P((void)); -void usage __P((void)); + /* Cycle detection algorithms build a free path of nodes. */ + struct node *from; /* Previous node in the current path. */ -int -main(argc, argv) - int argc; - char *argv[]; + unsigned int mark; /* Mark processed nodes in cycle discovery. */ + struct link *traverse; /* Next link to traverse when backtracking. */ + char k[1]; /* Name of this node. */ +}; + +#define HASH_START 9 + +struct array { + unsigned int entries; + struct node **t; +}; + +static void nodes_init __P((struct ohash *)); +static struct node *node_lookup __P((struct ohash *, const char *, const char *)); +static void usage __P((void)); +static struct node *new_node __P((const char *, const char *)); + +static void read_pairs __P((FILE *, struct ohash *, int, const char *)); +static void split_nodes __P((struct ohash *, struct array *, struct array *)); +static void insert_arc __P((struct node *, struct node *)); + +#ifdef DEBUG +static void dump_node __P((struct node *)); +static void dump_array __P((struct array *)); +static void dump_hash __P((struct ohash *)); +#endif +static void read_hints __P((FILE *, struct ohash *, const char *)); +static struct node *find_smallest_node __P((struct array *)); +static struct node *find_good_cycle_break __P((struct array *)); +static void print_cycle __P((struct array *)); +static int find_cycle_with __P((struct node *, struct array *)); +static struct node *find_predecessor __P((struct array *, struct node *)); +static unsigned int traverse_node __P((struct node *, unsigned int, struct array *)); +static struct node *find_longest_cycle __P((struct array *, struct array *)); + +static void heap_down __P((struct array *, unsigned int)); +static void heapify __P((struct array *)); +static struct node *dequeue __P((struct array *)); +static void enqueue __P((struct array *, struct node *)); + + + +#define erealloc(n, s) emem(realloc(n, s)) +static void *hash_alloc __P((size_t, void *)); +static void hash_free __P((void *, size_t, void *)); +static void* entry_alloc __P((size_t, void *)); +static void *emalloc __P((size_t)); +static void *emem __P((void *)); +#define DEBUG_TRAVERSE 0 +static struct ohash_info node_info = { + offsetof(struct node, k), NULL, hash_alloc, hash_free, entry_alloc }; + + +int main __P((int, char *[])); + + +/*** + *** Memory handling. + ***/ + +static void * +emem(p) + void *p; { - register BUF *b; - register int c, n; - FILE *fp; - int bsize, ch, nused; - BUF bufs[2]; - - while ((ch = getopt(argc, argv, "dlq")) != -1) - switch (ch) { - case 'd': - debug = 1; - break; - case 'l': - longest = 1; - break; - case 'q': - quiet = 1; - break; - case '?': - default: - usage(); - } - argc -= optind; - argv += optind; + if (p) + return p; + else + errx(EX_SOFTWARE, "Memory exhausted"); +} - switch (argc) { - case 0: - fp = stdin; - break; - case 1: - if ((fp = fopen(*argv, "r")) == NULL) - err(1, "%s", *argv); - break; - default: - usage(); - } +static void * +hash_alloc(s, u) + size_t s; + void *u UNUSED; +{ + return emem(calloc(s, 1)); +} - for (b = bufs, n = 2; --n >= 0; b++) - b->b_buf = grow_buf(NULL, b->b_bsize = 1024); +static void +hash_free(p, s, u) + void *p; + size_t s UNUSED; + void *u UNUSED; +{ + free(p); +} - /* parse input and build the graph */ - for (n = 0, c = getc(fp);;) { - while (c != EOF && isspace(c)) - c = getc(fp); - if (c == EOF) - break; +static void * +entry_alloc(s, u) + size_t s; + void *u UNUSED; +{ + return emalloc(s); +} - nused = 0; - b = &bufs[n]; - bsize = b->b_bsize; - do { - b->b_buf[nused++] = c; - if (nused == bsize) - b->b_buf = grow_buf(b->b_buf, bsize *= 2); - c = getc(fp); - } while (c != EOF && !isspace(c)); - - b->b_buf[nused] = '\0'; - b->b_bsize = bsize; - if (n) - add_arc(bufs[0].b_buf, bufs[1].b_buf); - n = !n; - } - (void)fclose(fp); - if (n) - errx(1, "odd data count"); +static void * +emalloc(s) + size_t s; +{ + return emem(malloc(s)); +} - /* do the sort */ - tsort(); - exit(0); + +/*** + *** Hash table. + ***/ + +/* Inserting and finding nodes in the hash structure. + * We handle interval strings for efficiency wrt fgetln. */ +static struct node * +new_node(start, end) + const char *start; + const char *end; +{ + struct node *n; + + n = ohash_create_entry(&node_info, start, &end); + n->from = NULL; + n->arcs = NULL; + n->refs = 0; + n->mark = 0; + n->order = 0; + n->traverse = NULL; + return n; } -/* double the size of oldbuf and return a pointer to the new buffer. */ -void * -grow_buf(bp, size) - void *bp; - int size; + +static void +nodes_init(h) + struct ohash *h; { - if ((bp = realloc(bp, (u_int)size)) == NULL) - err(1, NULL); - return (bp); + ohash_init(h, HASH_START, &node_info); } -/* - * add an arc from node s1 to node s2 in the graph. If s1 or s2 are not in - * the graph, then add them. - */ -void -add_arc(s1, s2) - char *s1, *s2; +static struct node * +node_lookup(h, start, end) + struct ohash *h; + const char *start; + const char *end; { - register NODE *n1; - NODE *n2; - int bsize, i; + unsigned int i; + struct node * n; - n1 = get_node(s1); + i = ohash_qlookupi(h, start, &end); - if (!strcmp(s1, s2)) + n = ohash_find(h, i); + if (n == NULL) + n = ohash_insert(h, i, new_node(start, end)); + return n; +} + +#ifdef DEBUG +static void +dump_node(n) + struct node *n; +{ + struct link *l; + + if (n->refs == 0) return; + printf("%s (%u): ", n->k, n->refs); + for (l = n->arcs; l != NULL; l = l->next) + if (n->refs != 0) + printf("%s(%u) ", l->node->k, l->node->refs); + putchar('\n'); +} + +static void +dump_array(a) + struct array *a; +{ + unsigned int i; - n2 = get_node(s2); + for (i = 0; i < a->entries; i++) + dump_node(a->t[i]); +} + +static void +dump_hash(h) + struct hash *h; +{ + unsigned int i; + struct node *n; + + for (n = ohash_first(h, &i); n != NULL; n = ohash_next(h, &i)) + dump_node(n); +} +#endif + + +/*** + *** Reading data. + ***/ + +static void +insert_arc(a, b) + struct node *a, *b; +{ + struct link *l; - /* - * Check if this arc is already here. - */ - for (i = 0; i < n1->n_narcs; i++) - if (n1->n_arcs[i] == n2) + /* Check that this arc is not already present. */ + for (l = a->arcs; l != NULL; l = l->next) { + if (l->node == b) return; - /* - * Add it. - */ - if (n1->n_narcs == n1->n_arcsize) { - if (!n1->n_arcsize) - n1->n_arcsize = 10; - bsize = n1->n_arcsize * sizeof(*n1->n_arcs) * 2; - n1->n_arcs = grow_buf(n1->n_arcs, bsize); - n1->n_arcsize = bsize / sizeof(*n1->n_arcs); } - n1->n_arcs[n1->n_narcs++] = n2; - ++n2->n_refcnt; + b->refs++; + l = emalloc(sizeof(struct link)); + l->node = b; + l->next = a->arcs; + a->arcs = l; } -/* Find a node in the graph (insert if not found) and return a pointer to it. */ -NODE * -get_node(name) - char *name; +static void +read_pairs(f, h, reverse, name) + FILE *f; + struct ohash *h; + int reverse; + const char *name; { - DBT data, key; - NODE *n; + int toggle; + struct node *a; + size_t size; + char *str; + unsigned int o; + + o = 1; + toggle = 1; + a = NULL; + + while ((str = fgetln(f, &size)) != NULL) { + char *sentinel; + + sentinel = str + size; + for (;;) { + char *e; + + while (isspace(*str) && str < sentinel) + str++; + if (str == sentinel) + break; + for (e = str; !isspace(*e) && e < sentinel; e++) + continue; + if (toggle) { + a = node_lookup(h, str, e); + if (a->order == 0) + a->order = o++; + } else { + struct node *b; + + b = node_lookup(h, str, e); + assert(a != NULL); + if (b != a) { + if (reverse) + insert_arc(b, a); + else + insert_arc(a, b); + } + } + toggle = !toggle; + str = e; + } + } + if (toggle == 0) + errx(EX_DATAERR, "odd number of pairs in %s", name); + if (!feof(f)) + err(EX_IOERR, "error reading %s", name); +} - if (db == NULL && - (db = dbopen(NULL, O_RDWR, 0, DB_HASH, NULL)) == NULL) - err(1, "db: %s", name); +static void +read_hints(f, h, name) + FILE *f; + struct ohash *h; + const char *name; +{ + char *str; + size_t size; + unsigned int i; - key.data = name; - key.size = strlen(name) + 1; + i = 1; - switch ((*db->get)(db, &key, &data, 0)) { - case 0: - bcopy(data.data, &n, sizeof(n)); - return (n); - case 1: - break; - default: - case -1: - err(1, "db: %s", name); - } + while ((str = fgetln(f, &size)) != NULL) { + char *sentinel; + + sentinel = str + size; + for (;;) { + char *e; + struct node *a; - if ((n = malloc(sizeof(NODE) + key.size)) == NULL) - err(1, NULL); - - n->n_narcs = 0; - n->n_arcsize = 0; - n->n_arcs = NULL; - n->n_refcnt = 0; - n->n_flags = 0; - bcopy(name, n->n_name, key.size); - - /* Add to linked list. */ - if ((n->n_next = graph) != NULL) - graph->n_prevp = &n->n_next; - n->n_prevp = &graph; - graph = n; - - /* Add to hash table. */ - data.data = &n; - data.size = sizeof(n); - if ((*db->put)(db, &key, &data, 0)) - err(1, "db: %s", name); - return (n); + while (isspace(*str) && str < sentinel) + str++; + if (str == sentinel) + break; + for (e = str; !isspace(*e) && e < sentinel; e++) + continue; + a = node_lookup(h, str, e); + if (a->order != 0) + errx(EX_DATAERR, + "duplicate node %s in hints file %s", + a->k, name); + else + a->order = i++; + str = e; + } + } } + +/*** + *** Standard heap handling routines. + ***/ -/* - * Clear the NODEST flag from all nodes. - */ -void -clear_cycle() +static void +heap_down(h, i) + struct array *h; + unsigned int i; { - NODE *n; + unsigned int j; + struct node *swap; - for (n = graph; n != NULL; n = n->n_next) - n->n_flags &= ~NF_NODEST; + for (; (j=2*i+1) < h->entries; i = j) { + if (j+1 < h->entries && h->t[j+1]->order < h->t[j]->order) + j++; + if (h->t[i]->order <= h->t[j]->order) + break; + swap = h->t[i]; + h->t[i] = h->t[j]; + h->t[j] = swap; + } } -/* do topological sort on graph */ -void -tsort() +static void +heapify(h) + struct array *h; { - register NODE *n, *next; - register int cnt, i; - - while (graph != NULL) { - /* - * Keep getting rid of simple cases until there are none left, - * if there are any nodes still in the graph, then there is - * a cycle in it. - */ - do { - for (cnt = 0, n = graph; n != NULL; n = next) { - next = n->n_next; - if (n->n_refcnt == 0) { - remove_node(n); - ++cnt; - } - } - } while (graph != NULL && cnt); + unsigned int i; - if (graph == NULL) - break; + for (i = h->entries; i != 0;) + heap_down(h, --i); +} - if (!cycle_buf) { - /* - * Allocate space for two cycle logs - one to be used - * as scratch space, the other to save the longest - * cycle. - */ - for (cnt = 0, n = graph; n != NULL; n = n->n_next) - ++cnt; - cycle_buf = malloc((u_int)sizeof(NODE *) * cnt); - longest_cycle = malloc((u_int)sizeof(NODE *) * cnt); - if (cycle_buf == NULL || longest_cycle == NULL) - err(1, NULL); +#define DEQUEUE(h) ( hints_flag ? dequeue(h) : (h)->t[--(h)->entries] ) + +static struct node * +dequeue(h) + struct array *h; +{ + struct node *n; + + if (h->entries == 0) + n = NULL; + else { + n = h->t[0]; + if (--h->entries != 0) { + h->t[0] = h->t[h->entries]; + heap_down(h, 0); } - for (n = graph; n != NULL; n = n->n_next) - if (!(n->n_flags & NF_ACYCLIC)) - if (cnt = find_cycle(n, n, 0, 0)) { - if (!quiet) { - warnx("cycle in data"); - for (i = 0; i < cnt; i++) - warnx("%s", - longest_cycle[i]->n_name); - } - remove_node(n); - clear_cycle(); - break; - } else { - /* to avoid further checks */ - n->n_flags |= NF_ACYCLIC; - clear_cycle(); - } + } + return n; +} + +#define ENQUEUE(h, n) do { \ + if (hints_flag) \ + enqueue((h), (n)); \ + else \ + (h)->t[(h)->entries++] = (n); \ + } while(0); + +static void +enqueue(h, n) + struct array *h; + struct node *n; +{ + unsigned int i, j; + struct node *swap; - if (n == NULL) - errx(1, "internal error -- could not find cycle"); + h->t[h->entries++] = n; + for (i = h->entries-1; i > 0; i = j) { + j = (i-1)/2; + if (h->t[j]->order < h->t[i]->order) + break; + swap = h->t[j]; + h->t[j] = h->t[i]; + h->t[i] = swap; } } -/* print node and remove from graph (does not actually free node) */ -void -remove_node(n) - register NODE *n; + +/*** + *** Search through hash array for nodes. + ***/ + +/* Split nodes into unrefed nodes/live nodes. */ +static void +split_nodes(hash, heap, remaining) + struct ohash *hash; + struct array *heap; + struct array *remaining; { - register NODE **np; - register int i; - - (void)printf("%s\n", n->n_name); - for (np = n->n_arcs, i = n->n_narcs; --i >= 0; np++) - --(*np)->n_refcnt; - n->n_narcs = 0; - *n->n_prevp = n->n_next; - if (n->n_next) - n->n_next->n_prevp = n->n_prevp; + + struct node *n; + unsigned int i; + + heap->t = emalloc(sizeof(struct node *) * ohash_entries(hash)); + remaining->t = emalloc(sizeof(struct node *) * ohash_entries(hash)); + heap->entries = 0; + remaining->entries = 0; + + for (n = ohash_first(hash, &i); n != NULL; n = ohash_next(hash, &i)) { + if (n->refs == 0) + heap->t[heap->entries++] = n; + else + remaining->t[remaining->entries++] = n; + } } +/* Good point to break a cycle: live node with as few refs as possible. */ +static struct node * +find_good_cycle_break(h) + struct array *h; +{ + unsigned int i; + unsigned int best; + struct node *u; + + best = UINT_MAX; + u = NULL; + + assert(h->entries != 0); + for (i = 0; i < h->entries; i++) { + struct node *n = h->t[i]; + /* No need to look further. */ + if (n->refs == 1) + return n; + if (n->refs != 0 && n->refs < best) { + best = n->refs; + u = n; + } + } + assert(u != NULL); + return u; +} + +/* Retrieve the node with the smallest order. */ +static struct node * +find_smallest_node(h) + struct array *h; +{ + unsigned int i; + unsigned int best; + struct node *u; + + best = UINT_MAX; + u = NULL; + + assert(h->entries != 0); + for (i = 0; i < h->entries; i++) { + struct node *n = h->t[i]; + if (n->refs != 0 && n->order < best) { + best = n->order; + u = n; + } + } + assert(u != NULL); + return u; +} -/* look for the longest? cycle from node from to node to. */ -int -find_cycle(from, to, longest_len, depth) - NODE *from, *to; - int depth, longest_len; + +/*** + *** Graph algorithms. + ***/ + +/* Explore the nodes reachable from i to find a cycle containing it, store + * it in c. This may fail. */ +static int +find_cycle_with(i, c) + struct node *i; + struct array *c; { - register NODE **np; - register int i, len; - - /* - * avoid infinite loops and ignore portions of the graph known - * to be acyclic - */ - if (from->n_flags & (NF_NODEST|NF_MARK|NF_ACYCLIC)) - return (0); - from->n_flags |= NF_MARK; - - for (np = from->n_arcs, i = from->n_narcs; --i >= 0; np++) { - cycle_buf[depth] = *np; - if (*np == to) { - if (depth + 1 > longest_len) { - longest_len = depth + 1; - (void)memcpy((char *)longest_cycle, - (char *)cycle_buf, - longest_len * sizeof(NODE *)); + struct node *n; + + n = i; + /* XXX Previous cycle findings may have left this pointer non-null. */ + i->from = NULL; + + for (;;) { + /* Note that all marks are reversed before this code exits. */ + n->mark = 1; + if (n->traverse) + n->traverse = n->traverse->next; + else + n->traverse = n->arcs; + /* Skip over dead nodes. */ + while (n->traverse && n->traverse->node->refs == 0) + n->traverse = n->traverse->next; + if (n->traverse) { + struct node *go = n->traverse->node; + + if (go->mark) { + if (go == i) { + c->entries = 0; + for (; n != NULL; n = n->from) + c->t[c->entries++] = n; + return 1; + } + } else { + go->from = n; + n = go; } } else { - if ((*np)->n_flags & (NF_MARK|NF_ACYCLIC|NF_NODEST)) - continue; - len = find_cycle(*np, to, longest_len, depth + 1); + n->mark = 0; + n = n->from; + if (n == NULL) + return 0; + } + } +} - if (debug) - (void)printf("%*s %s->%s %d\n", depth, "", - from->n_name, to->n_name, len); +/* Find a live predecessor of node n. This is a slow routine, as it needs + * to go through the whole array, but it is not needed often. + */ +static struct node * +find_predecessor(a, n) + struct array *a; + struct node *n; +{ + unsigned int i; - if (len == 0) - (*np)->n_flags |= NF_NODEST; + for (i = 0; i < a->entries; i++) { + struct node *m; - if (len > longest_len) - longest_len = len; + m = a->t[i]; + if (m->refs != 0) { + struct link *l; - if (len > 0 && !longest) - break; + for (l = m->arcs; l != NULL; l = l->next) + if (l->node == n) + return m; + } + } + assert(1 == 0); + return NULL; +} + +/* Traverse all strongly connected components reachable from node n. + Start numbering them at o. Return the maximum order reached. + Update the largest cycle found so far. + */ +static unsigned int +traverse_node(n, o, c) + struct node *n; + unsigned int o; + struct array *c; +{ + unsigned int min, max; + + n->from = NULL; + min = o; + max = ++o; + + for (;;) { + n->mark = o; + if (DEBUG_TRAVERSE) + printf("%s(%d) ", n->k, n->mark); + /* Find next arc to explore. */ + if (n->traverse) + n->traverse = n->traverse->next; + else + n->traverse = n->arcs; + /* Skip over dead nodes. */ + while (n->traverse && n->traverse->node->refs == 0) + n->traverse = n->traverse->next; + /* If arc left. */ + if (n->traverse) { + struct node *go; + + go = n->traverse->node; + /* Optimisation: if go->mark < min, we already + * visited this strongly-connected component in + * a previous pass. Hence, this can yield no new + * cycle. */ + + /* Not part of the current path: go for it. */ + if (go->mark == 0 || go->mark == min) { + go->from = n; + n = go; + o++; + if (o > max) + max = o; + /* Part of the current path: check cycle length. */ + } else if (go->mark > min) { + if (DEBUG_TRAVERSE) + printf("%d\n", o - go->mark + 1); + if (o - go->mark + 1 > c->entries) { + struct node *t; + unsigned int i; + + c->entries = o - go->mark + 1; + i = 0; + c->t[i++] = go; + for (t = n; t != go; t = t->from) + c->t[i++] = t; + } + } + + /* No arc left: backtrack. */ + } else { + n->mark = min; + n = n->from; + if (!n) + return max; + o--; + } + } +} + +static void +print_cycle(c) + struct array *c; +{ + unsigned int i; + + /* Printing in reverse order, since cycle discoveries finds reverse + * edges. */ + for (i = c->entries; i != 0;) { + i--; + warnx("%s", c->t[i]->k); + } +} + +static struct node * +find_longest_cycle(h, c) + struct array *h; + struct array *c; +{ + unsigned int i; + unsigned int o; + unsigned int best; + struct node *n; + static int notfirst = 0; + + assert(h->entries != 0); + + /* No cycle found yet. */ + c->entries = 0; + + /* Reset the set of marks, except the first time around. */ + if (notfirst) { + for (i = 0; i < h->entries; i++) + h->t[i]->mark = 0; + } else + notfirst = 1; + + o = 0; + + /* Traverse the array. Each unmarked, live node heralds a + * new set of strongly connected components. */ + for (i = 0; i < h->entries; i++) { + n = h->t[i]; + if (n->refs != 0 && n->mark == 0) { + /* Each call to traverse_node uses a separate + * interval of numbers to mark nodes. */ + o++; + o = traverse_node(n, o, c); + } + } + + assert(c->entries != 0); + n = c->t[0]; + best = n->refs; + for (i = 0; i < c->entries; i++) { + if (c->t[i]->refs < best) { + n = c->t[i]; + best = n->refs; } } - from->n_flags &= ~NF_MARK; - return (longest_len); + return n; } -void + +#define plural(n) ((n) > 1 ? "s" : "") + +int +main(argc, argv) + int argc; + char *argv[]; +{ + struct ohash pairs; + int reverse_flag, quiet_flag, long_flag, + warn_flag, hints_flag, verbose_flag; + + reverse_flag = quiet_flag = long_flag = + warn_flag = hints_flag = verbose_flag = 0; + nodes_init(&pairs); + + { + int c; + + while ((c = getopt(argc, argv, "h:flqrvw")) != -1) { + switch(c) { + case 'h': { + FILE *f; + + f = fopen(optarg, "r"); + if (f == NULL) + err(EX_NOINPUT, "Can't open hint file %s", + optarg); + read_hints(f, &pairs, optarg); + fclose(f); + } + /*FALLTHRU*/ + case 'f': + if (hints_flag == 1) + usage(); + hints_flag = 1; + break; + case 'l': + long_flag = 1; + break; + case 'q': + quiet_flag = 1; + break; + case 'r': + reverse_flag = 1; + break; + case 'v': + verbose_flag = 1; + break; + case 'w': + warn_flag = 1; + break; + default: + usage(); + } + } + + argc -= optind; + argv += optind; + } + + switch(argc) { + case 1: { + FILE *f; + + f = fopen(argv[0], "r"); + if (f == NULL) + err(EX_NOINPUT, "Can't open file %s", argv[1]); + read_pairs(f, &pairs, reverse_flag, argv[1]); + fclose(f); + break; + } + case 0: + read_pairs(stdin, &pairs, reverse_flag, "stdin"); + break; + default: + usage(); + } + + { + struct array aux; /* Unrefed nodes/cycle reporting. */ + struct array remaining; + unsigned int broken_arcs, broken_cycles; + unsigned int left; + + broken_arcs = 0; + broken_cycles = 0; + + split_nodes(&pairs, &aux, &remaining); + ohash_delete(&pairs); + + if (hints_flag) + heapify(&aux); + + left = remaining.entries + aux.entries; + while (left != 0) { + + /* Standard topological sort. */ + while (aux.entries) { + struct link *l; + struct node *n; + + n = DEQUEUE(&aux); + printf("%s\n", n->k); + left--; + /* We can't free nodes, as we don't know which + * entry we can remove in the hash table. We + * rely on refs == 0 to recognize live nodes. + * Decrease ref count of live nodes, enter new + * candidates into the unrefed list. */ + for (l = n->arcs; l != NULL; l = l->next) + if (l->node->refs != 0 && + --l->node->refs == 0) { + ENQUEUE(&aux, l->node); + } + } + /* There are still cycles to break. */ + if (left != 0) { + struct node *n; + + broken_cycles++; + /* XXX Simple cycle detection and long cycle + * detection are mutually exclusive. */ + + if (long_flag) { + n = find_longest_cycle(&remaining, &aux); + } else { + if (hints_flag) + n = find_smallest_node(&remaining); + else + n = find_good_cycle_break(&remaining); + if (!quiet_flag) { + while (!find_cycle_with(n, &aux)) + n = find_predecessor(&remaining, n); + } + } + + if (!quiet_flag) { + warnx("cycle in data"); + print_cycle(&aux); + } + + if (verbose_flag) + warnx("%u edge%s broken", n->refs, + plural(n->refs)); + broken_arcs += n->refs; + n->refs = 0; + /* Reinitialization, cycle reporting uses aux. */ + aux.t[0] = n; + aux.entries = 1; + } + } + if (verbose_flag && broken_cycles != 0) + warnx("%u cycle%s broken, for a total of %u edge%s", + broken_cycles, plural(broken_cycles), + broken_arcs, plural(broken_arcs)); + if (warn_flag) + exit(broken_cycles < 256 ? broken_cycles : 255); + else + exit(EX_OK); + } +} + + +extern char *__progname; + +static void usage() { - (void)fprintf(stderr, "usage: tsort [-lq] [file]\n"); - exit(1); + fprintf(stderr, "Usage: %s [-h file] [-flqrvw] [file]\n", __progname); + exit(EX_USAGE); } |