diff options
author | Todd C. Miller <millert@cvs.openbsd.org> | 2002-03-01 22:01:12 +0000 |
---|---|---|
committer | Todd C. Miller <millert@cvs.openbsd.org> | 2002-03-01 22:01:12 +0000 |
commit | a994239d65289488e751e7ee61bcdb678ee26164 (patch) | |
tree | ebbcfba3a236a6a08d7fb989449a1b5f9684f814 /usr.bin/spell/spellprog.c | |
parent | d0281a3ced86b9f7541ca7166fb7e3371be66794 (diff) |
spell command from 4.4BSD, now freed by Caldera.
I have made some significant changes:
o spell no longer uses hashed files. The algorithm for generating
the hashed files breaks down for large files. Instead, I use the
same approach as look(1) and do binary searches of mmap(2)ed files
which has acceptable performance. A side effect of this is that
spell can now search multiple word lists.
o The spell binary has been renamed to "spellprog" so as to not conflict
with the spell shell script. This is consistent with other operating
systems.
o The putsuf() and putword() functions have been renamed to
trypref() and tryword() respectively for consistency with modern
versions and for general sanity.
o the spell front-end shell script has been completely rewritten
o The code has been ANSIfied and re-style(9)d
spell currently must be compiled with -fwriteable-strings to make
the -b option work. This should be fixed.
Diffstat (limited to 'usr.bin/spell/spellprog.c')
-rw-r--r-- | usr.bin/spell/spellprog.c | 794 |
1 files changed, 794 insertions, 0 deletions
diff --git a/usr.bin/spell/spellprog.c b/usr.bin/spell/spellprog.c new file mode 100644 index 00000000000..acaad64c200 --- /dev/null +++ b/usr.bin/spell/spellprog.c @@ -0,0 +1,794 @@ +/* $OpenBSD: spellprog.c,v 1.1 2002/03/01 22:01:11 millert Exp $ */ + +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spell.h 8.1 (Berkeley) 6/6/93 + */ +/* + * Copyright (C) Caldera International Inc. 2001-2002. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code and documentation must retain the above + * copyright notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed or owned by Caldera + * International, Inc. + * 4. Neither the name of Caldera International, Inc. nor the names of other + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA + * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT, + * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef lint +static const char copyright[] = +"@(#) Copyright (c) 1991, 1993\n\ + The Regents of the University of California. All rights reserved.\n"; +#endif /* not lint */ + +#ifndef lint +#if 0 +static const char sccsid[] = "@(#)spell.c 8.1 (Berkeley) 6/6/93"; +#else +#endif +static const char rcsid[] = "$OpenBSD: spellprog.c,v 1.1 2002/03/01 22:01:11 millert Exp $"; +#endif /* not lint */ + +#include <sys/param.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include <ctype.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <locale.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#define DLEV 2 + +int an(char *, char *, char *, int); +int bility(char *, char *, char *, int); +int es(char *, char *, char *, int); +int dict(char *, char *); +int i_to_y(char *, char *, char *, int); +int ily(char *, char *, char *, int); +int ize(char *, char *, char *, int); +int metry(char *, char *, char *, int); +int monosyl(char *, char *); +int ncy(char *, char *, char *, int); +int nop(void); +int trypref(char *, char *, int); +int tryword(char *, char *, int); +int s(char *, char *, char *, int); +int strip(char *, char *, char *, int); +int suffix(char *, int); +int tion(char *, char *, char *, int); +int vowel(int); +int y_to_e(char *, char *, char *, int); +int CCe(char *, char *, char *, int); +int VCe(char *, char *, char *, int); +char *lookuppref(char **, char *); +char *skipv(char *); +void ise(void); +void print_word(FILE *); +void ztos(char *); +__dead void usage(void); + +/* from look.c */ +int look(unsigned char *, unsigned char *, unsigned char *); + +struct suftab { + char *suf; /* XXX - needs to be writable for ise() */ + int (*p1)(); /* XXX - variable args */ + int n1; + char *d1; + char *a1; + int (*p2)(); /* XXX - variable args */ + int n2; + char *d2; + char *a2; +} suftab[] = { + {"ssen", ily, 4, "-y+iness", "+ness" }, + {"ssel", ily, 4, "-y+i+less", "+less" }, + {"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" }, + {"s'", s, 2, "", "+'s"}, + {"s", s, 1, "", "+s"}, + {"ecn", ncy, 1, "", "-t+ce"}, + {"ycn", ncy, 1, "", "-cy+t"}, + {"ytilb", nop, 0, "", ""}, + {"ytilib", bility, 5, "-le+ility", ""}, + {"elbaif", i_to_y, 4, "-y+iable", ""}, + {"elba", CCe, 4, "-e+able", "+able"}, + {"yti", CCe, 3, "-e+ity", "+ity"}, + {"ylb", y_to_e, 1, "-e+y", ""}, + {"yl", ily, 2, "-y+ily", "+ly"}, + {"laci", strip, 2, "", "+al"}, + {"latnem", strip, 2, "", "+al"}, + {"lanoi", strip, 2, "", "+al"}, + {"tnem", strip, 4, "", "+ment"}, + {"gni", CCe, 3, "-e+ing", "+ing"}, + {"reta", nop, 0, "", ""}, + {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"}, + {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"}, + {"citsi", strip, 2, "", "+ic"}, + {"cihparg", i_to_y, 1, "-y+ic", ""}, + {"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"}, + {"cirtem", i_to_y, 1, "-y+ic", ""}, + {"yrtem", metry, 0, "-ry+er", ""}, + {"cigol", i_to_y, 1, "-y+ic", ""}, + {"tsigol", i_to_y, 2, "-y+ist", ""}, + {"tsi", VCe, 3, "-e+ist", "+ist"}, + {"msi", VCe, 3, "-e+ism", "+ist"}, + {"noitacif", i_to_y, 6, "-y+ication", ""}, + {"noitazi", ize, 5, "-e+ation", ""}, + {"rota", tion, 2, "-e+or", ""}, + {"noit", tion, 3, "-e+ion", "+ion"}, + {"naino", an, 3, "", "+ian"}, + {"na", an, 1, "", "+n"}, + {"evit", tion, 3, "-e+ive", "+ive"}, + {"ezi", CCe, 3, "-e+ize", "+ize"}, + {"pihs", strip, 4, "", "+ship"}, + {"dooh", ily, 4, "-y+hood", "+hood"}, + {"ekil", strip, 4, "", "+like"}, + { NULL } +}; + +char *preftab[] = { + "anti", + "bio", + "dis", + "electro", + "en", + "fore", + "hyper", + "intra", + "inter", + "iso", + "kilo", + "magneto", + "meta", + "micro", + "milli", + "mis", + "mono", + "multi", + "non", + "out", + "over", + "photo", + "poly", + "pre", + "pseudo", + "re", + "semi", + "stereo", + "sub", + "super", + "thermo", + "ultra", + "under", /* must precede un */ + "un", + NULL +}; + +struct wlist { + int fd; + unsigned char *front; + unsigned char *back; +} *wlists; + +int vflag; +int xflag; +char word[LINE_MAX]; +char original[LINE_MAX]; +char *deriv[40]; +char affix[40]; + +/* + * The spellprog utility accepts a newline-delimited list of words + * on stdin. For arguments it expects the path to a word list and + * the path to a file in which to store found words. + * + * In normal usage, spell is called twice. The first time it is + * called with a stop list to flag commonly mispelled words. The + * remaining words are then passed to spell again, this time with + * the dictionary file as the first (non-flag) argument. + * + * Unlike historic versions of spellprog, this one does not use + * hashed files. Instead it simply requires that files be sorted + * lexigraphically and uses the same algorithm as the look utility. + * + * Note that spellprog should be called via the spell shell script + * and is not meant to be invoked directly by the user. + */ + +int +main(int argc, char **argv) +{ + char *ep, *cp, *dp; + char *outfile; + int ch, fold, i; + struct stat sb; + FILE *file, *found; + + setlocale(LC_ALL, ""); + + outfile = NULL; + while ((ch = getopt(argc, argv, "bvxo:")) != -1) { + switch (ch) { + case 'b': + /* Use British dictionary and convert ize -> ise. */ + ise(); + break; + case 'o': + outfile = optarg; + break; + case 'v': + /* Also write derivations to "found" file. */ + vflag++; + break; + case 'x': + /* Print plausible stems to stdout. */ + xflag++; + break; + default: + usage(); + } + + } + argc -= optind; + argv += optind; + if (argc < 1) + usage(); + + /* Open and mmap the word/stop lists. */ + if ((wlists = malloc(sizeof(struct wlist) * (argc + 1))) == NULL) + err(1, "malloc"); + for (i = 0; argc--; i++) { + wlists[i].fd = open(argv[i], O_RDONLY, 0); + if (wlists[i].fd == -1 || fstat(wlists[i].fd, &sb) != 0) + err(1, "%s", argv[i]); + if (sb.st_size > SIZE_T_MAX) + errx(1, "%s: %s", argv[i], strerror(EFBIG)); + wlists[i].front = mmap(NULL, (size_t)sb.st_size, PROT_READ, + MAP_PRIVATE, wlists[i].fd, (off_t)0); + if (wlists[i].front == MAP_FAILED) + err(1, "%s", argv[i]); + wlists[i].back = wlists[i].front + sb.st_size; + } + wlists[i].fd = -1; + + /* Open file where found words are to be saved. */ + if (outfile == NULL) + found = NULL; + else if ((found = fopen(outfile, "w")) == NULL) + err(1, "cannot open %s", outfile); + + /* XXX - this fprintf is for loop abuse */ + for (;; print_word(file)) { + affix[0] = '\0'; + file = found; + for (ep = word; (*ep = ch = getchar()) != '\n'; ep++) { + if (ep - word == sizeof(word) - 1) { + *ep = '\0'; + warnx("word too long (%s)", word); + while ((ch = getchar()) != '\n') + ; /* slurp until EOL */ + } + if (ch == EOF) { + if (found != NULL) + fclose(found); + exit(0); + } + } + for (cp = word, dp = original; cp < ep; ) + *dp++ = *cp++; + *dp = '\0'; + fold = 0; + for (cp = word; cp < ep; cp++) + if (islower(*cp)) + goto lcase; + if (trypref(ep, ".", 0)) + continue; + ++fold; + for (cp = original + 1, dp = word + 1; dp < ep; dp++, cp++) + *dp = tolower(*cp); +lcase: + if (trypref(ep, ".", 0) || suffix(ep, 0)) + continue; + if (isupper(word[0])) { + for (cp = original, dp = word; (*dp = *cp++); dp++) { + if (fold) + *dp = tolower(*dp); + } + word[0] = tolower(word[0]); + goto lcase; + } + file = stdout; + } + + exit(0); +} + +void +print_word(FILE *f) +{ + + if (f != NULL) { + if (vflag && affix[0] != '\0' && affix[0] != '.') + fprintf(f, "%s\t%s\n", affix, original); + else + fprintf(f, "%s\n", original); + } +} + +/* + * For each matching suffix in suftab, call the function associated + * with that suffix (p1 and p2). + */ +int +suffix(char *ep, int lev) +{ + struct suftab *t; + char *cp, *sp; + + lev += DLEV; + deriv[lev] = deriv[lev-1] = 0; + for (t = suftab; (sp = t->suf); t++) { + cp = ep; + while (*sp) { + if (*--cp != *sp++) + goto next; + } + for (sp = cp; --sp >= word && !vowel(*sp);) + ; /* nothing */ + if (sp < word) + return(0); + if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1)) + return(1); + if (t->p2 != NULL) { + deriv[lev] = deriv[lev+1] = '\0'; + return((*t->p2)(ep-t->n2, t->d2, t->a2, lev)); + } + return(0); +next: ; + } + return(0); +} + +int +nop(void) +{ + + return(0); +} + +int +strip(char *ep, char *d, char *a, int lev) +{ + + return(trypref(ep, a, lev) || suffix(ep, lev)); +} + +int +s(char *ep, char *d, char *a, int lev) +{ + + if (lev > DLEV + 1) + return(0); + if (*ep == 's' && ep[-1] == 's') + return(0); + return(strip(ep, d, a, lev)); +} + +int +an(char *ep, char *d, char *a, int lev) +{ + + if (!isupper(*word)) /* must be proper name */ + return(0); + return(trypref(ep,a,lev)); +} + +int +ize(char *ep, char *d, char *a, int lev) +{ + + *ep++ = 'e'; + return(strip(ep ,"", d, lev)); +} + +int +y_to_e(char *ep, char *d, char *a, int lev) +{ + char c = *ep; + + *ep++ = 'e'; + if (strip(ep, "", d, lev)) + return (1); + ep[-1] = c; + return (0); +} + +int +ily(char *ep, char *d, char *a, int lev) +{ + + if (ep[-1] == 'i') + return(i_to_y(ep, d, a, lev)); + else + return(strip(ep, d, a, lev)); +} + +int +ncy(char *ep, char *d, char *a, int lev) +{ + + if (skipv(skipv(ep-1)) < word) + return(0); + ep[-1] = 't'; + return(strip(ep, d, a, lev)); +} + +int +bility(char *ep, char *d, char *a, int lev) +{ + + *ep++ = 'l'; + return(y_to_e(ep, d, a, lev)); +} + +int +i_to_y(char *ep, char *d, char *a, int lev) +{ + + if (ep[-1] == 'i') { + ep[-1] = 'y'; + a = d; + } + return(strip(ep, "", a, lev)); +} + +int +es(char *ep, char *d, char *a, int lev) +{ + + if (lev > DLEV) + return(0); + + switch (ep[-1]) { + default: + return(0); + case 'i': + return(i_to_y(ep, d, a, lev)); + case 's': + case 'h': + case 'z': + case 'x': + return(strip(ep, d, a, lev)); + } +} + +int +metry(char *ep, char *d, char *a, int lev) +{ + + ep[-2] = 'e'; + ep[-1] = 'r'; + return(strip(ep, d, a, lev)); +} + +int +tion(char *ep, char *d, char *a, int lev) +{ + + switch (ep[-2]) { + case 'c': + case 'r': + return(trypref(ep, a, lev)); + case 'a': + return(y_to_e(ep, d, a, lev)); + } + return(0); +} + +/* + * Possible consonant-consonant-e ending. + */ +int +CCe(char *ep, char *d, char *a, int lev) +{ + + switch (ep[-1]) { + case 'l': + if (vowel(ep[-2])) + break; + switch (ep[-2]) { + case 'l': + case 'r': + case 'w': + break; + default: + return(y_to_e(ep, d, a, lev)); + } + break; + case 's': + if (ep[-2] == 's') + break; + case 'c': + case 'g': + if (*ep == 'a') + return(0); + case 'v': + case 'z': + if (vowel(ep[-2])) + break; + case 'u': + if (y_to_e(ep, d, a, lev)) + return(1); + if (!(ep[-2] == 'n' && ep[-1] == 'g')) + return(0); + } + return(VCe(ep, d, a, lev)); +} + +/* + * Possible consonant-vowel-consonant-e ending. + */ +int +VCe(char *ep, char *d, char *a, int lev) +{ + char c; + + c = ep[-1]; + if (c == 'e') + return(0); + if (!vowel(c) && vowel(ep[-2])) { + c = *ep; + *ep++ = 'e'; + if (trypref(ep, d, lev) || suffix(ep, lev)) + return(1); + ep--; + *ep = c; + } + return(strip(ep, d, a, lev)); +} + +char * +lookuppref(char **wp, char *ep) +{ + char **sp; + char *bp,*cp; + + for (sp = preftab; *sp; sp++) { + bp = *wp; + for (cp = *sp; *cp; cp++, bp++) { + if (tolower(*bp) != *cp) + goto next; + } + for (cp = bp; cp < ep; cp++) { + if (vowel(*cp)) { + *wp = bp; + return(*sp); + } + } +next: ; + } + return(0); +} + +/* + * If the word is not in the dictionary, try stripping off prefixes + * until the word is found or we run out of prefixes to check. + */ +int +trypref(char *ep, char *a, int lev) +{ + char *cp; + char *bp; + char *pp; + int val = 0; + char space[20]; + + deriv[lev] = a; + if (tryword(word, ep, lev)) + return(1); + bp = word; + pp = space; + deriv[lev+1] = pp; + while ((cp = lookuppref(&bp, ep))) { + *pp++ = '+'; + while ((*pp = *cp++)) + pp++; + if (tryword(bp, ep, lev+1)) { + val = 1; + break; + } + if (pp - space >= sizeof(space)) + return(0); + } + deriv[lev+1] = deriv[lev+2] = '\0'; + return(val); +} + +int +tryword(char *bp, char *ep, int lev) +{ + int i, j; + char duple[3]; + + if (ep-bp <= 1) + return(0); + if (vowel(*ep) && monosyl(bp, ep)) + return(0); + + i = dict(bp, ep); + if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) { + ep--; + deriv[++lev] = duple; + duple[0] = '+'; + duple[1] = *ep; + duple[2] = '\0'; + i = dict(bp, ep); + } + if (vflag == 0 || i == 0) + return(i); + + /* Also tack on possible derivations. (XXX - warn on truncation?) */ + for (j = lev; j > 0; j--) { + if (deriv[j]) + strlcat(affix, deriv[j], sizeof(affix)); + } + return(i); +} + +int +monosyl(char *bp, char *ep) +{ + + if (ep < bp + 2) + return(0); + if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w') + return(0); + while (--ep >= bp) + if (vowel(*ep)) + return(0); + return(1); +} + +char * +skipv(char *s) +{ + + if (s >= word && vowel(*s)) + s--; + while (s >= word && !vowel(*s)) + s--; + return(s); +} + +int +vowel(int c) +{ + + switch (tolower(c)) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'y': + return(1); + } + return(0); +} + +/* + * Crummy way to Britishise. + */ +void +ise(void) +{ + struct suftab *p; + + for (p = suftab; p->suf; p++) { + ztos(p->suf); + ztos(p->d1); + ztos(p->a1); + } +} + +void +ztos(char *s) +{ + + for (; *s; s++) + if (*s == 'z') + *s = 's'; +} + +/* + * Look up a word in the dictionary. + * Returns 1 if found, 0 if not. + */ +int +dict(char *bp, char *ep) +{ + char c; + int i, rval; + + c = *ep; + *ep = '\0'; + if (xflag) + printf("=%s\n", bp); + for (i = rval = 0; wlists[i].fd != -1; i++) { + if ((rval = look((unsigned char *)bp, wlists[i].front, + wlists[i].back)) == 1) + break; + } + *ep = c; + return(rval); +} + +__dead void +usage(void) +{ + extern char *__progname; + + fprintf(stderr, "usage: %s [-bvx] [-o found-words] word-list ...\n", + __progname); + exit(1); +} |