diff options
author | Ingo Schwarze <schwarze@cvs.openbsd.org> | 2011-11-13 10:28:39 +0000 |
---|---|---|
committer | Ingo Schwarze <schwarze@cvs.openbsd.org> | 2011-11-13 10:28:39 +0000 |
commit | c1a0cbbbec9055ccd98836dda5d211c0c0ddb81f (patch) | |
tree | 897769b6b09197b7682e04bd0db0774379179f4a /usr.bin/mandoc/apropos_db.c | |
parent | 63443608ab500be3da50120ea56a4fd3ab11782b (diff) |
Split the common code to be reused by other mandocdb clients,
for example the future man.cgi, out of apropos.c (which still
contains the apropos(1) main program, command line handling, and
the terminal output formatter) into apropos_db.{h,c} (which now
contains the database searching backend code).
... and actually commit the two new files. Oops.
Diffstat (limited to 'usr.bin/mandoc/apropos_db.c')
-rw-r--r-- | usr.bin/mandoc/apropos_db.c | 541 |
1 files changed, 541 insertions, 0 deletions
diff --git a/usr.bin/mandoc/apropos_db.c b/usr.bin/mandoc/apropos_db.c new file mode 100644 index 00000000000..0166ccef4da --- /dev/null +++ b/usr.bin/mandoc/apropos_db.c @@ -0,0 +1,541 @@ +/* $Id: apropos_db.c,v 1.1 2011/11/13 10:28:38 schwarze Exp $ */ +/* + * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <assert.h> +#include <fcntl.h> +#include <regex.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> + +#ifdef __linux__ +# include <db_185.h> +#else +# include <db.h> +#endif + +#include "apropos_db.h" +#include "mandoc.h" + +enum match { + MATCH_REGEX, + MATCH_REGEXCASE, + MATCH_STR, + MATCH_STRCASE +}; + +struct expr { + enum match match; + int mask; + char *v; + regex_t re; +}; + +struct type { + int mask; + const char *name; +}; + +static const struct type types[] = { + { TYPE_NAME, "name" }, + { TYPE_FUNCTION, "func" }, + { TYPE_UTILITY, "utility" }, + { TYPE_INCLUDES, "incl" }, + { TYPE_VARIABLE, "var" }, + { TYPE_STANDARD, "stand" }, + { TYPE_AUTHOR, "auth" }, + { TYPE_CONFIG, "conf" }, + { TYPE_DESC, "desc" }, + { TYPE_XREF, "xref" }, + { TYPE_PATH, "path" }, + { TYPE_ENV, "env" }, + { TYPE_ERR, "err" }, + { INT_MAX, "all" }, + { 0, NULL } +}; + +static DB *btree_open(void); +static int btree_read(const DBT *, const struct mchars *, char **); +static int exprexec(const struct expr *, char *); +static DB *index_open(void); +static int index_read(const DBT *, const DBT *, + const struct mchars *, struct rec *); +static void norm_string(const char *, + const struct mchars *, char **); +static size_t norm_utf8(unsigned int, char[7]); + +/* + * Open the keyword mandoc-db database. + */ +static DB * +btree_open(void) +{ + BTREEINFO info; + DB *db; + + memset(&info, 0, sizeof(BTREEINFO)); + info.flags = R_DUP; + + db = dbopen("mandoc.db", O_RDONLY, 0, DB_BTREE, &info); + if (NULL != db) + return(db); + + return(NULL); +} + +/* + * Read a keyword from the database and normalise it. + * Return 0 if the database is insane, else 1. + */ +static int +btree_read(const DBT *v, const struct mchars *mc, char **buf) +{ + + /* Sanity: are we nil-terminated? */ + + assert(v->size > 0); + if ('\0' != ((char *)v->data)[(int)v->size - 1]) + return(0); + + norm_string((char *)v->data, mc, buf); + return(1); +} + +/* + * Take a Unicode codepoint and produce its UTF-8 encoding. + * This isn't the best way to do this, but it works. + * The magic numbers are from the UTF-8 packaging. + * They're not as scary as they seem: read the UTF-8 spec for details. + */ +static size_t +norm_utf8(unsigned int cp, char out[7]) +{ + size_t rc; + + rc = 0; + + if (cp <= 0x0000007F) { + rc = 1; + out[0] = (char)cp; + } else if (cp <= 0x000007FF) { + rc = 2; + out[0] = (cp >> 6 & 31) | 192; + out[1] = (cp & 63) | 128; + } else if (cp <= 0x0000FFFF) { + rc = 3; + out[0] = (cp >> 12 & 15) | 224; + out[1] = (cp >> 6 & 63) | 128; + out[2] = (cp & 63) | 128; + } else if (cp <= 0x001FFFFF) { + rc = 4; + out[0] = (cp >> 18 & 7) | 240; + out[1] = (cp >> 12 & 63) | 128; + out[2] = (cp >> 6 & 63) | 128; + out[3] = (cp & 63) | 128; + } else if (cp <= 0x03FFFFFF) { + rc = 5; + out[0] = (cp >> 24 & 3) | 248; + out[1] = (cp >> 18 & 63) | 128; + out[2] = (cp >> 12 & 63) | 128; + out[3] = (cp >> 6 & 63) | 128; + out[4] = (cp & 63) | 128; + } else if (cp <= 0x7FFFFFFF) { + rc = 6; + out[0] = (cp >> 30 & 1) | 252; + out[1] = (cp >> 24 & 63) | 128; + out[2] = (cp >> 18 & 63) | 128; + out[3] = (cp >> 12 & 63) | 128; + out[4] = (cp >> 6 & 63) | 128; + out[5] = (cp & 63) | 128; + } else + return(0); + + out[rc] = '\0'; + return(rc); +} + +/* + * Normalise strings from the index and database. + * These strings are escaped as defined by mandoc_char(7) along with + * other goop in mandoc.h (e.g., soft hyphens). + * This function normalises these into a nice UTF-8 string. + * Returns 0 if the database is fucked. + */ +static void +norm_string(const char *val, const struct mchars *mc, char **buf) +{ + size_t sz, bsz; + char utfbuf[7]; + const char *seq, *cpp; + int len, u, pos; + enum mandoc_esc esc; + static const char res[] = { '\\', '\t', + ASCII_NBRSP, ASCII_HYPH, '\0' }; + + /* Pre-allocate by the length of the input */ + + bsz = strlen(val) + 1; + *buf = mandoc_realloc(*buf, bsz); + pos = 0; + + while ('\0' != *val) { + /* + * Halt on the first escape sequence. + * This also halts on the end of string, in which case + * we just copy, fallthrough, and exit the loop. + */ + if ((sz = strcspn(val, res)) > 0) { + memcpy(&(*buf)[pos], val, sz); + pos += (int)sz; + val += (int)sz; + } + + if (ASCII_HYPH == *val) { + (*buf)[pos++] = '-'; + val++; + continue; + } else if ('\t' == *val || ASCII_NBRSP == *val) { + (*buf)[pos++] = ' '; + val++; + continue; + } else if ('\\' != *val) + break; + + /* Read past the slash. */ + + val++; + u = 0; + + /* + * Parse the escape sequence and see if it's a + * predefined character or special character. + */ + + esc = mandoc_escape(&val, &seq, &len); + if (ESCAPE_ERROR == esc) + break; + + /* + * XXX - this just does UTF-8, but we need to know + * beforehand whether we should do text substitution. + */ + + switch (esc) { + case (ESCAPE_SPECIAL): + if (0 != (u = mchars_spec2cp(mc, seq, len))) + break; + /* FALLTHROUGH */ + default: + continue; + } + + /* + * If we have a Unicode codepoint, try to convert that + * to a UTF-8 byte string. + */ + + cpp = utfbuf; + if (0 == (sz = norm_utf8(u, utfbuf))) + continue; + + /* Copy the rendered glyph into the stream. */ + + sz = strlen(cpp); + bsz += sz; + + *buf = mandoc_realloc(*buf, bsz); + + memcpy(&(*buf)[pos], cpp, sz); + pos += (int)sz; + } + + (*buf)[pos] = '\0'; +} + +/* + * Open the filename-index mandoc-db database. + * Returns NULL if opening failed. + */ +static DB * +index_open(void) +{ + DB *db; + + db = dbopen("mandoc.index", O_RDONLY, 0, DB_RECNO, NULL); + if (NULL != db) + return(db); + + return(NULL); +} + +/* + * Safely unpack from an index file record into the structure. + * Returns 1 if an entry was unpacked, 0 if the database is insane. + */ +static int +index_read(const DBT *key, const DBT *val, + const struct mchars *mc, struct rec *rec) +{ + size_t left; + char *np, *cp; + +#define INDEX_BREAD(_dst) \ + do { \ + if (NULL == (np = memchr(cp, '\0', left))) \ + return(0); \ + norm_string(cp, mc, &(_dst)); \ + left -= (np - cp) + 1; \ + cp = np + 1; \ + } while (/* CONSTCOND */ 0) + + left = val->size; + cp = (char *)val->data; + + rec->rec = *(recno_t *)key->data; + + INDEX_BREAD(rec->file); + INDEX_BREAD(rec->cat); + INDEX_BREAD(rec->title); + INDEX_BREAD(rec->arch); + INDEX_BREAD(rec->desc); + return(1); +} + +/* + * Search the mandocdb database for the expression "expr". + * Filter out by "opts". + * Call "res" with the results, which may be zero. + */ +void +apropos_search(const struct opts *opts, const struct expr *expr, + void *arg, void (*res)(struct rec *, size_t, void *)) +{ + int i, len, root, leaf; + DBT key, val; + DB *btree, *idx; + struct mchars *mc; + int ch; + char *buf; + recno_t rec; + struct rec *recs; + struct rec srec; + + root = -1; + leaf = -1; + btree = NULL; + idx = NULL; + mc = NULL; + buf = NULL; + recs = NULL; + len = 0; + + memset(&srec, 0, sizeof(struct rec)); + + /* XXX: error out with bad regexp? */ + + mc = mchars_alloc(); + + /* XXX: return fact that we've errored? */ + + if (NULL == (btree = btree_open())) + goto out; + if (NULL == (idx = index_open())) + goto out; + + while (0 == (ch = (*btree->seq)(btree, &key, &val, R_NEXT))) { + /* + * Low-water mark for key and value. + * The key must have something in it, and the value must + * have the correct tags/recno mix. + */ + if (key.size < 2 || 8 != val.size) + break; + if ( ! btree_read(&key, mc, &buf)) + break; + + if ( ! exprexec(expr, buf)) + continue; + + memcpy(&rec, val.data + 4, sizeof(recno_t)); + + /* + * O(log n) scan for prior records. Since a record + * number is unbounded, this has decent performance over + * a complex hash function. + */ + + for (leaf = root; leaf >= 0; ) + if (rec > recs[leaf].rec && recs[leaf].rhs >= 0) + leaf = recs[leaf].rhs; + else if (rec < recs[leaf].rec && recs[leaf].lhs >= 0) + leaf = recs[leaf].lhs; + else + break; + + if (leaf >= 0 && recs[leaf].rec == rec) + continue; + + /* + * Now we actually extract the manpage's metadata from + * the index database. + */ + + key.data = &rec; + key.size = sizeof(recno_t); + + if (0 != (*idx->get)(idx, &key, &val, 0)) + break; + + srec.lhs = srec.rhs = -1; + if ( ! index_read(&key, &val, mc, &srec)) + break; + + if (opts->cat && strcasecmp(opts->cat, srec.cat)) + continue; + if (opts->arch && strcasecmp(opts->arch, srec.arch)) + continue; + + recs = mandoc_realloc + (recs, (len + 1) * sizeof(struct rec)); + + memcpy(&recs[len], &srec, sizeof(struct rec)); + + /* Append to our tree. */ + + if (leaf >= 0) { + if (rec > recs[leaf].rec) + recs[leaf].rhs = len; + else + recs[leaf].lhs = len; + } else + root = len; + + memset(&srec, 0, sizeof(struct rec)); + len++; + } + + if (1 == ch) + (*res)(recs, len, arg); + + /* XXX: else? corrupt database error? */ +out: + for (i = 0; i < len; i++) { + free(recs[i].file); + free(recs[i].cat); + free(recs[i].title); + free(recs[i].arch); + free(recs[i].desc); + } + + free(srec.file); + free(srec.cat); + free(srec.title); + free(srec.arch); + free(srec.desc); + + if (mc) + mchars_free(mc); + if (btree) + (*btree->close)(btree); + if (idx) + (*idx->close)(idx); + + free(buf); + free(recs); +} + +struct expr * +exprcomp(int cs, char *argv[], int argc) +{ + struct expr *p; + struct expr e; + int i, pos, ch; + + pos = 0; + + if (pos > argc) + return(NULL); + + for (i = 0; 0 != types[i].mask; i++) + if (0 == strcmp(types[i].name, argv[pos])) + break; + + if (0 == (e.mask = types[i].mask)) + return(NULL); + + if (++pos > argc--) + return(NULL); + + if ('-' != *argv[pos]) + e.match = cs ? MATCH_STRCASE : MATCH_STR; + else if (0 == strcmp("-eq", argv[pos])) + e.match = cs ? MATCH_STRCASE : MATCH_STR; + else if (0 == strcmp("-ieq", argv[pos])) + e.match = MATCH_STRCASE; + else if (0 == strcmp("-re", argv[pos])) + e.match = cs ? MATCH_REGEXCASE : MATCH_REGEX; + else if (0 == strcmp("-ire", argv[pos])) + e.match = MATCH_REGEXCASE; + else + return(NULL); + + if ('-' == *argv[pos]) + pos++; + + if (pos > argc--) + return(NULL); + + e.v = mandoc_strdup(argv[pos]); + + if (MATCH_REGEX == e.match || MATCH_REGEXCASE == e.match) { + ch = REG_EXTENDED | REG_NOSUB; + if (MATCH_REGEXCASE == e.match) + ch |= REG_ICASE; + if (regcomp(&e.re, e.v, ch)) + return(NULL); + } + + p = mandoc_calloc(1, sizeof(struct expr)); + memcpy(p, &e, sizeof(struct expr)); + return(p); +} + +void +exprfree(struct expr *p) +{ + + if (NULL == p) + return; + + if (MATCH_REGEX == p->match) + regfree(&p->re); + + free(p->v); + free(p); +} + +static int +exprexec(const struct expr *p, char *cp) +{ + + if (MATCH_STR == p->match) + return(0 == strcmp(p->v, cp)); + else if (MATCH_STRCASE == p->match) + return(0 == strcasecmp(p->v, cp)); + + assert(MATCH_REGEX == p->match); + return(0 == regexec(&p->re, cp, 0, NULL, 0)); +} |