summaryrefslogtreecommitdiff
path: root/usr.bin/awk/lex.c
diff options
context:
space:
mode:
authorkstailey <kstailey@cvs.openbsd.org>1997-08-25 16:17:15 +0000
committerkstailey <kstailey@cvs.openbsd.org>1997-08-25 16:17:15 +0000
commitc1a12065492ecafe5bd1c4d09bd507f692624839 (patch)
tree807c1f8e2f4da44ce9e68bde04d631cb316e4b5f /usr.bin/awk/lex.c
parent73138c6a805b3d20f0625381b3f31c910da81d50 (diff)
August 1997 version of "the one true awk"
Diffstat (limited to 'usr.bin/awk/lex.c')
-rw-r--r--usr.bin/awk/lex.c572
1 files changed, 572 insertions, 0 deletions
diff --git a/usr.bin/awk/lex.c b/usr.bin/awk/lex.c
new file mode 100644
index 00000000000..cb89504c0c5
--- /dev/null
+++ b/usr.bin/awk/lex.c
@@ -0,0 +1,572 @@
+/* $OpenBSD: lex.c,v 1.1 1997/08/25 16:17:11 kstailey Exp $ */
+/****************************************************************
+Copyright (C) Lucent Technologies 1997
+All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and
+its documentation for any purpose and without fee is hereby
+granted, provided that the above copyright notice appear in all
+copies and that both that the copyright notice and this
+permission notice and warranty disclaimer appear in supporting
+documentation, and that the name Lucent Technologies or any of
+its entities not be used in advertising or publicity pertaining
+to distribution of the software without specific, written prior
+permission.
+
+LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
+IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
+SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
+THIS SOFTWARE.
+****************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "awk.h"
+#include "ytab.h"
+
+extern YYSTYPE yylval;
+extern int infunc;
+
+int lineno = 1;
+int bracecnt = 0;
+int brackcnt = 0;
+int parencnt = 0;
+
+typedef struct Keyword {
+ char *word;
+ int sub;
+ int type;
+} Keyword;
+
+Keyword keywords[] ={ /* keep sorted: binary searched */
+ { "BEGIN", XBEGIN, XBEGIN },
+ { "END", XEND, XEND },
+ { "NF", VARNF, VARNF },
+ { "atan2", FATAN, BLTIN },
+ { "break", BREAK, BREAK },
+ { "close", CLOSE, CLOSE },
+ { "continue", CONTINUE, CONTINUE },
+ { "cos", FCOS, BLTIN },
+ { "delete", DELETE, DELETE },
+ { "do", DO, DO },
+ { "else", ELSE, ELSE },
+ { "exit", EXIT, EXIT },
+ { "exp", FEXP, BLTIN },
+ { "fflush", FFLUSH, BLTIN },
+ { "for", FOR, FOR },
+ { "func", FUNC, FUNC },
+ { "function", FUNC, FUNC },
+ { "getline", GETLINE, GETLINE },
+ { "gsub", GSUB, GSUB },
+ { "if", IF, IF },
+ { "in", IN, IN },
+ { "index", INDEX, INDEX },
+ { "int", FINT, BLTIN },
+ { "length", FLENGTH, BLTIN },
+ { "log", FLOG, BLTIN },
+ { "match", MATCHFCN, MATCHFCN },
+ { "next", NEXT, NEXT },
+ { "nextfile", NEXTFILE, NEXTFILE },
+ { "print", PRINT, PRINT },
+ { "printf", PRINTF, PRINTF },
+ { "rand", FRAND, BLTIN },
+ { "return", RETURN, RETURN },
+ { "sin", FSIN, BLTIN },
+ { "split", SPLIT, SPLIT },
+ { "sprintf", SPRINTF, SPRINTF },
+ { "sqrt", FSQRT, BLTIN },
+ { "srand", FSRAND, BLTIN },
+ { "sub", SUB, SUB },
+ { "substr", SUBSTR, SUBSTR },
+ { "system", FSYSTEM, BLTIN },
+ { "tolower", FTOLOWER, BLTIN },
+ { "toupper", FTOUPPER, BLTIN },
+ { "while", WHILE, WHILE },
+};
+
+#define DEBUG
+#ifdef DEBUG
+#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
+#else
+#define RET(x) return(x)
+#endif
+
+int peek()
+{
+ int c = input();
+ unput(c);
+ return c;
+}
+
+int gettok(char **pbuf, int *psz) /* get next input token */
+{
+ int c;
+ char *buf = *pbuf;
+ int sz = *psz;
+ char *bp = buf;
+
+ c = input();
+ if (c == 0)
+ return 0;
+ buf[0] = c;
+ buf[1] = 0;
+ if (!isalnum(c) && c != '.' && c != '_')
+ return c;
+
+ *bp++ = c;
+ if (isalpha(c) || c == '_') { /* it's a varname */
+ for ( ; (c = input()) != 0; ) {
+ if (bp-buf >= sz)
+ if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
+ ERROR "out of space for name %.10s...", buf FATAL;
+ if (isalnum(c) || c == '_')
+ *bp++ = c;
+ else {
+ *bp = 0;
+ unput(c);
+ break;
+ }
+ }
+ } else { /* it's a number */
+ char *rem;
+ /* read input until can't be a number */
+ for ( ; (c = input()) != 0; ) {
+ if (bp-buf >= sz)
+ if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
+ ERROR "out of space for number %.10s...", buf FATAL;
+ if (isdigit(c) || c == 'e' || c == 'E'
+ || c == '.' || c == '+' || c == '-')
+ *bp++ = c;
+ else {
+ *bp = 0;
+ unput(c);
+ break;
+ }
+ }
+ strtod(buf, &rem); /* parse the number */
+ unputstr(rem); /* put rest back for later */
+ rem[0] = 0;
+ }
+ *pbuf = buf;
+ *psz = sz;
+ return buf[0];
+}
+
+int word(char *);
+int string(void);
+int regexpr(void);
+int sc = 0; /* 1 => return a } right now */
+int reg = 0; /* 1 => return a REGEXPR now */
+
+int yylex()
+{
+ int c, n;
+ static char *buf = 0;
+ static int bufsize = 500;
+
+ if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
+ ERROR "out of space in yylex" FATAL;
+ if (sc) {
+ sc = 0;
+ RET('}');
+ }
+ if (reg) {
+ reg = 0;
+ return regexpr();
+ }
+ for (;;) {
+ c = gettok(&buf, &bufsize);
+ if (c == 0)
+ return 0;
+ if (isalpha(c) || c == '_')
+ return word(buf);
+ if (isdigit(c) || c == '.') {
+ yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
+ /* should this also have STR set? */
+ RET(NUMBER);
+ }
+
+ yylval.i = c;
+ switch (c) {
+ case '\n': /* {EOL} */
+ RET(NL);
+ case '\r': /* assume \n is coming */
+ case ' ': /* {WS}+ */
+ case '\t':
+ break;
+ case '#': /* #.* strip comments */
+ while ((c = input()) != '\n' && c != 0)
+ ;
+ unput(c);
+ break;
+ case ';':
+ RET(';');
+ case '\\':
+ if (peek() == '\n') {
+ input(); lineno++;
+ } else if (peek() == '\r') {
+ input(); input(); /* \n */
+ lineno++;
+ } else {
+ RET(c);
+ }
+ break;
+ case '&':
+ if (peek() == '&') {
+ input(); RET(AND);
+ } else
+ RET('&');
+ case '|':
+ if (peek() == '|') {
+ input(); RET(BOR);
+ } else
+ RET('|');
+ case '!':
+ if (peek() == '=') {
+ input(); yylval.i = NE; RET(NE);
+ } else if (peek() == '~') {
+ input(); yylval.i = NOTMATCH; RET(MATCHOP);
+ } else
+ RET(NOT);
+ case '~':
+ yylval.i = MATCH;
+ RET(MATCHOP);
+ case '<':
+ if (peek() == '=') {
+ input(); yylval.i = LE; RET(LE);
+ } else {
+ yylval.i = LT; RET(LT);
+ }
+ case '=':
+ if (peek() == '=') {
+ input(); yylval.i = EQ; RET(EQ);
+ } else {
+ yylval.i = ASSIGN; RET(ASGNOP);
+ }
+ case '>':
+ if (peek() == '=') {
+ input(); yylval.i = GE; RET(GE);
+ } else if (peek() == '>') {
+ input(); yylval.i = APPEND; RET(APPEND);
+ } else {
+ yylval.i = GT; RET(GT);
+ }
+ case '+':
+ if (peek() == '+') {
+ input(); yylval.i = INCR; RET(INCR);
+ } else if (peek() == '=') {
+ input(); yylval.i = ADDEQ; RET(ASGNOP);
+ } else
+ RET('+');
+ case '-':
+ if (peek() == '-') {
+ input(); yylval.i = DECR; RET(DECR);
+ } else if (peek() == '=') {
+ input(); yylval.i = SUBEQ; RET(ASGNOP);
+ } else
+ RET('-');
+ case '*':
+ if (peek() == '=') { /* *= */
+ input(); yylval.i = MULTEQ; RET(ASGNOP);
+ } else if (peek() == '*') { /* ** or **= */
+ input(); /* eat 2nd * */
+ if (peek() == '=') {
+ input(); yylval.i = POWEQ; RET(ASGNOP);
+ } else {
+ RET(POWER);
+ }
+ } else
+ RET('*');
+ case '/':
+ if (peek() == '=') {
+ input(); yylval.i = DIVEQ; RET(ASGNOP);
+ } else
+ RET('/');
+ case '%':
+ if (peek() == '=') {
+ input(); yylval.i = MODEQ; RET(ASGNOP);
+ } else
+ RET('%');
+ case '^':
+ if (peek() == '=') {
+ input(); yylval.i = POWEQ; RET(ASGNOP);
+ } else
+ RET(POWER);
+
+ case '$':
+ /* BUG: awkward, if not wrong */
+ c = gettok(&buf, &bufsize);
+ if (c == '(' || c == '[' || (infunc && (n=isarg(buf)) >= 0)) {
+ unputstr(buf);
+ RET(INDIRECT);
+ } else if (isalpha(c)) {
+ if (strcmp(buf, "NF") == 0) { /* very special */
+ unputstr("(NF)");
+ RET(INDIRECT);
+ }
+ yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
+ RET(IVAR);
+ } else {
+ unputstr(buf);
+ RET(INDIRECT);
+ }
+
+ case '}':
+ if (--bracecnt < 0)
+ ERROR "extra }" SYNTAX;
+ sc = 1;
+ RET(';');
+ case ']':
+ if (--brackcnt < 0)
+ ERROR "extra ]" SYNTAX;
+ RET(']');
+ case ')':
+ if (--parencnt < 0)
+ ERROR "extra )" SYNTAX;
+ RET(')');
+ case '{':
+ bracecnt++;
+ RET('{');
+ case '[':
+ brackcnt++;
+ RET('[');
+ case '(':
+ parencnt++;
+ RET('(');
+
+ case '"':
+ return string(); /* BUG: should be like tran.c ? */
+
+ default:
+ RET(c);
+ }
+ }
+}
+
+int string()
+{
+ int c, n;
+ char *s, *bp;
+ static char *buf = 0;
+ static int bufsz = 500;
+
+ if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
+ ERROR "out of space for strings" FATAL;
+ for (bp = buf; (c = input()) != '"'; ) {
+ if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
+ ERROR "out of space for string %.10s...", buf FATAL;
+ switch (c) {
+ case '\n':
+ case '\r':
+ case 0:
+ ERROR "non-terminated string %.10s...", buf SYNTAX;
+ lineno++;
+ break;
+ case '\\':
+ c = input();
+ switch (c) {
+ case '"': *bp++ = '"'; break;
+ case 'n': *bp++ = '\n'; break;
+ case 't': *bp++ = '\t'; break;
+ case 'f': *bp++ = '\f'; break;
+ case 'r': *bp++ = '\r'; break;
+ case 'b': *bp++ = '\b'; break;
+ case 'v': *bp++ = '\v'; break;
+ case 'a': *bp++ = '\a'; break;
+ case '\\': *bp++ = '\\'; break;
+
+ case '0': case '1': case '2': /* octal: \d \dd \ddd */
+ case '3': case '4': case '5': case '6': case '7':
+ n = c - '0';
+ if ((c = peek()) >= '0' && c < '8') {
+ n = 8 * n + input() - '0';
+ if ((c = peek()) >= '0' && c < '8')
+ n = 8 * n + input() - '0';
+ }
+ *bp++ = n;
+ break;
+
+ case 'x': /* hex \x0-9a-fA-F + */
+ { char xbuf[100], *px;
+ for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
+ if (isdigit(c)
+ || (c >= 'a' && c <= 'f')
+ || (c >= 'A' && c <= 'F'))
+ *px++ = c;
+ else
+ break;
+ }
+ *px = 0;
+ unput(c);
+ sscanf(xbuf, "%x", &n);
+ *bp++ = n;
+ break;
+ }
+
+ default:
+ *bp++ = c;
+ break;
+ }
+ break;
+ default:
+ *bp++ = c;
+ break;
+ }
+ }
+ *bp = 0;
+ s = tostring(buf);
+ *bp++ = ' '; *bp++ = 0;
+ yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
+ RET(STRING);
+}
+
+
+int binsearch(char *w, Keyword *kp, int n)
+{
+ int cond, low, mid, high;
+
+ low = 0;
+ high = n - 1;
+ while (low <= high) {
+ mid = (low + high) / 2;
+ if ((cond = strcmp(w, kp[mid].word)) < 0)
+ high = mid - 1;
+ else if (cond > 0)
+ low = mid + 1;
+ else
+ return mid;
+ }
+ return -1;
+}
+
+int word(char *w)
+{
+ Keyword *kp;
+ int c, n;
+
+ n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
+ kp = keywords + n;
+ if (n != -1) { /* found in table */
+ yylval.i = kp->sub;
+ switch (kp->type) { /* special handling */
+ case FSYSTEM:
+ if (safe)
+ ERROR "system is unsafe" SYNTAX;
+ RET(kp->type);
+ case FUNC:
+ if (infunc)
+ ERROR "illegal nested function" SYNTAX;
+ RET(kp->type);
+ case RETURN:
+ if (!infunc)
+ ERROR "return not in function" SYNTAX;
+ RET(kp->type);
+ case VARNF:
+ yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
+ RET(VARNF);
+ default:
+ RET(kp->type);
+ }
+ }
+ c = peek(); /* look for '(' */
+ if (c != '(' && infunc && (n=isarg(w)) >= 0) {
+ yylval.i = n;
+ RET(ARG);
+ } else {
+ yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
+ if (c == '(') {
+ RET(CALL);
+ } else {
+ RET(VAR);
+ }
+ }
+}
+
+void startreg(void) /* next call to yyles will return a regular expression */
+{
+ reg = 1;
+}
+
+int regexpr()
+{
+ int c;
+ static char *buf = 0;
+ static int bufsz = 500;
+ char *bp;
+
+ if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
+ ERROR "out of space for rex expr" FATAL;
+ bp = buf;
+ for ( ; (c = input()) != '/' && c != 0; ) {
+ if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
+ ERROR "out of space for reg expr %.10s...", buf FATAL;
+ if (c == '\n') {
+ ERROR "newline in regular expression %.10s...", buf SYNTAX;
+ unput('\n');
+ break;
+ } else if (c == '\\') {
+ *bp++ = '\\';
+ *bp++ = input();
+ } else {
+ *bp++ = c;
+ }
+ }
+ *bp = 0;
+ yylval.s = tostring(buf);
+ unput('/');
+ RET(REGEXPR);
+}
+
+/* low-level lexical stuff, sort of inherited from lex */
+
+char ebuf[300];
+char *ep = ebuf;
+char yysbuf[100]; /* pushback buffer */
+char *yysptr = yysbuf;
+FILE *yyin = 0;
+
+int input(void) /* get next lexical input character */
+{
+ int c;
+ extern char *lexprog;
+
+ if (yysptr > yysbuf)
+ c = *--yysptr;
+ else if (lexprog != NULL) { /* awk '...' */
+ if ((c = *lexprog) != 0)
+ lexprog++;
+ } else /* awk -f ... */
+ c = pgetc();
+ if (c == '\n')
+ lineno++;
+ else if (c == EOF)
+ c = 0;
+ if (ep >= ebuf + sizeof ebuf)
+ ep = ebuf;
+ return *ep++ = c;
+}
+
+void unput(int c) /* put lexical character back on input */
+{
+ if (c == '\n')
+ lineno--;
+ if (yysptr >= yysbuf + sizeof(yysbuf))
+ ERROR "pushed back too much: %.20s...", yysbuf FATAL;
+ *yysptr++ = c;
+ if (--ep < ebuf)
+ ep = ebuf + sizeof(ebuf) - 1;
+}
+
+void unputstr(char *s) /* put a string back on input */
+{
+ int i;
+
+ for (i = strlen(s)-1; i >= 0; i--)
+ unput(s[i]);
+}