/* * scan.c: a simple command-line scanner. Cuts the line up into * space-delimited tokens; returns the kw value of the first * non-assigned token. * * This is probably the ugliest code around. It could be cleaner, but then * would be longer. The comments will try to explain what's going down, * but could do a much better job of what they're doing. */ #include #include #include "kw.h" #include "mm.h" extern char *mmalloc(); extern char *mrealloc(); extern char *strdup(); /* * keywords we need to look for inside the scanner. Some cause magic to * happen inside the scanner - most cause magic to happen inside the parser. * EXIT is included for ease of development. */ struct kwt { char *word; int key; } res[] = { "if", IF, "then", THEN, "elif", ELIF, "else", ELSE, "fi", FI, "for", FOR, "while", WHILE, "do", DO, "done", DONE, "exit", EXIT, (char*)0, 0 } ; /* * APPEND: is a macro to quickly add a character to the c_text field * of a CMD * variable. It's used by scan() to easily * build c_text. */ #define APPEND(p,c,i) {\ p->c_text[i++] = c; \ if (i%50 == 0) \ p->c_text = mrealloc(p->c_text, 50+i);\ } /* * delim(): returns the delimiter terminating a command. This command is * called by the parser after scan() has been called to get the latest * command stream. */ delim() { register char c; if (!eos()) if ((c = input()) == EOF) return EOF; else if (c == '&') { c = input(c); if (c == '&') return AND; else if (c == EOF) return EOF; unput(c); return BG; } else if (c == '|') { c = input(c); if (c == '|') return OR; else if (c == EOF) return EOF; unput(c); return PIPE; } return NEWLINE; } /* delim */ /* * isspc(): is this a special token that involves a command break * after the token? */ static isspc(ptr, fs, fe) register CMD *ptr; register fs; { register size, j; if (size = fe-fs) { /* * check for special reserved word */ for (j=0; res[j].word; j++) { if (strlen(res[j].word) == size && strncmp(res[j].word, fs + ptr->c_text, size) == 0) { ptr->c_type = res[j].key; break; } } if (ptr->c_type & (IF|THEN|ELIF|ELSE|WHILE|DO)) return 1; } return 0; } /* isspc */ /* * isdelim(): tells us whether the token we just hit is a command delimiter * or not. */ isdelim(c) register c; { return (c==';') || (c=='\n') || (c=='|') || (c=='&'); } /* isdelim */ /* * getddin(): converts a << input redirection into a conventional < redirection, * returning the name of the tempfile (which will demise as soon as the * command list gets garbage()d. */ static char * getddin() { register char *p; char match[100]; char text[200]; register c; register size=0; static int ddseq=0; register FILE *ddfile; register process=1; /* scan past whitespace */ while (!eos() && (c=input()) != EOF && isspace(c)) ; if (c == EOF) return (char*)0; else if (c == '#') { fprintf(stderr, "'newline or ;' unexpected\n"); return (char*)0; } /* now suck up the end-of << pattern. If we run into eos or a delimiter * token, cry wolf and abandon ship. */ while (!(eos() || isdelim(c))) { /* on << redirections, \ will always escape the next char (Un*x * compatability with shar.) Notice that this is not the way * scan() handles \\! */ if (c == '\\') { process=0; c = input(); if (c == EOF) return (char*)0; if (eos()) break; match[size++] = c; } else if (c == '"' || c == '\'') { register char qc = c; process=0; while(1) { c = input(); if (c == EOF) return (char*)0; else if (eos()) { fprintf(stderr, "badly escaped << token\n"); return (char*)0; } if (c == qc) break; match[size++] = c; } } else match[size++] = c; if (eos() || isspace(c = input())) break; if (c == EOF) return (char*)0; } if (size == 0) { fprintf(stderr, "unexpected symbol\n"); return (char *)0; } unput(eos()?'\n':c); match[size++] = '\n'; match[size] = 0; /* now we have the termination token, so we build a tempfile to hold * the text of the << redirection, then stroll through our input * stream, sucking up lines until we reach the termination token. * If we didn't escape the termination token, we'll need to do variable * substitution on all of the lines we read into the tempfile. */ p = mmalloc(11); sprintf(p, "sh%04x.tmp", ddseq++); if ((ddfile = fopen(p, "wb")) == (FILE*)0) { mfree(p); fprintf(stderr, "spoolfile open failure!\n"); return (char *)0; } while (1) { if (snatchline(text) == EOF) { fclose(ddfile); unlink(p); return (char*)0; } if (strcmp(match, text) == 0) { fclose(ddfile); return p; } else fputs(text, ddfile); } } /* getddin */ /* * scan(): gets a single command off input * * a command is composed of an optional list of variable assignments, * followed by a optional list of arguments and/or I/O redirection * operators, and terminated by end-of-line or one of the command * separator tokens (;, |, &, ||, &&). A token is a series of * nonwhite characters, terminated by whitespace, one of the command * separator tokens or a redirection operator. Tokens may also include * any other character, as long as it is quoted by a \ (any of the special * termination characters may be quoted; nothing else), or a "", '', or * `` string. (strings have special meaning to the backend scanner as * well, but are not dealt with here aside from just being recognised.) * * the list of variable assignments is composed of ident=token pairs. * An ident can contain alphanumeric characters and _, much like a C * language identifier. If, while scanning, scan() encounters a invalid * variable assignment (such as x,y=token), it will consider that to be * the first argument in the command. * * scan returns a pointer to a command structure that contains the text * of the command (c_text), the index of the start of the first argument * in the command (c_fs), and the index of the end of the first argument * in the command (c_fe). All items to the left of c_fs are variable * assignments. All items to the right of c_fe are arguments and i/o * redirection. (For a control block, the control expression is reparsed * by passing c_text + c_fe to the scanner; this allows expressions like * if a=7 true; then echo $a; fi to properly echo "7") * * Some exceptions and things to notice: * * o The keywords IF, THEN, ELIF, ELSE, WHILE, and DO cause an * immediate command break, to reduce the work needed to parse the * resulting command. (The command stream if x; then y will scan * to the four commands (if) (x) (then) (y), for example.) * * o At the end of the scan, the terminating token is left on the * input stream for the parser to pick up. Calling scan twice in * a row will lead to a null second command, due to the presence of * the terminator. * * o Comments are prefixed with a token beginning with #, and continue * to the end of the line they are on. */ CMD * scan() { register c; /* latest character read */ register i=0; /* index into output stream */ register CMD *ptr; /* what we're returning to caller */ register fe=(-1), fs=(-1); /* first arg end, first arg start */ register istok=0; /* are we in a token? */ register tts; /* this token start */ register wsf=0; /* word-so-far: to tell if a lhs of a leading ='s */ /* token is a valid identifier */ int esc; /* escaped char after \ */ /* initialize... */ ptr = (CMD *)mmalloc(sizeof *ptr); memset(ptr, 0, sizeof *ptr); ptr->c_next = (CMD *)0; ptr->c_text = mmalloc(50); ptr->c_type = 0; /* and away we go */ while (1) { c = input(); /* first check for interrupted input */ if (c == EOF) { toss: garbage(ptr); return (CMD *)0; } /* then check for end of command */ else if (isdelim(c) || (c == '#' && !istok)) { if (istok) { istok=0; if (fs >= 0 && fe < 0) fe=i; } ptr->c_text[i] = 0; break; } /* then check for whitespace & end of token */ else if (isspace(c)) { if (istok) { wsf=istok=0; if (fs >= 0 && fe < 0) if (isspc(ptr, fs, fe=i)) break; } else /* strip out multiple spaces */ continue; } else { if (c == '<' || c == '>') { if (fs >= 0 && fe < 0) if (isspc(ptr, fs, fe=i)) break; if (fs < 0) /* null first argument :-( */ fs = fe = i; if (c == '<' && !eos()) { c = input(); if (c == EOF) goto toss; if (c == '<') { register char *p; if (istok && tts) APPEND(ptr, ' ', i); APPEND(ptr, '<', i); if ((ptr->c_filein=getddin()) == (char *)0) { garbage(ptr); return (CMD *)0; } for (p=ptr->c_filein; *p; ) APPEND(ptr, *p++, i); APPEND(ptr, ' ', i); continue; } else { unput(c); c = '<'; } } if (istok) { register idx; /* * here we check for magical #> redirection */ for (idx=tts; idxc_text[idx])) break; if (idx < i) /* if not magical, break the line */ APPEND(ptr, ' ', i); } APPEND(ptr, c, i); /* * eat spaces before next token... */ while (!eos() && isspace(c=input())) ; if (c == EOF) goto toss; if (c != '&' && (eos() || isdelim(c) || c == '#')) { fprintf(stderr, "unexpected symbol\n"); goto toss; } istok=1; } if (!istok) { wsf=istok=1; if (fs < 0) fs=i; tts = i; } else if (c == '=' && wsf) if (fs >= 0 && fe < 0) fs=(-1); if (wsf && !isalnum(c) && c != '_') wsf = 0; if (c == '"' || c == '\'' || c == '`') { register char qc = c; do { APPEND(ptr, c, i); if (c == '\\') { if ((esc=input()) == EOF) goto toss; APPEND(ptr, esc, i); } } while ((c=input()) != qc && c != EOF); if (c == EOF) goto toss; } else if (c == '\\') { APPEND(ptr, c, i); if ((c = input()) == EOF) goto toss; } } APPEND(ptr, c, i); } if (c == '#') flushline(); else unput(c); ptr->c_text[i] = 0; ptr->c_fs = fs; ptr->c_fe = fe; isspc(ptr, fs, fe); return ptr; } /* scan */