Main Page | Class Hierarchy | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals

lex.c

Go to the documentation of this file.
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <string.h>
00004 #include "cpp.h"
00005 
00006 /*
00007  * lexical FSM encoding
00008  *   when in state state, and one of the characters
00009  *   in ch arrives, enter nextstate.
00010  *   States >= S_SELF are either final, or at least require special action.
00011  *   In 'fsm' there is a line for each state X charset X nextstate.
00012  *   List chars that overwrite previous entries later (e.g. C_ALPH
00013  *   can be overridden by '_' by a later entry; and C_XX is the
00014  *   the universal set, and should always be first.
00015  *   States above S_SELF are represented in the big table as negative values.
00016  *   S_SELF and S_SELFB encode the resulting token type in the upper bits.
00017  *   These actions differ in that S_SELF doesn't have a lookahead char,
00018  *   S_SELFB does.
00019  *
00020  *   The encoding is blown out into a big table for time-efficiency.
00021  *   Entries have
00022  *      nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
00023  */
00024 
00025 #define MAXSTATE 32
00026 #define ACT(tok,act)    ((tok<<7)+act)
00027 #define QBSBIT  0100
00028 #define GETACT(st)  (st>>7)&0x1ff
00029 
00030 /* character classes */
00031 #define C_WS    1
00032 #define C_ALPH  2
00033 #define C_NUM   3
00034 #define C_EOF   4
00035 #define C_XX    5
00036 
00037 enum state {
00038     START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
00039     CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
00040     CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
00041     S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
00042     S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
00043 };
00044 
00045 int tottok;
00046 int tokkind[256];
00047 struct  fsm {
00048     int state;      /* if in this state */
00049     uchar   ch[4];      /* and see one of these characters */
00050     int nextstate;  /* enter this state if +ve */
00051 };
00052 
00053 /*const*/ struct fsm fsm[] = {
00054     /* start state */
00055     START,  { C_XX },   ACT(UNCLASS,S_SELF),
00056     START,  { ' ', '\t', '\v' },    WS1,
00057     START,  { C_NUM },  NUM1,
00058     START,  { '.' },    NUM3,
00059     START,  { C_ALPH }, ID1,
00060     START,  { 'L' },    ST1,
00061     START,  { '"' },    ST2,
00062     START,  { '\'' },   CC1,
00063     START,  { '/' },    COM1,
00064     START,  { EOFC },   S_EOF,
00065     START,  { '\n' },   S_NL,
00066     START,  { '-' },    MINUS1,
00067     START,  { '+' },    PLUS1,
00068     START,  { '<' },    LT1,
00069     START,  { '>' },    GT1,
00070     START,  { '=' },    ASG1,
00071     START,  { '!' },    NOT1,
00072     START,  { '&' },    AND1,
00073     START,  { '|' },    OR1,
00074     START,  { '#' },    SHARP1,
00075     START,  { '%' },    PCT1,
00076     START,  { '[' },    ACT(SBRA,S_SELF),
00077     START,  { ']' },    ACT(SKET,S_SELF),
00078     START,  { '(' },    ACT(LP,S_SELF),
00079     START,  { ')' },    ACT(RP,S_SELF),
00080     START,  { '*' },    STAR1,
00081     START,  { ',' },    ACT(COMMA,S_SELF),
00082     START,  { '?' },    ACT(QUEST,S_SELF),
00083     START,  { ':' },    ACT(COLON,S_SELF),
00084     START,  { ';' },    ACT(SEMIC,S_SELF),
00085     START,  { '{' },    ACT(CBRA,S_SELF),
00086     START,  { '}' },    ACT(CKET,S_SELF),
00087     START,  { '~' },    ACT(TILDE,S_SELF),
00088     START,  { '^' },    CIRC1,
00089 
00090     /* saw a digit */
00091     NUM1,   { C_XX },   ACT(NUMBER,S_SELFB),
00092     NUM1,   { C_NUM, C_ALPH, '.' }, NUM1,
00093     NUM1,   { 'E', 'e' },   NUM2,
00094     NUM1,   { '_' },    ACT(NUMBER,S_SELFB),
00095 
00096     /* saw possible start of exponent, digits-e */
00097     NUM2,   { C_XX },   ACT(NUMBER,S_SELFB),
00098     NUM2,   { '+', '-' },   NUM1,
00099     NUM2,   { C_NUM, C_ALPH },  NUM1,
00100     NUM2,   { '_' },    ACT(NUMBER,S_SELFB),
00101 
00102     /* saw a '.', which could be a number or an operator */
00103     NUM3,   { C_XX },   ACT(DOT,S_SELFB),
00104     NUM3,   { '.' },    DOTS1,
00105     NUM3,   { C_NUM },  NUM1,
00106 
00107     DOTS1,  { C_XX },   ACT(UNCLASS, S_SELFB),
00108     DOTS1,  { C_NUM },  NUM1,
00109     DOTS1,  { '.' },    ACT(ELLIPS, S_SELF),
00110 
00111     /* saw a letter or _ */
00112     ID1,    { C_XX },   ACT(NAME,S_NAME),
00113     ID1,    { C_ALPH, C_NUM },  ID1,
00114 
00115     /* saw L (start of wide string?) */
00116     ST1,    { C_XX },   ACT(NAME,S_NAME),
00117     ST1,    { C_ALPH, C_NUM },  ID1,
00118     ST1,    { '"' },    ST2,
00119     ST1,    { '\'' },   CC1,
00120 
00121     /* saw " beginning string */
00122     ST2,    { C_XX },   ST2,
00123     ST2,    { '"' },    ACT(STRING, S_SELF),
00124     ST2,    { '\\' },   ST3,
00125     ST2,    { '\n' },   S_STNL,
00126     ST2,    { EOFC },   S_EOFSTR,
00127 
00128     /* saw \ in string */
00129     ST3,    { C_XX },   ST2,
00130     ST3,    { '\n' },   S_STNL,
00131     ST3,    { EOFC },   S_EOFSTR,
00132 
00133     /* saw ' beginning character const */
00134     CC1,    { C_XX },   CC1,
00135     CC1,    { '\'' },   ACT(CCON, S_SELF),
00136     CC1,    { '\\' },   CC2,
00137     CC1,    { '\n' },   S_STNL,
00138     CC1,    { EOFC },   S_EOFSTR,
00139 
00140     /* saw \ in ccon */
00141     CC2,    { C_XX },   CC1,
00142     CC2,    { '\n' },   S_STNL,
00143     CC2,    { EOFC },   S_EOFSTR,
00144 
00145     /* saw /, perhaps start of comment */
00146     COM1,   { C_XX },   ACT(SLASH, S_SELFB),
00147     COM1,   { '=' },    ACT(ASSLASH, S_SELF),
00148     COM1,   { '*' },    COM2,
00149     COM1,   { '/' },    COM4,
00150 
00151     /* saw / then *, start of comment */
00152     COM2,   { C_XX },   COM2,
00153     COM2,   { '\n' },   S_COMNL,
00154     COM2,   { '*' },    COM3,
00155     COM2,   { EOFC },   S_EOFCOM,
00156 
00157     /* saw the * possibly ending a comment */
00158     COM3,   { C_XX },   COM2,
00159     COM3,   { '\n' },   S_COMNL,
00160     COM3,   { '*' },    COM3,
00161     COM3,   { '/' },    S_COMMENT,
00162 
00163     /* // comment */
00164     COM4,   { C_XX },   COM4,
00165     COM4,   { '\n' },   S_NL,
00166     COM4,   { EOFC },   S_EOFCOM,
00167 
00168     /* saw white space, eat it up */
00169     WS1,    { C_XX },   S_WS,
00170     WS1,    { ' ', '\t', '\v' },    WS1,
00171 
00172     /* saw -, check --, -=, -> */
00173     MINUS1, { C_XX },   ACT(MINUS, S_SELFB),
00174     MINUS1, { '-' },    ACT(MMINUS, S_SELF),
00175     MINUS1, { '=' },    ACT(ASMINUS,S_SELF),
00176     MINUS1, { '>' },    ACT(ARROW,S_SELF),
00177 
00178     /* saw +, check ++, += */
00179     PLUS1,  { C_XX },   ACT(PLUS, S_SELFB),
00180     PLUS1,  { '+' },    ACT(PPLUS, S_SELF),
00181     PLUS1,  { '=' },    ACT(ASPLUS, S_SELF),
00182 
00183     /* saw <, check <<, <<=, <= */
00184     LT1,    { C_XX },   ACT(LT, S_SELFB),
00185     LT1,    { '<' },    LT2,
00186     LT1,    { '=' },    ACT(LEQ, S_SELF),
00187     LT2,    { C_XX },   ACT(LSH, S_SELFB),
00188     LT2,    { '=' },    ACT(ASLSH, S_SELF),
00189 
00190     /* saw >, check >>, >>=, >= */
00191     GT1,    { C_XX },   ACT(GT, S_SELFB),
00192     GT1,    { '>' },    GT2,
00193     GT1,    { '=' },    ACT(GEQ, S_SELF),
00194     GT2,    { C_XX },   ACT(RSH, S_SELFB),
00195     GT2,    { '=' },    ACT(ASRSH, S_SELF),
00196 
00197     /* = */
00198     ASG1,   { C_XX },   ACT(ASGN, S_SELFB),
00199     ASG1,   { '=' },    ACT(EQ, S_SELF),
00200 
00201     /* ! */
00202     NOT1,   { C_XX },   ACT(NOT, S_SELFB),
00203     NOT1,   { '=' },    ACT(NEQ, S_SELF),
00204 
00205     /* & */
00206     AND1,   { C_XX },   ACT(AND, S_SELFB),
00207     AND1,   { '&' },    ACT(LAND, S_SELF),
00208     AND1,   { '=' },    ACT(ASAND, S_SELF),
00209 
00210     /* | */
00211     OR1,    { C_XX },   ACT(OR, S_SELFB),
00212     OR1,    { '|' },    ACT(LOR, S_SELF),
00213     OR1,    { '=' },    ACT(ASOR, S_SELF),
00214 
00215     /* # */
00216     SHARP1, { C_XX },   ACT(SHARP, S_SELFB),
00217     SHARP1, { '#' },    ACT(DSHARP, S_SELF),
00218 
00219     /* % */
00220     PCT1,   { C_XX },   ACT(PCT, S_SELFB),
00221     PCT1,   { '=' },    ACT(ASPCT, S_SELF),
00222 
00223     /* * */
00224     STAR1,  { C_XX },   ACT(STAR, S_SELFB),
00225     STAR1,  { '=' },    ACT(ASSTAR, S_SELF),
00226 
00227     /* ^ */
00228     CIRC1,  { C_XX },   ACT(CIRC, S_SELFB),
00229     CIRC1,  { '=' },    ACT(ASCIRC, S_SELF),
00230 
00231     -1
00232 };
00233 
00234 /* first index is char, second is state */
00235 /* increase #states to power of 2 to encourage use of shift */
00236 short   bigfsm[256][MAXSTATE];
00237 
00238 void
00239 expandlex(void)
00240 {
00241     /*const*/ struct fsm *fp;
00242     int i, j, nstate;
00243 
00244     for (fp = fsm; fp->state>=0; fp++) {
00245         for (i=0; fp->ch[i]; i++) {
00246             nstate = fp->nextstate;
00247             if (nstate >= S_SELF)
00248                 nstate = ~nstate;
00249             switch (fp->ch[i]) {
00250 
00251             case C_XX:      /* random characters */
00252                 for (j=0; j<256; j++)
00253                     bigfsm[j][fp->state] = nstate;
00254                 continue;
00255             case C_ALPH:
00256                 for (j=0; j<=256; j++)
00257                     if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z'
00258                       || j=='_')
00259                         bigfsm[j][fp->state] = nstate;
00260                 continue;
00261             case C_NUM:
00262                 for (j='0'; j<='9'; j++)
00263                     bigfsm[j][fp->state] = nstate;
00264                 continue;
00265             default:
00266                 bigfsm[fp->ch[i]][fp->state] = nstate;
00267             }
00268         }
00269     }
00270     /* install special cases for ? (trigraphs),  \ (splicing), runes, and EOB */
00271     for (i=0; i<MAXSTATE; i++) {
00272         for (j=0; j<0xFF; j++)
00273             if (j=='?' || j=='\\') {
00274                 if (bigfsm[j][i]>0)
00275                     bigfsm[j][i] = ~bigfsm[j][i];
00276                 bigfsm[j][i] &= ~QBSBIT;
00277             }
00278         bigfsm[EOB][i] = ~S_EOB;
00279         if (bigfsm[EOFC][i]>=0)
00280             bigfsm[EOFC][i] = ~S_EOF;
00281     }
00282 }
00283 
00284 void
00285 fixlex(void)
00286 {
00287     /* do C++ comments? */
00288     if (Cplusplus==0)
00289         bigfsm['/'][COM1] = bigfsm['x'][COM1];
00290 }
00291 
00292 /*
00293  * fill in a row of tokens from input, terminated by NL or END
00294  * First token is put at trp->lp.
00295  * Reset is non-zero when the input buffer can be "rewound."
00296  * The value is a flag indicating that possible macros have
00297  * been seen in the row.
00298  */
00299 int
00300 gettokens(Tokenrow *trp, int reset)
00301 {
00302     register int c, state, oldstate;
00303     register uchar *ip;
00304     register Token *tp, *maxp;
00305     int runelen;
00306     Source *s = cursource;
00307     int nmac = 0;
00308     extern char outbuf[];
00309 
00310     tp = trp->lp;
00311     ip = s->inp;
00312     if (reset) {
00313         s->lineinc = 0;
00314         if (ip>=s->inl) {       /* nothing in buffer */
00315             s->inl = s->inb;
00316             fillbuf(s);
00317             ip = s->inp = s->inb;
00318         } else if (ip >= s->inb+(3*INS/4)) {
00319             memmove(s->inb, ip, 4+s->inl-ip);
00320             s->inl = s->inb+(s->inl-ip);
00321             ip = s->inp = s->inb;
00322         }
00323     }
00324     maxp = &trp->bp[trp->max];
00325     runelen = 1;
00326     for (;;) {
00327        continue2:
00328         if (tp>=maxp) {
00329             trp->lp = tp;
00330             tp = growtokenrow(trp);
00331             maxp = &trp->bp[trp->max];
00332         }
00333         tp->type = UNCLASS;
00334         tp->hideset = 0;
00335         tp->t = ip;
00336         tp->wslen = 0;
00337         tp->flag = 0;
00338         state = START;
00339         for (;;) {
00340             oldstate = state;
00341             c = *ip;
00342             if ((state = bigfsm[c][state]) >= 0) {
00343                 ip += runelen;
00344                 runelen = 1;
00345                 continue;
00346             }
00347             state = ~state;
00348         reswitch:
00349             switch (state&0177) {
00350             case S_SELF:
00351                 ip += runelen;
00352                 runelen = 1;
00353             case S_SELFB:
00354                 tp->type = GETACT(state);
00355                 tp->len = ip - tp->t;
00356                 tp++;
00357                 goto continue2;
00358 
00359             case S_NAME:    /* like S_SELFB but with nmac check */
00360                 tp->type = NAME;
00361                 tp->len = ip - tp->t;
00362                 nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0);
00363                 tp++;
00364                 goto continue2;
00365 
00366             case S_WS:
00367                 tp->wslen = ip - tp->t;
00368                 tp->t = ip;
00369                 state = START;
00370                 continue;
00371 
00372             default:
00373                 if ((state&QBSBIT)==0) {
00374                     ip += runelen;
00375                     runelen = 1;
00376                     continue;
00377                 }
00378                 state &= ~QBSBIT;
00379                 s->inp = ip;
00380                 if (c=='?') {   /* check trigraph */
00381                     if (trigraph(s)) {
00382                         state = oldstate;
00383                         continue;
00384                     }
00385                     goto reswitch;
00386                 }
00387                 if (c=='\\') { /* line-folding */
00388                     if (foldline(s)) {
00389                         s->lineinc++;
00390                         state = oldstate;
00391                         continue;
00392                     }
00393                     goto reswitch;
00394                 }
00395                 error(WARNING, "Lexical botch in cpp");
00396                 ip += runelen;
00397                 runelen = 1;
00398                 continue;
00399 
00400             case S_EOB:
00401                 s->inp = ip;
00402                 fillbuf(cursource);
00403                 state = oldstate;
00404                 continue;
00405 
00406             case S_EOF:
00407                 tp->type = END;
00408                 tp->len = 0;
00409                 s->inp = ip;
00410                 if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1)
00411                     error(WARNING,"No newline at end of file");
00412                 trp->lp = tp+1;
00413                 return nmac;
00414 
00415             case S_STNL:
00416                 error(ERROR, "Unterminated string or char const");
00417             case S_NL:
00418                 tp->t = ip;
00419                 tp->type = NL;
00420                 tp->len = 1;
00421                 tp->wslen = 0;
00422                 s->lineinc++;
00423                 s->inp = ip+1;
00424                 trp->lp = tp+1;
00425                 return nmac;
00426 
00427             case S_EOFSTR:
00428                 error(FATAL, "EOF in string or char constant");
00429                 break;
00430 
00431             case S_COMNL:
00432                 s->lineinc++;
00433                 state = COM2;
00434                 ip += runelen;
00435                 runelen = 1;
00436                 if (ip >= s->inb+(7*INS/8)) { /* very long comment */
00437                     memmove(tp->t, ip, 4+s->inl-ip);
00438                     s->inl -= ip-tp->t;
00439                     ip = tp->t+1;
00440                 }
00441                 continue;
00442 
00443             case S_EOFCOM:
00444                 error(WARNING, "EOF inside comment");
00445                 --ip;
00446             case S_COMMENT:
00447                 ++ip;
00448                 tp->t = ip;
00449                 tp->t[-1] = ' ';
00450                 tp->wslen = 1;
00451                 state = START;
00452                 continue;
00453             }
00454             break;
00455         }
00456         ip += runelen;
00457         runelen = 1;
00458         tp->len = ip - tp->t;
00459         tp++;
00460     }
00461 }
00462 
00463 /* have seen ?; handle the trigraph it starts (if any) else 0 */
00464 int
00465 trigraph(Source *s)
00466 {
00467     int c;
00468 
00469     while (s->inp+2 >= s->inl && fillbuf(s)!=EOF)
00470         ;
00471     if (s->inp[1]!='?')
00472         return 0;
00473     c = 0;
00474     switch(s->inp[2]) {
00475     case '=':
00476         c = '#'; break;
00477     case '(':
00478         c = '['; break;
00479     case '/':
00480         c = '\\'; break;
00481     case ')':
00482         c = ']'; break;
00483     case '\'':
00484         c = '^'; break;
00485     case '<':
00486         c = '{'; break;
00487     case '!':
00488         c = '|'; break;
00489     case '>':
00490         c = '}'; break;
00491     case '-':
00492         c = '~'; break;
00493     }
00494     if (c) {
00495         *s->inp = c;
00496         memmove(s->inp+1, s->inp+3, s->inl-s->inp+2);
00497         s->inl -= 2;
00498     }
00499     return c;
00500 }
00501 
00502 int
00503 foldline(Source *s)
00504 {
00505     while (s->inp+1 >= s->inl && fillbuf(s)!=EOF)
00506         ;
00507     if (s->inp[1] == '\n') {
00508         memmove(s->inp, s->inp+2, s->inl-s->inp+3);
00509         s->inl -= 2;
00510         return 1;
00511     }
00512     return 0;
00513 }
00514 
00515 int
00516 fillbuf(Source *s)
00517 {
00518     int n, nr;
00519 
00520     nr = INS/8;
00521     if ((char *)s->inl+nr > (char *)s->inb+INS)
00522         error(FATAL, "Input buffer overflow");
00523     if (s->fd<0 || (n=read(s->fd, (char *)s->inl, INS/8)) <= 0)
00524         n = 0;
00525     if ((*s->inp&0xff) == EOB) /* sentinel character appears in input */
00526         *s->inp = EOFC;
00527     s->inl += n;
00528     s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB;
00529     if (n==0) {
00530         s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC;
00531         return EOF;
00532     }
00533     return 0;
00534 }
00535 
00536 /*
00537  * Push down to new source of characters.
00538  * If fd>0 and str==NULL, then from a file `name';
00539  * if fd==-1 and str, then from the string.
00540  */
00541 Source *
00542 setsource(char *name, int fd, char *str)
00543 {
00544     Source *s = new(Source);
00545     int len;
00546 
00547     s->line = 1;
00548     s->lineinc = 0;
00549     s->fd = fd;
00550     s->filename = name;
00551     s->next = cursource;
00552     s->ifdepth = 0;
00553     cursource = s;
00554     /* slop at right for EOB */
00555     if (str) {
00556         len = strlen(str);
00557         s->inb = domalloc(len+4);
00558         s->inp = s->inb;
00559         strncpy((char *)s->inp, str, len);
00560     } else {
00561         s->inb = domalloc(INS+4);
00562         s->inp = s->inb;
00563         len = 0;
00564     }
00565     s->inl = s->inp+len;
00566     s->inl[0] = s->inl[1] = EOB;
00567     return s;
00568 }
00569 
00570 void
00571 unsetsource(void)
00572 {
00573     Source *s = cursource;
00574 
00575     if (s->fd>=0) {
00576         close(s->fd);
00577         dofree(s->inb);
00578     }
00579     cursource = s->next;
00580     dofree(s);
00581 }

Generated on Thu Aug 25 12:38:08 2005 for Quake III Arena by  doxygen 1.3.9.1