Logo Search packages:      
Sourcecode: 9base version File versions  Download package

regcomp.c

#include "lib9.h"
#include "regexp9.h"
#include "regcomp.h"

#define     TRUE  1
#define     FALSE 0

/*
 * Parser Information
 */
typedef
struct Node
{
      Reinst*     first;
      Reinst*     last;
}Node;

#define     NSTACK      20
static      Node  andstack[NSTACK];
static      Node  *andp;
static      int   atorstack[NSTACK];
static      int*  atorp;
static      int   cursubid;         /* id of current subexpression */
static      int   subidstack[NSTACK];     /* parallel to atorstack */
static      int*  subidp;
static      int   lastwasand; /* Last token was operand */
static      int   nbra;
static      char* exprp;            /* pointer to next character in source expression */
static      int   lexdone;
static      int   nclass;
static      Reclass*classp;
static      Reinst*     freep;
static      int   errors;
static      Rune  yyrune;           /* last lex'd rune */
static      Reclass*yyclassp; /* last lex'd class */

/* predeclared crap */
static      void  operator(int);
static      void  pushand(Reinst*, Reinst*);
static      void  pushator(int);
static      void  evaluntil(int);
static      int   bldcclass(void);

static jmp_buf regkaboom;

static      void
rcerror(char *s)
{
      errors++;
      regerror(s);
      longjmp(regkaboom, 1);
}

static      Reinst*
newinst(int t)
{
      freep->type = t;
      freep->u2.left = 0;
      freep->u1.right = 0;
      return freep++;
}

static      void
operand(int t)
{
      Reinst *i;

      if(lastwasand)
            operator(CAT);    /* catenate is implicit */
      i = newinst(t);

      if(t == CCLASS || t == NCCLASS)
            i->u1.cp = yyclassp;
      if(t == RUNE)
            i->u1.r = yyrune;

      pushand(i, i);
      lastwasand = TRUE;
}

static      void
operator(int t)
{
      if(t==RBRA && --nbra<0)
            rcerror("unmatched right paren");
      if(t==LBRA){
            if(++cursubid >= NSUBEXP)
                  rcerror ("too many subexpressions");
            nbra++;
            if(lastwasand)
                  operator(CAT);
      } else
            evaluntil(t);
      if(t != RBRA)
            pushator(t);
      lastwasand = FALSE;
      if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
            lastwasand = TRUE;      /* these look like operands */
}

static      void
regerr2(char *s, int c)
{
      char buf[100];
      char *cp = buf;
      while(*s)
            *cp++ = *s++;
      *cp++ = c;
      *cp = '\0'; 
      rcerror(buf);
}

static      void
cant(char *s)
{
      char buf[100];
      strcpy(buf, "can't happen: ");
      strcat(buf, s);
      rcerror(buf);
}

static      void
pushand(Reinst *f, Reinst *l)
{
      if(andp >= &andstack[NSTACK])
            cant("operand stack overflow");
      andp->first = f;
      andp->last = l;
      andp++;
}

static      void
pushator(int t)
{
      if(atorp >= &atorstack[NSTACK])
            cant("operator stack overflow");
      *atorp++ = t;
      *subidp++ = cursubid;
}

static      Node*
popand(int op)
{
      Reinst *inst;

      if(andp <= &andstack[0]){
            regerr2("missing operand for ", op);
            inst = newinst(NOP);
            pushand(inst,inst);
      }
      return --andp;
}

static      int
popator(void)
{
      if(atorp <= &atorstack[0])
            cant("operator stack underflow");
      --subidp;
      return *--atorp;
}

static      void
evaluntil(int pri)
{
      Node *op1, *op2;
      Reinst *inst1, *inst2;

      while(pri==RBRA || atorp[-1]>=pri){
            switch(popator()){
            default:
                  rcerror("unknown operator in evaluntil");
                  break;
            case LBRA:        /* must have been RBRA */
                  op1 = popand('(');
                  inst2 = newinst(RBRA);
                  inst2->u1.subid = *subidp;
                  op1->last->u2.next = inst2;
                  inst1 = newinst(LBRA);
                  inst1->u1.subid = *subidp;
                  inst1->u2.next = op1->first;
                  pushand(inst1, inst2);
                  return;
            case OR:
                  op2 = popand('|');
                  op1 = popand('|');
                  inst2 = newinst(NOP);
                  op2->last->u2.next = inst2;
                  op1->last->u2.next = inst2;
                  inst1 = newinst(OR);
                  inst1->u1.right = op1->first;
                  inst1->u2.left = op2->first;
                  pushand(inst1, inst2);
                  break;
            case CAT:
                  op2 = popand(0);
                  op1 = popand(0);
                  op1->last->u2.next = op2->first;
                  pushand(op1->first, op2->last);
                  break;
            case STAR:
                  op2 = popand('*');
                  inst1 = newinst(OR);
                  op2->last->u2.next = inst1;
                  inst1->u1.right = op2->first;
                  pushand(inst1, inst1);
                  break;
            case PLUS:
                  op2 = popand('+');
                  inst1 = newinst(OR);
                  op2->last->u2.next = inst1;
                  inst1->u1.right = op2->first;
                  pushand(op2->first, inst1);
                  break;
            case QUEST:
                  op2 = popand('?');
                  inst1 = newinst(OR);
                  inst2 = newinst(NOP);
                  inst1->u2.left = inst2;
                  inst1->u1.right = op2->first;
                  op2->last->u2.next = inst2;
                  pushand(inst1, inst2);
                  break;
            }
      }
}

static      Reprog*
optimize(Reprog *pp)
{
      Reinst *inst, *target;
      int size;
      Reprog *npp;
      Reclass *cl;
      int diff;

      /*
       *  get rid of NOOP chains
       */
      for(inst=pp->firstinst; inst->type!=END; inst++){
            target = inst->u2.next;
            while(target->type == NOP)
                  target = target->u2.next;
            inst->u2.next = target;
      }

      /*
       *  The original allocation is for an area larger than
       *  necessary.  Reallocate to the actual space used
       *  and then relocate the code.
       */
      size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
      npp = realloc(pp, size);
      if(npp==0 || npp==pp)
            return pp;
      diff = (char *)npp - (char *)pp;
      freep = (Reinst *)((char *)freep + diff);
      for(inst=npp->firstinst; inst<freep; inst++){
            switch(inst->type){
            case OR:
            case STAR:
            case PLUS:
            case QUEST:
                  *(char **)&inst->u1.right += diff;
                  break;
            case CCLASS:
            case NCCLASS:
                  *(char **)&inst->u1.right += diff;
                  cl = inst->u1.cp;
                  *(char **)&cl->end += diff;
                  break;
            }
            *(char **)&inst->u2.left += diff;
      }
      *(char **)&npp->startinst += diff;
      return npp;
}

#ifdef      DEBUG
static      void
dumpstack(void){
      Node *stk;
      int *ip;

      print("operators\n");
      for(ip=atorstack; ip<atorp; ip++)
            print("0%o\n", *ip);
      print("operands\n");
      for(stk=andstack; stk<andp; stk++)
            print("0%o\t0%o\n", stk->first->type, stk->last->type);
}

static      void
dump(Reprog *pp)
{
      Reinst *l;
      Rune *p;

      l = pp->firstinst;
      do{
            print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
                  l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
            if(l->type == RUNE)
                  print("\t%C\n", l->u1.r);
            else if(l->type == CCLASS || l->type == NCCLASS){
                  print("\t[");
                  if(l->type == NCCLASS)
                        print("^");
                  for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
                        if(p[0] == p[1])
                              print("%C", p[0]);
                        else
                              print("%C-%C", p[0], p[1]);
                  print("]\n");
            } else
                  print("\n");
      }while(l++->type);
}
#endif

static      Reclass*
newclass(void)
{
      if(nclass >= NCLASS)
            regerr2("too many character classes; limit", NCLASS+'0');
      return &(classp[nclass++]);
}

static      int
nextc(Rune *rp)
{
      if(lexdone){
            *rp = 0;
            return 1;
      }
      exprp += chartorune(rp, exprp);
      if(*rp == '\\'){
            exprp += chartorune(rp, exprp);
            return 1;
      }
      if(*rp == 0)
            lexdone = 1;
      return 0;
}

static      int
lex(int literal, int dot_type)
{
      int quoted;

      quoted = nextc(&yyrune);
      if(literal || quoted){
            if(yyrune == 0)
                  return END;
            return RUNE;
      }

      switch(yyrune){
      case 0:
            return END;
      case '*':
            return STAR;
      case '?':
            return QUEST;
      case '+':
            return PLUS;
      case '|':
            return OR;
      case '.':
            return dot_type;
      case '(':
            return LBRA;
      case ')':
            return RBRA;
      case '^':
            return BOL;
      case '$':
            return EOL;
      case '[':
            return bldcclass();
      }
      return RUNE;
}

static int
bldcclass(void)
{
      int type;
      Rune r[NCCRUNE];
      Rune *p, *ep, *np;
      Rune rune;
      int quoted;

      /* we have already seen the '[' */
      type = CCLASS;
      yyclassp = newclass();

      /* look ahead for negation */
      /* SPECIAL CASE!!! negated classes don't match \n */
      ep = r;
      quoted = nextc(&rune);
      if(!quoted && rune == '^'){
            type = NCCLASS;
            quoted = nextc(&rune);
            *ep++ = '\n';
            *ep++ = '\n';
      }

      /* parse class into a set of spans */
      for(; ep<&r[NCCRUNE];){
            if(rune == 0){
                  rcerror("malformed '[]'");
                  return 0;
            }
            if(!quoted && rune == ']')
                  break;
            if(!quoted && rune == '-'){
                  if(ep == r){
                        rcerror("malformed '[]'");
                        return 0;
                  }
                  quoted = nextc(&rune);
                  if((!quoted && rune == ']') || rune == 0){
                        rcerror("malformed '[]'");
                        return 0;
                  }
                  *(ep-1) = rune;
            } else {
                  *ep++ = rune;
                  *ep++ = rune;
            }
            quoted = nextc(&rune);
      }

      /* sort on span start */
      for(p = r; p < ep; p += 2){
            for(np = p; np < ep; np += 2)
                  if(*np < *p){
                        rune = np[0];
                        np[0] = p[0];
                        p[0] = rune;
                        rune = np[1];
                        np[1] = p[1];
                        p[1] = rune;
                  }
      }

      /* merge spans */
      np = yyclassp->spans;
      p = r;
      if(r == ep)
            yyclassp->end = np;
      else {
            np[0] = *p++;
            np[1] = *p++;
            for(; p < ep; p += 2)
                  if(p[0] <= np[1]){
                        if(p[1] > np[1])
                              np[1] = p[1];
                  } else {
                        np += 2;
                        np[0] = p[0];
                        np[1] = p[1];
                  }
            yyclassp->end = np+2;
      }

      return type;
}

static      Reprog*
regcomp1(char *s, int literal, int dot_type)
{
      int token;
      Reprog *volatile pp;

      /* get memory for the program */
      pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
      if(pp == 0){
            regerror("out of memory");
            return 0;
      }
      freep = pp->firstinst;
      classp = pp->class;
      errors = 0;

      if(setjmp(regkaboom))
            goto out;

      /* go compile the sucker */
      lexdone = 0;
      exprp = s;
      nclass = 0;
      nbra = 0;
      atorp = atorstack;
      andp = andstack;
      subidp = subidstack;
      lastwasand = FALSE;
      cursubid = 0;

      /* Start with a low priority operator to prime parser */
      pushator(START-1);
      while((token = lex(literal, dot_type)) != END){
            if((token&0300) == OPERATOR)
                  operator(token);
            else
                  operand(token);
      }

      /* Close with a low priority operator */
      evaluntil(START);

      /* Force END */
      operand(END);
      evaluntil(START);
#ifdef DEBUG
      dumpstack();
#endif
      if(nbra)
            rcerror("unmatched left paren");
      --andp;     /* points to first and only operand */
      pp->startinst = andp->first;
#ifdef DEBUG
      dump(pp);
#endif
      pp = optimize(pp);
#ifdef DEBUG
      print("start: %d\n", andp->first-pp->firstinst);
      dump(pp);
#endif
out:
      if(errors){
            free(pp);
            pp = 0;
      }
      return pp;
}

extern      Reprog*
regcomp(char *s)
{
      return regcomp1(s, 0, ANY);
}

extern      Reprog*
regcomplit(char *s)
{
      return regcomp1(s, 1, ANY);
}

extern      Reprog*
regcompnl(char *s)
{
      return regcomp1(s, 0, ANYNL);
}

Generated by  Doxygen 1.6.0   Back to index