Logo Search packages:      
Sourcecode: w3mmee version File versions  Download package

regex.c

/* 
 * regex: Regular expression pattern match library
 * 
 * by A.ITO, December 1989
 */

#ifdef REGEX_DEBUG
#include <sys/types.h>
#include <malloc.h>
#endif                        /* REGEX_DEBUG */
#include <ctype.h>
#include <gc.h>
#ifdef __EMX__
#include <strings.h>
#endif
#include "fm.h"
#include "regex.h"

#ifdef HAVE_MOE
#define REGEX_BOF (mb_notchar_bof)
#define REGEX_EOF (mb_notchar_eof)
#else
#define REGEX_BOF (~1U)
#define REGEX_EOF (~0U)
#endif

#ifdef JP_CHARSET
#define RE_KANJI(p)     (((unsigned char)*(p) << 8) | (unsigned char)*((p)+1))
#endif

static Regex DefaultRegex;

static wcrx_wcl_t wcl_nl = {NULL, '\n', '\n'};

static wcrx_compile_macro_t wccm_nl = {{(wcrx_compile_macro_func_t)wcrx_compile_class_macro_class}, &wcl_nl};
static wcrx_compile_macro_t wcem_nl = {{(wcrx_compile_macro_func_t)wcrx_compile_expr_macro_class}, &wcl_nl};

static wcrx_wcl_t wcl_tab = {&wcl_nl, '\t', '\t'};
static wcrx_wcl_t wcl_cr = {&wcl_tab, '\r', '\r'};
static wcrx_wcl_t wcl_space = {&wcl_cr, ' ', ' '};

static wcrx_compile_macro_t wccm_space = {{(wcrx_compile_macro_func_t)wcrx_compile_class_macro_class}, &wcl_space};
static wcrx_compile_macro_t wcem_space = {{(wcrx_compile_macro_func_t)wcrx_compile_expr_macro_class}, &wcl_space};

static wcrx_compile_macro_t wccm_non_space = {{(wcrx_compile_macro_func_t)wcrx_compile_class_macro_complement}, &wcl_space};
static wcrx_compile_macro_t wcem_non_space = {{(wcrx_compile_macro_func_t)wcrx_compile_expr_macro_complement}, &wcl_space};

static wcrx_wcl_t wcl_digit = {NULL, '0', '9'};

static wcrx_compile_macro_t wccm_digit = {{(wcrx_compile_macro_func_t)wcrx_compile_class_macro_class}, &wcl_digit};
static wcrx_compile_macro_t wcem_digit = {{(wcrx_compile_macro_func_t)wcrx_compile_expr_macro_class}, &wcl_digit};

static wcrx_compile_macro_t wccm_non_digit = {{(wcrx_compile_macro_func_t)wcrx_compile_class_macro_complement}, &wcl_digit};
static wcrx_compile_macro_t wcem_non_digit = {{(wcrx_compile_macro_func_t)wcrx_compile_expr_macro_complement}, &wcl_digit};

static wcrx_wcl_t wcl_upper_alpha = {&wcl_digit, 'A', 'Z'};
static wcrx_wcl_t wcl_lower_alpha = {&wcl_upper_alpha, 'a', 'z'};
static wcrx_wcl_t wcl_word = {&wcl_lower_alpha, '_', '_'};

static wcrx_compile_macro_t wccm_word = {{(wcrx_compile_macro_func_t)wcrx_compile_class_macro_class}, &wcl_word};
static wcrx_compile_macro_t wcem_word = {{(wcrx_compile_macro_func_t)wcrx_compile_expr_macro_class}, &wcl_word};

static wcrx_compile_macro_t wccm_non_word = {{(wcrx_compile_macro_func_t)wcrx_compile_class_macro_complement}, &wcl_word};
static wcrx_compile_macro_t wcem_non_word = {{(wcrx_compile_macro_func_t)wcrx_compile_expr_macro_complement}, &wcl_word};

static wcrx_macro_item_t wccm_tab_itemv[] = {
  {'D', &wccm_non_digit},
  {'S', &wccm_non_space},
  {'W', &wccm_non_word},
  {'d', &wccm_digit},
  {'n', &wccm_nl},
  {'s', &wccm_space},
  {'w', &wccm_word},
};

static wcrx_macro_tab_t wccm_tab = {wccm_tab_itemv, sizeof(wccm_tab_itemv) / sizeof(wccm_tab_itemv[0])};

static wcrx_macro_item_t wcem_tab_itemv[] = {
  {'D', &wcem_non_digit},
  {'S', &wcem_non_space},
  {'W', &wcem_non_word},
  {'d', &wcem_digit},
  {'n', &wcem_nl},
  {'s', &wcem_space},
  {'w', &wcem_word},
};

static wcrx_macro_tab_t wcem_tab = {wcem_tab_itemv, sizeof(wcem_tab_itemv) / sizeof(wcem_tab_itemv[0])};

struct re_fetch_wchar_arg_st {
  char **pcursor, *ep;
  int igncase;
};

static unsigned int
re_fetch_wchar(void *arg)
{
  struct re_fetch_wchar_arg_st *p;
  uirx_wc_t wc;
#ifdef HAVE_MOE
  int cn;

  p = arg;

  if ((cn = mb_mem_to_wchar_internal(*p->pcursor, p->ep - *p->pcursor, wc)) > 0)
    *p->pcursor += cn;
  else
    ++(*p->pcursor);
#else

  p = arg;

  if (*p->pcursor >= p->ep)
    wc = REGEX_EOF;
  else
#ifdef JP_CHARSET
  if (*p->pcursor + 1 < p->ep && IS_KANJI1(**p->pcursor)) {
    wc = RE_KANJI(*p->pcursor);
    *p->pcursor += 2;
  }
  else
#endif
    {
      wc = (unsigned char)**p->pcursor;
      ++(*p->pcursor);
    }
#endif

  return wc;
}

static void
re_ws_tolower(uirx_wc_t *ws, uirx_wc_t *ws_end)
{
  for (; ws < ws_end ; ++ws)
    if (
#if defined(HAVE_MOE) || LANG == JA
      !(*ws & ~0x7F) &&
#endif
      isupper(*ws))
      *ws = tolower(*ws);
}

static unsigned int
re_ci_wchar_filter(uirx_wc_t wc, wcrx_parser_desc_t *desc)
{
  re_ws_tolower(&wc, &wc + 1);
  return wc;
}

typedef struct regex_arg_st {
  Regex *re;
  char *orgv[BUFSIZ + 1], *cb, *ce;
  uirx_wc_t ws[BUFSIZ], *cur;
  int continued;
} regex_arg_t;

static void
regexRecordBeginning(uirx_wc_t i, void *arg)
{
  if (!i) {
    regex_arg_t *re_arg;

    re_arg = arg;
    re_arg->re->position = re_arg->orgv[re_arg->cur - re_arg->ws];
  }
}

static void
regexRecordEnd(uirx_wc_t i, void *arg)
{
  if (!i) {
    regex_arg_t *re_arg;

    re_arg = arg;
    re_arg->re->lposition = re_arg->orgv[re_arg->cur - re_arg->ws];
  }
}

static void
regex_evprintf(void *arg, const char *frmt, va_list ap)
{
  char **emsg_ptr, *old, *new;
  int oldlen, newlen;

  newlen = vsprintf_length(frmt, ap) * 2;

  if ((old = *(emsg_ptr = arg))) {
    oldlen = strlen(old) + sizeof("\n");
    *emsg_ptr = GC_malloc_atomic(oldlen + newlen + sizeof(""));
    memcpy(*emsg_ptr, old, oldlen);
    memcpy(*emsg_ptr + oldlen, "\n", sizeof("\n") - sizeof(""));
    new = *emsg_ptr + oldlen + sizeof("\n") - sizeof("");
  }
  else
    *emsg_ptr = new = GC_malloc_atomic(newlen + sizeof(""));

  if (vsprintf(new, frmt, ap) > newlen)
    fputs("regex_evprintf: generated string is too long\n", stderr);
}

static wcrx_parser_desc_core_t regex_desc_core = {
  re_fetch_wchar,
  &wccm_tab, &wcem_tab,
  REGEX_BOF, REGEX_EOF,
  regexRecordBeginning, regexRecordEnd,
  regex_evprintf,
  1,
};

/* 
 * regexCompile: compile regular expression
 */
char *
regexCompile(char *ex, int igncase)
{
  char *msg;
  newRegex(ex, igncase, &DefaultRegex, &msg);
  return msg;
}

Regex *
newRegex(char *ex, int igncase, Regex * volatile regex, char **p_msg)
{
  struct re_fetch_wchar_arg_st p;
  jmp_buf env;
  char *msg = NULL;

  wcrx_parser_desc_t desc = {
    &regex_desc_core,
    NULL,
    NULL,
    0,
    NULL,
    NULL,
  };

  desc.reader_arg = &p;

  if (igncase)
    desc.alpha_filter = re_ci_wchar_filter;

  desc.evprintf_arg = &msg;
  desc.continuation = &env;

  if (!regex)
    regex = GC_malloc(sizeof(Regex));

  memset(regex, 0, sizeof(*regex));
  p.pcursor = &ex;
  p.ep = ex + strlen(ex);
  p.igncase = regex->igncase = igncase;

  if (setjmp(env))
    goto end;

  regex->nfa = wcrx_compile(&desc);
end:
  if (p_msg)
    *p_msg = msg;

  return regex;
}

/* 
 * regexSearch: search regular expression
 */
int
regexSearch(char **pb, char **pe, int firstp)
{
  return RegexSearch(&DefaultRegex, pb, pe, firstp);
}

static int
doRegexSearch(regex_arg_t *arg, int i)
{
  Regex *re;
  uirx_wc_t *we, *next;

  re = arg->re;
  arg->cur = arg->ws;
  we = arg->cur + i;

  if (re->igncase)
    re_ws_tolower(arg->cur, we);

#ifdef HAVE_MOE
  tty_apply_convv(arg->cur, we, NULL);
#endif

  if (arg->continued) {
    int res;

    for (res = TRUE ; arg->cur < we ; ++(arg->cur))
      if (!uirx_match(re->nfa, arg, *arg->cur)) {
      res = FALSE;
      break;
      }

    arg->cb = arg->orgv[arg->cur - arg->ws];
    return res;
  }
  else {
    for (; arg->cur < we ; ++(arg->cur)) {
      uirx_match_start(re->nfa);

      if (uirx_match(re->nfa, arg, *arg->cur)) {
      for (next = ++(arg->cur) ; arg->cur < we ; ++(arg->cur))
        if (!uirx_match(re->nfa, arg, *arg->cur)) {
          if (re->position && re->lposition)
            break;

          arg->cur = next;
          goto next_match;
        }

      arg->cb = arg->orgv[next - 1 - arg->ws];
      arg->ce = arg->orgv[arg->cur - arg->ws];
      return TRUE;
      }
    next_match:
      ;
    }

    arg->cb = arg->ce = arg->orgv[arg->cur - arg->ws];
    return FALSE;
  }
}

int
RegexSearch(Regex *re, char **pb, char **pe, int firstp)
{
  regex_arg_t arg;
  char *p = NULL, *ep;
  int i, res = FALSE;
#if defined(HAVE_MOE)
  int cn;
#elif LANG == JA
  int ctype;
#endif

  if (!re || !re->nfa) {
    *pb = *pe;
    return FALSE;
  }

  if (firstp & RE_FLAG_REUSE)
    arg.continued = TRUE;
  else {
    re->position = re->lposition = NULL;
    arg.continued = FALSE;
  }

  arg.re = re;
  arg.cb = p = *pb;
  arg.cb = ep = *pe;

  if (firstp & RE_FLAG_BOF) {
    arg.orgv[0] = p;
    arg.ws[0] = REGEX_BOF;
    i = 1;
  }
  else
    i = 0;

  arg.orgv[i] = p;
retry:
  while (p < ep) {
#ifdef HAVE_MOE
    if ((cn = mb_mem_to_wchar_internal(p, ep - p, arg.ws[i])) < 1)
      cn = 1;

    p += cn;
#elif LANG == JA
    ctype = get_mctype(p);
    arg.ws[i] = mctowc(p, ctype);
    p += get_mclen(ctype);
#else
    arg.ws[i] = (unsigned char)*p++;
#endif
    arg.orgv[++i] = p;

    if (i >= sizeof(arg.ws) / sizeof(arg.ws[0])) {
      res = doRegexSearch(&arg, i);

      if (arg.cur < arg.ws + i) {
      if (res || firstp & RE_FLAG_REUSE || arg.cb >= ep) {
        res = FALSE;
        goto end;
      }

      p = ++(arg.cb);
      }

      arg.continued = res;
      arg.orgv[i = 0] = p;
    }
  }

  if (firstp & RE_FLAG_EOL) {
    if (i >= sizeof(arg.ws) / sizeof(arg.ws[0])) {
      res = doRegexSearch(&arg, i);

      if (arg.cur < arg.ws + i) {
      if (res || firstp & RE_FLAG_REUSE || arg.cb >= ep) {
        res = FALSE;
        goto end;
      }

      arg.orgv[i = 0] = p = ++(arg.cb);
      arg.continued = FALSE;
      goto retry;
      }

      arg.continued = res;
      arg.orgv[i = 0] = p;
    }

    arg.ws[i++] = '\n';
    arg.orgv[i] = p;
  }

  if (firstp & RE_FLAG_EOF) {
    if (i >= sizeof(arg.ws) / sizeof(arg.ws[0])) {
      res = doRegexSearch(&arg, i);

      if (arg.cur < arg.ws + i) {
      if (res || firstp & RE_FLAG_REUSE || arg.cb >= ep) {
        res = FALSE;
        goto end;
      }

      arg.orgv[i = 0] = p = ++(arg.cb);
      arg.continued = FALSE;
      goto retry;
      }

      arg.continued = res;
      arg.orgv[i = 0] = p;
    }

    arg.ws[i++] = REGEX_EOF;
    arg.orgv[i] = p;
  }

  if (i) {
    if ((res = doRegexSearch(&arg, i)) &&
      arg.cur == arg.ws + i && firstp & RE_FLAG_EOF)
      uirx_match_end(re->nfa, &arg);
    else if (arg.cur < arg.ws + i)
      res = FALSE;
  }
end:
  *pb = arg.cb;
  *pe = arg.ce;
  return res;
}

/* 
 * regexMatch: match regular expression
 */
int
regexMatch(char *p, int len)
{
  return RegexMatch(&DefaultRegex, p, len);
}

int
RegexMatch(Regex *re, char *p, int len)
{
  char *ep;

  ep = p + (len < 0 ? strlen(p) : len);
  RegexSearch(re, &p, &ep, RE_FLAG_BOF | RE_FLAG_EOF);
  return (re->position && re->lposition) ? TRUE : FALSE;
}

/* 
 * matchedPosition: last matched position
 */
void
MatchedPosition(Regex *re, char **first, char **last)
{
  *first = re->position;
  *last = re->lposition;
}

void
matchedPosition(char **first, char **last)
{
  *first = DefaultRegex.position;
  *last = DefaultRegex.lposition;
}

Generated by  Doxygen 1.6.0   Back to index