マルチバイト対応正規表現関数(mb_ereg系)でUTF-8の文字列が正しく処理できない件について(続き)
昨日の件ですが。
論より証拠、という事で差分を晒してみたいと思います。
まずは、Ruby1.6.8とRuby1.8.5-p2との差分。
--- ruby-1.6.8/regex.c 2002-11-19 22:36:29.000000000 +0900 +++ ruby-1.8.5-p2/regex.c 2006-08-07 12:43:42.000000000 +0900 @@ -12,7 +12,7 @@ Library General Public License for more details. You should have received a copy of the GNU Library General Public - License along with the GNU C Library; see the file COPYING.LIB. If not, + License along with the GNU C Library; see the file LGPL. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* Multi-byte extension added May, 1993 by t^2 (Takahiro Tanimoto) @@ -51,7 +51,7 @@ # include <sys/types.h> #endif -#ifndef __STDC__ +#if !defined(__STDC__) && !defined(_MSC_VER) # define volatile #endif @@ -69,19 +69,11 @@ extern int rb_trap_pending; void rb_trap_exec _((void)); -# define CHECK_INTS if (!rb_prohibit_interrupt) {\ - if (rb_trap_pending) rb_trap_exec();\ -} - -#define xmalloc ruby_xmalloc -#define xcalloc ruby_xcalloc -#define xrealloc ruby_xrealloc -#define xfree ruby_xfree - -void *xmalloc _((size_t)); -void *xcalloc _((size_t,size_t)); -void *xrealloc _((void*,size_t)); -void xfree _((void*)); +# define CHECK_INTS do {\ + if (!rb_prohibit_interrupt) {\ + if (rb_trap_pending) rb_trap_exec();\ + }\ +} while (0) #endif /* Make alloca work the best possible way. */ @@ -92,16 +84,19 @@ # endif # endif /* atarist */ #else -# if defined(HAVE_ALLOCA_H) +# ifdef HAVE_ALLOCA_H # include <alloca.h> -# elif !defined(alloca) -char *alloca(); -# endif -#endif /* __GNUC__ */ +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +void *alloca (); +# endif +# endif /* AIX */ +# endif /* HAVE_ALLOCA_H */ -#ifdef _AIX -#pragma alloca -#endif +#endif /* __GNUC__ */ #ifdef HAVE_STRING_H # include <string.h> @@ -193,6 +188,12 @@ #ifdef RUBY #include "util.h" +void rb_warn _((const char*, ...)); +# define re_warning(x) rb_warn(x) +#endif + +#ifndef re_warning +# define re_warning(x) #endif static void @@ -429,7 +430,6 @@ return 0; } - /* Macros for re_compile_pattern, which is found below these definitions. */ #define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate) @@ -478,6 +478,19 @@ #define WC2MBC1ST(c) \ ((current_mbctype != MBCTYPE_UTF8) ? ((c<0x100) ? (c) : (((c)>>8)&0xff)) : utf8_firstbyte(c)) +typedef unsigned int (*mbc_startpos_func_t) _((const char *string, unsigned int pos)); + +static unsigned int asc_startpos _((const char *string, unsigned int pos)); +static unsigned int euc_startpos _((const char *string, unsigned int pos)); +static unsigned int sjis_startpos _((const char *string, unsigned int pos)); +static unsigned int utf8_startpos _((const char *string, unsigned int pos)); + +static const mbc_startpos_func_t mbc_startpos_func[4] = { + asc_startpos, euc_startpos, sjis_startpos, utf8_startpos +}; + +#define mbc_startpos(start, pos) (*mbc_startpos_func[current_mbctype])((start), (pos)) + static unsigned int utf8_firstbyte(c) unsigned long c; @@ -547,7 +560,7 @@ reset the pointers that pointed into the old allocation to point to the correct places in the new allocation. If extending the buffer results in it being larger than 1 << 16, then flag memory exhausted. */ -#define EXTEND_BUFFER \ +#define EXTEND_BUFFER \ do { char *old_buffer = bufp->buffer; \ if (bufp->allocated == (1L<<16)) goto too_big; \ bufp->allocated *= 2; \ @@ -694,7 +707,18 @@ } static int -is_in_list(c, b) +is_in_list_sbc(c, b) + unsigned long c; + const unsigned char *b; +{ + unsigned short size; + + size = *b++; + return ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH); +} + +static int +is_in_list_mbc(c, b) unsigned long c; const unsigned char *b; { @@ -702,9 +726,6 @@ unsigned short i, j; size = *b++; - if ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH) { - return 1; - } b += size + 2; size = EXTRACT_UNSIGNED(&b[-2]); if (size == 0) return 0; @@ -719,9 +740,18 @@ } if (i < size && EXTRACT_MBC(&b[i*8]) <= c) return 1; + return 0; } +static int +is_in_list(c, b) + unsigned long c; + const unsigned char *b; +{ + return is_in_list_sbc(c, b) || (current_mbctype ? is_in_list_mbc(c, b) : 0); +} + static void print_partial_compiled_pattern(start, end) unsigned char *start; @@ -828,7 +858,7 @@ unsigned bit; unsigned char map_byte = p[c]; - putchar ('/'); + putchar('/'); for (bit = 0; bit < BYTEWIDTH; bit++) if (map_byte & (1 << bit)) @@ -836,10 +866,10 @@ } p += mcnt; mcnt = EXTRACT_UNSIGNED_AND_INCR(p); - printf("/"); + putchar('/'); while (mcnt--) { print_mbc(EXTRACT_MBC_AND_INCR(p)); - printf("-"); + putchar('-'); print_mbc(EXTRACT_MBC_AND_INCR(p)); } break; @@ -984,8 +1014,8 @@ { int mcnt; int max = 0; - char *p = start; - char *pend = end; + unsigned char *p = start; + unsigned char *pend = end; char *must = 0; if (start == NULL) return 0; @@ -1011,6 +1041,7 @@ break; case duplicate: + case option_set: p++; break; @@ -1036,7 +1067,6 @@ case push_dummy_failure: case start_paren: case stop_paren: - case option_set: break; case charset: @@ -1053,7 +1083,7 @@ EXTRACT_NUMBER_AND_INCR(mcnt, p); if (mcnt > 0) p += mcnt; if ((enum regexpcode)p[-3] == jump) { - p -= 2; + p -= 2; EXTRACT_NUMBER_AND_INCR(mcnt, p); if (mcnt > 0) p += mcnt; } @@ -1135,7 +1165,7 @@ PATFETCH_RAW(c); *pp = p; if (c == '\\') { - return read_special(p, pend, pp) | 0x80; + return read_special(--p, pend, pp) | 0x80; } else if (c == -1) return ~0; else { @@ -1149,12 +1179,13 @@ PATFETCH_RAW(c); *pp = p; if (c == '\\') { - c = read_special(p, pend, pp); + c = read_special(--p, pend, pp); } else if (c == '?') return 0177; else if (c == -1) return ~0; return c & 0x9f; default: + *pp = p + 1; return read_backslash(c); } @@ -1272,13 +1303,9 @@ if (bufp->allocated == 0) { bufp->allocated = INIT_BUF_SIZE; - if (bufp->buffer) - /* EXTEND_BUFFER loses when bufp->allocated is 0. */ - bufp->buffer = (char*)xrealloc(bufp->buffer, INIT_BUF_SIZE); - else - /* Caller did not allocate a buffer. Do it for them. */ - bufp->buffer = (char*)xmalloc(INIT_BUF_SIZE); - if (!bufp->buffer) goto memory_exhausted; + /* EXTEND_BUFFER loses when bufp->allocated is 0. */ + bufp->buffer = (char*)xrealloc(bufp->buffer, INIT_BUF_SIZE); + if (!bufp->buffer) goto memory_exhausted; /* this not happen */ begalt = b = bufp->buffer; } @@ -1430,8 +1457,7 @@ int size; unsigned last = (unsigned)-1; - if ((size = EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])) - || current_mbctype) { + if ((size = EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])) || current_mbctype) { /* Ensure the space is enough to hold another interval of multi-byte chars in charset(_not)?. */ size = (1 << BYTEWIDTH) / BYTEWIDTH + 2 + size*8 + 8; @@ -1442,12 +1468,13 @@ if (range && had_char_class) { FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as an end value of range"); } - PATFETCH(c); + PATFETCH_RAW(c); if (c == ']') { if (p == p0 + 1) { if (p == pend) FREE_AND_RETURN(stackb, "invalid regular expression; empty character class"); + re_warning("character class has `]' without escape"); } else /* Stop if this isn't merely a ] inside a bracket @@ -1465,6 +1492,13 @@ } had_char_class = 0; + if (c == '-' && ((p != p0 + 1 && *p != ']') || + (p[0] == '-' && p[1] != ']') || + range)) + re_warning("character class has `-' without escape"); + if (c == '[' && *p != ':') + re_warning("character class has `[' without escape"); + /* \ escapes characters when inside [...]. */ if (c == '\\') { PATFETCH_RAW(c); @@ -1547,7 +1581,7 @@ case 'C': case 'c': { - char *pp; + const char *pp; --p; c = read_special(p, pend, &pp); @@ -1566,32 +1600,7 @@ break; } } - - /* Get a range. */ - if (range) { - if (last > c) - goto invalid_pattern; - - range = 0; - if (had_mbchar == 0) { - for (;last<=c;last++) - SET_LIST_BIT(last); - } - else if (had_mbchar == 2) { - set_list_bits(last, c, b); - } - else { - /* restriction: range between sbc and mbc */ - goto invalid_pattern; - } - } - else if (p[0] == '-' && p[1] != ']') { - last = c; - PATFETCH(c1); - range = 1; - goto range_retry; - } - else if (c == '[' && *p == ':') { + else if (c == '[' && *p == ':') { /* [:...:] */ /* Leave room for the null. */ char str[CHAR_CLASS_MAX_LENGTH + 1]; @@ -1603,7 +1612,7 @@ FREE_AND_RETURN(stackb, "invalid regular expression; re can't end '[[:'"); for (;;) { - PATFETCH (c); + PATFETCH_RAW(c); if (c == ':' || c == ']' || p == pend || c1 == CHAR_CLASS_MAX_LENGTH) break; @@ -1611,9 +1620,9 @@ } str[c1] = '\0'; - /* If isn't a word bracketed by `[:' and:`]': - undo the ending character, the letters, and leave - the leading `:' and `[' (but set bits for them). */ + /* If isn't a word bracketed by `[:' and `:]': + undo the ending character, the letters, and + the leading `:' and `['. */ if (c == ':' && *p == ']') { int ch; char is_alnum = STREQ(str, "alnum"); @@ -1657,23 +1666,57 @@ SET_LIST_BIT(ch); } had_char_class = 1; + continue; } else { - c1++; + c1 += 2; while (c1--) PATUNFETCH; - SET_LIST_BIT(TRANSLATE_P()?translate['[']:'['); - SET_LIST_BIT(TRANSLATE_P()?translate[':']:':'); - had_char_class = 0; - last = ':'; + re_warning("character class has `[' without escape"); + c = '['; } } - else if (had_mbchar == 0 && (!current_mbctype || !had_num_literal)) { - SET_LIST_BIT(c); - had_num_literal = 0; + + /* Get a range. */ + if (range) { + if (last > c) + goto invalid_pattern; + + range = 0; + if (had_mbchar == 0) { + if (TRANSLATE_P()) { + for (;last<=c;last++) + SET_LIST_BIT(translate[last]); + } + else { + for (;last<=c;last++) + SET_LIST_BIT(last); + } + } + else if (had_mbchar == 2) { + set_list_bits(last, c, b); + } + else { + /* restriction: range between sbc and mbc */ + goto invalid_pattern; + } + } + else if (p[0] == '-' && p[1] != ']') { + last = c; + PATFETCH_RAW(c1); + range = 1; + goto range_retry; + } + else { + if (TRANSLATE_P() && c < 0x100) c = (unsigned char)translate[c]; + if (had_mbchar == 0 && (!current_mbctype || !had_num_literal)) { + SET_LIST_BIT(c); + had_num_literal = 0; + } + else { + set_list_bits(c, c, b); + } } - else - set_list_bits(c, c, b); had_mbchar = 0; } @@ -1682,9 +1725,10 @@ while ((int)b[-1] > 0 && b[b[-1] - 1] == 0) b[-1]--; if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH) - memmove(&b[b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH], + memmove(&b[(unsigned char)b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH], 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8); - b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[b[-1]])*8; + b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[(unsigned char)b[-1]])*8; + had_num_literal = 0; break; case '(': @@ -1693,155 +1737,143 @@ int push_option = 0; int casefold = 0; - PATFETCH(c); - if (c == '?') { - int negative = 0; - - PATFETCH_RAW(c); - switch (c) { - case 'x': case 'p': case 'm': case 'i': case '-': - for (;;) { - switch (c) { - case '-': - negative = 1; - break; - - case ':': - case ')': - break; - - case 'x': - if (negative) - options &= ~RE_OPTION_EXTENDED; - else - options |= RE_OPTION_EXTENDED; - break; + PATFETCH(c); + if (c == '?') { + int negative = 0; - case 'p': - if (negative) { - if ((options&RE_OPTION_POSIXLINE) == RE_OPTION_POSIXLINE) { - options &= ~RE_OPTION_POSIXLINE; + PATFETCH_RAW(c); + switch (c) { + case 'x': case 'm': case 'i': case '-': + for (;;) { + switch (c) { + case '-': + negative = 1; + break; + + case ':': + case ')': + break; + + case 'x': + if (negative) + options &= ~RE_OPTION_EXTENDED; + else + options |= RE_OPTION_EXTENDED; + break; + + case 'm': + if (negative) { + if (options&RE_OPTION_MULTILINE) { + options &= ~RE_OPTION_MULTILINE; + } } - } - else if ((options&RE_OPTION_POSIXLINE) != RE_OPTION_POSIXLINE) { - options |= RE_OPTION_POSIXLINE; - } - push_option = 1; - break; - - case 'm': - if (negative) { - if (options&RE_OPTION_MULTILINE) { - options &= ~RE_OPTION_MULTILINE; + else if (!(options&RE_OPTION_MULTILINE)) { + options |= RE_OPTION_MULTILINE; } - } - else if (!(options&RE_OPTION_MULTILINE)) { - options |= RE_OPTION_MULTILINE; - } - push_option = 1; - break; + push_option = 1; + break; - case 'i': - if (negative) { - if (options&RE_OPTION_IGNORECASE) { - options &= ~RE_OPTION_IGNORECASE; + case 'i': + if (negative) { + if (options&RE_OPTION_IGNORECASE) { + options &= ~RE_OPTION_IGNORECASE; + } + } + else if (!(options&RE_OPTION_IGNORECASE)) { + options |= RE_OPTION_IGNORECASE; } - } - else if (!(options&RE_OPTION_IGNORECASE)) { - options |= RE_OPTION_IGNORECASE; - } casefold = 1; - break; + break; - default: - FREE_AND_RETURN(stackb, "undefined (?...) inline option"); - } - if (c == ')') { - c = '#'; /* read whole in-line options */ - break; + default: + FREE_AND_RETURN(stackb, "undefined (?...) inline option"); + } + if (c == ')') { + c = '#'; /* read whole in-line options */ + break; + } + if (c == ':') break; + PATFETCH_RAW(c); } - if (c == ':') break; - PATFETCH_RAW(c); - } - break; + break; - case '#': - for (;;) { - PATFETCH(c); - if (c == ')') break; - } - c = '#'; - break; + case '#': + for (;;) { + PATFETCH(c); + if (c == ')') break; + } + c = '#'; + break; - case ':': - case '=': - case '!': - case '>': - break; + case ':': + case '=': + case '!': + case '>': + break; - default: - FREE_AND_RETURN(stackb, "undefined (?...) sequence"); - } + default: + FREE_AND_RETURN(stackb, "undefined (?...) sequence"); + } } else { PATUNFETCH; c = '('; } if (c == '#') { - if (push_option) { - BUFPUSH(option_set); - BUFPUSH(options); - } + if (push_option) { + BUFPUSH(option_set); + BUFPUSH(options); + } if (casefold) { if (options & RE_OPTION_IGNORECASE) BUFPUSH(casefold_on); else BUFPUSH(casefold_off); - } + } break; - } - if (stackp+8 >= stacke) { - DOUBLE_STACK(int); - } - - /* Laststart should point to the start_memory that we are about - to push (unless the pattern has RE_NREGS or more ('s). */ - /* obsolete: now RE_NREGS is just a default register size. */ - *stackp++ = b - bufp->buffer; - *stackp++ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; - *stackp++ = begalt - bufp->buffer; - switch (c) { - case '(': - BUFPUSH(start_memory); - BUFPUSH(regnum); - *stackp++ = regnum++; - *stackp++ = b - bufp->buffer; - BUFPUSH(0); - /* too many ()'s to fit in a byte. (max 254) */ - if (regnum >= RE_REG_MAX) goto too_big; - break; + } + if (stackp+8 >= stacke) { + DOUBLE_STACK(int); + } - case '=': - case '!': - case '>': - BUFPUSH(start_nowidth); - *stackp++ = b - bufp->buffer; - BUFPUSH(0); /* temporary value */ - BUFPUSH(0); - if (c != '!') break; + /* Laststart should point to the start_memory that we are about + to push (unless the pattern has RE_NREGS or more ('s). */ + /* obsolete: now RE_NREGS is just a default register size. */ + *stackp++ = b - bufp->buffer; + *stackp++ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; + *stackp++ = begalt - bufp->buffer; + switch (c) { + case '(': + BUFPUSH(start_memory); + BUFPUSH(regnum); + *stackp++ = regnum++; + *stackp++ = b - bufp->buffer; + BUFPUSH(0); + /* too many ()'s to fit in a byte. (max 254) */ + if (regnum >= RE_REG_MAX) goto too_big; + break; - BUFPUSH(on_failure_jump); - *stackp++ = b - bufp->buffer; - BUFPUSH(0); /* temporary value */ - BUFPUSH(0); - break; + case '=': + case '!': + case '>': + BUFPUSH(start_nowidth); + *stackp++ = b - bufp->buffer; + BUFPUSH(0); /* temporary value */ + BUFPUSH(0); + if (c != '!') break; + + BUFPUSH(on_failure_jump); + *stackp++ = b - bufp->buffer; + BUFPUSH(0); /* temporary value */ + BUFPUSH(0); + break; - case ':': - BUFPUSH(start_paren); - pending_exact = 0; - default: - break; - } + case ':': + BUFPUSH(start_paren); + pending_exact = 0; + default: + break; + } if (push_option) { BUFPUSH(option_set); BUFPUSH(options); @@ -1852,11 +1884,11 @@ else BUFPUSH(casefold_off); } - *stackp++ = c; - *stackp++ = old_options; - fixup_alt_jump = 0; - laststart = 0; - begalt = b; + *stackp++ = c; + *stackp++ = old_options; + fixup_alt_jump = 0; + laststart = 0; + begalt = b; } break; @@ -2154,6 +2186,7 @@ unfetch_interval: /* If an invalid interval, match the characters as literals. */ + re_warning("regexp has invalid interval"); p = beg_interval; beg_interval = 0; @@ -2205,9 +2238,9 @@ while ((int)b[-1] > 0 && b[b[-1] - 1] == 0) b[-1]--; if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH) - memmove(&b[b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH], + memmove(&b[(unsigned char)b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH], 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8); - b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[b[-1]])*8; + b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[(unsigned char)b[-1]])*8; break; case 'w': @@ -2277,22 +2310,22 @@ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - PATUNFETCH; + PATUNFETCH; p0 = p; - had_mbchar = 0; - c1 = 0; - GET_UNSIGNED_NUMBER(c1); - if (!ISDIGIT(c)) PATUNFETCH; + had_mbchar = 0; + c1 = 0; + GET_UNSIGNED_NUMBER(c1); + if (!ISDIGIT(c)) PATUNFETCH; if (9 < c1 && c1 >= regnum) { - /* need to get octal */ + /* need to get octal */ c = scan_oct(p0, 3, &numlen) & 0xff; p = p0 + numlen; - c1 = 0; - had_num_literal = 1; - goto numeric_char; - } + c1 = 0; + had_num_literal = 1; + goto numeric_char; + } laststart = b; BUFPUSH(duplicate); @@ -2334,6 +2367,10 @@ break; default: + if (c == ']') + re_warning("regexp has `]' without escape"); + else if (c == '}') + re_warning("regexp has `}' without escape"); normal_char: /* Expects the character in `c'. */ had_mbchar = 0; if (ismbchar(c)) { @@ -2344,9 +2381,10 @@ nextp = p + mbclen(c) - 1; if (!pending_exact || pending_exact + *pending_exact + 1 != b || *pending_exact >= (c1 ? 0176 : 0177) - || *nextp == '+' || *nextp == '?' - || *nextp == '*' || *nextp == '^' - || *nextp == '{') { + || (nextp < pend && + ( *nextp == '+' || *nextp == '?' + || *nextp == '*' || *nextp == '^' + || *nextp == '{'))) { laststart = b; BUFPUSH(exactn); pending_exact = b; @@ -2379,7 +2417,6 @@ /* set optimize flags */ laststart = bufp->buffer; if (laststart != b) { - if (*laststart == start_memory) laststart += 3; if (*laststart == dummy_failure_jump) laststart += 3; else if (*laststart == try_next) laststart += 3; if (*laststart == anychar_repeat) { @@ -2591,9 +2628,9 @@ #define trans_eq(c1, c2, translate) (translate?(translate[c1]==translate[c2]):((c1)==(c2))) static int slow_match(little, lend, big, bend, translate) - unsigned char *little, *lend; - unsigned char *big, *bend; - unsigned char *translate; + const unsigned char *little, *lend; + const unsigned char *big, *bend; + const unsigned char *translate; { int c; @@ -2609,14 +2646,14 @@ static int slow_search(little, llen, big, blen, translate) - unsigned char *little; + const unsigned char *little; int llen; - unsigned char *big; + const unsigned char *big; int blen; - char *translate; + const char *translate; { - unsigned char *bsave = big; - unsigned char *bend = big + blen; + const unsigned char *bsave = big; + const unsigned char *bend = big + blen; register int c; int fescape = 0; @@ -2686,12 +2723,12 @@ static int bm_search(little, llen, big, blen, skip, translate) - unsigned char *little; + const unsigned char *little; int llen; - unsigned char *big; + const unsigned char *big; int blen; int *skip; - unsigned char *translate; + const unsigned char *translate; { int i, j, k; @@ -2798,8 +2835,11 @@ case casefold_on: bufp->options |= RE_MAY_IGNORECASE; + options |= RE_OPTION_IGNORECASE; + continue; + case casefold_off: - options ^= RE_OPTION_IGNORECASE; + options &= ~RE_OPTION_IGNORECASE; continue; case option_set: @@ -2889,6 +2929,7 @@ case duplicate: bufp->can_be_null = 1; + if (*p >= bufp->re_nsub) break; fastmap['\n'] = 1; case anychar_repeat: case anychar: @@ -3080,27 +3121,18 @@ /* Adjust startpos for mbc string */ if (current_mbctype && startpos>0 && !(bufp->options&RE_OPTIMIZE_BMATCH)) { - int i = 0; + int i = mbc_startpos(string, startpos); - if (range > 0) { - while (i<size) { - i += mbclen(string[i]); - if (startpos <= i) { - startpos = i; - break; - } + if (i < startpos) { + if (range > 0) { + startpos = i + mbclen(string[i]); } - } - else { - int w; - - while (i<size) { - w = mbclen(string[i]); - if (startpos < i + w) { + else { + int len = mbclen(string[i]); + if (i + len <= startpos) + startpos = i + len; + else startpos = i; - break; - } - i += w; } } } @@ -3108,6 +3140,9 @@ } +static int re_match_exec _((struct re_pattern_buffer *, const char *, int, int, int, + struct re_registers *)); + /* Using the compiled pattern in BUFP->buffer, first tries to match STRING, starting first at index STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of places to try before giving up. If @@ -3128,7 +3163,7 @@ struct re_registers *regs; { register char *fastmap = bufp->fastmap; - int val, anchor = 0; + int val, anchor = 0, initpos = startpos; /* Check for out-of-range starting position. */ if (startpos < 0 || startpos > size) @@ -3170,7 +3205,7 @@ } } if (bufp->options & RE_OPTIMIZE_ANCHOR) { - if (bufp->options&RE_OPTION_SINGLELINE) { + if (bufp->options&RE_OPTION_MULTILINE && range > 0) { goto begbuf_match; } anchor = 1; @@ -3257,7 +3292,7 @@ if (startpos > size) return -1; if ((anchor || !bufp->can_be_null) && range > 0 && size > 0 && startpos == size) return -1; - val = re_match(bufp, string, size, startpos, regs); + val = re_match_exec(bufp, string, size, startpos, initpos, regs); if (val >= 0) return startpos; if (val == -2) return -2; @@ -3492,6 +3527,16 @@ int size, pos; struct re_registers *regs; { + return re_match_exec(bufp, string_arg, size, pos, pos, regs); +} + +static int +re_match_exec(bufp, string_arg, size, pos, beg, regs) + struct re_pattern_buffer *bufp; + const char *string_arg; + int size, pos, beg; + struct re_registers *regs; +{ register unsigned char *p = (unsigned char*)bufp->buffer; unsigned char *p1; @@ -3821,19 +3866,25 @@ int cc, c; PREFETCH; - cc = c = (unsigned char)*d++; + c = (unsigned char)*d++; if (ismbchar(c)) { if (d + mbclen(c) - 1 <= dend) { + cc = c; MBC2WC(c, d); + not = is_in_list_mbc(c, p); + if (!not) { + part = not = is_in_list_sbc(cc, p); + } + } else { + not = is_in_list(c, p); } } - else if (TRANSLATE_P()) - cc = c = (unsigned char)translate[c]; - - not = is_in_list(c, p); - if (!not && cc != c) { - part = not = is_in_list(cc, p); + else { + if (TRANSLATE_P()) + c = (unsigned char)translate[c]; + not = is_in_list(c, p); } + if (*(p - 1) == (unsigned char)charset_not) { not = !not; } @@ -3855,8 +3906,7 @@ case endline: if (AT_STRINGS_END(d)) { - if (size == 0 || d[-1] != '\n') - break; + break; } else if (*d == '\n') break; @@ -3877,8 +3927,7 @@ /* Match at the very end of the data. */ case endbuf2: if (AT_STRINGS_END(d)) { - if (size == 0 || d[-1] != '\n') - break; + break; } /* .. or newline just before the end of the data. */ if (*d == '\n' && AT_STRINGS_END(d+1)) @@ -3903,7 +3952,7 @@ /* Match at the starting position. */ case begpos: - if (d - string == pos) + if (d - string == beg) break; goto fail; @@ -4411,7 +4460,7 @@ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; static const unsigned char mbctab_euc[] = { /* 0xA1-0xFE */ @@ -4423,17 +4472,17 @@ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, }; -static const unsigned char mbctab_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */ +static const unsigned char mbctab_sjis[] = { /* 0x81-0x9F,0xE0-0xFC */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -4442,14 +4491,33 @@ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 +}; + +static const unsigned char mbctab_sjis_trail[] = { /* 0x40-0x7E,0x80-0xFC */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }; static const unsigned char mbctab_utf8[] = { @@ -4468,7 +4536,7 @@ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0 + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0, }; const unsigned char *re_mbctab = mbctab_ascii; @@ -4496,3 +4564,85 @@ break; } } + +#define mbc_isfirst(t, c) (t)[(unsigned char)(c)] +#define mbc_len(t, c) ((t)[(unsigned char)(c)]+1) + +static unsigned int +asc_startpos(string, pos) + const char *string; + unsigned int pos; +{ + return pos; +} + +#define euc_islead(c) ((unsigned char)((c) - 0xa1) > 0xfe - 0xa1) +#define euc_mbclen(c) mbc_len(mbctab_euc, (c)) +static unsigned int +euc_startpos(string, pos) + const char *string; + unsigned int pos; +{ + unsigned int i = pos, w; + + while (i > 0 && !euc_islead(string[i])) { + --i; + } + if (i == pos || i + (w = euc_mbclen(string[i])) > pos) { + return i; + } + i += w; + return i + ((pos - i) & ~1); +} + +#define sjis_isfirst(c) mbc_isfirst(mbctab_sjis, (c)) +#define sjis_istrail(c) mbctab_sjis_trail[(unsigned char)(c)] +#define sjis_mbclen(c) mbc_len(mbctab_sjis, (c)) +static unsigned int +sjis_startpos(string, pos) + const char *string; + unsigned int pos; +{ + unsigned int i = pos, w; + + if (i > 0 && sjis_istrail(string[i])) { + do { + if (!sjis_isfirst(string[--i])) { + ++i; + break; + } + } while (i > 0); + } + if (i == pos || i + (w = sjis_mbclen(string[i])) > pos) { + return i; + } + i += w; + return i + ((pos - i) & ~1); +} + +#define utf8_islead(c) ((unsigned char)((c) & 0xc0) != 0x80) +#define utf8_mbclen(c) mbc_len(mbctab_utf8, (c)) +static unsigned int +utf8_startpos(string, pos) + const char *string; + unsigned int pos; +{ + unsigned int i = pos, w; + + while (i > 0 && !utf8_islead(string[i])) { + --i; + } + if (i == pos || i + (w = utf8_mbclen(string[i])) > pos) { + return i; + } + return i + w; +} + +/* + vi: sw=2 ts=8 + Local variables: + mode : C + c-file-style : "gnu" + tab-width : 8 + End : +*/ --- ruby-1.6.8/regex.h 2001-12-29 01:56:11.000000000 +0900 +++ ruby-1.8.5-p2/regex.h 2003-08-03 18:22:50.000000000 +0900 @@ -23,8 +23,8 @@ Last change: May 21, 1993 by t^2 */ /* modified for Ruby by matz@netlab.co.jp */ -#ifndef __REGEXP_LIBRARY -#define __REGEXP_LIBRARY +#ifndef REGEX_H +#define REGEX_H /* symbol mangling for ruby */ #ifdef RUBY @@ -73,8 +73,6 @@ #define RE_OPTION_MULTILINE (RE_OPTION_EXTENDED<<1) /* ^ and $ ignore newline */ #define RE_OPTION_SINGLELINE (RE_OPTION_MULTILINE<<1) -/* works line Perl's /s; it's called POSIX for wrong reason */ -#define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE) /* search for longest match, in accord with POSIX regexp */ #define RE_OPTION_LONGEST (RE_OPTION_SINGLELINE<<1) @@ -90,13 +88,10 @@ #define MBCTYPE_SJIS 2 #define MBCTYPE_UTF8 3 -#if defined IMPORT || defined USEIMPORTLIB -extern __declspec(dllimport) -#elif defined EXPORT -extern __declspec(dllexport) -#else extern -#endif +#if defined _WIN32 && !defined __GNUC__ && !defined RUBY_EXPORT +__declspec(dllimport) +# endif const unsigned char *re_mbctab; #if defined(__STDC__) void re_mbcinit (int); @@ -187,7 +182,6 @@ regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ } regmatch_t; - #ifdef __STDC__ extern char *re_compile_pattern (const char *, int, struct re_pattern_buffer *); @@ -224,4 +218,4 @@ #endif /* __STDC__ */ -#endif /* !__REGEXP_LIBRARY */ +#endif /* !REGEX_H */
でもって、PHP4.4.4とRuby1.6.8との差分。
--- php-4.4.4/ext/mbstring/mbregex/mbregex.c 2003-10-25 20:58:46.000000000 +0900 +++ ruby-1.6.8/regex.c 2002-11-19 22:36:29.000000000 +0900 @@ -21,23 +21,7 @@ /* Perl5 extension added by matz <matz@caelum.co.jp> */ /* UTF-8 extension added Jan 16 1999 by Yoshida Masato <yoshidam@tau.bekkoame.ne.jp> */ -#include "php.h" - -#ifdef HAVE_CONFIG_H #include "config.h" -#endif - -#if HAVE_MBREGEX - -#define re_compile_pattern mbre_compile_pattern -#define re_free_pattern mbre_free_pattern -#define re_adjust_startpos mbre_adjust_startpos -#define re_compile_fastmap mbre_compile_fastmap -#define re_search mbre_search -#define re_match mbre_match -#define re_set_casetable mbre_set_casetable -#define re_copy_registers mbre_copy_registers -#define re_free_registers mbre_free_registers #ifdef HAVE_STRING_H # include <string.h> @@ -100,13 +84,6 @@ void xfree _((void*)); #endif - -#define xmalloc emalloc -#define xcalloc ecalloc -#define xrealloc erealloc -#define xfree efree - - /* Make alloca work the best possible way. */ #ifdef __GNUC__ # ifndef atarist @@ -185,19 +162,16 @@ } while (0) /* Get the interface, including the syntax bits. */ -#include "mbregex.h" +#include "regex.h" /* Subroutines for re_compile_pattern. */ static void store_jump _((char*, int, char*)); static void insert_jump _((int, char*, char*, char*)); static void store_jump_n _((char*, int, char*, unsigned)); static void insert_jump_n _((int, char*, char*, char*, unsigned)); -#if 0 static void insert_op _((int, char*, char*)); -#endif static void insert_op_2 _((int, char*, char*, int, int)); -static int memcmp_translate _((unsigned char*, unsigned char*, int, const unsigned char*)); -static const unsigned char* re_mbctab_get _((int)); +static int memcmp_translate _((unsigned char*, unsigned char*, int)); /* Define the syntax stuff, so we can do the \<, \>, etc. */ @@ -208,137 +182,39 @@ #define SYNTAX(c) re_syntax_table[c] -static const char casetable[] = { - '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', - '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', - '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', - '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', - /* ' ' '!' '"' '#' '$' '%' '&' ''' */ - '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', - /* '(' ')' '*' '+' ',' '-' '.' '/' */ - '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', - /* '0' '1' '2' '3' '4' '5' '6' '7' */ - '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', - /* '8' '9' ':' ';' '<' '=' '>' '?' */ - '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', - /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */ - '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', - /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */ - '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', - /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */ - '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', - /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */ - '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', - /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */ - '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', - /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */ - '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', - /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */ - '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', - /* 'x' 'y' 'z' '{' '|' '}' '~' */ - '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', - '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', - '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', - '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', - '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', - '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', - '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', - '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', - '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', - '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', - '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', - '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', - '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', - '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', - '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', - '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', - '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', -}; - static char re_syntax_table[256]; static void init_syntax_once _((void)); -static const unsigned char *translate = (const unsigned char*)casetable; -static void init_regs _((struct mbre_registers*, unsigned int)); +static const unsigned char *translate = 0; +static void init_regs _((struct re_registers*, unsigned int)); static void bm_init_skip _((int *, unsigned char*, int, const unsigned char*)); -#if 0 static int current_mbctype = MBCTYPE_ASCII; -#endif #undef P -static unsigned long -scan_oct(start, len, retlen) -const char *start; -int len; -int *retlen; -{ - register const char *s = start; - register unsigned long retval = 0; - - while (len-- && *s >= '0' && *s <= '7') { - retval <<= 3; - retval |= *s++ - '0'; - } - *retlen = s - start; - return retval; -} - -static unsigned long -scan_hex(start, len, retlen) -const char *start; -int len; -int *retlen; -{ - static char hexdigit[] = "0123456789abcdef0123456789ABCDEFx"; - register const char *s = start; - register unsigned long retval = 0; - char *tmp; - - while (len-- && *s && (tmp = strchr(hexdigit, *s))) { - retval <<= 4; - retval |= (tmp - hexdigit) & 15; - s++; - } - *retlen = s - start; - return retval; -} +#ifdef RUBY +#include "util.h" +#endif -#define rt re_syntax_table static void init_syntax_once() { - register int c; - static int done = 0; + register int c; + static int done = 0; -#ifdef ZTS - extern MUTEX_T mbregex_locale_mutex; -#endif + if (done) + return; - if (done) { - return; - } -#ifdef ZTS - tsrm_mutex_lock( mbregex_locale_mutex ); -#endif - - memset(re_syntax_table, 0, sizeof(re_syntax_table)); - - for (c=0; c<=0x7f; c++) { - if (isalnum(c)) { - re_syntax_table[c] = Sword; - } - } - re_syntax_table['_'] = Sword; + memset(re_syntax_table, 0, sizeof re_syntax_table); - for (c=0x80; c<=0xff; c++) { - if (isalnum(c)) { - re_syntax_table[c] = Sword2; - } - } -#ifdef ZTS - tsrm_mutex_unlock( mbregex_locale_mutex ); -#endif - done = 1; + for (c=0; c<=0x7f; c++) + if (isalnum(c)) + re_syntax_table[c] = Sword; + re_syntax_table['_'] = Sword; + + for (c=0x80; c<=0xff; c++) + if (isalnum(c)) + re_syntax_table[c] = Sword2; + done = 1; } void @@ -477,11 +353,11 @@ start_memory, /* Start remembering the text that is matched, for storing in a memory register. Followed by one byte containing the register number. Register numbers - must be in the range 0 through MBRE_NREGS. */ + must be in the range 0 through RE_NREGS. */ stop_memory, /* Stop remembering the text that is matched and store it in a memory register. Followed by one byte containing the register number. Register - numbers must be in the range 0 through MBRE_NREGS. */ + numbers must be in the range 0 through RE_NREGS. */ start_paren, /* Place holder at the start of (?:..). */ stop_paren, /* Place holder at the end of (?:..). */ casefold_on, /* Turn on casefold flag. */ @@ -494,7 +370,6 @@ duplicate, /* Match a duplicate of something remembered. Followed by one byte containing the index of the memory register. */ - fail, /* always fails. */ wordchar, /* Matches any word-constituent character. */ notwordchar, /* Matches any char that is not a word-constituent. */ wordbeg, /* Succeeds if at word beginning. */ @@ -545,7 +420,7 @@ The argument SYNTAX is a bit-mask comprised of the various bits defined in regex.h. */ -#if 0 + long re_set_syntax(syntax) long syntax; @@ -553,12 +428,12 @@ /* obsolete */ return 0; } -#endif + /* Macros for re_compile_pattern, which is found below these definitions. */ -#define TRANSLATE_P() ((options&MBRE_OPTION_IGNORECASE) && translate) -#define MAY_TRANSLATE() ((bufp->options&(MBRE_OPTION_IGNORECASE|MBRE_MAY_IGNORECASE)) && translate) +#define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate) +#define MAY_TRANSLATE() ((bufp->options&(RE_OPTION_IGNORECASE|RE_MAY_IGNORECASE)) && translate) /* Fetch the next character in the uncompiled pattern---translating it if necessary. Also cast from a signed character in the constant string passed to us by the user to an unsigned char that we can use @@ -583,7 +458,7 @@ do { \ if (current_mbctype == MBCTYPE_UTF8) { \ int n = mbclen(c) - 1; \ - c &= (1<<(MBRE_BYTEWIDTH-2-n)) - 1; \ + c &= (1<<(BYTEWIDTH-2-n)) - 1; \ while (n--) { \ c = c << 6 | (*p++ & ((1<<6)-1)); \ } \ @@ -601,7 +476,7 @@ } while(0) #define WC2MBC1ST(c) \ - ((c<0x100)?(c):((current_mbctype != MBCTYPE_UTF8)?(((c)>>8)&0xff):utf8_firstbyte(c))) + ((current_mbctype != MBCTYPE_UTF8) ? ((c<0x100) ? (c) : (((c)>>8)&0xff)) : utf8_firstbyte(c)) static unsigned int utf8_firstbyte(c) @@ -620,33 +495,36 @@ #endif } -#if 0 static void print_mbc(c) unsigned int c; { if (current_mbctype == MBCTYPE_UTF8) { if (c < 0x80) - printf("%c", c); + printf("%c", (int)c); else if (c <= 0x7ff) - printf("%c%c", utf8_firstbyte(c), c&0x3f); + printf("%c%c", (int)utf8_firstbyte(c), (int)(c & 0x3f)); else if (c <= 0xffff) - printf("%c%c%c", utf8_firstbyte(c), (c>>6)&0x3f, c&0x3f); + printf("%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 6) & 0x3f), + (int)(c & 0x3f)); else if (c <= 0x1fffff) - printf("%c%c%c%c", utf8_firstbyte(c), (c>>12)&0x3f, (c>>6)&0x3f, c&0x3f); + printf("%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 12) & 0x3f), + (int)((c >> 6) & 0x3f), (int)(c & 0x3f)); else if (c <= 0x3ffffff) - printf("%c%c%c%c%c", utf8_firstbyte(c), (c>>18)&0x3f, (c>>12)&0x3f, (c>>6)&0x3f, c&0x3f); + printf("%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 18) & 0x3f), + (int)((c >> 12) & 0x3f), (int)((c >> 6) & 0x3f), (int)(c & 0x3f)); else if (c <= 0x7fffffff) - printf("%c%c%c%c%c%c", utf8_firstbyte(c), (c>>24)&0x3f, (c>>18)&0x3f, (c>>12)&0x3f, (c>>6)&0x3f, c&0x3f); + printf("%c%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 24) & 0x3f), + (int)((c >> 18) & 0x3f), (int)((c >> 12) & 0x3f), + (int)((c >> 6) & 0x3f), (int)(c & 0x3f)); } else if (c < 0xff) { - printf("\\%o", c); + printf("\\%o", (int)c); } else { - printf("%c%c", c>>MBRE_BYTEWIDTH, c&0xff); + printf("%c%c", (int)(c >> BYTEWIDTH), (int)(c &0xff)); } } -#endif /* If the buffer isn't allocated when it comes in, use this. */ #define INIT_BUF_SIZE 28 @@ -654,7 +532,7 @@ /* Make sure we have at least N more bytes of space in buffer. */ #define GET_BUFFER_SPACE(n) \ do { \ - while (b - bufp->buffer + (size_t)(n) >= (size_t)bufp->allocated) \ + while (b - bufp->buffer + (n) >= bufp->allocated) \ EXTEND_BUFFER; \ } while (0) @@ -690,8 +568,8 @@ /* Set the bit for character C in a character set list. */ #define SET_LIST_BIT(c) \ - (b[(unsigned char)(c) / MBRE_BYTEWIDTH] \ - |= 1 << ((unsigned char)(c) % MBRE_BYTEWIDTH)) + (b[(unsigned char)(c) / BYTEWIDTH] \ + |= 1 << ((unsigned char)(c) % BYTEWIDTH)) /* Get the next unsigned number in the uncompiled pattern. */ #define GET_UNSIGNED_NUMBER(num) \ @@ -824,7 +702,7 @@ unsigned short i, j; size = *b++; - if ((int)c / MBRE_BYTEWIDTH < (int)size && b[c / MBRE_BYTEWIDTH] & 1 << c % MBRE_BYTEWIDTH) { + if ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH) { return 1; } b += size + 2; @@ -839,13 +717,11 @@ else j = k; } - if (i < size && EXTRACT_MBC(&b[i*8]) <= c - && ((unsigned char)c != '\n' && (unsigned char)c != '\0')) + if (i < size && EXTRACT_MBC(&b[i*8]) <= c) return 1; return 0; } -#if 0 static void print_partial_compiled_pattern(start, end) unsigned char *start; @@ -954,9 +830,9 @@ putchar ('/'); - for (bit = 0; bit < MBRE_BYTEWIDTH; bit++) + for (bit = 0; bit < BYTEWIDTH; bit++) if (map_byte & (1 << bit)) - printf("%c", c * MBRE_BYTEWIDTH + bit); + printf("%c", c * BYTEWIDTH + bit); } p += mcnt; mcnt = EXTRACT_UNSIGNED_AND_INCR(p); @@ -1094,13 +970,12 @@ static void print_compiled_pattern(bufp) - struct mbre_pattern_buffer *bufp; + struct re_pattern_buffer *bufp; { unsigned char *buffer = (unsigned char*)bufp->buffer; print_partial_compiled_pattern(buffer, buffer + bufp->used); } -#endif static char* calculate_must_string(start, end) @@ -1170,7 +1045,7 @@ p += mcnt; mcnt = EXTRACT_UNSIGNED_AND_INCR(p); while (mcnt--) { - p += 4; + p += 8; } break; @@ -1292,27 +1167,27 @@ PATTERN is the address of the pattern string SIZE is the length of it. - BUFP is a struct mbre_pattern_buffer * which points to the info + BUFP is a struct re_pattern_buffer * which points to the info on where to store the byte commands. This structure contains a char * which points to the actual space, which should have been obtained with malloc. re_compile_pattern may use realloc to grow the buffer space. The number of bytes of commands can be found out by looking in - the `struct mbre_pattern_buffer' that bufp pointed to, after + the `struct re_pattern_buffer' that bufp pointed to, after re_compile_pattern returns. */ char * re_compile_pattern(pattern, size, bufp) const char *pattern; int size; - struct mbre_pattern_buffer *bufp; + struct re_pattern_buffer *bufp; { register char *b = bufp->buffer; register const char *p = pattern; const char *nextp; const char *pend = pattern + size; - register unsigned int c, c1=0; + register unsigned int c, c1 = 0; const char *p0; int numlen; #define ERROR_MSG_MAX_SIZE 200 @@ -1388,13 +1263,9 @@ int options = bufp->options; - int current_mbctype = bufp->mbctype; - const unsigned char *re_mbctab = re_mbctab_get(current_mbctype); - bufp->fastmap_accurate = 0; bufp->must = 0; bufp->must_skip = 0; - bufp->stclass = 0; /* Initialize the syntax table. */ init_syntax_once(); @@ -1416,7 +1287,7 @@ switch (c) { case '$': - if (bufp->options & MBRE_OPTION_SINGLELINE) { + if (bufp->options & RE_OPTION_SINGLELINE) { BUFPUSH(endbuf); } else { @@ -1436,7 +1307,7 @@ break; case '^': - if (bufp->options & MBRE_OPTION_SINGLELINE) + if (bufp->options & RE_OPTION_SINGLELINE) BUFPUSH(begbuf); else BUFPUSH(begline); @@ -1450,9 +1321,6 @@ snprintf(error_msg, ERROR_MSG_MAX_SIZE, "invalid regular expression; there's no previous pattern, to which '%c' would define cardinality at %d", c, p-pattern); - if (bufp->buffer) { - xfree(bufp->buffer); - } FREE_AND_RETURN(stackb, error_msg); } /* If there is a sequence of repetition chars, @@ -1536,7 +1404,7 @@ case '[': if (p == pend) FREE_AND_RETURN(stackb, "invalid regular expression; '[' can't be the last character ie. can't start range at the end of pattern"); - while ((b - bufp->buffer + 9 + (1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH) + while ((b - bufp->buffer + 9 + (1 << BYTEWIDTH) / BYTEWIDTH) > bufp->allocated) EXTEND_BUFFER; @@ -1549,9 +1417,9 @@ BUFPUSH(charset); p0 = p; - BUFPUSH((1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH); + BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH); /* Clear the whole map */ - memset(b, 0, (1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH + 2); + memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2); had_mbchar = 0; had_num_literal = 0; @@ -1562,11 +1430,11 @@ int size; unsigned last = (unsigned)-1; - if ((size = EXTRACT_UNSIGNED(&b[(1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH])) + if ((size = EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])) || current_mbctype) { /* Ensure the space is enough to hold another interval of multi-byte chars in charset(_not)?. */ - size = (1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH + 2 + size*8 + 8; + size = (1 << BYTEWIDTH) / BYTEWIDTH + 2 + size*8 + 8; while (b + size + 1 > bufp->buffer + bufp->allocated) EXTEND_BUFFER; } @@ -1602,7 +1470,7 @@ PATFETCH_RAW(c); switch (c) { case 'w': - for (c = 0; c < (1 << MBRE_BYTEWIDTH); c++) { + for (c = 0; c < (1 << BYTEWIDTH); c++) { if (SYNTAX(c) == Sword || (!current_mbctype && SYNTAX(c) == Sword2)) SET_LIST_BIT(c); @@ -1615,7 +1483,7 @@ continue; case 'W': - for (c = 0; c < (1 << MBRE_BYTEWIDTH); c++) { + for (c = 0; c < (1 << BYTEWIDTH); c++) { if (SYNTAX(c) != Sword && ((current_mbctype && !re_mbctab[c]) || (!current_mbctype && SYNTAX(c) != Sword2))) @@ -1662,6 +1530,7 @@ case 'x': c = scan_hex(p, 2, &numlen); + if (numlen == 0) goto invalid_escape; p += numlen; had_num_literal = 1; break; @@ -1677,11 +1546,15 @@ case 'M': case 'C': case 'c': - p0 = --p; - c = read_special(p, pend, &p0); - if (c > 255) goto invalid_escape; - p = p0; - had_num_literal = 1; + { + char *pp; + + --p; + c = read_special(p, pend, &pp); + if (c > 255) goto invalid_escape; + p = pp; + had_num_literal = 1; + } break; default: @@ -1768,7 +1641,7 @@ if (p == pend) FREE_AND_RETURN(stackb, "invalid regular expression; range doesn't have ending ']' after a character class"); - for (ch = 0; ch < 1 << MBRE_BYTEWIDTH; ch++) { + for (ch = 0; ch < 1 << BYTEWIDTH; ch++) { if ( (is_alnum && ISALNUM(ch)) || (is_alpha && ISALPHA(ch)) || (is_blank && ISBLANK(ch)) @@ -1806,12 +1679,12 @@ /* Discard any character set/class bitmap bytes that are all 0 at the end of the map. Decrement the map-length byte too. */ - while ((int)b[-1] > 0 && b[(int)b[-1] - 1] == 0) + while ((int)b[-1] > 0 && b[b[-1] - 1] == 0) b[-1]--; - if (b[-1] != (1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH) - memmove(&b[(int)b[-1]], &b[(1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH], - 2 + EXTRACT_UNSIGNED(&b[(1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH])*8); - b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[(int)b[-1]])*8; + if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH) + memmove(&b[b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH], + 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8); + b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[b[-1]])*8; break; case '(': @@ -1839,43 +1712,43 @@ case 'x': if (negative) - options &= ~MBRE_OPTION_EXTENDED; + options &= ~RE_OPTION_EXTENDED; else - options |= MBRE_OPTION_EXTENDED; + options |= RE_OPTION_EXTENDED; break; case 'p': if (negative) { - if ((options&MBRE_OPTION_POSIXLINE) == MBRE_OPTION_POSIXLINE) { - options &= ~MBRE_OPTION_POSIXLINE; + if ((options&RE_OPTION_POSIXLINE) == RE_OPTION_POSIXLINE) { + options &= ~RE_OPTION_POSIXLINE; } } - else if ((options&MBRE_OPTION_POSIXLINE) != MBRE_OPTION_POSIXLINE) { - options |= MBRE_OPTION_POSIXLINE; + else if ((options&RE_OPTION_POSIXLINE) != RE_OPTION_POSIXLINE) { + options |= RE_OPTION_POSIXLINE; } push_option = 1; break; case 'm': if (negative) { - if (options&MBRE_OPTION_MULTILINE) { - options &= ~MBRE_OPTION_MULTILINE; + if (options&RE_OPTION_MULTILINE) { + options &= ~RE_OPTION_MULTILINE; } } - else if (!(options&MBRE_OPTION_MULTILINE)) { - options |= MBRE_OPTION_MULTILINE; + else if (!(options&RE_OPTION_MULTILINE)) { + options |= RE_OPTION_MULTILINE; } push_option = 1; break; case 'i': if (negative) { - if (options&MBRE_OPTION_IGNORECASE) { - options &= ~MBRE_OPTION_IGNORECASE; + if (options&RE_OPTION_IGNORECASE) { + options &= ~RE_OPTION_IGNORECASE; } } - else if (!(options&MBRE_OPTION_IGNORECASE)) { - options |= MBRE_OPTION_IGNORECASE; + else if (!(options&RE_OPTION_IGNORECASE)) { + options |= RE_OPTION_IGNORECASE; } casefold = 1; break; @@ -1920,7 +1793,7 @@ BUFPUSH(options); } if (casefold) { - if (options & MBRE_OPTION_IGNORECASE) + if (options & RE_OPTION_IGNORECASE) BUFPUSH(casefold_on); else BUFPUSH(casefold_off); @@ -1932,8 +1805,8 @@ } /* Laststart should point to the start_memory that we are about - to push (unless the pattern has MBRE_NREGS or more ('s). */ - /* obsolete: now MBRE_NREGS is just a default register size. */ + to push (unless the pattern has RE_NREGS or more ('s). */ + /* obsolete: now RE_NREGS is just a default register size. */ *stackp++ = b - bufp->buffer; *stackp++ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; *stackp++ = begalt - bufp->buffer; @@ -1945,7 +1818,7 @@ *stackp++ = b - bufp->buffer; BUFPUSH(0); /* too many ()'s to fit in a byte. (max 254) */ - if (regnum >= MBRE_REG_MAX) goto too_big; + if (regnum >= RE_REG_MAX) goto too_big; break; case '=': @@ -1974,7 +1847,7 @@ BUFPUSH(options); } if (casefold) { - if (options & MBRE_OPTION_IGNORECASE) + if (options & RE_OPTION_IGNORECASE) BUFPUSH(casefold_on); else BUFPUSH(casefold_off); @@ -2004,10 +1877,10 @@ store_jump(fixup_alt_jump, jump, b); } if (options != stackp[-1]) { - if ((options ^ stackp[-1]) & MBRE_OPTION_IGNORECASE) { - BUFPUSH((options&MBRE_OPTION_IGNORECASE)?casefold_off:casefold_on); + if ((options ^ stackp[-1]) & RE_OPTION_IGNORECASE) { + BUFPUSH((options&RE_OPTION_IGNORECASE)?casefold_off:casefold_on); } - if ((options ^ stackp[-1]) != MBRE_OPTION_IGNORECASE) { + if ((options ^ stackp[-1]) != RE_OPTION_IGNORECASE) { BUFPUSH(option_set); BUFPUSH(stackp[-1]); } @@ -2127,9 +2000,9 @@ if (lower_bound < 0 || c != '}') goto unfetch_interval; - if (lower_bound >= MBRE_DUP_MAX || upper_bound >= MBRE_DUP_MAX) + if (lower_bound >= RE_DUP_MAX || upper_bound >= RE_DUP_MAX) FREE_AND_RETURN(stackb, "too big quantifier in {,}"); - if (upper_bound < 0) upper_bound = MBRE_DUP_MAX; + if (upper_bound < 0) upper_bound = RE_DUP_MAX; if (lower_bound > upper_bound) FREE_AND_RETURN(stackb, "can't do {n,m} with n > m"); @@ -2145,7 +2018,7 @@ if (lower_bound == 0) { zero_times_ok = 1; - if (upper_bound == MBRE_DUP_MAX) { + if (upper_bound == RE_DUP_MAX) { many_times_ok = 1; goto repeat; } @@ -2159,7 +2032,7 @@ /* No need to repeat */ break; } - if (upper_bound == MBRE_DUP_MAX) { + if (upper_bound == RE_DUP_MAX) { many_times_ok = 1; zero_times_ok = 0; goto repeat; @@ -2226,7 +2099,7 @@ `upper_bound' is 1, though.) */ { /* If the upper bound is > 1, we need to insert more at the end of the loop. */ - unsigned int nbytes = (unsigned int)upper_bound == 1 ? 10 : 20; + unsigned nbytes = upper_bound == 1 ? 10 : 20; GET_BUFFER_SPACE(nbytes); /* Initialize lower bound of the `succeed_n', even @@ -2300,7 +2173,7 @@ case 'S': case 'd': case 'D': - while (b - bufp->buffer + 9 + (1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH + while (b - bufp->buffer + 9 + (1 << BYTEWIDTH) / BYTEWIDTH > bufp->allocated) EXTEND_BUFFER; @@ -2312,8 +2185,8 @@ BUFPUSH(charset_not); } - BUFPUSH((1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH); - memset(b, 0, (1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH + 2); + BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH); + memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2); if (c == 's' || c == 'S') { SET_LIST_BIT(' '); SET_LIST_BIT('\t'); @@ -2331,10 +2204,10 @@ while ((int)b[-1] > 0 && b[b[-1] - 1] == 0) b[-1]--; - if (b[-1] != (1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH) - memmove(&b[(int)b[-1]], &b[(1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH], - 2 + EXTRACT_UNSIGNED(&b[(1 << MBRE_BYTEWIDTH) / MBRE_BYTEWIDTH])*8); - b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[(int)b[-1]])*8; + if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH) + memmove(&b[b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH], + 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8); + b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[b[-1]])*8; break; case 'w': @@ -2370,7 +2243,7 @@ break; case 'Z': - if ((bufp->options & MBRE_OPTION_SINGLELINE) == 0) { + if ((bufp->options & RE_OPTION_SINGLELINE) == 0) { BUFPUSH(endbuf2); break; } @@ -2387,6 +2260,7 @@ case 'x': had_mbchar = 0; c = scan_hex(p, 2, &numlen); + if (numlen == 0) goto invalid_escape; p += numlen; had_num_literal = 1; goto numeric_char; @@ -2394,7 +2268,7 @@ /* octal */ case '0': had_mbchar = 0; - c = scan_oct(p, 3, &numlen); + c = scan_oct(p, 2, &numlen); p += numlen; had_num_literal = 1; goto numeric_char; @@ -2411,7 +2285,7 @@ GET_UNSIGNED_NUMBER(c1); if (!ISDIGIT(c)) PATUNFETCH; - if (9 < c1 && c1 >= (unsigned int)regnum) { + if (9 < c1 && c1 >= regnum) { /* need to get octal */ c = scan_oct(p0, 3, &numlen) & 0xff; p = p0 + numlen; @@ -2442,7 +2316,7 @@ break; case '#': - if (options & MBRE_OPTION_EXTENDED) { + if (options & RE_OPTION_EXTENDED) { while (p != pend) { PATFETCH(c); if (c == '\n') break; @@ -2456,7 +2330,7 @@ case '\f': case '\r': case '\n': - if (options & MBRE_OPTION_EXTENDED) + if (options & RE_OPTION_EXTENDED) break; default: @@ -2509,23 +2383,7 @@ if (*laststart == dummy_failure_jump) laststart += 3; else if (*laststart == try_next) laststart += 3; if (*laststart == anychar_repeat) { - bufp->options |= MBRE_OPTIMIZE_ANCHOR; - } - else if (*laststart == on_failure_jump) { - int mcnt; - - laststart++; - EXTRACT_NUMBER_AND_INCR(mcnt, laststart); - if (*laststart == charset || *laststart == charset_not) { - p0 = laststart; - mcnt = *++p0; - p0 += mcnt+1; - mcnt = EXTRACT_UNSIGNED_AND_INCR(p0); - p0 += 8*mcnt; - if (*p0 == maybe_finalize_jump) { - bufp->stclass = laststart; - } - } + bufp->options |= RE_OPTIMIZE_ANCHOR; } } @@ -2535,14 +2393,14 @@ if (laststart != b) { if (*laststart == start_memory) laststart += 3; if (*laststart == exactn) { - bufp->options |= MBRE_OPTIMIZE_EXACTN; + bufp->options |= RE_OPTIMIZE_EXACTN; bufp->must = laststart+1; } } if (!bufp->must) { bufp->must = calculate_must_string(bufp->buffer, b); } - if (current_mbctype == MBCTYPE_SJIS) bufp->options |= MBRE_OPTIMIZE_NO_BM; + if (current_mbctype == MBCTYPE_SJIS) bufp->options |= RE_OPTIMIZE_NO_BM; else if (bufp->must) { int i; int len = (unsigned char)bufp->must[0]; @@ -2550,12 +2408,12 @@ for (i=1; i<len; i++) { if ((unsigned char)bufp->must[i] == 0xff || (current_mbctype && ismbchar(bufp->must[i]))) { - bufp->options |= MBRE_OPTIMIZE_NO_BM; + bufp->options |= RE_OPTIMIZE_NO_BM; break; } } - if (!(bufp->options & MBRE_OPTIMIZE_NO_BM)) { - bufp->must_skip = (int *) xmalloc((1 << MBRE_BYTEWIDTH)*sizeof(int)); + if (!(bufp->options & RE_OPTIMIZE_NO_BM)) { + bufp->must_skip = (int *) xmalloc((1 << BYTEWIDTH)*sizeof(int)); bm_init_skip(bufp->must_skip, (unsigned char*)bufp->must+1, (unsigned char)bufp->must[0], (unsigned char*)(MAY_TRANSLATE()?translate:0)); @@ -2566,7 +2424,7 @@ bufp->regend = TMALLOC(regnum, unsigned char*); bufp->old_regstart = TMALLOC(regnum, unsigned char*); bufp->old_regend = TMALLOC(regnum, unsigned char*); - bufp->reg_info = TMALLOC(regnum, mbre_register_info_type); + bufp->reg_info = TMALLOC(regnum, register_info_type); bufp->best_regstart = TMALLOC(regnum, unsigned char*); bufp->best_regend = TMALLOC(regnum, unsigned char*); FREE_AND_RETURN(stackb, 0); @@ -2592,21 +2450,20 @@ void re_free_pattern(bufp) - struct mbre_pattern_buffer *bufp; + struct re_pattern_buffer *bufp; { - if(bufp){ - if (bufp->buffer) xfree(bufp->buffer); - if (bufp->fastmap) xfree(bufp->fastmap); - if (bufp->must_skip) xfree(bufp->must_skip); - - if (bufp->regstart) xfree(bufp->regstart); - if (bufp->regend) xfree(bufp->regend); - if (bufp->old_regstart) xfree(bufp->old_regstart); - if (bufp->old_regend) xfree(bufp->old_regend); - if (bufp->best_regstart) xfree(bufp->best_regstart); - if (bufp->best_regend) xfree(bufp->best_regend); - if (bufp->reg_info) xfree(bufp->reg_info); - } + xfree(bufp->buffer); + xfree(bufp->fastmap); + if (bufp->must_skip) xfree(bufp->must_skip); + + xfree(bufp->regstart); + xfree(bufp->regend); + xfree(bufp->old_regstart); + xfree(bufp->old_regend); + xfree(bufp->best_regstart); + xfree(bufp->best_regend); + xfree(bufp->reg_info); + xfree(bufp); } /* Store a jump of the form <OPCODE> <relative address>. @@ -2692,7 +2549,6 @@ If you call this function, you must zero out pending_exact. */ -#if 0 static void insert_op(op, there, current_end) int op; @@ -2706,7 +2562,7 @@ there[0] = (char)op; } -#endif + /* Open up space at location THERE, and insert operation OP followed by NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so @@ -2752,13 +2608,12 @@ } static int -slow_search(little, llen, big, blen, translate, re_mbctab) +slow_search(little, llen, big, blen, translate) unsigned char *little; int llen; unsigned char *big; int blen; char *translate; - const unsigned char *re_mbctab; { unsigned char *bsave = big; unsigned char *bend = big + blen; @@ -2870,16 +2725,16 @@ } /* Given a pattern, compute a fastmap from it. The fastmap records - which of the (1 << MBRE_BYTEWIDTH) possible characters can start a string + which of the (1 << BYTEWIDTH) possible characters can start a string that matches the pattern. This fastmap is used by re_search to skip quickly over totally implausible text.