Loading ...
Sorry, an error occurred while loading the content.
 

[vim-multibyte] Multibyte regexp

Expand Messages
  • Taro Muraoka
    -Regular expression search and substitute. (16K) Please test. ... Taro Muraoka Problem: Cannot multibyte pattern search and
    Message 1 of 4 , Jan 15, 2000
      -Regular expression search and substitute. (16K)
      Please test.
      ----
      Taro Muraoka <koron@...>


      Problem: Cannot multibyte pattern search and substitute.
      Solution: Add new node for multibyte character. and etc...
      Files: src/regexp.c src/misc2.c


      *** ./src.orig/regexp.c Sat May 15 22:48:52 1999
      --- ./src/regexp.c Sat Jan 15 14:45:42 2000
      ***************
      *** 195,200 ****
      --- 195,204 ----
      #define BACKREF 80 /* -89 node Match same string again \1-\9 */
      #define BRACE_COMPLEX 90 /* -99 node Match nodes between m & n times */

      + #ifdef MULTI_BYTE
      + #define MULTIBYTECODE 200 /* str Match multibyte code */
      + #endif
      +
      #define Magic(x) ((x) | ('\\' << 8))

      /*
      ***************
      *** 416,421 ****
      --- 420,433 ----
      {
      ++p;
      if (*p != ']' && *p != NUL)
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*p))
      + {
      + if (*++p == NUL) break;
      + ++p;
      + }
      + else
      + #endif
      ++p;
      }
      else if (*p == '\\'
      ***************
      *** 428,433 ****
      --- 440,452 ----
      ++p; /* It was not a class name */
      }
      else
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*p))
      + {
      + if (*++p == NUL) break;
      + }
      + else
      + #endif
      ++p;
      }

      ***************
      *** 491,496 ****
      --- 510,518 ----
      * Global work variables for vim_regcomp().
      */

      + #ifdef MULTI_BYTE
      + static int skip_multi; /* previous skip was multibyte */
      + #endif
      static char_u *regparse; /* Input-scan pointer. */
      static int num_complex_braces; /* Complex \{...} count */
      static int regnpar; /* () count. */
      ***************
      *** 534,539 ****
      --- 556,575 ----
      static int read_limits __ARGS((int, int, int *, int *));
      static void regtail __ARGS((char_u *, char_u *));
      static void regoptail __ARGS((char_u *, char_u *));
      + #ifdef MULTI_BYTE
      + static int re_ismultibytecode __ARGS((int));
      +
      + /*
      + * Is chr multi-byte? If no then return 0 else return leadbyte
      + */
      + int
      + re_ismultibytecode(c)
      + int c;
      + {
      + int lead = ( c >> 8 ) & 0xFF;
      + return IsLeadByte(lead) ? lead : 0;
      + }
      + #endif

      /*
      * Skip past regular expression.
      ***************
      *** 559,564 ****
      --- 595,607 ----
      }
      else if (p[0] == '\\' && p[1] != NUL)
      ++p; /* skip next character */
      + #ifdef MULTI_BYTE
      + else if (IsLeadByte(*p))
      + {
      + if (*++p == NUL)
      + break;
      + }
      + #endif
      }
      return p;
      }
      ***************
      *** 1148,1154 ****
      --- 1191,1201 ----
      case Magic('['):
      {
      char_u *p;
      + #ifdef MULTI_BYTE
      + char_u *base;

      + base = regparse;
      + #endif
      /*
      * If there is no matching ']', we assume the '[' is a normal
      * character. This makes ":help [" work.
      ***************
      *** 1177,1182 ****
      --- 1224,1262 ----
      regc('-');
      else
      {
      + #ifdef MULTI_BYTE
      + int lead, start, end;
      +
      + start = UCHARAT(regparse - 2) + 1;
      + end = UCHARAT(regparse);
      + if (IsTrailByte(base, regparse - 2))
      + {
      + if (!IsLeadByte(end))
      + EMSG_RET_NULL(e_invrange);
      + lead = UCHARAT(regparse - 3);
      + if (lead != end)
      + EMSG_RET_NULL(e_invrange);
      + end = UCHARAT(regparse + 1);
      + if (start > end +1)
      + EMSG_RET_NULL(e_invrange);
      + for (; start <= end; start++)
      + {
      + regc(lead);
      + regc(start);
      + }
      + regparse += 2;
      + }
      + else
      + {
      + if (IsLeadByte(end))
      + EMSG_RET_NULL(e_invrange);
      + if (start > end +1)
      + EMSG_RET_NULL(e_invrange);
      + for (; start <= end; start++)
      + regc(start);
      + regparse++;
      + }
      + #else
      int cclass;
      int cclassend;

      ***************
      *** 1187,1192 ****
      --- 1267,1273 ----
      for (; cclass <= cclassend; cclass++)
      regc(cclass);
      regparse++;
      + #endif
      }
      }
      /*
      ***************
      *** 1216,1222 ****
      --- 1297,1312 ----
      regc(cu);
      }
      else
      + #ifndef MULTI_BYTE
      regc(*regparse++);
      + #else
      + {
      + int c;
      + regc(c = *regparse++);
      + if (IsLeadByte(c))
      + regc(*regparse++);
      + }
      + #endif
      }
      regc('\0');
      if (*regparse != ']')
      ***************
      *** 1234,1239 ****
      --- 1324,1342 ----
      int chr;

      ungetchr();
      + #if defined(MULTI_BYTE)
      + chr = re_ismultibytecode(peekchr());
      + if (chr)
      + {
      + ret = regnode(MULTIBYTECODE);
      + regc(chr);
      + regc(PeekChr() & 0xFF);
      + skipchr();
      + *flagp |= HASWIDTH;
      + // *flagp |= HASWIDTH | SIMPLE;
      + break;
      + }
      + #endif
      len = 0;
      ret = regnode(EXACTLY);
      /*
      ***************
      *** 1542,1547 ****
      --- 1645,1655 ----
      curchr = regparse[0];
      }
      break;
      + #ifdef MULTI_BYTE
      + default:
      + if (IsLeadByte(curchr))
      + curchr = curchr << 8 | regparse[1];
      + #endif
      }
      }

      ***************
      *** 1551,1556 ****
      --- 1659,1673 ----
      static void
      skipchr()
      {
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*regparse))
      + {
      + skip_multi = 1;
      + regparse++;
      + }
      + else
      + skip_multi = 0;
      + #endif
      regparse++;
      prev_at_start = at_start;
      at_start = FALSE;
      ***************
      *** 1584,1589 ****
      --- 1701,1713 ----
      * Backup regparse as well; not because we will use what it points at,
      * but because skipchr() will bump it again.
      */
      + #ifdef MULTI_BYTE
      + if (skip_multi)
      + {
      + regparse--;
      + skip_multi = 0;
      + }
      + #endif
      regparse--;
      }

      ***************
      *** 1702,1707 ****
      --- 1826,1835 ----
      {
      if (cstrncmp(s, prog->regmust, prog->regmlen) == 0)
      break; /* Found it. */
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*s))
      + if (!(*++s)) break;
      + #endif
      s++;
      }
      if (s == NULL) /* Not present. */
      ***************
      *** 1730,1735 ****
      --- 1858,1867 ----
      {
      if (regtry(prog, s))
      return 1;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*s))
      + if (!(*++s)) break;
      + #endif
      s++;
      }
      else
      ***************
      *** 1738,1743 ****
      --- 1870,1879 ----
      {
      if (regtry(prog, s))
      return 1;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*s))
      + if (!(*++s)) break;
      + #endif
      } while (*s++ != '\0');

      /* Failure. */
      ***************
      *** 1855,1990 ****
      --- 1991,2207 ----
      case ANY:
      if (*reginput == '\0')
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case IDENT:
      if (!vim_isIDc(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case KWORD:
      if (!vim_iswordc(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case FNAME:
      if (!vim_isfilec(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case PRINT:
      if (charsize(*reginput) != 1)
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case SIDENT:
      if (isdigit(*reginput) || !vim_isIDc(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case SWORD:
      if (isdigit(*reginput) || !vim_iswordc(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case SFNAME:
      if (isdigit(*reginput) || !vim_isfilec(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case SPRINT:
      if (isdigit(*reginput) || charsize(*reginput) != 1)
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case WHITE:
      if (!vim_iswhite(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NWHITE:
      if (*reginput == NUL || vim_iswhite(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case DIGIT:
      if (!ri_digit(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NDIGIT:
      if (*reginput == NUL || ri_digit(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case HEX:
      if (!ri_hex(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NHEX:
      if (*reginput == NUL || ri_hex(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case OCTAL:
      if (!ri_octal(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NOCTAL:
      if (*reginput == NUL || ri_octal(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case WORD:
      if (!ri_word(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NWORD:
      if (*reginput == NUL || ri_word(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case HEAD:
      if (!ri_head(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NHEAD:
      if (*reginput == NUL || ri_head(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case ALPHA:
      if (!ri_alpha(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NALPHA:
      if (*reginput == NUL || ri_alpha(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case LOWER:
      if (!ri_lower(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NLOWER:
      if (*reginput == NUL || ri_lower(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case UPPER:
      if (!ri_upper(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case NUPPER:
      if (*reginput == NUL || ri_upper(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*reginput++))
      + #endif
      reginput++;
      break;
      case EXACTLY:
      ***************
      *** 2004,2016 ****
      --- 2221,2267 ----
      }
      break;
      case ANYOF:
      + #ifdef MULTI_BYTE
      + {
      + int c;
      +
      + if ((c = *reginput) == '\0' )
      + return 0;
      + if (IsLeadByte(c))
      + {
      + if (cstrchr(OPERAND(scan), c << 8 | reginput[1]) == NULL)
      + return 0;
      + reginput++;
      + }
      + else
      + if (cstrchr(OPERAND(scan), c) == NULL) return 0;
      + }
      + #else
      if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) == NULL)
      return 0;
      + #endif
      reginput++;
      break;
      case ANYBUT:
      + #ifdef MULTI_BYTE
      + {
      + int c;
      +
      + if ((c = *reginput) == '\0' )
      + return 0;
      + if (IsLeadByte(c))
      + {
      + if (cstrchr(OPERAND(scan), c << 8 | reginput[1]) != NULL)
      + return 0;
      + reginput++;
      + }
      + else
      + if (cstrchr(OPERAND(scan), c) != NULL) return 0;
      + }
      + #else
      if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) != NULL)
      return 0;
      + #endif
      reginput++;
      break;
      case NOTHING:
      ***************
      *** 2316,2321 ****
      --- 2567,2584 ----
      case END:
      return 1; /* Success! */
      /* break; Not Reached */
      + #ifdef MULTI_BYTE
      + case MULTIBYTECODE:
      + {
      + char_u *opnd;
      +
      + opnd = OPERAND(scan);
      + if( *opnd != *reginput || *(opnd+1) != *(reginput+1) )
      + return 0;
      + reginput += 2;
      + }
      + break;
      + #endif
      default:
      emsg(e_re_corr);
      #ifdef DEBUG
      ***************
      *** 2479,2497 ****
      --- 2742,2818 ----
      break;
      }
      case ANYOF:
      + #ifdef MULTI_BYTE
      + while (1)
      + {
      + int c;
      +
      + if ((c = *scan) == '\0')
      + break;
      + if (IsLeadByte(c))
      + {
      + if (cstrchr(opnd, c << 8 | *(scan + 1)) == NULL)
      + break;
      + count++;
      + scan++;
      + }
      + else
      + if (cstrchr(opnd, c) == NULL)
      + break;
      + count++;
      + scan++;
      + }
      + #else
      while (*scan != '\0' && cstrchr(opnd, *scan) != NULL)
      {
      count++;
      scan++;
      }
      + #endif
      break;
      case ANYBUT:
      + #ifdef MULTI_BYTE
      + while (1)
      + {
      + int c;
      +
      + if ((c = *scan) == '\0')
      + break;
      + if (IsLeadByte(c))
      + {
      + if (cstrchr(opnd, c << 8 | *(scan + 1)) != NULL)
      + break;
      + count++;
      + scan++;
      + }
      + else
      + if (cstrchr(opnd, c) != NULL)
      + break;
      + count++;
      + scan++;
      + }
      + #else
      while (*scan != '\0' && cstrchr(opnd, *scan) == NULL)
      {
      count++;
      scan++;
      }
      + #endif
      break;
      + #ifdef MULTI_BYTE
      + case MULTIBYTECODE:
      + {
      + int cl, ct;
      + cl = opnd[0];
      + ct = opnd[1];
      + while( scan[0] == cl && scan[1] == ct )
      + {
      + count += 2;
      + scan++;
      + }
      + }
      + break;
      + #endif
      default: /* Oh dear. Called inappropriately. */
      emsg(e_re_corr);
      #ifdef DEBUG
      ***************
      *** 2769,2774 ****
      --- 3090,3100 ----
      sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
      p = NULL;
      break;
      + #ifdef MULTI_BYTE
      + case MULTIBYTECODE:
      + p = 'MULTIBYTE CODE';
      + break;
      + #endif
      default:
      sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
      p = NULL;
      ***************
      *** 2807,2812 ****
      --- 3133,3142 ----

      if (!reg_ic)
      return vim_strchr(s, c);
      + #ifdef MULTI_BYTE
      + if (re_ismultibytecode(c))
      + return vim_strchr(s, c);
      + #endif

      /* tolower() and toupper() can be slow, comparing twice should be a lot
      * faster (esp. when using MS Visual C++!) */
      ***************
      *** 2820,2825 ****
      --- 3150,3159 ----
      for (p = s; *p; ++p)
      if (*p == c || *p == cc)
      return p;
      + #ifdef MULTI_BYTE
      + else if (IsLeadByte(*p))
      + if (!(*++p)) break;
      + #endif
      return NULL;
      }

      ***************
      *** 3046,3051 ****
      --- 3380,3393 ----
      }
      if (copy)
      {
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(c) && *src != NUL)
      + {
      + *dst++ = c;
      + *dst = *src++;
      + }
      + else
      + #endif
      if (func == (fptr)NULL) /* just copy */
      *dst = c;
      else /* change case */
      *** ./src.orig/misc2.c Sun Jan 09 09:50:54 2000
      --- ./src/misc2.c Sun Jan 09 09:48:06 2000
      ***************
      *** 1017,1028 ****
      --- 1017,1043 ----
      {
      char_u *p;
      int c;
      + #ifdef MULTI_BYTE
      + int n2 = -1;
      + if (n > 255)
      + {
      + n2 = n & 0xFF;
      + n = (n >> 8) & 0xFF;
      + }
      + #endif

      p = string;
      while ((c = *p) != NUL)
      {
      if (c == n)
      + #ifdef MULTI_BYTE
      + if (n2 == -1 || p[1] == n2)
      + #endif
      return p;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(c))
      + if (!(*++p)) break;
      + #endif
      ++p;
      }
      return NULL;
      ***************
      *** 1039,1044 ****
      --- 1054,1063 ----
      {
      if (*string == n)
      retval = string;
      + #ifdef MULTI_BYTE
      + if (IsLeadByte(*string))
      + if (!(*++string)) break;
      + #endif
      ++string;
      }
      return retval;
    • Bram Moolenaar
      ... I m glad Taro looked into this and made a patch. One question, like with the other ones: Doesn t is_dbcs need to be used here? -- hundred-and-one symptoms
      Message 2 of 4 , Jan 15, 2000
        Taro Muraoka wrote:

        > -Regular expression search and substitute. (16K)
        > Please test.

        I'm glad Taro looked into this and made a patch.

        One question, like with the other ones: Doesn't is_dbcs need to be used here?

        --
        hundred-and-one symptoms of being an internet addict:
        189. You put your e-mail address in the upper left-hand corner of envelopes.

        --/-/---- Bram Moolenaar ---- Bram@... ---- Bram@... ---\-\--
        \ \ www.vim.org/iccf www.moolenaar.net www.vim.org / /
      • Taro Muraoka
        ... Yes it needs. Sorry I forgot about 8-bit character. It is almost always needed checking is_dbcs before IsLeadByte(). And I amended patch. ... Taro Muraoka
        Message 3 of 4 , Jan 15, 2000
          Bram Moolenaar wrote:
          > One question, like with the other ones: Doesn't is_dbcs need to be used here?

          Yes it needs. Sorry I forgot about 8-bit character. It is almost always needed
          checking is_dbcs before IsLeadByte(). And I amended patch.
          ----
          Taro Muraoka koron@...


          Problem: Cannot multibyte pattern search and substitute.
          Solution: Add new node for multibyte character. and etc...
          Files: src/regexp.c src/misc2.c


          *** ./src.orig/regexp.c Sat May 15 22:48:52 1999
          --- ./src/regexp.c Sun Jan 16 14:33:24 2000
          ***************
          *** 195,200 ****
          --- 195,204 ----
          #define BACKREF 80 /* -89 node Match same string again \1-\9 */
          #define BRACE_COMPLEX 90 /* -99 node Match nodes between m & n times */

          + #ifdef MULTI_BYTE
          + #define MULTIBYTECODE 200 /* str Match multibyte code */
          + #endif
          +
          #define Magic(x) ((x) | ('\\' << 8))

          /*
          ***************
          *** 416,421 ****
          --- 420,433 ----
          {
          ++p;
          if (*p != ']' && *p != NUL)
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*p))
          + {
          + if (*++p == NUL) break;
          + ++p;
          + }
          + else
          + #endif
          ++p;
          }
          else if (*p == '\\'
          ***************
          *** 428,433 ****
          --- 440,452 ----
          ++p; /* It was not a class name */
          }
          else
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*p))
          + {
          + if (*++p == NUL) break;
          + }
          + else
          + #endif
          ++p;
          }

          ***************
          *** 491,496 ****
          --- 510,518 ----
          * Global work variables for vim_regcomp().
          */

          + #ifdef MULTI_BYTE
          + static int skip_multi; /* previous skip was multibyte */
          + #endif
          static char_u *regparse; /* Input-scan pointer. */
          static int num_complex_braces; /* Complex \{...} count */
          static int regnpar; /* () count. */
          ***************
          *** 534,539 ****
          --- 556,575 ----
          static int read_limits __ARGS((int, int, int *, int *));
          static void regtail __ARGS((char_u *, char_u *));
          static void regoptail __ARGS((char_u *, char_u *));
          + #ifdef MULTI_BYTE
          + static int re_ismultibytecode __ARGS((int));
          +
          + /*
          + * Is chr multi-byte? If no then return 0 else return leadbyte
          + */
          + int
          + re_ismultibytecode(c)
          + int c;
          + {
          + int lead = ( c >> 8 ) & 0xFF;
          + return (is_dbcs && IsLeadByte(lead)) ? lead : 0;
          + }
          + #endif

          /*
          * Skip past regular expression.
          ***************
          *** 559,564 ****
          --- 595,607 ----
          }
          else if (p[0] == '\\' && p[1] != NUL)
          ++p; /* skip next character */
          + #ifdef MULTI_BYTE
          + else if (is_dbcs && IsLeadByte(*p))
          + {
          + if (*++p == NUL)
          + break;
          + }
          + #endif
          }
          return p;
          }
          ***************
          *** 1148,1154 ****
          --- 1191,1201 ----
          case Magic('['):
          {
          char_u *p;
          + #ifdef MULTI_BYTE
          + char_u *base;

          + base = regparse;
          + #endif
          /*
          * If there is no matching ']', we assume the '[' is a normal
          * character. This makes ":help [" work.
          ***************
          *** 1175,1180 ****
          --- 1222,1264 ----
          regparse++;
          if (*regparse == ']' || *regparse == '\0')
          regc('-');
          + #ifdef MULTI_BYTE
          + else if (is_dbcs)
          + {
          +
          + int lead, start, end;
          +
          + start = UCHARAT(regparse - 2) + 1;
          + end = UCHARAT(regparse);
          + if (IsTrailByte(base, regparse - 2))
          + {
          + if (!IsLeadByte(end))
          + EMSG_RET_NULL(e_invrange);
          + lead = UCHARAT(regparse - 3);
          + if (lead != end)
          + EMSG_RET_NULL(e_invrange);
          + end = UCHARAT(regparse + 1);
          + if (start > end +1)
          + EMSG_RET_NULL(e_invrange);
          + for (; start <= end; start++)
          + {
          + regc(lead);
          + regc(start);
          + }
          + regparse += 2;
          + }
          + else
          + {
          + if (IsLeadByte(end))
          + EMSG_RET_NULL(e_invrange);
          + if (start > end +1)
          + EMSG_RET_NULL(e_invrange);
          + for (; start <= end; start++)
          + regc(start);
          + regparse++;
          + }
          + }
          + #endif
          else
          {
          int cclass;
          ***************
          *** 1216,1222 ****
          --- 1300,1315 ----
          regc(cu);
          }
          else
          + #ifndef MULTI_BYTE
          regc(*regparse++);
          + #else
          + {
          + int c;
          + regc(c = *regparse++);
          + if (is_dbcs && IsLeadByte(c))
          + regc(*regparse++);
          + }
          + #endif
          }
          regc('\0');
          if (*regparse != ']')
          ***************
          *** 1234,1239 ****
          --- 1327,1344 ----
          int chr;

          ungetchr();
          + #ifdef MULTI_BYTE
          + chr = re_ismultibytecode(peekchr());
          + if (chr)
          + {
          + ret = regnode(MULTIBYTECODE);
          + regc(chr);
          + regc(PeekChr() & 0xFF);
          + skipchr();
          + *flagp |= HASWIDTH;
          + break;
          + }
          + #endif
          len = 0;
          ret = regnode(EXACTLY);
          /*
          ***************
          *** 1542,1547 ****
          --- 1647,1657 ----
          curchr = regparse[0];
          }
          break;
          + #ifdef MULTI_BYTE
          + default:
          + if (is_dbcs && IsLeadByte(curchr))
          + curchr = curchr << 8 | regparse[1];
          + #endif
          }
          }

          ***************
          *** 1551,1556 ****
          --- 1661,1675 ----
          static void
          skipchr()
          {
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*regparse))
          + {
          + skip_multi = 1;
          + regparse++;
          + }
          + else
          + skip_multi = 0;
          + #endif
          regparse++;
          prev_at_start = at_start;
          at_start = FALSE;
          ***************
          *** 1584,1589 ****
          --- 1703,1715 ----
          * Backup regparse as well; not because we will use what it points at,
          * but because skipchr() will bump it again.
          */
          + #ifdef MULTI_BYTE
          + if (skip_multi)
          + {
          + regparse--;
          + skip_multi = 0;
          + }
          + #endif
          regparse--;
          }

          ***************
          *** 1702,1707 ****
          --- 1828,1837 ----
          {
          if (cstrncmp(s, prog->regmust, prog->regmlen) == 0)
          break; /* Found it. */
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*s))
          + if (!(*++s)) break;
          + #endif
          s++;
          }
          if (s == NULL) /* Not present. */
          ***************
          *** 1730,1735 ****
          --- 1860,1869 ----
          {
          if (regtry(prog, s))
          return 1;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*s))
          + if (!(*++s)) break;
          + #endif
          s++;
          }
          else
          ***************
          *** 1738,1743 ****
          --- 1872,1881 ----
          {
          if (regtry(prog, s))
          return 1;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*s))
          + if (!(*++s)) break;
          + #endif
          } while (*s++ != '\0');

          /* Failure. */
          ***************
          *** 1855,1990 ****
          --- 1993,2209 ----
          case ANY:
          if (*reginput == '\0')
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case IDENT:
          if (!vim_isIDc(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case KWORD:
          if (!vim_iswordc(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case FNAME:
          if (!vim_isfilec(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case PRINT:
          if (charsize(*reginput) != 1)
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case SIDENT:
          if (isdigit(*reginput) || !vim_isIDc(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case SWORD:
          if (isdigit(*reginput) || !vim_iswordc(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case SFNAME:
          if (isdigit(*reginput) || !vim_isfilec(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case SPRINT:
          if (isdigit(*reginput) || charsize(*reginput) != 1)
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case WHITE:
          if (!vim_iswhite(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NWHITE:
          if (*reginput == NUL || vim_iswhite(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case DIGIT:
          if (!ri_digit(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NDIGIT:
          if (*reginput == NUL || ri_digit(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case HEX:
          if (!ri_hex(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NHEX:
          if (*reginput == NUL || ri_hex(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case OCTAL:
          if (!ri_octal(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NOCTAL:
          if (*reginput == NUL || ri_octal(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case WORD:
          if (!ri_word(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NWORD:
          if (*reginput == NUL || ri_word(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case HEAD:
          if (!ri_head(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NHEAD:
          if (*reginput == NUL || ri_head(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case ALPHA:
          if (!ri_alpha(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NALPHA:
          if (*reginput == NUL || ri_alpha(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case LOWER:
          if (!ri_lower(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NLOWER:
          if (*reginput == NUL || ri_lower(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case UPPER:
          if (!ri_upper(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case NUPPER:
          if (*reginput == NUL || ri_upper(*reginput))
          return 0;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
          + #endif
          reginput++;
          break;
          case EXACTLY:
          ***************
          *** 2004,2016 ****
          --- 2223,2269 ----
          }
          break;
          case ANYOF:
          + #ifdef MULTI_BYTE
          + {
          + int c;
          +
          + if ((c = *reginput) == '\0' )
          + return 0;
          + if (is_dbcs && IsLeadByte(c))
          + {
          + if (cstrchr(OPERAND(scan), c << 8 | reginput[1]) == NULL)
          + return 0;
          + reginput++;
          + }
          + else
          + if (cstrchr(OPERAND(scan), c) == NULL) return 0;
          + }
          + #else
          if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) == NULL)
          return 0;
          + #endif
          reginput++;
          break;
          case ANYBUT:
          + #ifdef MULTI_BYTE
          + {
          + int c;
          +
          + if ((c = *reginput) == '\0' )
          + return 0;
          + if (is_dbcs && IsLeadByte(c))
          + {
          + if (cstrchr(OPERAND(scan), c << 8 | reginput[1]) != NULL)
          + return 0;
          + reginput++;
          + }
          + else
          + if (cstrchr(OPERAND(scan), c) != NULL) return 0;
          + }
          + #else
          if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) != NULL)
          return 0;
          + #endif
          reginput++;
          break;
          case NOTHING:
          ***************
          *** 2316,2321 ****
          --- 2569,2586 ----
          case END:
          return 1; /* Success! */
          /* break; Not Reached */
          + #ifdef MULTI_BYTE
          + case MULTIBYTECODE:
          + {
          + char_u *opnd;
          +
          + opnd = OPERAND(scan);
          + if( *opnd != *reginput || *(opnd+1) != *(reginput+1) )
          + return 0;
          + reginput += 2;
          + }
          + break;
          + #endif
          default:
          emsg(e_re_corr);
          #ifdef DEBUG
          ***************
          *** 2479,2497 ****
          --- 2744,2820 ----
          break;
          }
          case ANYOF:
          + #ifdef MULTI_BYTE
          + while (1)
          + {
          + int c;
          +
          + if ((c = *scan) == '\0')
          + break;
          + if (is_dbcs && IsLeadByte(c)) /* if multibyte, extra forward */
          + {
          + if (cstrchr(opnd, c << 8 | *(scan + 1)) == NULL)
          + break;
          + count++;
          + scan++;
          + }
          + else
          + if (cstrchr(opnd, c) == NULL)
          + break;
          + count++;
          + scan++;
          + }
          + #else
          while (*scan != '\0' && cstrchr(opnd, *scan) != NULL)
          {
          count++;
          scan++;
          }
          + #endif
          break;
          case ANYBUT:
          + #ifdef MULTI_BYTE
          + while (1)
          + {
          + int c;
          +
          + if ((c = *scan) == '\0')
          + break;
          + if (is_dbcs && IsLeadByte(c)) /* if multibyte, extra forward */
          + {
          + if (cstrchr(opnd, c << 8 | *(scan + 1)) != NULL)
          + break;
          + count++;
          + scan++;
          + }
          + else
          + if (cstrchr(opnd, c) != NULL)
          + break;
          + count++;
          + scan++;
          + }
          + #else
          while (*scan != '\0' && cstrchr(opnd, *scan) == NULL)
          {
          count++;
          scan++;
          }
          + #endif
          break;
          + #ifdef MULTI_BYTE
          + case MULTIBYTECODE:
          + {
          + int cl, ct;
          + cl = opnd[0];
          + ct = opnd[1];
          + while( scan[0] == cl && scan[1] == ct )
          + {
          + count += 2;
          + scan++;
          + }
          + }
          + break;
          + #endif
          default: /* Oh dear. Called inappropriately. */
          emsg(e_re_corr);
          #ifdef DEBUG
          ***************
          *** 2769,2774 ****
          --- 3092,3102 ----
          sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
          p = NULL;
          break;
          + #ifdef MULTI_BYTE
          + case MULTIBYTECODE:
          + p = 'MULTIBYTE CODE';
          + break;
          + #endif
          default:
          sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
          p = NULL;
          ***************
          *** 2807,2812 ****
          --- 3135,3144 ----

          if (!reg_ic)
          return vim_strchr(s, c);
          + #ifdef MULTI_BYTE
          + if (re_ismultibytecode(c))
          + return vim_strchr(s, c);
          + #endif

          /* tolower() and toupper() can be slow, comparing twice should be a lot
          * faster (esp. when using MS Visual C++!) */
          ***************
          *** 2820,2825 ****
          --- 3152,3161 ----
          for (p = s; *p; ++p)
          if (*p == c || *p == cc)
          return p;
          + #ifdef MULTI_BYTE
          + else if (is_dbcs && IsLeadByte(*p))
          + if (!(*++p)) break;
          + #endif
          return NULL;
          }

          ***************
          *** 3046,3051 ****
          --- 3382,3395 ----
          }
          if (copy)
          {
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(c) && *src != NUL )
          + {
          + *dst++ = c;
          + *dst = *src++;
          + }
          + else
          + #endif
          if (func == (fptr)NULL) /* just copy */
          *dst = c;
          else /* change case */
          *** ./src.orig/misc2.c Sun Jan 09 09:50:54 2000
          --- ./src/misc2.c Sun Jan 16 14:02:04 2000
          ***************
          *** 1017,1028 ****
          --- 1017,1043 ----
          {
          char_u *p;
          int c;
          + #ifdef MULTI_BYTE
          + int n2 = -1;
          + if (n > 255)
          + {
          + n2 = n & 0xFF;
          + n = (n >> 8) & 0xFF;
          + }
          + #endif

          p = string;
          while ((c = *p) != NUL)
          {
          if (c == n)
          + #ifdef MULTI_BYTE
          + if (n2 == -1 || p[1] == n2)
          + #endif
          return p;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(c))
          + if (!(*++p)) break;
          + #endif
          ++p;
          }
          return NULL;
          ***************
          *** 1039,1044 ****
          --- 1054,1063 ----
          {
          if (*string == n)
          retval = string;
          + #ifdef MULTI_BYTE
          + if (is_dbcs && IsLeadByte(*string))
          + if (!(*++string)) break;
          + #endif
          ++string;
          }
          return retval;

          ----
          Taro Muraoka mailto:koron@...
        • Chong-Dae Park
          I ve add boundary check for Taro s work. Please test it. -- Chong-Dae Park -- Warning: Your signature is longer than 4 lines. Since signatures usually do not
          Message 4 of 4 , May 17 6:25 AM
            I've add boundary check for Taro's work.

            Please test it.

            --
            Chong-Dae Park
            --
            Warning: Your signature is longer than 4 lines. Since signatures usually do
            not transport any useful information, they should be as short as
            possible. - tin's Warning message -
          Your message has been successfully submitted and would be delivered to recipients shortly.