Loading ...
Sorry, an error occurred while loading the content.
 

[vim-multibyte] Re: Multibyte regexp

Expand Messages
  • Taro Muraoka
    ... Yes it needs. Sorry I forgot about 8-bit character. It is almost always needed checking is_dbcs before IsLeadByte(). And I amended patch. ... Taro Muraoka
    Message 1 of 4 , Jan 15, 2000
      Bram Moolenaar wrote:
      > One question, like with the other ones: Doesn't is_dbcs need to be used here?

      Yes it needs. Sorry I forgot about 8-bit character. It is almost always needed
      checking is_dbcs before IsLeadByte(). And I amended patch.
      ----
      Taro Muraoka koron@...


      Problem: Cannot multibyte pattern search and substitute.
      Solution: Add new node for multibyte character. and etc...
      Files: src/regexp.c src/misc2.c


      *** ./src.orig/regexp.c Sat May 15 22:48:52 1999
      --- ./src/regexp.c Sun Jan 16 14:33:24 2000
      ***************
      *** 195,200 ****
      --- 195,204 ----
      #define BACKREF 80 /* -89 node Match same string again \1-\9 */
      #define BRACE_COMPLEX 90 /* -99 node Match nodes between m & n times */

      + #ifdef MULTI_BYTE
      + #define MULTIBYTECODE 200 /* str Match multibyte code */
      + #endif
      +
      #define Magic(x) ((x) | ('\\' << 8))

      /*
      ***************
      *** 416,421 ****
      --- 420,433 ----
      {
      ++p;
      if (*p != ']' && *p != NUL)
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*p))
      + {
      + if (*++p == NUL) break;
      + ++p;
      + }
      + else
      + #endif
      ++p;
      }
      else if (*p == '\\'
      ***************
      *** 428,433 ****
      --- 440,452 ----
      ++p; /* It was not a class name */
      }
      else
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*p))
      + {
      + if (*++p == NUL) break;
      + }
      + else
      + #endif
      ++p;
      }

      ***************
      *** 491,496 ****
      --- 510,518 ----
      * Global work variables for vim_regcomp().
      */

      + #ifdef MULTI_BYTE
      + static int skip_multi; /* previous skip was multibyte */
      + #endif
      static char_u *regparse; /* Input-scan pointer. */
      static int num_complex_braces; /* Complex \{...} count */
      static int regnpar; /* () count. */
      ***************
      *** 534,539 ****
      --- 556,575 ----
      static int read_limits __ARGS((int, int, int *, int *));
      static void regtail __ARGS((char_u *, char_u *));
      static void regoptail __ARGS((char_u *, char_u *));
      + #ifdef MULTI_BYTE
      + static int re_ismultibytecode __ARGS((int));
      +
      + /*
      + * Is chr multi-byte? If no then return 0 else return leadbyte
      + */
      + int
      + re_ismultibytecode(c)
      + int c;
      + {
      + int lead = ( c >> 8 ) & 0xFF;
      + return (is_dbcs && IsLeadByte(lead)) ? lead : 0;
      + }
      + #endif

      /*
      * Skip past regular expression.
      ***************
      *** 559,564 ****
      --- 595,607 ----
      }
      else if (p[0] == '\\' && p[1] != NUL)
      ++p; /* skip next character */
      + #ifdef MULTI_BYTE
      + else if (is_dbcs && IsLeadByte(*p))
      + {
      + if (*++p == NUL)
      + break;
      + }
      + #endif
      }
      return p;
      }
      ***************
      *** 1148,1154 ****
      --- 1191,1201 ----
      case Magic('['):
      {
      char_u *p;
      + #ifdef MULTI_BYTE
      + char_u *base;

      + base = regparse;
      + #endif
      /*
      * If there is no matching ']', we assume the '[' is a normal
      * character. This makes ":help [" work.
      ***************
      *** 1175,1180 ****
      --- 1222,1264 ----
      regparse++;
      if (*regparse == ']' || *regparse == '\0')
      regc('-');
      + #ifdef MULTI_BYTE
      + else if (is_dbcs)
      + {
      +
      + int lead, start, end;
      +
      + start = UCHARAT(regparse - 2) + 1;
      + end = UCHARAT(regparse);
      + if (IsTrailByte(base, regparse - 2))
      + {
      + if (!IsLeadByte(end))
      + EMSG_RET_NULL(e_invrange);
      + lead = UCHARAT(regparse - 3);
      + if (lead != end)
      + EMSG_RET_NULL(e_invrange);
      + end = UCHARAT(regparse + 1);
      + if (start > end +1)
      + EMSG_RET_NULL(e_invrange);
      + for (; start <= end; start++)
      + {
      + regc(lead);
      + regc(start);
      + }
      + regparse += 2;
      + }
      + else
      + {
      + if (IsLeadByte(end))
      + EMSG_RET_NULL(e_invrange);
      + if (start > end +1)
      + EMSG_RET_NULL(e_invrange);
      + for (; start <= end; start++)
      + regc(start);
      + regparse++;
      + }
      + }
      + #endif
      else
      {
      int cclass;
      ***************
      *** 1216,1222 ****
      --- 1300,1315 ----
      regc(cu);
      }
      else
      + #ifndef MULTI_BYTE
      regc(*regparse++);
      + #else
      + {
      + int c;
      + regc(c = *regparse++);
      + if (is_dbcs && IsLeadByte(c))
      + regc(*regparse++);
      + }
      + #endif
      }
      regc('\0');
      if (*regparse != ']')
      ***************
      *** 1234,1239 ****
      --- 1327,1344 ----
      int chr;

      ungetchr();
      + #ifdef MULTI_BYTE
      + chr = re_ismultibytecode(peekchr());
      + if (chr)
      + {
      + ret = regnode(MULTIBYTECODE);
      + regc(chr);
      + regc(PeekChr() & 0xFF);
      + skipchr();
      + *flagp |= HASWIDTH;
      + break;
      + }
      + #endif
      len = 0;
      ret = regnode(EXACTLY);
      /*
      ***************
      *** 1542,1547 ****
      --- 1647,1657 ----
      curchr = regparse[0];
      }
      break;
      + #ifdef MULTI_BYTE
      + default:
      + if (is_dbcs && IsLeadByte(curchr))
      + curchr = curchr << 8 | regparse[1];
      + #endif
      }
      }

      ***************
      *** 1551,1556 ****
      --- 1661,1675 ----
      static void
      skipchr()
      {
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*regparse))
      + {
      + skip_multi = 1;
      + regparse++;
      + }
      + else
      + skip_multi = 0;
      + #endif
      regparse++;
      prev_at_start = at_start;
      at_start = FALSE;
      ***************
      *** 1584,1589 ****
      --- 1703,1715 ----
      * Backup regparse as well; not because we will use what it points at,
      * but because skipchr() will bump it again.
      */
      + #ifdef MULTI_BYTE
      + if (skip_multi)
      + {
      + regparse--;
      + skip_multi = 0;
      + }
      + #endif
      regparse--;
      }

      ***************
      *** 1702,1707 ****
      --- 1828,1837 ----
      {
      if (cstrncmp(s, prog->regmust, prog->regmlen) == 0)
      break; /* Found it. */
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*s))
      + if (!(*++s)) break;
      + #endif
      s++;
      }
      if (s == NULL) /* Not present. */
      ***************
      *** 1730,1735 ****
      --- 1860,1869 ----
      {
      if (regtry(prog, s))
      return 1;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*s))
      + if (!(*++s)) break;
      + #endif
      s++;
      }
      else
      ***************
      *** 1738,1743 ****
      --- 1872,1881 ----
      {
      if (regtry(prog, s))
      return 1;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*s))
      + if (!(*++s)) break;
      + #endif
      } while (*s++ != '\0');

      /* Failure. */
      ***************
      *** 1855,1990 ****
      --- 1993,2209 ----
      case ANY:
      if (*reginput == '\0')
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case IDENT:
      if (!vim_isIDc(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case KWORD:
      if (!vim_iswordc(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case FNAME:
      if (!vim_isfilec(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case PRINT:
      if (charsize(*reginput) != 1)
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case SIDENT:
      if (isdigit(*reginput) || !vim_isIDc(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case SWORD:
      if (isdigit(*reginput) || !vim_iswordc(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case SFNAME:
      if (isdigit(*reginput) || !vim_isfilec(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case SPRINT:
      if (isdigit(*reginput) || charsize(*reginput) != 1)
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case WHITE:
      if (!vim_iswhite(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NWHITE:
      if (*reginput == NUL || vim_iswhite(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case DIGIT:
      if (!ri_digit(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NDIGIT:
      if (*reginput == NUL || ri_digit(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case HEX:
      if (!ri_hex(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NHEX:
      if (*reginput == NUL || ri_hex(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case OCTAL:
      if (!ri_octal(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NOCTAL:
      if (*reginput == NUL || ri_octal(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case WORD:
      if (!ri_word(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NWORD:
      if (*reginput == NUL || ri_word(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case HEAD:
      if (!ri_head(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NHEAD:
      if (*reginput == NUL || ri_head(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case ALPHA:
      if (!ri_alpha(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NALPHA:
      if (*reginput == NUL || ri_alpha(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case LOWER:
      if (!ri_lower(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NLOWER:
      if (*reginput == NUL || ri_lower(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case UPPER:
      if (!ri_upper(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case NUPPER:
      if (*reginput == NUL || ri_upper(*reginput))
      return 0;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
      + #endif
      reginput++;
      break;
      case EXACTLY:
      ***************
      *** 2004,2016 ****
      --- 2223,2269 ----
      }
      break;
      case ANYOF:
      + #ifdef MULTI_BYTE
      + {
      + int c;
      +
      + if ((c = *reginput) == '\0' )
      + return 0;
      + if (is_dbcs && IsLeadByte(c))
      + {
      + if (cstrchr(OPERAND(scan), c << 8 | reginput[1]) == NULL)
      + return 0;
      + reginput++;
      + }
      + else
      + if (cstrchr(OPERAND(scan), c) == NULL) return 0;
      + }
      + #else
      if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) == NULL)
      return 0;
      + #endif
      reginput++;
      break;
      case ANYBUT:
      + #ifdef MULTI_BYTE
      + {
      + int c;
      +
      + if ((c = *reginput) == '\0' )
      + return 0;
      + if (is_dbcs && IsLeadByte(c))
      + {
      + if (cstrchr(OPERAND(scan), c << 8 | reginput[1]) != NULL)
      + return 0;
      + reginput++;
      + }
      + else
      + if (cstrchr(OPERAND(scan), c) != NULL) return 0;
      + }
      + #else
      if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) != NULL)
      return 0;
      + #endif
      reginput++;
      break;
      case NOTHING:
      ***************
      *** 2316,2321 ****
      --- 2569,2586 ----
      case END:
      return 1; /* Success! */
      /* break; Not Reached */
      + #ifdef MULTI_BYTE
      + case MULTIBYTECODE:
      + {
      + char_u *opnd;
      +
      + opnd = OPERAND(scan);
      + if( *opnd != *reginput || *(opnd+1) != *(reginput+1) )
      + return 0;
      + reginput += 2;
      + }
      + break;
      + #endif
      default:
      emsg(e_re_corr);
      #ifdef DEBUG
      ***************
      *** 2479,2497 ****
      --- 2744,2820 ----
      break;
      }
      case ANYOF:
      + #ifdef MULTI_BYTE
      + while (1)
      + {
      + int c;
      +
      + if ((c = *scan) == '\0')
      + break;
      + if (is_dbcs && IsLeadByte(c)) /* if multibyte, extra forward */
      + {
      + if (cstrchr(opnd, c << 8 | *(scan + 1)) == NULL)
      + break;
      + count++;
      + scan++;
      + }
      + else
      + if (cstrchr(opnd, c) == NULL)
      + break;
      + count++;
      + scan++;
      + }
      + #else
      while (*scan != '\0' && cstrchr(opnd, *scan) != NULL)
      {
      count++;
      scan++;
      }
      + #endif
      break;
      case ANYBUT:
      + #ifdef MULTI_BYTE
      + while (1)
      + {
      + int c;
      +
      + if ((c = *scan) == '\0')
      + break;
      + if (is_dbcs && IsLeadByte(c)) /* if multibyte, extra forward */
      + {
      + if (cstrchr(opnd, c << 8 | *(scan + 1)) != NULL)
      + break;
      + count++;
      + scan++;
      + }
      + else
      + if (cstrchr(opnd, c) != NULL)
      + break;
      + count++;
      + scan++;
      + }
      + #else
      while (*scan != '\0' && cstrchr(opnd, *scan) == NULL)
      {
      count++;
      scan++;
      }
      + #endif
      break;
      + #ifdef MULTI_BYTE
      + case MULTIBYTECODE:
      + {
      + int cl, ct;
      + cl = opnd[0];
      + ct = opnd[1];
      + while( scan[0] == cl && scan[1] == ct )
      + {
      + count += 2;
      + scan++;
      + }
      + }
      + break;
      + #endif
      default: /* Oh dear. Called inappropriately. */
      emsg(e_re_corr);
      #ifdef DEBUG
      ***************
      *** 2769,2774 ****
      --- 3092,3102 ----
      sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
      p = NULL;
      break;
      + #ifdef MULTI_BYTE
      + case MULTIBYTECODE:
      + p = 'MULTIBYTE CODE';
      + break;
      + #endif
      default:
      sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
      p = NULL;
      ***************
      *** 2807,2812 ****
      --- 3135,3144 ----

      if (!reg_ic)
      return vim_strchr(s, c);
      + #ifdef MULTI_BYTE
      + if (re_ismultibytecode(c))
      + return vim_strchr(s, c);
      + #endif

      /* tolower() and toupper() can be slow, comparing twice should be a lot
      * faster (esp. when using MS Visual C++!) */
      ***************
      *** 2820,2825 ****
      --- 3152,3161 ----
      for (p = s; *p; ++p)
      if (*p == c || *p == cc)
      return p;
      + #ifdef MULTI_BYTE
      + else if (is_dbcs && IsLeadByte(*p))
      + if (!(*++p)) break;
      + #endif
      return NULL;
      }

      ***************
      *** 3046,3051 ****
      --- 3382,3395 ----
      }
      if (copy)
      {
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(c) && *src != NUL )
      + {
      + *dst++ = c;
      + *dst = *src++;
      + }
      + else
      + #endif
      if (func == (fptr)NULL) /* just copy */
      *dst = c;
      else /* change case */
      *** ./src.orig/misc2.c Sun Jan 09 09:50:54 2000
      --- ./src/misc2.c Sun Jan 16 14:02:04 2000
      ***************
      *** 1017,1028 ****
      --- 1017,1043 ----
      {
      char_u *p;
      int c;
      + #ifdef MULTI_BYTE
      + int n2 = -1;
      + if (n > 255)
      + {
      + n2 = n & 0xFF;
      + n = (n >> 8) & 0xFF;
      + }
      + #endif

      p = string;
      while ((c = *p) != NUL)
      {
      if (c == n)
      + #ifdef MULTI_BYTE
      + if (n2 == -1 || p[1] == n2)
      + #endif
      return p;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(c))
      + if (!(*++p)) break;
      + #endif
      ++p;
      }
      return NULL;
      ***************
      *** 1039,1044 ****
      --- 1054,1063 ----
      {
      if (*string == n)
      retval = string;
      + #ifdef MULTI_BYTE
      + if (is_dbcs && IsLeadByte(*string))
      + if (!(*++string)) break;
      + #endif
      ++string;
      }
      return retval;

      ----
      Taro Muraoka mailto:koron@...
    • Chong-Dae Park
      I ve add boundary check for Taro s work. Please test it. -- Chong-Dae Park -- Warning: Your signature is longer than 4 lines. Since signatures usually do not
      Message 2 of 4 , May 17, 2000
        I've add boundary check for Taro's work.

        Please test it.

        --
        Chong-Dae Park
        --
        Warning: Your signature is longer than 4 lines. Since signatures usually do
        not transport any useful information, they should be as short as
        possible. - tin's Warning message -
      Your message has been successfully submitted and would be delivered to recipients shortly.