Loading ...
Sorry, an error occurred while loading the content.

[vim-multibyte] Re: Multibyte regexp

Expand Messages
  • Bram Moolenaar
    ... I m glad Taro looked into this and made a patch. One question, like with the other ones: Doesn t is_dbcs need to be used here? -- hundred-and-one symptoms
    Message 1 of 4 , Jan 15, 2000
    View Source
    • 0 Attachment
      Taro Muraoka wrote:

      > -Regular expression search and substitute. (16K)
      > Please test.

      I'm glad Taro looked into this and made a patch.

      One question, like with the other ones: Doesn't is_dbcs need to be used here?

      --
      hundred-and-one symptoms of being an internet addict:
      189. You put your e-mail address in the upper left-hand corner of envelopes.

      --/-/---- Bram Moolenaar ---- Bram@... ---- Bram@... ---\-\--
      \ \ www.vim.org/iccf www.moolenaar.net www.vim.org / /
    • Taro Muraoka
      ... Yes it needs. Sorry I forgot about 8-bit character. It is almost always needed checking is_dbcs before IsLeadByte(). And I amended patch. ... Taro Muraoka
      Message 2 of 4 , Jan 15, 2000
      View Source
      • 0 Attachment
        Bram Moolenaar wrote:
        > One question, like with the other ones: Doesn't is_dbcs need to be used here?

        Yes it needs. Sorry I forgot about 8-bit character. It is almost always needed
        checking is_dbcs before IsLeadByte(). And I amended patch.
        ----
        Taro Muraoka koron@...


        Problem: Cannot multibyte pattern search and substitute.
        Solution: Add new node for multibyte character. and etc...
        Files: src/regexp.c src/misc2.c


        *** ./src.orig/regexp.c Sat May 15 22:48:52 1999
        --- ./src/regexp.c Sun Jan 16 14:33:24 2000
        ***************
        *** 195,200 ****
        --- 195,204 ----
        #define BACKREF 80 /* -89 node Match same string again \1-\9 */
        #define BRACE_COMPLEX 90 /* -99 node Match nodes between m & n times */

        + #ifdef MULTI_BYTE
        + #define MULTIBYTECODE 200 /* str Match multibyte code */
        + #endif
        +
        #define Magic(x) ((x) | ('\\' << 8))

        /*
        ***************
        *** 416,421 ****
        --- 420,433 ----
        {
        ++p;
        if (*p != ']' && *p != NUL)
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*p))
        + {
        + if (*++p == NUL) break;
        + ++p;
        + }
        + else
        + #endif
        ++p;
        }
        else if (*p == '\\'
        ***************
        *** 428,433 ****
        --- 440,452 ----
        ++p; /* It was not a class name */
        }
        else
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*p))
        + {
        + if (*++p == NUL) break;
        + }
        + else
        + #endif
        ++p;
        }

        ***************
        *** 491,496 ****
        --- 510,518 ----
        * Global work variables for vim_regcomp().
        */

        + #ifdef MULTI_BYTE
        + static int skip_multi; /* previous skip was multibyte */
        + #endif
        static char_u *regparse; /* Input-scan pointer. */
        static int num_complex_braces; /* Complex \{...} count */
        static int regnpar; /* () count. */
        ***************
        *** 534,539 ****
        --- 556,575 ----
        static int read_limits __ARGS((int, int, int *, int *));
        static void regtail __ARGS((char_u *, char_u *));
        static void regoptail __ARGS((char_u *, char_u *));
        + #ifdef MULTI_BYTE
        + static int re_ismultibytecode __ARGS((int));
        +
        + /*
        + * Is chr multi-byte? If no then return 0 else return leadbyte
        + */
        + int
        + re_ismultibytecode(c)
        + int c;
        + {
        + int lead = ( c >> 8 ) & 0xFF;
        + return (is_dbcs && IsLeadByte(lead)) ? lead : 0;
        + }
        + #endif

        /*
        * Skip past regular expression.
        ***************
        *** 559,564 ****
        --- 595,607 ----
        }
        else if (p[0] == '\\' && p[1] != NUL)
        ++p; /* skip next character */
        + #ifdef MULTI_BYTE
        + else if (is_dbcs && IsLeadByte(*p))
        + {
        + if (*++p == NUL)
        + break;
        + }
        + #endif
        }
        return p;
        }
        ***************
        *** 1148,1154 ****
        --- 1191,1201 ----
        case Magic('['):
        {
        char_u *p;
        + #ifdef MULTI_BYTE
        + char_u *base;

        + base = regparse;
        + #endif
        /*
        * If there is no matching ']', we assume the '[' is a normal
        * character. This makes ":help [" work.
        ***************
        *** 1175,1180 ****
        --- 1222,1264 ----
        regparse++;
        if (*regparse == ']' || *regparse == '\0')
        regc('-');
        + #ifdef MULTI_BYTE
        + else if (is_dbcs)
        + {
        +
        + int lead, start, end;
        +
        + start = UCHARAT(regparse - 2) + 1;
        + end = UCHARAT(regparse);
        + if (IsTrailByte(base, regparse - 2))
        + {
        + if (!IsLeadByte(end))
        + EMSG_RET_NULL(e_invrange);
        + lead = UCHARAT(regparse - 3);
        + if (lead != end)
        + EMSG_RET_NULL(e_invrange);
        + end = UCHARAT(regparse + 1);
        + if (start > end +1)
        + EMSG_RET_NULL(e_invrange);
        + for (; start <= end; start++)
        + {
        + regc(lead);
        + regc(start);
        + }
        + regparse += 2;
        + }
        + else
        + {
        + if (IsLeadByte(end))
        + EMSG_RET_NULL(e_invrange);
        + if (start > end +1)
        + EMSG_RET_NULL(e_invrange);
        + for (; start <= end; start++)
        + regc(start);
        + regparse++;
        + }
        + }
        + #endif
        else
        {
        int cclass;
        ***************
        *** 1216,1222 ****
        --- 1300,1315 ----
        regc(cu);
        }
        else
        + #ifndef MULTI_BYTE
        regc(*regparse++);
        + #else
        + {
        + int c;
        + regc(c = *regparse++);
        + if (is_dbcs && IsLeadByte(c))
        + regc(*regparse++);
        + }
        + #endif
        }
        regc('\0');
        if (*regparse != ']')
        ***************
        *** 1234,1239 ****
        --- 1327,1344 ----
        int chr;

        ungetchr();
        + #ifdef MULTI_BYTE
        + chr = re_ismultibytecode(peekchr());
        + if (chr)
        + {
        + ret = regnode(MULTIBYTECODE);
        + regc(chr);
        + regc(PeekChr() & 0xFF);
        + skipchr();
        + *flagp |= HASWIDTH;
        + break;
        + }
        + #endif
        len = 0;
        ret = regnode(EXACTLY);
        /*
        ***************
        *** 1542,1547 ****
        --- 1647,1657 ----
        curchr = regparse[0];
        }
        break;
        + #ifdef MULTI_BYTE
        + default:
        + if (is_dbcs && IsLeadByte(curchr))
        + curchr = curchr << 8 | regparse[1];
        + #endif
        }
        }

        ***************
        *** 1551,1556 ****
        --- 1661,1675 ----
        static void
        skipchr()
        {
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*regparse))
        + {
        + skip_multi = 1;
        + regparse++;
        + }
        + else
        + skip_multi = 0;
        + #endif
        regparse++;
        prev_at_start = at_start;
        at_start = FALSE;
        ***************
        *** 1584,1589 ****
        --- 1703,1715 ----
        * Backup regparse as well; not because we will use what it points at,
        * but because skipchr() will bump it again.
        */
        + #ifdef MULTI_BYTE
        + if (skip_multi)
        + {
        + regparse--;
        + skip_multi = 0;
        + }
        + #endif
        regparse--;
        }

        ***************
        *** 1702,1707 ****
        --- 1828,1837 ----
        {
        if (cstrncmp(s, prog->regmust, prog->regmlen) == 0)
        break; /* Found it. */
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*s))
        + if (!(*++s)) break;
        + #endif
        s++;
        }
        if (s == NULL) /* Not present. */
        ***************
        *** 1730,1735 ****
        --- 1860,1869 ----
        {
        if (regtry(prog, s))
        return 1;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*s))
        + if (!(*++s)) break;
        + #endif
        s++;
        }
        else
        ***************
        *** 1738,1743 ****
        --- 1872,1881 ----
        {
        if (regtry(prog, s))
        return 1;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*s))
        + if (!(*++s)) break;
        + #endif
        } while (*s++ != '\0');

        /* Failure. */
        ***************
        *** 1855,1990 ****
        --- 1993,2209 ----
        case ANY:
        if (*reginput == '\0')
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case IDENT:
        if (!vim_isIDc(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case KWORD:
        if (!vim_iswordc(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case FNAME:
        if (!vim_isfilec(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case PRINT:
        if (charsize(*reginput) != 1)
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case SIDENT:
        if (isdigit(*reginput) || !vim_isIDc(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case SWORD:
        if (isdigit(*reginput) || !vim_iswordc(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case SFNAME:
        if (isdigit(*reginput) || !vim_isfilec(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case SPRINT:
        if (isdigit(*reginput) || charsize(*reginput) != 1)
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case WHITE:
        if (!vim_iswhite(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NWHITE:
        if (*reginput == NUL || vim_iswhite(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case DIGIT:
        if (!ri_digit(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NDIGIT:
        if (*reginput == NUL || ri_digit(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case HEX:
        if (!ri_hex(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NHEX:
        if (*reginput == NUL || ri_hex(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case OCTAL:
        if (!ri_octal(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NOCTAL:
        if (*reginput == NUL || ri_octal(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case WORD:
        if (!ri_word(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NWORD:
        if (*reginput == NUL || ri_word(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case HEAD:
        if (!ri_head(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NHEAD:
        if (*reginput == NUL || ri_head(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case ALPHA:
        if (!ri_alpha(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NALPHA:
        if (*reginput == NUL || ri_alpha(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case LOWER:
        if (!ri_lower(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NLOWER:
        if (*reginput == NUL || ri_lower(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case UPPER:
        if (!ri_upper(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case NUPPER:
        if (*reginput == NUL || ri_upper(*reginput))
        return 0;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*reginput)) reginput++;
        + #endif
        reginput++;
        break;
        case EXACTLY:
        ***************
        *** 2004,2016 ****
        --- 2223,2269 ----
        }
        break;
        case ANYOF:
        + #ifdef MULTI_BYTE
        + {
        + int c;
        +
        + if ((c = *reginput) == '\0' )
        + return 0;
        + if (is_dbcs && IsLeadByte(c))
        + {
        + if (cstrchr(OPERAND(scan), c << 8 | reginput[1]) == NULL)
        + return 0;
        + reginput++;
        + }
        + else
        + if (cstrchr(OPERAND(scan), c) == NULL) return 0;
        + }
        + #else
        if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) == NULL)
        return 0;
        + #endif
        reginput++;
        break;
        case ANYBUT:
        + #ifdef MULTI_BYTE
        + {
        + int c;
        +
        + if ((c = *reginput) == '\0' )
        + return 0;
        + if (is_dbcs && IsLeadByte(c))
        + {
        + if (cstrchr(OPERAND(scan), c << 8 | reginput[1]) != NULL)
        + return 0;
        + reginput++;
        + }
        + else
        + if (cstrchr(OPERAND(scan), c) != NULL) return 0;
        + }
        + #else
        if (*reginput == '\0' || cstrchr(OPERAND(scan), *reginput) != NULL)
        return 0;
        + #endif
        reginput++;
        break;
        case NOTHING:
        ***************
        *** 2316,2321 ****
        --- 2569,2586 ----
        case END:
        return 1; /* Success! */
        /* break; Not Reached */
        + #ifdef MULTI_BYTE
        + case MULTIBYTECODE:
        + {
        + char_u *opnd;
        +
        + opnd = OPERAND(scan);
        + if( *opnd != *reginput || *(opnd+1) != *(reginput+1) )
        + return 0;
        + reginput += 2;
        + }
        + break;
        + #endif
        default:
        emsg(e_re_corr);
        #ifdef DEBUG
        ***************
        *** 2479,2497 ****
        --- 2744,2820 ----
        break;
        }
        case ANYOF:
        + #ifdef MULTI_BYTE
        + while (1)
        + {
        + int c;
        +
        + if ((c = *scan) == '\0')
        + break;
        + if (is_dbcs && IsLeadByte(c)) /* if multibyte, extra forward */
        + {
        + if (cstrchr(opnd, c << 8 | *(scan + 1)) == NULL)
        + break;
        + count++;
        + scan++;
        + }
        + else
        + if (cstrchr(opnd, c) == NULL)
        + break;
        + count++;
        + scan++;
        + }
        + #else
        while (*scan != '\0' && cstrchr(opnd, *scan) != NULL)
        {
        count++;
        scan++;
        }
        + #endif
        break;
        case ANYBUT:
        + #ifdef MULTI_BYTE
        + while (1)
        + {
        + int c;
        +
        + if ((c = *scan) == '\0')
        + break;
        + if (is_dbcs && IsLeadByte(c)) /* if multibyte, extra forward */
        + {
        + if (cstrchr(opnd, c << 8 | *(scan + 1)) != NULL)
        + break;
        + count++;
        + scan++;
        + }
        + else
        + if (cstrchr(opnd, c) != NULL)
        + break;
        + count++;
        + scan++;
        + }
        + #else
        while (*scan != '\0' && cstrchr(opnd, *scan) == NULL)
        {
        count++;
        scan++;
        }
        + #endif
        break;
        + #ifdef MULTI_BYTE
        + case MULTIBYTECODE:
        + {
        + int cl, ct;
        + cl = opnd[0];
        + ct = opnd[1];
        + while( scan[0] == cl && scan[1] == ct )
        + {
        + count += 2;
        + scan++;
        + }
        + }
        + break;
        + #endif
        default: /* Oh dear. Called inappropriately. */
        emsg(e_re_corr);
        #ifdef DEBUG
        ***************
        *** 2769,2774 ****
        --- 3092,3102 ----
        sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
        p = NULL;
        break;
        + #ifdef MULTI_BYTE
        + case MULTIBYTECODE:
        + p = 'MULTIBYTE CODE';
        + break;
        + #endif
        default:
        sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
        p = NULL;
        ***************
        *** 2807,2812 ****
        --- 3135,3144 ----

        if (!reg_ic)
        return vim_strchr(s, c);
        + #ifdef MULTI_BYTE
        + if (re_ismultibytecode(c))
        + return vim_strchr(s, c);
        + #endif

        /* tolower() and toupper() can be slow, comparing twice should be a lot
        * faster (esp. when using MS Visual C++!) */
        ***************
        *** 2820,2825 ****
        --- 3152,3161 ----
        for (p = s; *p; ++p)
        if (*p == c || *p == cc)
        return p;
        + #ifdef MULTI_BYTE
        + else if (is_dbcs && IsLeadByte(*p))
        + if (!(*++p)) break;
        + #endif
        return NULL;
        }

        ***************
        *** 3046,3051 ****
        --- 3382,3395 ----
        }
        if (copy)
        {
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(c) && *src != NUL )
        + {
        + *dst++ = c;
        + *dst = *src++;
        + }
        + else
        + #endif
        if (func == (fptr)NULL) /* just copy */
        *dst = c;
        else /* change case */
        *** ./src.orig/misc2.c Sun Jan 09 09:50:54 2000
        --- ./src/misc2.c Sun Jan 16 14:02:04 2000
        ***************
        *** 1017,1028 ****
        --- 1017,1043 ----
        {
        char_u *p;
        int c;
        + #ifdef MULTI_BYTE
        + int n2 = -1;
        + if (n > 255)
        + {
        + n2 = n & 0xFF;
        + n = (n >> 8) & 0xFF;
        + }
        + #endif

        p = string;
        while ((c = *p) != NUL)
        {
        if (c == n)
        + #ifdef MULTI_BYTE
        + if (n2 == -1 || p[1] == n2)
        + #endif
        return p;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(c))
        + if (!(*++p)) break;
        + #endif
        ++p;
        }
        return NULL;
        ***************
        *** 1039,1044 ****
        --- 1054,1063 ----
        {
        if (*string == n)
        retval = string;
        + #ifdef MULTI_BYTE
        + if (is_dbcs && IsLeadByte(*string))
        + if (!(*++string)) break;
        + #endif
        ++string;
        }
        return retval;

        ----
        Taro Muraoka mailto:koron@...
      • Chong-Dae Park
        I ve add boundary check for Taro s work. Please test it. -- Chong-Dae Park -- Warning: Your signature is longer than 4 lines. Since signatures usually do not
        Message 3 of 4 , May 17, 2000
        View Source
        • 0 Attachment
          I've add boundary check for Taro's work.

          Please test it.

          --
          Chong-Dae Park
          --
          Warning: Your signature is longer than 4 lines. Since signatures usually do
          not transport any useful information, they should be as short as
          possible. - tin's Warning message -
        Your message has been successfully submitted and would be delivered to recipients shortly.