Loading ...
Sorry, an error occurred while loading the content.

patch: Enabling utf-8 hangul input.

Expand Messages
  • Shawn Y.H. Kim
    In response to the following comment made by Bram on Aug 2, 2007: (can be viewed at
    Message 1 of 9 , May 1, 2011
    • 0 Attachment
      In response to the following comment made by Bram on Aug 2, 2007:
      (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)

      > I hesitate removing the Hangul support without knowing for sure that it
      > is not needed. Browsing through the messages I do see remarks that it
      > might still be useful to a few people.
      >
      > Perhaps the Hangul support can be changed to also work for UTF-8?

      I made (finally) a patch that enables hangul-input module to work for
      UTF-8.

      First, hg head:
      orchistro.ubuntu:~/work/vim-hangulin/src$ hg head
      changeset: 2790:08c36bef2004
      tag: tip
      user: Bram Moolenaar <bram@...>
      date: Thu Apr 28 19:05:05 2011 +0200
      summary: Added tag v7-3-170 for changeset 64c3402df964

      changeset: 2572:ee53a39d5896
      branch: vim73
      user: Bram Moolenaar <bram@...>
      date: Sun Aug 15 15:24:20 2010 +0200
      summary: Last changes for the 7.3 release!

      Secondly, hg st:
      orchistro.ubuntu:~/work/vim-hangulin/src$ hg st
      M src/getchar.c
      M src/gui.c
      M src/hangulin.c
      M src/screen.c

      Finally, hg diff:
      ... It is too long. But I cannot find a way to attach a file, so, here
      goes the diff:

      orchistro.ubuntu:~/work/vim-hangulin/src$ hg diff
      diff -r 08c36bef2004 src/getchar.c
      --- a/src/getchar.c Thu Apr 28 19:05:05 2011 +0200
      +++ b/src/getchar.c Sun May 01 20:06:42 2011 +0900
      @@ -1722,8 +1722,23 @@
      buf[i] = vgetorpeek(TRUE);
      if (buf[i] == K_SPECIAL
      #ifdef FEAT_GUI
      +#ifdef FEAT_HANGULIN
      + /* Since hangul's utf-8 code has many 0x9b instances.
      + * Any hangul unicodes of U+XX1B, U+X6CX, U+X6DX, U+X6EX, U
      +X6FX
      + * has 0x9B when encoded by utf-8 encoding scheme.
      + * To correctly input those characters with
      + * utf-8 hangul-input module in gvim,
      + * Checking CSI is disabled when hangulinput module is featured
      in.
      + *
      + * But I(orchistro _at_ gmail.com) am not sure if it is right
      + * choice to skip checking CSI,
      + * because I have no idea what CSI does exactly.
      + */
      + || (buf[i] == CSI && !gui.in_use)
      +#else
      || buf[i] == CSI
      #endif
      +#endif
      )
      {
      /* Must be a K_SPECIAL - KS_SPECIAL - KE_FILLER sequence,
      diff -r 08c36bef2004 src/gui.c
      --- a/src/gui.c Thu Apr 28 19:05:05 2011 +0200
      +++ b/src/gui.c Sun May 01 20:06:42 2011 +0900
      @@ -39,6 +39,8 @@

      static int can_update_cursor = TRUE; /* can display the cursor */

      +extern size_t hangul_width[];
      +
      /*
      * The Athena scrollbars can move the thumb to after the end of the
      scrollbar,
      * this makes the thumb indicate the part of the text that is shown.
      Motif
      @@ -1064,7 +1066,7 @@
      gui.highlight_mask = (cattr | attr);
      #ifdef FEAT_HANGULIN
      if (composing_hangul)
      - (void)gui_outstr_nowrap(composing_hangul_buffer, 2,
      + (void)gui_outstr_nowrap(composing_hangul_buffer,
      hangul_width[enc_utf8],
      GUI_MON_IS_CURSOR | GUI_MON_NOCLEAR, cfg, cbg, 0);
      else
      #endif
      @@ -2384,7 +2386,7 @@
      #ifdef FEAT_HANGULIN
      if (composing_hangul
      && gui.col == gui.cursor_col && gui.row == gui.cursor_row)
      - (void)gui_outstr_nowrap(composing_hangul_buffer, 2,
      + (void)gui_outstr_nowrap(composing_hangul_buffer,
      hangul_width[enc_utf8],
      GUI_MON_IS_CURSOR | GUI_MON_NOCLEAR,
      gui.norm_pixel, gui.back_pixel, 0);
      else
      diff -r 08c36bef2004 src/hangulin.c
      --- a/src/hangulin.c Thu Apr 28 19:05:05 2011 +0200
      +++ b/src/hangulin.c Sun May 01 20:06:42 2011 +0900
      @@ -39,6 +39,8 @@
      static int convert_3_to_ks __ARGS((int fv, int mv, int lv, char_u
      *des));
      static int hangul_automata2 __ARGS((char_u *buf, unsigned int *c));
      static int hangul_automata3 __ARGS((char_u *buf, unsigned int *c));
      +size_t hangul_width[] = {2, /* ksc 5601, 2 bytes */
      + 3 /* utf-8, 3 bytes. when enc_utf8 is 1
      */ };

      #define push(x) {stack[ sp++ ] = *(x); stack[sp++] = *((x)+1);}
      #define pop(x) {*((x) + 1) = stack[--sp]; *(x) = stack[--sp];}
      @@ -435,12 +437,14 @@
      hangul_input_state_set(0);
      if (composing_hangul)
      {
      - push_raw_key(composing_hangul_buffer, 2);
      + push_raw_key(composing_hangul_buffer, hangul_width[enc_utf8]);
      composing_hangul = 0;
      }
      }
      else
      + {
      hangul_input_state_set(1);
      + }

      if (showmode())
      {
      @@ -745,6 +749,31 @@
      }
      }

      +static void
      +hangul_convert_to_utf_8(char_u *buf_utf_8, char_u *buf_euc_kr)
      +{
      + vimconv_T hangul_conv;
      + iconv_t hangul_iconv_fd = (iconv_t)-1;
      + int conv_len = 2;
      + char_u *converted_str;
      +
      + hangul_iconv_fd = (iconv_t)my_iconv_open("utf-8", "euc-kr");
      +
      + hangul_conv.vc_type = CONV_ICONV;
      + hangul_conv.vc_factor = 0;
      + hangul_conv.vc_fd = hangul_iconv_fd;
      + hangul_conv.vc_fail = 0;
      +
      + converted_str = string_convert(&hangul_conv, buf_euc_kr,
      &conv_len);
      +
      + /*
      + * Copying utf-8 code to the buffer
      + */
      + memcpy(buf_utf_8, converted_str, conv_len);
      +
      + iconv_close(hangul_iconv_fd);
      +}
      +
      int
      hangul_input_process(s, len)
      char_u *s;
      @@ -752,7 +781,8 @@
      {
      int n;
      unsigned int c;
      - char_u hanbuf[20];
      + char_u hanbuf[2][30]; /* hanbuf[0] : for ks c 5601 codes
      + hanbuf[1] : for utf-8 codes */

      if (len == 1)
      /* normal key press */
      @@ -768,37 +798,70 @@
      else
      {
      if (composing_hangul)
      - push_raw_key(composing_hangul_buffer, 2);
      + push_raw_key(composing_hangul_buffer, hangul_width[enc_utf8]);
      hangul_input_clear();
      composing_hangul = 0;
      return len;
      }

      + /*
      + * TODO: It will look better if automata for du-bul-sik(2-way
      korean keyboard)
      + * and se-bul-sik(3-way korean keyboard) is called
      automatically
      + * by a function pointer
      + */
      if (hangul_keyboard_type == 2)
      - n = hangul_automata2(hanbuf, &c);
      + {
      + n = hangul_automata2(hanbuf[0], &c); /* Character the automata
      created is
      + always in
      ks_c_5601-1987 */
      + }
      else
      - n = hangul_automata3(hanbuf, &c);
      + {
      + n = hangul_automata3(hanbuf[0], &c); /* Character the automata
      created is
      + always in
      ks_c_5601-1987 */
      + }

      if (n == AUTOMATA_CORRECT)
      {
      - STRNCPY(composing_hangul_buffer, hanbuf, 2);
      + if (enc_utf8)
      + hangul_convert_to_utf_8(hanbuf[1], hanbuf[0]);
      +
      + STRNCPY(composing_hangul_buffer, hanbuf[enc_utf8],
      hangul_width[enc_utf8]);
      gui_update_cursor(TRUE, FALSE);
      return 0;
      }
      else if (n == AUTOMATA_NEW)
      {
      + if (enc_utf8)
      + hangul_convert_to_utf_8(hanbuf[1], hanbuf[0]);
      if (composing_hangul)
      - push_raw_key(composing_hangul_buffer, 2);
      - STRNCPY(composing_hangul_buffer, hanbuf, 2);
      + push_raw_key(composing_hangul_buffer, hangul_width[enc_utf8]);
      + STRNCPY(composing_hangul_buffer, hanbuf[enc_utf8],
      hangul_width[enc_utf8]);
      composing_hangul = 1;
      gui_update_cursor(TRUE, FALSE);
      return 0;
      }
      else if (n == AUTOMATA_CORRECT_NEW)
      {
      + if (enc_utf8)
      + hangul_convert_to_utf_8(hanbuf[1], hanbuf[0]);
      if (composing_hangul)
      - push_raw_key(hanbuf, 2);
      - STRNCPY(composing_hangul_buffer, hanbuf+2, 2);
      + {
      + push_raw_key(hanbuf[enc_utf8], hangul_width[enc_utf8]);
      + /*
      + * Because the content of hanbuf has been pushed into the
      "inbuf",
      + * it is turn for the next character.
      + * Convert next character and put it into
      composing_hangul_buffer.
      + */
      + if (enc_utf8)
      + {
      + hangul_convert_to_utf_8(&hanbuf[1][3], &hanbuf[0][2]);
      + }
      + }
      +
      + STRNCPY(composing_hangul_buffer,
      + hanbuf[enc_utf8] + hangul_width[enc_utf8],
      + hangul_width[enc_utf8]);
      +
      composing_hangul = 1;
      gui_update_cursor(TRUE, FALSE);
      return 0;
      @@ -816,7 +879,7 @@
      {
      if (composing_hangul)
      {
      - push_raw_key(composing_hangul_buffer, 2);
      + push_raw_key(composing_hangul_buffer, hangul_width[enc_utf8]);
      composing_hangul = 0;
      }
      *s = c;
      @@ -1538,6 +1601,13 @@
      }
      }

      +/*
      + * convert_3_to_ks()
      + *
      + * Receive Cho, Jung, Jong-Seong
      + * and build ks c 5601-1992 code.
      + * Returns ks c 5601-1992 code in des array.
      + */
      static int
      convert_3_to_ks(fv, mv, lv, des)
      int fv;
      diff -r 08c36bef2004 src/screen.c
      --- a/src/screen.c Thu Apr 28 19:05:05 2011 +0200
      +++ b/src/screen.c Sun May 01 20:06:42 2011 +0900
      @@ -9466,7 +9466,12 @@
      if (gui.in_use)
      {
      if (hangul_input_state_get())
      - MSG_PUTS_ATTR(" \307\321\261\333", attr); /* HANGUL */
      + if (enc_utf8)
      + /* Displays Korean Letters for "Hangul" in
      utf-8 */
      + MSG_PUTS_ATTR(" \355\225\234\352\270\200",
      attr);
      + else
      + /* Displays Korean Letters for "Hangul" in
      euc-kr */
      + MSG_PUTS_ATTR(" \307\321\261\333", attr);
      }
      #endif
      #ifdef FEAT_INS_EXPAND

      --
      You received this message from the "vim_dev" maillist.
      Do not top-post! Type your reply below the text you are replying to.
      For more information, visit http://www.vim.org/maillist.php
    • Bram Moolenaar
      ... Great, thanks. I ll have a closer look later. -- The 50-50-90 rule: Anytime you have a 50-50 chance of getting something right, there s a 90% probability
      Message 2 of 9 , May 1, 2011
      • 0 Attachment
        Shawn Y.H. Kim wrote:

        > In response to the following comment made by Bram on Aug 2, 2007:
        > (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)
        >
        > > I hesitate removing the Hangul support without knowing for sure that it
        > > is not needed. Browsing through the messages I do see remarks that it
        > > might still be useful to a few people.
        > >
        > > Perhaps the Hangul support can be changed to also work for UTF-8?
        >
        > I made (finally) a patch that enables hangul-input module to work for
        > UTF-8.
        >
        > First, hg head:
        > orchistro.ubuntu:~/work/vim-hangulin/src$ hg head
        > changeset: 2790:08c36bef2004
        > tag: tip
        > user: Bram Moolenaar <bram@...>
        > date: Thu Apr 28 19:05:05 2011 +0200
        > summary: Added tag v7-3-170 for changeset 64c3402df964
        >
        > changeset: 2572:ee53a39d5896
        > branch: vim73
        > user: Bram Moolenaar <bram@...>
        > date: Sun Aug 15 15:24:20 2010 +0200
        > summary: Last changes for the 7.3 release!
        >
        > Secondly, hg st:
        > orchistro.ubuntu:~/work/vim-hangulin/src$ hg st
        > M src/getchar.c
        > M src/gui.c
        > M src/hangulin.c
        > M src/screen.c
        >
        > Finally, hg diff:
        > ... It is too long. But I cannot find a way to attach a file, so, here
        > goes the diff:

        Great, thanks. I'll have a closer look later.


        --
        The 50-50-90 rule: Anytime you have a 50-50 chance of getting
        something right, there's a 90% probability you'll get it wrong.

        /// Bram Moolenaar -- Bram@... -- http://www.Moolenaar.net \\\
        /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
        \\\ an exciting new programming language -- http://www.Zimbu.org ///
        \\\ help me help AIDS victims -- http://ICCF-Holland.org ///

        --
        You received this message from the "vim_dev" maillist.
        Do not top-post! Type your reply below the text you are replying to.
        For more information, visit http://www.vim.org/maillist.php
      • Bram Moolenaar
        ... Thanks. I m glad to finally see this implemented. It still needs some work though. ... Please do send this as an attachment. Long lines got wrapped,
        Message 3 of 9 , May 10, 2011
        • 0 Attachment
          Shawn Y.H. Kim wrote:

          > In response to the following comment made by Bram on Aug 2, 2007:
          > (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)
          >
          > > I hesitate removing the Hangul support without knowing for sure that it
          > > is not needed. Browsing through the messages I do see remarks that it
          > > might still be useful to a few people.
          > >
          > > Perhaps the Hangul support can be changed to also work for UTF-8?
          >
          > I made (finally) a patch that enables hangul-input module to work for
          > UTF-8.

          Thanks. I'm glad to finally see this implemented.
          It still needs some work though.

          > Finally, hg diff:
          > ... It is too long. But I cannot find a way to attach a file, so, here
          > goes the diff:

          Please do send this as an attachment. Long lines got wrapped, making it
          impossible to apply.

          The change to getchar.c should not be there. Perhaps you are not
          encoding the strings that go into the input buffer correctly? A CSI
          should be put there as three characters: CSI KS_EXTRA KE_CSI.
          I guess fix_input_buffer() can be used in push_raw_key().

          It should be possible to keep hangul_width[] inside hangulin.c. Instead
          of:
          (void)gui_outstr_nowrap(composing_hangul_buffer, hangul_width[enc_utf8],
          Use something like:
          (void)hangul_outstr_composing();

          The code for calling push_raw_key() can also be put in one place.

          The call to my_iconv_open() must be inside a check for USE_ICONV.
          Perhaps when this is not available all of the patch won't work?
          Then you need to add a lot more #ifdefs.


          --
          hundred-and-one symptoms of being an internet addict:
          82. AT&T names you Customer of the Month for the third consecutive time.

          /// Bram Moolenaar -- Bram@... -- http://www.Moolenaar.net \\\
          /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
          \\\ an exciting new programming language -- http://www.Zimbu.org ///
          \\\ help me help AIDS victims -- http://ICCF-Holland.org ///

          --
          You received this message from the "vim_dev" maillist.
          Do not top-post! Type your reply below the text you are replying to.
          For more information, visit http://www.vim.org/maillist.php
        • Shawn
          ... Yes. Next time, I will send it as an attachment using google mail :-) ... As I wrote in the comment, any unicode that falls into the range of U+XX1B,
          Message 4 of 9 , May 14, 2011
          • 0 Attachment
            On Wed, May 11, 2011 at 12:05 AM, Bram Moolenaar <Bram@...> wrote:
            >
            > Shawn Y.H. Kim wrote:
            >
            >> In response to the following comment made by Bram on Aug 2, 2007:
            >> (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)
            >>
            >> > I hesitate removing the Hangul support without knowing for sure that it
            >> > is not needed.  Browsing through the messages I do see remarks that it
            >> > might still be useful to a few people.
            >> >
            >> > Perhaps the Hangul support can be changed to also work for UTF-8?
            >>
            >> I made (finally) a patch that enables hangul-input module to work for
            >> UTF-8.
            >
            > Thanks.  I'm glad to finally see this implemented.
            > It still needs some work though.
            >
            >> Finally, hg diff:
            >> ... It is too long. But I cannot find a way to attach a file, so, here
            >> goes the diff:
            >
            > Please do send this as an attachment.  Long lines got wrapped, making it
            > impossible to apply.

            Yes. Next time, I will send it as an attachment using google mail :-)

            > The change to getchar.c should not be there.  Perhaps you are not
            > encoding the strings that go into the input buffer correctly?  A CSI
            > should be put there as three characters: CSI KS_EXTRA KE_CSI.
            > I guess fix_input_buffer() can be used in push_raw_key().

            As I wrote in the comment, any unicode that falls into the range of
            U+XX1B, U+X6CX, U+X6DX, U+X6EX, U+X6FX will have 0x9B (CSI)
            if it is encoded using UTF-8 encoding scheme.
            Question is, is it correct to input raw UTF-8 code into the input buffer?
            If it is correct, then the logic that I commented out has some problem, I guess.
            I will look into fix_input_buffer() function to check if it will do
            the trick, thanks for
            the advice :-)

            > It should be possible to keep hangul_width[] inside hangulin.c.  Instead
            > of:
            >        (void)gui_outstr_nowrap(composing_hangul_buffer, hangul_width[enc_utf8],
            > Use something like:
            >        (void)hangul_outstr_composing();
            >
            > The code for calling push_raw_key() can also be put in one place.

            Good point. I will look for more graceful way to blend hangul input codes in.

            > The call to my_iconv_open() must be inside a check for USE_ICONV.
            > Perhaps when this is not available all of the patch won't work?
            > Then you need to add a lot more #ifdefs.

            You are right, iconv related routines should be surrounded by USE_ICONV check.
            As you pointed out, if iconv is not available, the patch won't work, right.
            I will work on it.

            After I am done with all of the points you mentioned, I am going to send
            another patch. Please wait for me until then. It will take some time, though.

            Thanks for your comments, helped a lot.
            Look forward to the next patch ;-)

            >
            >
            > --
            > hundred-and-one symptoms of being an internet addict:
            > 82. AT&T names you Customer of the Month for the third consecutive time.
            >
            >  /// Bram Moolenaar -- Bram@... -- http://www.Moolenaar.net   \\\
            > ///        sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
            > \\\  an exciting new programming language -- http://www.Zimbu.org        ///
            >  \\\            help me help AIDS victims -- http://ICCF-Holland.org    ///
            >



            --
            May the life be a ever lasting rapture.

            Shawn Y.H. Kim
            +82 16 7287 5874
            orchistro at gmail dot com
            http://orchistro.tistory.com

            --
            You received this message from the "vim_dev" maillist.
            Do not top-post! Type your reply below the text you are replying to.
            For more information, visit http://www.vim.org/maillist.php
          • Shawn Kim
            ... 1. I took a look into fix_input_buffer() and used it to fix hangul input buffer. But fix_input_buffer() function did not do anything. It escapes CSI into
            Message 5 of 9 , May 28, 2011
            • 0 Attachment
              >
              > Shawn Y.H. Kim wrote:
              >
              >> In response to the following comment made by Bram on Aug 2, 2007:
              >> (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)
              >>
              >>> I hesitate removing the Hangul support without knowing for sure that it
              >>> is not needed. Browsing through the messages I do see remarks that it
              >>> might still be useful to a few people.
              >>>
              >>> Perhaps the Hangul support can be changed to also work for UTF-8?
              >>
              >> I made (finally) a patch that enables hangul-input module to work for
              >> UTF-8.
              >
              > Thanks. I'm glad to finally see this implemented.
              > It still needs some work though.
              >
              >> Finally, hg diff:
              >> ... It is too long. But I cannot find a way to attach a file, so, here
              >> goes the diff:
              >
              > Please do send this as an attachment. Long lines got wrapped, making it
              > impossible to apply.
              >
              > The change to getchar.c should not be there. Perhaps you are not
              > encoding the strings that go into the input buffer correctly? A CSI
              > should be put there as three characters: CSI KS_EXTRA KE_CSI.
              > I guess fix_input_buffer() can be used in push_raw_key().

              1. I took a look into fix_input_buffer() and used it to "fix" hangul input buffer.
              But fix_input_buffer() function did not do anything.
              It escapes CSI into K_SPECIAL KS_EXTRA KE_CSI sequence
              only when the first byte of the input buffer is CSI.
              But the hangul codes in question have 0x9b in the middle or at the end,
              e.g) EB A0 9B.
              The function does not have any chance to "fix" the buffer.

              2. 0x9b in hangul codes is valid code. I encoded the strings correctly.
              0x9b(CSI) is part of utf-8 encoded hangul code.

              3. Question: I guest that the CSI is some kind of special character that
              indicates subsequent characters have some special meaning, right? Then,
              in gui mode, in what case a user can generate CSI code?
              If I knew what does the CSI do and when the CSI is generated, it would be
              much easier for me to do the job.

              Now I'm working on the advices you made before :-)
              As soon as you shed some light on the secret of CSI, I will work on it.

              Looking forward to your kind advice.

              Regards.

              >
              > It should be possible to keep hangul_width[] inside hangulin.c. Instead
              > of:
              > (void)gui_outstr_nowrap(composing_hangul_buffer, hangul_width[enc_utf8],
              > Use something like:
              > (void)hangul_outstr_composing();
              >
              > The code for calling push_raw_key() can also be put in one place.
              >
              > The call to my_iconv_open() must be inside a check for USE_ICONV.
              > Perhaps when this is not available all of the patch won't work?
              > Then you need to add a lot more #ifdefs.
              >
              >
              > --
              > hundred-and-one symptoms of being an internet addict:
              > 82. AT&T names you Customer of the Month for the third consecutive time.
              >
              > /// Bram Moolenaar -- Bram@... -- http://www.Moolenaar.net \\\
              > /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
              > \\\ an exciting new programming language -- http://www.Zimbu.org ///
              > \\\ help me help AIDS victims -- http://ICCF-Holland.org ///

              --
              You received this message from the "vim_dev" maillist.
              Do not top-post! Type your reply below the text you are replying to.
              For more information, visit http://www.vim.org/maillist.php
            • Bram Moolenaar
              ... I think that when CSI appears halfway a utf-8 byte sequence it doesn t need to be escaped. That only happens when it s at the start of a character, it
              Message 6 of 9 , May 29, 2011
              • 0 Attachment
                Shawn Y.H. Kim wrote:

                > >> In response to the following comment made by Bram on Aug 2, 2007:
                > >> (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)
                > >>
                > >>> I hesitate removing the Hangul support without knowing for sure that it
                > >>> is not needed. Browsing through the messages I do see remarks that it
                > >>> might still be useful to a few people.
                > >>>
                > >>> Perhaps the Hangul support can be changed to also work for UTF-8?
                > >>
                > >> I made (finally) a patch that enables hangul-input module to work for
                > >> UTF-8.
                > >
                > > Thanks. I'm glad to finally see this implemented.
                > > It still needs some work though.
                > >
                > >> Finally, hg diff:
                > >> ... It is too long. But I cannot find a way to attach a file, so, here
                > >> goes the diff:
                > >
                > > Please do send this as an attachment. Long lines got wrapped, making it
                > > impossible to apply.
                > >
                > > The change to getchar.c should not be there. Perhaps you are not
                > > encoding the strings that go into the input buffer correctly? A CSI
                > > should be put there as three characters: CSI KS_EXTRA KE_CSI.
                > > I guess fix_input_buffer() can be used in push_raw_key().
                >
                > 1. I took a look into fix_input_buffer() and used it to "fix" hangul input buffer.
                > But fix_input_buffer() function did not do anything.
                > It escapes CSI into K_SPECIAL KS_EXTRA KE_CSI sequence
                > only when the first byte of the input buffer is CSI.
                > But the hangul codes in question have 0x9b in the middle or at the end,
                > e.g) EB A0 9B.
                > The function does not have any chance to "fix" the buffer.

                I think that when CSI appears halfway a utf-8 byte sequence it doesn't
                need to be escaped. That only happens when it's at the start of a
                character, it needs to be escaped to avoid it being interpreted as a
                special key byte sequence.

                > 2. 0x9b in hangul codes is valid code. I encoded the strings correctly.
                > 0x9b(CSI) is part of utf-8 encoded hangul code.

                The encoding in the input buffer is a bit weird, it includes special
                byte sequences, and then what the user types has to be escaped to avoid
                that byte sequence being handled in the wrong way.

                > 3. Question: I guest that the CSI is some kind of special character that
                > indicates subsequent characters have some special meaning, right? Then,
                > in gui mode, in what case a user can generate CSI code?
                > If I knew what does the CSI do and when the CSI is generated, it would be
                > much easier for me to do the job.

                In the GUI it's a bit different, we don't read raw bytes from what the
                user types, but create a byte stream from events. E.g. in
                src/gui_gtk_x11.c in key_press_event().

                > Now I'm working on the advices you made before :-)
                > As soon as you shed some light on the secret of CSI, I will work on it.
                >
                > Looking forward to your kind advice.

                I hope this helps.

                --
                hundred-and-one symptoms of being an internet addict:
                120. You ask a friend, "What's that big shiny thing?" He says, "It's the sun."

                /// Bram Moolenaar -- Bram@... -- http://www.Moolenaar.net \\\
                /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
                \\\ an exciting new programming language -- http://www.Zimbu.org ///
                \\\ help me help AIDS victims -- http://ICCF-Holland.org ///

                --
                You received this message from the "vim_dev" maillist.
                Do not top-post! Type your reply below the text you are replying to.
                For more information, visit http://www.vim.org/maillist.php
              • Shawn Kim
                ... Yes, I also believe the 0x9b in the middle of an encoded byte does not need to be escaped. It s part of valid code. ... The hangul input automata is
                Message 7 of 9 , May 29, 2011
                • 0 Attachment
                  >>>> In response to the following comment made by Bram on Aug 2, 2007:
                  >>>> (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)
                  >>>>
                  >>>>> I hesitate removing the Hangul support without knowing for sure that it
                  >>>>> is not needed. Browsing through the messages I do see remarks that it
                  >>>>> might still be useful to a few people.
                  >>>>>
                  >>>>> Perhaps the Hangul support can be changed to also work for UTF-8?
                  >>>>
                  >>>> I made (finally) a patch that enables hangul-input module to work for
                  >>>> UTF-8.
                  >>>
                  >>> Thanks. I'm glad to finally see this implemented.
                  >>> It still needs some work though.
                  >>>
                  >>>> Finally, hg diff:
                  >>>> ... It is too long. But I cannot find a way to attach a file, so, here
                  >>>> goes the diff:
                  >>>
                  >>> Please do send this as an attachment. Long lines got wrapped, making it
                  >>> impossible to apply.
                  >>>
                  >>> The change to getchar.c should not be there. Perhaps you are not
                  >>> encoding the strings that go into the input buffer correctly? A CSI
                  >>> should be put there as three characters: CSI KS_EXTRA KE_CSI.
                  >>> I guess fix_input_buffer() can be used in push_raw_key().
                  >>
                  >> 1. I took a look into fix_input_buffer() and used it to "fix" hangul input buffer.
                  >> But fix_input_buffer() function did not do anything.
                  >> It escapes CSI into K_SPECIAL KS_EXTRA KE_CSI sequence
                  >> only when the first byte of the input buffer is CSI.
                  >> But the hangul codes in question have 0x9b in the middle or at the end,
                  >> e.g) EB A0 9B.
                  >> The function does not have any chance to "fix" the buffer.
                  >
                  > I think that when CSI appears halfway a utf-8 byte sequence it doesn't
                  > need to be escaped. That only happens when it's at the start of a
                  > character, it needs to be escaped to avoid it being interpreted as a
                  > special key byte sequence.

                  Yes, I also believe the 0x9b in the middle of an encoded byte
                  does not need to be escaped. It's part of valid code.

                  >> 2. 0x9b in hangul codes is valid code. I encoded the strings correctly.
                  >> 0x9b(CSI) is part of utf-8 encoded hangul code.
                  >
                  > The encoding in the input buffer is a bit weird, it includes special
                  > byte sequences, and then what the user types has to be escaped to avoid
                  > that byte sequence being handled in the wrong way.
                  >
                  >> 3. Question: I guest that the CSI is some kind of special character that
                  >> indicates subsequent characters have some special meaning, right? Then,
                  >> in gui mode, in what case a user can generate CSI code?
                  >> If I knew what does the CSI do and when the CSI is generated, it would be
                  >> much easier for me to do the job.
                  >
                  > In the GUI it's a bit different, we don't read raw bytes from what the
                  > user types, but create a byte stream from events. E.g. in
                  > src/gui_gtk_x11.c in key_press_event().

                  The hangul input automata is initiated from THAT routine.
                  Following is the callstack when hangul input automata is being in action:

                  src/gui_gtk_x11.c: key_press_event()
                  --> src/ui.c: add_to_input_buffer()
                  --> src/hangulin.c: hangul_input_process() (the automata)

                  or

                  src/gui_x11.c: gui_x11_key_hit_cb()
                  --> src/ui.c: add_to_input_buffer()
                  --> src/hangulin.c: hangul_input_process() (the automata)

                  The hangul_input_process() creates hangul code from what user has typed in.
                  And then it puts the hangul code in "inbuf" buffer by calling push_raw_key().

                  And then somewhere in the way, the "inbuf" is processd by vgetc() in src/getchar.c.
                  The function finds out that the 0x9b(CSI) is in the middle
                  of the code, and the routine I commented out (src/getchar.c: vgetc())
                  interprets the 0x9b as a special code, and modifies "inbuf", where it
                  should not be interpreted as a special key, but be preserved as they are.

                  Am I missing something?
                  And, what should I do to avoid interpreting 0x9b as CSI?

                  Please consider that hangul input routine is meaningful only when
                  MULTIBYTE and GUI option is enabled.

                  >
                  >> Now I'm working on the advices you made before :-)
                  >> As soon as you shed some light on the secret of CSI, I will work on it.
                  >>
                  >> Looking forward to your kind advice.
                  >
                  > I hope this helps.
                  >
                  > --
                  > hundred-and-one symptoms of being an internet addict:
                  > 120. You ask a friend, "What's that big shiny thing?" He says, "It's the sun."
                  >
                  > /// Bram Moolenaar -- Bram@... -- http://www.Moolenaar.net \\\
                  > /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
                  > \\\ an exciting new programming language -- http://www.Zimbu.org ///
                  > \\\ help me help AIDS victims -- http://ICCF-Holland.org ///

                  Regards,
                  Shawn.

                  --
                  You received this message from the "vim_dev" maillist.
                  Do not top-post! Type your reply below the text you are replying to.
                  For more information, visit http://www.vim.org/maillist.php
                • Bram Moolenaar
                  ... I was wrong, it does need to be escaped. But for the GUI this happens early on, not in fix_input_buffer(). See key_press_event(), first use of CSI. ...
                  Message 8 of 9 , May 29, 2011
                  • 0 Attachment
                    Shawn Kim wrote:

                    > >>>> In response to the following comment made by Bram on Aug 2, 2007:
                    > >>>> (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)
                    > >>>>
                    > >>>>> I hesitate removing the Hangul support without knowing for sure that it
                    > >>>>> is not needed. Browsing through the messages I do see remarks that it
                    > >>>>> might still be useful to a few people.
                    > >>>>>
                    > >>>>> Perhaps the Hangul support can be changed to also work for UTF-8?
                    > >>>>
                    > >>>> I made (finally) a patch that enables hangul-input module to work for
                    > >>>> UTF-8.
                    > >>>
                    > >>> Thanks. I'm glad to finally see this implemented.
                    > >>> It still needs some work though.
                    > >>>
                    > >>>> Finally, hg diff:
                    > >>>> ... It is too long. But I cannot find a way to attach a file, so, here
                    > >>>> goes the diff:
                    > >>>
                    > >>> Please do send this as an attachment. Long lines got wrapped, making it
                    > >>> impossible to apply.
                    > >>>
                    > >>> The change to getchar.c should not be there. Perhaps you are not
                    > >>> encoding the strings that go into the input buffer correctly? A CSI
                    > >>> should be put there as three characters: CSI KS_EXTRA KE_CSI.
                    > >>> I guess fix_input_buffer() can be used in push_raw_key().
                    > >>
                    > >> 1. I took a look into fix_input_buffer() and used it to "fix" hangul input buffer.
                    > >> But fix_input_buffer() function did not do anything.
                    > >> It escapes CSI into K_SPECIAL KS_EXTRA KE_CSI sequence
                    > >> only when the first byte of the input buffer is CSI.
                    > >> But the hangul codes in question have 0x9b in the middle or at the end,
                    > >> e.g) EB A0 9B.
                    > >> The function does not have any chance to "fix" the buffer.
                    > >
                    > > I think that when CSI appears halfway a utf-8 byte sequence it doesn't
                    > > need to be escaped. That only happens when it's at the start of a
                    > > character, it needs to be escaped to avoid it being interpreted as a
                    > > special key byte sequence.
                    >
                    > Yes, I also believe the 0x9b in the middle of an encoded byte
                    > does not need to be escaped. It's part of valid code.

                    I was wrong, it does need to be escaped. But for the GUI this happens
                    early on, not in fix_input_buffer(). See key_press_event(), first use
                    of CSI.

                    > >> 2. 0x9b in hangul codes is valid code. I encoded the strings correctly.
                    > >> 0x9b(CSI) is part of utf-8 encoded hangul code.
                    > >
                    > > The encoding in the input buffer is a bit weird, it includes special
                    > > byte sequences, and then what the user types has to be escaped to avoid
                    > > that byte sequence being handled in the wrong way.
                    > >
                    > >> 3. Question: I guest that the CSI is some kind of special character that
                    > >> indicates subsequent characters have some special meaning, right? Then,
                    > >> in gui mode, in what case a user can generate CSI code?
                    > >> If I knew what does the CSI do and when the CSI is generated, it would be
                    > >> much easier for me to do the job.
                    > >
                    > > In the GUI it's a bit different, we don't read raw bytes from what the
                    > > user types, but create a byte stream from events. E.g. in
                    > > src/gui_gtk_x11.c in key_press_event().
                    >
                    > The hangul input automata is initiated from THAT routine.
                    > Following is the callstack when hangul input automata is being in action:
                    >
                    > src/gui_gtk_x11.c: key_press_event()
                    > --> src/ui.c: add_to_input_buffer()
                    > --> src/hangulin.c: hangul_input_process() (the automata)
                    >
                    > or
                    >
                    > src/gui_x11.c: gui_x11_key_hit_cb()
                    > --> src/ui.c: add_to_input_buffer()
                    > --> src/hangulin.c: hangul_input_process() (the automata)
                    >
                    > The hangul_input_process() creates hangul code from what user has
                    > typed in. And then it puts the hangul code in "inbuf" buffer by
                    > calling push_raw_key().
                    >
                    > And then somewhere in the way, the "inbuf" is processd by vgetc() in
                    > src/getchar.c. The function finds out that the 0x9b(CSI) is in the
                    > middle of the code, and the routine I commented out (src/getchar.c:
                    > vgetc()) interprets the 0x9b as a special code, and modifies "inbuf",
                    > where it should not be interpreted as a special key, but be preserved
                    > as they are.
                    >
                    > Am I missing something?
                    > And, what should I do to avoid interpreting 0x9b as CSI?
                    >
                    > Please consider that hangul input routine is meaningful only when
                    > MULTIBYTE and GUI option is enabled.

                    You need to do the same thing as what happens in the loop in
                    key_press_event() to escape the CSI characters.

                    Also see the comment above add_to_input_buf().

                    --
                    hundred-and-one symptoms of being an internet addict:
                    121. You ask for e-mail adresses instead of telephone numbers.

                    /// Bram Moolenaar -- Bram@... -- http://www.Moolenaar.net \\\
                    /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
                    \\\ an exciting new programming language -- http://www.Zimbu.org ///
                    \\\ help me help AIDS victims -- http://ICCF-Holland.org ///

                    --
                    You received this message from the "vim_dev" maillist.
                    Do not top-post! Type your reply below the text you are replying to.
                    For more information, visit http://www.vim.org/maillist.php
                  • Bram Moolenaar
                    ... Any update on this? -- Compilation process failed successfully. /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net ///
                    Message 9 of 9 , Jan 4, 2012
                    • 0 Attachment
                      I replied to Shawn Kim (long ago):

                      > > >>>> In response to the following comment made by Bram on Aug 2, 2007:
                      > > >>>> (can be viewed at http://groups.google.com/group/vim_dev/browse_thread/thread/3b73a504c77ba803/)
                      > > >>>>
                      > > >>>>> I hesitate removing the Hangul support without knowing for sure that it
                      > > >>>>> is not needed. Browsing through the messages I do see remarks that it
                      > > >>>>> might still be useful to a few people.
                      > > >>>>>
                      > > >>>>> Perhaps the Hangul support can be changed to also work for UTF-8?
                      > > >>>>
                      > > >>>> I made (finally) a patch that enables hangul-input module to work for
                      > > >>>> UTF-8.
                      > > >>>
                      > > >>> Thanks. I'm glad to finally see this implemented.
                      > > >>> It still needs some work though.
                      > > >>>
                      > > >>>> Finally, hg diff:
                      > > >>>> ... It is too long. But I cannot find a way to attach a file, so, here
                      > > >>>> goes the diff:
                      > > >>>
                      > > >>> Please do send this as an attachment. Long lines got wrapped, making it
                      > > >>> impossible to apply.
                      > > >>>
                      > > >>> The change to getchar.c should not be there. Perhaps you are not
                      > > >>> encoding the strings that go into the input buffer correctly? A CSI
                      > > >>> should be put there as three characters: CSI KS_EXTRA KE_CSI.
                      > > >>> I guess fix_input_buffer() can be used in push_raw_key().
                      > > >>
                      > > >> 1. I took a look into fix_input_buffer() and used it to "fix" hangul input buffer.
                      > > >> But fix_input_buffer() function did not do anything.
                      > > >> It escapes CSI into K_SPECIAL KS_EXTRA KE_CSI sequence
                      > > >> only when the first byte of the input buffer is CSI.
                      > > >> But the hangul codes in question have 0x9b in the middle or at the end,
                      > > >> e.g) EB A0 9B.
                      > > >> The function does not have any chance to "fix" the buffer.
                      > > >
                      > > > I think that when CSI appears halfway a utf-8 byte sequence it doesn't
                      > > > need to be escaped. That only happens when it's at the start of a
                      > > > character, it needs to be escaped to avoid it being interpreted as a
                      > > > special key byte sequence.
                      > >
                      > > Yes, I also believe the 0x9b in the middle of an encoded byte
                      > > does not need to be escaped. It's part of valid code.
                      >
                      > I was wrong, it does need to be escaped. But for the GUI this happens
                      > early on, not in fix_input_buffer(). See key_press_event(), first use
                      > of CSI.
                      >
                      > > >> 2. 0x9b in hangul codes is valid code. I encoded the strings correctly.
                      > > >> 0x9b(CSI) is part of utf-8 encoded hangul code.
                      > > >
                      > > > The encoding in the input buffer is a bit weird, it includes special
                      > > > byte sequences, and then what the user types has to be escaped to avoid
                      > > > that byte sequence being handled in the wrong way.
                      > > >
                      > > >> 3. Question: I guest that the CSI is some kind of special character that
                      > > >> indicates subsequent characters have some special meaning, right? Then,
                      > > >> in gui mode, in what case a user can generate CSI code?
                      > > >> If I knew what does the CSI do and when the CSI is generated, it would be
                      > > >> much easier for me to do the job.
                      > > >
                      > > > In the GUI it's a bit different, we don't read raw bytes from what the
                      > > > user types, but create a byte stream from events. E.g. in
                      > > > src/gui_gtk_x11.c in key_press_event().
                      > >
                      > > The hangul input automata is initiated from THAT routine.
                      > > Following is the callstack when hangul input automata is being in action:
                      > >
                      > > src/gui_gtk_x11.c: key_press_event()
                      > > --> src/ui.c: add_to_input_buffer()
                      > > --> src/hangulin.c: hangul_input_process() (the automata)
                      > >
                      > > or
                      > >
                      > > src/gui_x11.c: gui_x11_key_hit_cb()
                      > > --> src/ui.c: add_to_input_buffer()
                      > > --> src/hangulin.c: hangul_input_process() (the automata)
                      > >
                      > > The hangul_input_process() creates hangul code from what user has
                      > > typed in. And then it puts the hangul code in "inbuf" buffer by
                      > > calling push_raw_key().
                      > >
                      > > And then somewhere in the way, the "inbuf" is processd by vgetc() in
                      > > src/getchar.c. The function finds out that the 0x9b(CSI) is in the
                      > > middle of the code, and the routine I commented out (src/getchar.c:
                      > > vgetc()) interprets the 0x9b as a special code, and modifies "inbuf",
                      > > where it should not be interpreted as a special key, but be preserved
                      > > as they are.
                      > >
                      > > Am I missing something?
                      > > And, what should I do to avoid interpreting 0x9b as CSI?
                      > >
                      > > Please consider that hangul input routine is meaningful only when
                      > > MULTIBYTE and GUI option is enabled.
                      >
                      > You need to do the same thing as what happens in the loop in
                      > key_press_event() to escape the CSI characters.
                      >
                      > Also see the comment above add_to_input_buf().

                      Any update on this?

                      --
                      Compilation process failed successfully.

                      /// Bram Moolenaar -- Bram@... -- http://www.Moolenaar.net \\\
                      /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
                      \\\ an exciting new programming language -- http://www.Zimbu.org ///
                      \\\ help me help AIDS victims -- http://ICCF-Holland.org ///

                      --
                      You received this message from the "vim_dev" maillist.
                      Do not top-post! Type your reply below the text you are replying to.
                      For more information, visit http://www.vim.org/maillist.php
                    Your message has been successfully submitted and would be delivered to recipients shortly.