Loading ...
Sorry, an error occurred while loading the content.

kemiripan kata bahasa indonesia (phonetic dan edit disctance)

Expand Messages
  • ginanjar_utama
    http://students.if.itb.ac.id/~if13092/riset/soundex/Normalisasi_String.doc aku lagi coba yang ini http://delphi.log.web.id/blogs/delphi/000087.html
    Message 1 of 1 , Jun 1, 2006
    • 0 Attachment
      http://students.if.itb.ac.id/~if13092/riset/soundex/Normalisasi_String.doc
      aku lagi coba yang ini

      http://delphi.log.web.id/blogs/delphi/000087.html
      http://priyadi.net/archives/2005/12/21/algoritma-fonetik-bahasa-indonesia/

      agoritma Priyadi dan Jaimy :)
      kalau ada waktu nanti dicoba juga insya Allah, sekalian belajar regex

      untuk yang bahasa inggris
      http://www.s-direktnet.de/homepages/neumann/rb_prgs/Soundex.rb
      http://po-ru.com/files/metaphone/0.4/metaphone.rb

      http://po-ru.com/files/levenshtein/1.3/levenshtein.rb
      untuk yang edit distance

      buku baru yang menjelaskan algorithma di atas secara sederhana
      dan test driven
      wrox beginning algorithm

      berguna untuk spell check dan data cleansing


      wassalam,
      Ginanjar Utama

      kode yang masih salah :)

      require 'test/unit'
      require 'SoundexPhoneticEncoder'

      class SoundexBandungEuyTestCase < Test::Unit::TestCase

      EXPECTED =[
      %w{Rakhmat Rahhmat},
      %w{Endjang Enjjang},
      %w{Itjang Iccang},
      %w{Erick Erikk},
      %w{Philip Ffilip},
      %w{Dzikri Zzikri},
      %w{Sjahrir Syahrir},
      %w{Syifa Ssifa},
      %w{Ardhi Arddi},
      %w{Arbhi Arbbi},
      %w{Saviena Safiina},
      %w{Wicaksono Wicaxxono},
      %w{Wahyoedi Wahyuudi},
      %w{Nurcholis Nurhholis},
      %w{Christian Kkristian},
      %w{Chandra Ccandra},
      %w{Dajat Dayat},
      %w{Joni Joni},
      %w{Ginandjar Ginanjjar}
      ]

      RESULT = [
      %w{Rahmat R530},
      %w{Rachmat R530},
      %w{Rakhmat R530},
      %w{Dzikri Z260},
      %w{Zikri Z260},
      %w{Dajat D300},
      %w{Dayat D300},
      %w{Efendi E153},
      %w{Efendhi E153},
      %w{Effendhy E153},
      %w{Effendi E153},
      %w{Effendy E153},
      %w{Ginanjar G560},
      %w{Ginandjar G560},
      %w{Enjang E520},
      %w{Endjang E520}
      ]
      def setup
      @encoder = SoundexBandungEuy.new
      @soundex = SoundexPhoneticEncoder.new
      end
      def testNormalizeWord
      EXPECTED.each do |a,b|
      assert_equal(b.upcase, @..._word(a))
      end
      end
      def testNamaIndonesia
      RESULT.each do |a,b|
      assert_equal(b, @...(a))
      end
      end
      end


      class SoundexBandungEuy < SoundexPhoneticEncoder
      RULES = [
      # Regexp, replacement
      [ /kh/, 'HH' ],
      [ /dj/, 'JJ' ],
      [ /tj/, 'CC' ],
      [ /ck/, 'KK' ],
      [ /dj/, 'JJ' ],
      [ /cq/, 'KK' ],
      [ /ph/, 'FF' ],
      [ /dz/, 'ZZ' ],
      [ /sj/, 'SY' ],
      [ /sy/, 'SS' ],
      [ /dh/, 'DD' ],
      [ /bh/, 'BB' ],
      [ /gh/, 'GG' ],
      [ /jh/, 'JJ' ],
      [ /sh/, 'SS' ],
      [ /th/, 'TT' ],
      [ /zh/, 'ZZ' ],
      [ /v/, 'F' ],
      [ /ks/, 'XX' ],
      [ /oe/, 'UU' ],
      [ /ie/, 'II' ],
      [ /\zy/, 'I' ],
      [ /\Achr/, 'KKR' ],
      [ /\Ach/, 'CC' ],
      [ /ch/, 'HH' ],
      [ /\Bj/, 'Y' ]
      ]

      def normalize_word(w)
      # Normalise case and remove non-ASCII
      s = w.downcase.gsub(/[^a-z]/, '')
      # Apply the normalize rules
      RULES.each { |rx, rep| s.gsub!(rx, rep) }
      return s.upcase
      end

      alias encode_orig encode
      def encode(string)
      puts "called for " + string
      encode_orig(normalize_word(string))
      end
      end


      class SoundexPhoneticEncoder
      def encode(string)
      puts "orig called " + string
      string = string.upcase
      result = "0000"
      result[0] = string[0,1]
      stringIndex = 1
      resultIndex = 1
      while (stringIndex < string.length && resultIndex < result.length)
      code = get_code(string[stringIndex, 1])
      if (code != "0" && code != result[resultIndex - 1, 1])
      result[resultIndex]=code
      resultIndex += 1
      end
      stringIndex += 1
      end
      result
      end
      def get_code(char)
      temp = char.tr! "AEIOUYWHBPFVCSKGJQXZDTLMNR",
      "00000000111122222222334556"
      temp == nil ? "0" : temp
      end
      end

      require 'test/unit'

      class SoundexPhoneticEncoderTestCase < Test::Unit::TestCase
      def setup
      @encoder = SoundexPhoneticEncoder.new
      end
      def testFirstLetterIsAlwaysUsed
      for c in 'A'..'Z'
      result = @...(c + "-")
      assert_not_nil(result)
      assert_equal(4, result.length)
      assert_equal(c, result[0,1])
      end
      end
      def assert_all_equal(expected, chars)
      chars.each do |x|
      result = @...("-" + x)
      assert_equal(4, result.length)
      assert_equal("-" + expected + "00",result)
      end
      end

      def testVowelsAreIgnored
      assert_all_equal("0", %w{a e i o u h w y})
      end
      def testLettersRepresentedByOne
      assert_all_equal("1", %w{b f p v})
      end
      def testLettersRepresentedByTwo
      assert_all_equal("2", %w{c g j k q s x})
      end
      def testLettersRepresentedByThree
      assert_all_equal("3", %w{d t})
      end
      def testLettersRepresentedByFour
      assert_all_equal("4", %w{l})
      end
      def testLettersRepresentedByFive
      assert_all_equal("5", %w{m n})
      end
      def testLettersRepresentedBySix
      assert_all_equal("6", %w{r})
      end
      def testDuplicateCodesAreDropped
      assert_equal("B100", @...("BFPV"))
      assert_equal("C200", @...("CGJKQSXZ"))
      assert_equal("D300", @...("DDT"))
      assert_equal("L400", @...("LLL"))
      assert_equal("M500", @...("MNMN"))
      assert_equal("R600", @...("RRR"))
      end
      def testEnglishNames
      assert_equal("S530", @...("Smith"))
      assert_equal("S530", @...("Smythe"))
      assert_equal("M235", @...("McDonald"))
      assert_equal("M235", @...("MacDonald"))
      assert_equal("H620", @...("Harris"))
      assert_equal("H620", @...("Harrys"))
      end
      def teardown
      end
      end
    Your message has been successfully submitted and would be delivered to recipients shortly.