# # An implementation of the Metaphone phonetic coding system in Ruby # # Author:: Paul BATTLEY (pbattley @ gmail.com) # Version:: 0.4 # Date:: 2005-04-18 # Status:: Alpha # # == About # # Metaphone encodes names into a phonetic form such that similar-sounding names # have the same or similar Metaphone encodings. # # As there are multiple implementations of Metaphone, each with their own bugs, # I have based this on my reading of the specification. This implementation # has been only lightly tested so far; please report any bugs found to the # email address given above. # # I have also compared this implementation with that found in PHP's standard # library. The present implementation follows the algorithm description, # whilst PHP's implementation mimics the behaviour of LP's original BASIC # implementation, which appears to contain bugs (specifically with the handling # of CC and MB). The changes required for 100% compatibility are noted in the # code, marked with [PHP]. It would be useful to compare the behaviour of # other implementations as well. # # The original system described by Lawrence Philips in Computer Language Vol. 7 # No. 12, December 1990, pp 39-43: # # The 16 consonant sounds: # # B X S K J T F H L M N P R 0 W Y # # 0 represents the "th" sound. # # Exceptions: # # Initial kn-, gn-, pn, ac- or wr- -> drop first letter # Initial x- -> change to "s" # Initial wh- -> change to "w" # # Transformations: # # Vowels are kept only when they are the first letter. # # B -> B unless at the end of a word after "m" as in "dumb" # C -> X (sh) if -cia- or -ch- # S if -ci-, -ce- or -cy- # K otherwise, including -sch- # D -> J if in -dge-, -dgy- or -dgi- # T otherwise # F -> F # G -> silent if in -gh- and not at end or before a vowel # in -gn- or -gned- (also see dge etc. above) # J if before i or e or y if not double gg # K otherwise # H -> silent if after vowel and no vowel follows # H otherwise # J -> J # K -> silent if after "c" # K otherwise # L -> L # M -> M # N -> N # P -> F if before "h" # P otherwise # Q -> K # R -> R # S -> X (sh) if before "h" or in -sio- or -sia- # S otherwise # T -> X (sh) if -tia- or -tio- # 0 (th) if before "h" # silent if in -tch- # T otherwise # V -> F # W -> silent if not followed by a vowel # W if followed by a vowel # X -> KS # Y -> silent if not followed by a vowel # Y if followed by a vowel # Z -> S # # == Usage # # require 'metaphone' # Metaphone.metaphone('foo bar') # => 'F BR' # # == Revision history # # * 2005-05-18 0.4 More efficient double-letter removal method. # * 2005-05-18 0.3 More test cases, caught bug with initial SCH. # * 2005-05-18 0.2 Fixed a bug with GG. # * 2005-05-18 0.1 Initial release. # # == Licence # # Copyright (c) 2005 Paul Battley # # Usage of the works is permitted provided that this instrument is retained # with the works, so that any entity that uses the works is notified of this # instrument. # # DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY. # module Metaphone # # Metaphone rules. These are simply applied in order. # RULES = [ # Regexp, replacement [ /([bcdfhjklmnpqrstvwxyz])\1+/, '\1' ], # Remove doubled consonants except g. # [PHP] add c to regexp. [ /^ae/, 'E' ], [ /^[gkp]n/, 'N' ], [ /^wr/, 'R' ], [ /^x/, 'S' ], [ /^wh/, 'W' ], [ /mb$/, 'M' ], # [PHP] remove $ from regexp. [ /(?!^)sch/, 'SK' ], [ /th/, '0' ], [ /t?ch|sh/, 'X' ], [ /c(?=ia)/, 'X' ], [ /[st](?=i[ao])/, 'X' ], [ /s?c(?=[iey])/, 'S' ], [ /[cq]/, 'K' ], [ /dg(?=[iey])/, 'J' ], [ /d/, 'T' ], [ /g(?=h[^aeiou])/, '' ], [ /gn(ed)?/, 'N' ], [ /([^g]|^)g(?=[iey])/, '\1J' ], [ /g+/, 'K' ], [ /ph/, 'F' ], [ /([aeiou])h(?=\b|[^aeiou])/, '\1' ], [ /[wy](?![aeiou])/, '' ], [ /z/, 'S' ], [ /v/, 'F' ], [ /(?!^)[aeiou]+/, '' ], ] # # Finds the Metaphone value for a word. Note that only the letters A-Z are # supported, so any language-specific processing should be done beforehand. # def metaphone_word(w) # Normalise case and remove non-ASCII s = w.downcase.gsub(/[^a-z]/, '') # Apply the Metaphone rules RULES.each { |rx, rep| s.gsub!(rx, rep) } return s.upcase end # # Finds the Metaphone values for a string containing multiple words by # calling metaphone_word. # def metaphone(str) return str.strip.split(/\s+/).map { |w| Metaphone.metaphone_word(w) }.join(' ') end extend self end if (__FILE__ == $0) require 'test/unit' class MetaphoneTest < Test::Unit::TestCase # # Based on the table at http://aspell.net/metaphone/metaphone-kuhn.txt # but with brain-dead results changed to 'correct' ones (according to # my interpretation). # Added more results from around the web. # # [PHP] MCCALL: MKL -> MKKL; # MCCRORE: MKRR -> MKKRR; # CAMBRILLO: KMBRL -> KMRL # KNOWN = %w[ ANASTHA ANS0 DAVIS-CARTER TFSKRTR ESCARMANT ESKRMNT MCCALL MKL MCCROREY MKRR MERSEAL MRSL PIEURISSAINT PRSNT ROTMAN RTMN SCHEVEL SXFL SCHROM SXRM SEAL SL SPARR SPR STARLEPER STRLPR THRASH 0RX LOGGING LKNK LOGIC LJK JUDGES JJS SHOOS XS SHOES XS CHUTE XT SCHUSS SXS OTTO OT ERIC ERK DAVE TF CATHERINE K0RN KATHERINE K0RN AUBREY ABR BRYAN BRYN BRYCE BRS STEVEN STFN RICHARD RXRT HEIDI HT AUTO AT MAURICE MRS RANDY RNT CAMBRILLO KMBRL BRIAN BRN RAY R GEOFF JF BOB BB AHA AH AAH A PAUL PL BATTLEY BTL WROTE RT THIS 0S ] def setup @known = {} (0...KNOWN.length).step(2) do |i| @known[KNOWN[i]] = KNOWN[i+1] end end def test_known @known.each_pair do |input, output| assert_equal(output, Metaphone.metaphone(input)) end end def test_junk assert_equal( Metaphone.metaphone('foobar'), Metaphone.metaphone('%^@#$^f%^&o%^o@b#a@#r%^^&') ) end def test_caps assert_equal( Metaphone.metaphone('foobar'), Metaphone.metaphone('FOOBAR') ) end def test_string assert_equal('F BR BS', Metaphone.metaphone('foo bar baz')) assert_equal('N WT', Metaphone.metaphone('gnu what')) end end end