#
# An implementation of the Metaphone phonetic coding system in Ruby
#
# Author::  Paul BATTLEY (pbattley @ gmail.com)
# Version:: 0.4
# Date::    2005-04-18
# Status::  Alpha
#
# == About
#
# Metaphone encodes names into a phonetic form such that similar-sounding names
# have the same or similar Metaphone encodings.
#
# As there are multiple implementations of Metaphone, each with their own bugs,
# I have based this on my reading of the specification.  This implementation
# has been only lightly tested so far; please report any bugs found to the
# email address given above.
#
# I have also compared this implementation with that found in PHP's standard
# library.  The present implementation follows the algorithm description,
# whilst PHP's implementation mimics the behaviour of LP's original BASIC
# implementation, which appears to contain bugs (specifically with the handling
# of CC and MB).  The changes required for 100% compatibility are noted in the
# code, marked with [PHP].  It would be useful to compare the behaviour of
# other implementations as well.
#
# The original system described by Lawrence Philips in Computer Language Vol. 7
# No. 12, December 1990, pp 39-43:
#
#   The 16 consonant sounds:
#   
#   B X S K J T F H L M N P R 0 W Y
#   
#   0 represents the "th" sound.
#
#   Exceptions:
#    
#   Initial  kn-, gn-, pn, ac- or wr-     -> drop first letter
#   Initial  x-                           -> change to "s"
#   Initial  wh-                          -> change to "w"
#   
#   Transformations:
#
#   Vowels are kept only when they are the first letter.
#    
#   B -> B   unless at the end of a word after "m" as in "dumb"
#   C -> X    (sh) if -cia- or -ch-
#        S   if -ci-, -ce- or -cy-
#        K   otherwise, including -sch-
#   D -> J   if in -dge-, -dgy- or -dgi-
#        T   otherwise
#   F -> F
#   G ->     silent if in -gh- and not at end or before a vowel
#            in -gn- or -gned- (also see dge etc. above)
#        J   if before i or e or y if not double gg
#        K   otherwise
#   H ->     silent if after vowel and no vowel follows
#        H   otherwise
#   J -> J
#   K ->     silent if after "c"
#        K   otherwise
#   L -> L   
#   M -> M
#   N -> N
#   P -> F   if before "h"
#        P   otherwise
#   Q -> K
#   R -> R
#   S -> X   (sh) if before "h" or in -sio- or -sia-
#        S   otherwise
#   T -> X   (sh) if -tia- or -tio-
#        0   (th) if before "h"
#            silent if in -tch-
#        T   otherwise
#   V -> F
#   W ->     silent if not followed by a vowel
#        W   if followed by a vowel
#   X -> KS
#   Y ->     silent if not followed by a vowel
#        Y   if followed by a vowel
#   Z -> S 
#
# == Usage
#
# require 'metaphone'
# Metaphone.metaphone('foo bar') # => 'F BR'
#
# == Revision history
#
# * 2005-05-18 0.4 More efficient double-letter removal method.
# * 2005-05-18 0.3 More test cases, caught bug with initial SCH.
# * 2005-05-18 0.2 Fixed a bug with GG.
# * 2005-05-18 0.1 Initial release.
#
# == Licence
#
# Copyright (c) 2005 Paul Battley
#
# Usage of the works is permitted provided that this instrument is retained
# with the works, so that any entity that uses the works is notified of this
# instrument.
#
# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.  
#

module Metaphone

    #
    # Metaphone rules.  These are simply applied in order.
    #
    RULES = [ 
        # Regexp, replacement
        [ /([bcdfhjklmnpqrstvwxyz])\1+/,
                           '\1' ],  # Remove doubled consonants except g.
                                    # [PHP] add c to regexp.
        [ /^ae/,            'E' ],
        [ /^[gkp]n/,        'N' ],
        [ /^wr/,            'R' ],
        [ /^x/,             'S' ],
        [ /^wh/,            'W' ],
        [ /mb$/,            'M' ],  # [PHP] remove $ from regexp.
        [ /(?!^)sch/,      'SK' ],
        [ /th/,             '0' ],
        [ /t?ch|sh/,        'X' ],
        [ /c(?=ia)/,        'X' ],
        [ /[st](?=i[ao])/,  'X' ],
        [ /s?c(?=[iey])/,   'S' ],
        [ /[cq]/,           'K' ],
        [ /dg(?=[iey])/,    'J' ],
        [ /d/,              'T' ],
        [ /g(?=h[^aeiou])/, ''  ],
        [ /gn(ed)?/,        'N' ],
        [ /([^g]|^)g(?=[iey])/,
                          '\1J' ],
        [ /g+/,             'K' ],
        [ /ph/,             'F' ],
        [ /([aeiou])h(?=\b|[^aeiou])/,
                           '\1' ],
        [ /[wy](?![aeiou])/, '' ],
        [ /z/,              'S' ],
        [ /v/,              'F' ],
        [ /(?!^)[aeiou]+/,  ''  ],
    ]

    #
    # Finds the Metaphone value for a word.  Note that only the letters A-Z are
    # supported, so any language-specific processing should be done beforehand.
    #
    def metaphone_word(w)
        # Normalise case and remove non-ASCII
        s = w.downcase.gsub(/[^a-z]/, '')
        # Apply the Metaphone rules
        RULES.each { |rx, rep| s.gsub!(rx, rep) }
        return s.upcase
    end

    #
    # Finds the Metaphone values for a string containing multiple words by
    # calling metaphone_word.
    #
    def metaphone(str)
        return str.strip.split(/\s+/).map { |w| Metaphone.metaphone_word(w) }.join(' ')
    end

    extend self
end

if (__FILE__ == $0)
    require 'test/unit'

    class MetaphoneTest < Test::Unit::TestCase

        #
        # Based on the table at http://aspell.net/metaphone/metaphone-kuhn.txt
        # but with brain-dead results changed to 'correct' ones (according to
        # my interpretation).
        # Added more results from around the web.
        # 
        #   [PHP] MCCALL:    MKL   -> MKKL; 
        #         MCCRORE:   MKRR  -> MKKRR;
        #         CAMBRILLO: KMBRL -> KMRL
        #
        KNOWN = %w[
            ANASTHA         ANS0
            DAVIS-CARTER    TFSKRTR
            ESCARMANT       ESKRMNT
            MCCALL          MKL
            MCCROREY        MKRR
            MERSEAL         MRSL
            PIEURISSAINT    PRSNT
            ROTMAN          RTMN 
            SCHEVEL         SXFL
            SCHROM          SXRM  
            SEAL            SL   
            SPARR           SPR 
            STARLEPER       STRLPR
            THRASH          0RX 

            LOGGING         LKNK
            LOGIC           LJK
            JUDGES          JJS

            SHOOS           XS
            SHOES           XS
            CHUTE           XT
            SCHUSS          SXS

            OTTO            OT
            ERIC            ERK
            DAVE            TF
            CATHERINE       K0RN
            KATHERINE       K0RN
            AUBREY          ABR
            BRYAN           BRYN
            BRYCE           BRS
            STEVEN          STFN
            RICHARD         RXRT
            HEIDI           HT
            AUTO            AT
            MAURICE         MRS
            RANDY           RNT
            CAMBRILLO       KMBRL
            BRIAN           BRN
            RAY             R
            GEOFF           JF
            BOB             BB

            AHA             AH
            AAH             A

            PAUL            PL
            BATTLEY         BTL
            WROTE           RT
            THIS            0S
        ]

        def setup
            @known = {}
            (0...KNOWN.length).step(2) do |i|
                @known[KNOWN[i]] = KNOWN[i+1]
            end
        end

        def test_known
            @known.each_pair do |input, output|
                assert_equal(output, Metaphone.metaphone(input))
            end
        end

        def test_junk
            assert_equal(
                Metaphone.metaphone('foobar'),
                Metaphone.metaphone('%^@#$^f%^&o%^o@b#a@#r%^^&')
            )
        end

        def test_caps
            assert_equal(
                Metaphone.metaphone('foobar'),
                Metaphone.metaphone('FOOBAR')
            )
        end

        def test_string
            assert_equal('F BR BS', Metaphone.metaphone('foo bar baz'))
            assert_equal('N WT', Metaphone.metaphone('gnu what'))
        end

    end
end
