#
# HTML entity decoding for Ruby
#
# Author:: Paul BATTLEY (pbattley @ gmail.com)
# Version:: 1.0
# Date:: 2005-08-03
#
# == About
#
# This library extends the String class to allow decoding of HTML/XML entities
# into their corresponding UTF-8 codepoints.
#
# == Licence
#
# Copyright (c) 2005 Paul Battley
#
# Usage of the works is permitted provided that this instrument is retained
# with the works, so that any entity that uses the works is notified of this
# instrument.
#
# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
#
module HTMLEntities
#
# MAP is a hash of all the HTML entities I could discover, as taken
# from the w3schools page on the subject:
# http://www.w3schools.com/html/html_entitiesref.asp
# The format is 'entity name' => codepoint where entity name is given
# without the surrounding ampersand and semicolon.
#
MAP = {
'quot' => 34,
'apos' => 39,
'amp' => 38,
'lt' => 60,
'gt' => 62,
'nbsp' => 160,
'iexcl' => 161,
'curren' => 164,
'cent' => 162,
'pound' => 163,
'yen' => 165,
'brvbar' => 166,
'sect' => 167,
'uml' => 168,
'copy' => 169,
'ordf' => 170,
'laquo' => 171,
'not' => 172,
'shy' => 173,
'reg' => 174,
'trade' => 8482,
'macr' => 175,
'deg' => 176,
'plusmn' => 177,
'sup2' => 178,
'sup3' => 179,
'acute' => 180,
'micro' => 181,
'para' => 182,
'middot' => 183,
'cedil' => 184,
'sup1' => 185,
'ordm' => 186,
'raquo' => 187,
'frac14' => 188,
'frac12' => 189,
'frac34' => 190,
'iquest' => 191,
'times' => 215,
'divide' => 247,
'Agrave' => 192,
'Aacute' => 193,
'Acirc' => 194,
'Atilde' => 195,
'Auml' => 196,
'Aring' => 197,
'AElig' => 198,
'Ccedil' => 199,
'Egrave' => 200,
'Eacute' => 201,
'Ecirc' => 202,
'Euml' => 203,
'Igrave' => 204,
'Iacute' => 205,
'Icirc' => 206,
'Iuml' => 207,
'ETH' => 208,
'Ntilde' => 209,
'Ograve' => 210,
'Oacute' => 211,
'Ocirc' => 212,
'Otilde' => 213,
'Ouml' => 214,
'Oslash' => 216,
'Ugrave' => 217,
'Uacute' => 218,
'Ucirc' => 219,
'Uuml' => 220,
'Yacute' => 221,
'THORN' => 222,
'szlig' => 223,
'agrave' => 224,
'aacute' => 225,
'acirc' => 226,
'atilde' => 227,
'auml' => 228,
'aring' => 229,
'aelig' => 230,
'ccedil' => 231,
'egrave' => 232,
'eacute' => 233,
'ecirc' => 234,
'euml' => 235,
'igrave' => 236,
'iacute' => 237,
'icirc' => 238,
'iuml' => 239,
'eth' => 240,
'ntilde' => 241,
'ograve' => 242,
'oacute' => 243,
'ocirc' => 244,
'otilde' => 245,
'ouml' => 246,
'oslash' => 248,
'ugrave' => 249,
'uacute' => 250,
'ucirc' => 251,
'uuml' => 252,
'yacute' => 253,
'thorn' => 254,
'yuml' => 255,
'OElig' => 338,
'oelig' => 339,
'Scaron' => 352,
'scaron' => 353,
'Yuml' => 376,
'circ' => 710,
'tilde' => 732,
'ensp' => 8194,
'emsp' => 8195,
'thinsp' => 8201,
'zwnj' => 8204,
'zwj' => 8205,
'lrm' => 8206,
'rlm' => 8207,
'ndash' => 8211,
'mdash' => 8212,
'lsquo' => 8216,
'rsquo' => 8217,
'sbquo' => 8218,
'ldquo' => 8220,
'rdquo' => 8221,
'bdquo' => 8222,
'dagger' => 8224,
'Dagger' => 8225,
'hellip' => 8230,
'permil' => 8240,
'lsaquo' => 8249,
'rsaquo' => 8250,
'euro' => 8364
}
MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
end
class String
# Precompile the regexp
NAMED_ENTITY_REGEXP =
/&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
#
# Decode XML and HTML 4.01 entities in a string into their UTF-8
# equivalents. Obviously, if your string is not already in UTF-8, you'd
# better convert it before using this method, or the output will be mixed
# up.
# Unknown named entities are not converted
#
def decode_entities
return gsub(NAMED_ENTITY_REGEXP) {
HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
}.gsub(/([0-9]{2,6});/) {
[$1.to_i].pack('U')
}.gsub(/([0-9a-e]{2,6});/i) {
[$1.to_i(16)].pack('U')
}
end
end
if (__FILE__ == $0)
require 'test/unit'
class TestHTMLEntities < Test::Unit::TestCase
def test_basic
assert_equal('&', '&'.decode_entities)
assert_equal('<', '<'.decode_entities)
assert_equal('"', '"'.decode_entities)
end
def test_extended
assert_equal('±', '±'.decode_entities)
assert_equal('ð', 'ð'.decode_entities)
assert_equal('Œ', 'Œ'.decode_entities)
assert_equal('œ', 'œ'.decode_entities)
end
def test_decimal
assert_equal('“', '“'.decode_entities)
assert_equal('…', '…'.decode_entities)
assert_equal(' ', ' '.decode_entities)
end
def test_hexadecimal
assert_equal('−', '−'.decode_entities)
assert_equal('—', '—'.decode_entities)
assert_equal('`', '`'.decode_entities)
assert_equal('`', '`'.decode_entities)
end
def test_mixed
# Just a random headline - I needed something with accented letters.
assert_equal('Le tabac pourrait bientôt être banni dans tous les lieux publics en France',
'Le tabac pourrait bientôt être banni dans tous les lieux publics en France'.decode_entities)
end
def test_edge_cases
assert_equal('', ''.decode_entities)
assert_equal('&bogus;', '&bogus;'.decode_entities)
end
end
end