From 379dfddd5b5b47a6b93ae81ed52bd66cb22b6be9 Mon Sep 17 00:00:00 2001 From: jim teeuwen Date: Wed, 2 Dec 2009 21:44:02 +0100 Subject: [PATCH] rewrote loadNonStandardEntities() to use a more reliable token representation for each character: "\uxxxx" notation. --- src/entitymap.go | 506 +++++++++++++++++++++++------------------------ 1 file changed, 252 insertions(+), 254 deletions(-) diff --git a/src/entitymap.go b/src/entitymap.go index b6a016c..3835aea 100644 --- a/src/entitymap.go +++ b/src/entitymap.go @@ -33,7 +33,6 @@ func UTF8ToHtml(token string) string { return fmt.Sprintf("&#%d;", rune); } - /* http://www.w3.org/TR/html4/sgml/entities.html @@ -49,259 +48,258 @@ func UTF8ToHtml(token string) string { error and abort the parsing. Hence the ability to supply this map. */ func loadNonStandardEntities(em *map[string]string) { - // Generic entities string([]uint8{160}); - (*em)["nbsp"] = " "; - (*em)["iexcl"] = "¡"; - (*em)["cent"] = "¢"; - (*em)["pound"] = "£"; - (*em)["curren"] = "¤"; - (*em)["yen"] = "¥"; - (*em)["brvbar"] = "¦"; - (*em)["sect"] = "§"; - (*em)["uml"] = "¨"; - (*em)["copy"] = "©"; - (*em)["ordf"] = "ª"; - (*em)["laquo"] = "«"; - (*em)["not"] = "¬"; - (*em)["shy"] = "­"; - (*em)["reg"] = "®"; - (*em)["macr"] = "¯"; - (*em)["deg"] = "°"; - (*em)["plusmn"] = "±"; - (*em)["sup"] = "²"; - (*em)["sup"] = "³"; - (*em)["acute"] = "´"; - (*em)["micro"] = "µ"; - (*em)["para"] = "¶"; - (*em)["middot"] = "·"; - (*em)["cedil"] = "¸"; - (*em)["sup"] = "¹"; - (*em)["ordm"] = "º"; - (*em)["raquo"] = "»"; - (*em)["frac14"] = "¼"; - (*em)["frac12"] = "½"; - (*em)["frac34"] = "¾"; - (*em)["iquest"] = "¿"; - (*em)["Agrave"] = "À"; - (*em)["Aacute"] = "Á"; - (*em)["Acirc"] = "Â"; - (*em)["Atilde"] = "Ã"; - (*em)["Auml"] = "Ä"; - (*em)["Aring"] = "Å"; - (*em)["AElig"] = "Æ"; - (*em)["Ccedil"] = "Ç"; - (*em)["Egrave"] = "È"; - (*em)["Eacute"] = "É"; - (*em)["Ecirc"] = "Ê"; - (*em)["Euml"] = "Ë"; - (*em)["Igrave"] = "Ì"; - (*em)["Iacute"] = "Í"; - (*em)["Icirc"] = "Î"; - (*em)["Iuml"] = "Ï"; - (*em)["ETH"] = "Ð"; - (*em)["Ntilde"] = "Ñ"; - (*em)["Ograve"] = "Ò"; - (*em)["Oacute"] = "Ó"; - (*em)["Ocirc"] = "Ô"; - (*em)["Otilde"] = "Õ"; - (*em)["Ouml"] = "Ö"; - (*em)["times"] = "×"; - (*em)["Oslash"] = "Ø"; - (*em)["Ugrave"] = "Ù"; - (*em)["Uacute"] = "Ú"; - (*em)["Ucirc"] = "Û"; - (*em)["Uuml"] = "Ü"; - (*em)["Yacute"] = "Ý"; - (*em)["THORN"] = "Þ"; - (*em)["szlig"] = "ß"; - (*em)["agrave"] = "à"; - (*em)["aacute"] = "á"; - (*em)["acirc"] = "â"; - (*em)["atilde"] = "ã"; - (*em)["auml"] = "ä"; - (*em)["aring"] = "å"; - (*em)["aelig"] = "æ"; - (*em)["ccedil"] = "ç"; - (*em)["egrave"] = "è"; - (*em)["eacute"] = "é"; - (*em)["ecirc"] = "ê"; - (*em)["euml"] = "ë"; - (*em)["igrave"] = "ì"; - (*em)["iacute"] = "í"; - (*em)["icirc"] = "î"; - (*em)["iuml"] = "ï"; - (*em)["eth"] = "ð"; - (*em)["ntilde"] = "ñ"; - (*em)["ograve"] = "ò"; - (*em)["oacute"] = "ó"; - (*em)["ocirc"] = "ô"; - (*em)["otilde"] = "õ"; - (*em)["ouml"] = "ö"; - (*em)["divide"] = "÷"; - (*em)["oslash"] = "ø"; - (*em)["ugrave"] = "ù"; - (*em)["uacute"] = "ú"; - (*em)["ucirc"] = "û"; - (*em)["uuml"] = "ü"; - (*em)["yacute"] = "ý"; - (*em)["thorn"] = "þ"; - (*em)["yuml"] = "ÿ"; - (*em)["fnof"] = "ƒ"; - (*em)["Alpha"] = "Α"; - (*em)["Beta"] = "Β"; - (*em)["Gamma"] = "Γ"; - (*em)["Delta"] = "Δ"; - (*em)["Epsilon"] = "Ε"; - (*em)["Zeta"] = "Ζ"; - (*em)["Eta"] = "Η"; - (*em)["Theta"] = "Θ"; - (*em)["Iota"] = "Ι"; - (*em)["Kappa"] = "Κ"; - (*em)["Lambda"] = "Λ"; - (*em)["Mu"] = "Μ"; - (*em)["Nu"] = "Ν"; - (*em)["Xi"] = "Ξ"; - (*em)["Omicron"] = "Ο"; - (*em)["Pi"] = "Π"; - (*em)["Rho"] = "Ρ"; - (*em)["Sigma"] = "Σ"; - (*em)["Tau"] = "Τ"; - (*em)["Upsilon"] = "Υ"; - (*em)["Phi"] = "Φ"; - (*em)["Chi"] = "Χ"; - (*em)["Psi"] = "Ψ"; - (*em)["Omega"] = "Ω"; - (*em)["alpha"] = "α"; - (*em)["beta"] = "β"; - (*em)["gamma"] = "γ"; - (*em)["delta"] = "δ"; - (*em)["epsilon"] = "ε"; - (*em)["zeta"] = "ζ"; - (*em)["eta"] = "η"; - (*em)["theta"] = "θ"; - (*em)["iota"] = "ι"; - (*em)["kappa"] = "κ"; - (*em)["lambda"] = "λ"; - (*em)["mu"] = "μ"; - (*em)["nu"] = "ν"; - (*em)["xi"] = "ξ"; - (*em)["omicron"] = "ο"; - (*em)["pi"] = "π"; - (*em)["rho"] = "ρ"; - (*em)["sigmaf"] = "ς"; - (*em)["sigma"] = "σ"; - (*em)["tau"] = "τ"; - (*em)["upsilon"] = "υ"; - (*em)["phi"] = "φ"; - (*em)["chi"] = "χ"; - (*em)["psi"] = "ψ"; - (*em)["omega"] = "ω"; - (*em)["thetasym"] = "ϑ"; - (*em)["upsih"] = "ϒ"; - (*em)["piv"] = "ϖ"; - (*em)["bull"] = "•"; - (*em)["hellip"] = "…"; - (*em)["prime"] = "′"; - (*em)["Prime"] = "″"; - (*em)["oline"] = "‾"; - (*em)["frasl"] = "⁄"; - (*em)["weierp"] = "℘"; - (*em)["image"] = "ℑ"; - (*em)["real"] = "ℜ"; - (*em)["trade"] = "™"; - (*em)["alefsym"] = "ℵ"; - (*em)["larr"] = "←"; - (*em)["uarr"] = "↑"; - (*em)["rarr"] = "→"; - (*em)["darr"] = "↓"; - (*em)["harr"] = "↔"; - (*em)["crarr"] = "↵"; - (*em)["lArr"] = "⇐"; - (*em)["uArr"] = "⇑"; - (*em)["rArr"] = "⇒"; - (*em)["dArr"] = "⇓"; - (*em)["hArr"] = "⇔"; - (*em)["forall"] = "∀"; - (*em)["part"] = "∂"; - (*em)["exist"] = "∃"; - (*em)["empty"] = "∅"; - (*em)["nabla"] = "∇"; - (*em)["isin"] = "∈"; - (*em)["notin"] = "∉"; - (*em)["ni"] = "∋"; - (*em)["prod"] = "∏"; - (*em)["sum"] = "∑"; - (*em)["minus"] = "−"; - (*em)["lowast"] = "∗"; - (*em)["radic"] = "√"; - (*em)["prop"] = "∝"; - (*em)["infin"] = "∞"; - (*em)["ang"] = "∠"; - (*em)["and"] = "∧"; - (*em)["or"] = "∨"; - (*em)["cap"] = "∩"; - (*em)["cup"] = "∪"; - (*em)["int"] = "∫"; - (*em)["there4"] = "∴"; - (*em)["sim"] = "∼"; - (*em)["cong"] = "≅"; - (*em)["asymp"] = "≈"; - (*em)["ne"] = "≠"; - (*em)["equiv"] = "≡"; - (*em)["le"] = "≤"; - (*em)["ge"] = "≥"; - (*em)["sub"] = "⊂"; - (*em)["sup"] = "⊃"; - (*em)["nsub"] = "⊄"; - (*em)["sube"] = "⊆"; - (*em)["supe"] = "⊇"; - (*em)["oplus"] = "⊕"; - (*em)["otimes"] = "⊗"; - (*em)["perp"] = "⊥"; - (*em)["sdot"] = "⋅"; - (*em)["lceil"] = "⌈"; - (*em)["rceil"] = "⌉"; - (*em)["lfloor"] = "⌊"; - (*em)["rfloor"] = "⌋"; - (*em)["lang"] = "〈"; - (*em)["rang"] = "〉"; - (*em)["loz"] = "◊"; - (*em)["spades"] = "♠"; - (*em)["clubs"] = "♣"; - (*em)["hearts"] = "♥"; - (*em)["diams"] = "♦"; - (*em)["quot"] = "\""; - (*em)["amp"] = "&"; - (*em)["lt"] = "<"; - (*em)["gt"] = ">"; - (*em)["OElig"] = "Œ"; - (*em)["oelig"] = "œ"; - (*em)["Scaron"] = "Š"; - (*em)["scaron"] = "š"; - (*em)["Yuml"] = "Ÿ"; - (*em)["circ"] = "ˆ"; - (*em)["tilde"] = "˜"; - (*em)["ensp"] = " "; - (*em)["emsp"] = " "; - (*em)["thinsp"] = " "; - (*em)["zwnj"] = "‌"; - (*em)["zwj"] = "‍"; - (*em)["lrm"] = "‎"; - (*em)["rlm"] = "‏"; - (*em)["ndash"] = "–"; - (*em)["mdash"] = "—"; - (*em)["lsquo"] = "‘"; - (*em)["rsquo"] = "’"; - (*em)["sbquo"] = "‚"; - (*em)["ldquo"] = "“"; - (*em)["rdquo"] = "”"; - (*em)["bdquo"] = "„"; - (*em)["dagger"] = "†"; - (*em)["Dagger"] = "‡"; - (*em)["permil"] = "‰"; - (*em)["lsaquo"] = "‹"; - (*em)["rsaquo"] = "›"; - (*em)["euro"] = "€"; + (*em)["pi"] = "\u03c0"; + (*em)["nabla"] = "\u2207"; + (*em)["isin"] = "\u2208"; + (*em)["loz"] = "\u25ca"; + (*em)["prop"] = "\u221d"; + (*em)["para"] = "\u00b6"; + (*em)["Aring"] = "\u00c5"; + (*em)["euro"] = "\u20ac"; + (*em)["sup3"] = "\u00b3"; + (*em)["sup2"] = "\u00b2"; + (*em)["sup1"] = "\u00b9"; + (*em)["prod"] = "\u220f"; + (*em)["gamma"] = "\u03b3"; + (*em)["perp"] = "\u22a5"; + (*em)["lfloor"] = "\u230a"; + (*em)["fnof"] = "\u0192"; + (*em)["frasl"] = "\u2044"; + (*em)["rlm"] = "\u200f"; + (*em)["omega"] = "\u03c9"; + (*em)["part"] = "\u2202"; + (*em)["euml"] = "\u00eb"; + (*em)["Kappa"] = "\u039a"; + (*em)["nbsp"] = "\u00a0"; + (*em)["Eacute"] = "\u00c9"; + (*em)["brvbar"] = "\u00a6"; + (*em)["otimes"] = "\u2297"; + (*em)["ndash"] = "\u2013"; + (*em)["thinsp"] = "\u2009"; + (*em)["nu"] = "\u03bd"; + (*em)["Upsilon"] = "\u03a5"; + (*em)["upsih"] = "\u03d2"; + (*em)["raquo"] = "\u00bb"; + (*em)["yacute"] = "\u00fd"; + (*em)["delta"] = "\u03b4"; + (*em)["eth"] = "\u00f0"; + (*em)["supe"] = "\u2287"; + (*em)["ne"] = "\u2260"; + (*em)["ni"] = "\u220b"; + (*em)["eta"] = "\u03b7"; + (*em)["uArr"] = "\u21d1"; + (*em)["image"] = "\u2111"; + (*em)["asymp"] = "\u2248"; + (*em)["oacute"] = "\u00f3"; + (*em)["rarr"] = "\u2192"; + (*em)["emsp"] = "\u2003"; + (*em)["acirc"] = "\u00e2"; + (*em)["shy"] = "\u00ad"; + (*em)["yuml"] = "\u00ff"; + (*em)["acute"] = "\u00b4"; + (*em)["int"] = "\u222b"; + (*em)["ccedil"] = "\u00e7"; + (*em)["Acirc"] = "\u00c2"; + (*em)["Ograve"] = "\u00d2"; + (*em)["times"] = "\u00d7"; + (*em)["weierp"] = "\u2118"; + (*em)["Tau"] = "\u03a4"; + (*em)["omicron"] = "\u03bf"; + (*em)["lt"] = "\u003c"; + (*em)["Mu"] = "\u039c"; + (*em)["Ucirc"] = "\u00db"; + (*em)["sub"] = "\u2282"; + (*em)["le"] = "\u2264"; + (*em)["sum"] = "\u2211"; + (*em)["sup"] = "\u2283"; + (*em)["lrm"] = "\u200e"; + (*em)["frac34"] = "\u00be"; + (*em)["Iota"] = "\u0399"; + (*em)["Ugrave"] = "\u00d9"; + (*em)["THORN"] = "\u00de"; + (*em)["rsaquo"] = "\u203a"; + (*em)["not"] = "\u00ac"; + (*em)["sigma"] = "\u03c3"; + (*em)["iuml"] = "\u00ef"; + (*em)["epsilon"] = "\u03b5"; + (*em)["spades"] = "\u2660"; + (*em)["theta"] = "\u03b8"; + (*em)["divide"] = "\u00f7"; + (*em)["Atilde"] = "\u00c3"; + (*em)["uacute"] = "\u00fa"; + (*em)["Rho"] = "\u03a1"; + (*em)["trade"] = "\u2122"; + (*em)["chi"] = "\u03c7"; + (*em)["agrave"] = "\u00e0"; + (*em)["or"] = "\u2228"; + (*em)["circ"] = "\u02c6"; + (*em)["middot"] = "\u00b7"; + (*em)["plusmn"] = "\u00b1"; + (*em)["aring"] = "\u00e5"; + (*em)["lsquo"] = "\u2018"; + (*em)["Yacute"] = "\u00dd"; + (*em)["oline"] = "\u203e"; + (*em)["copy"] = "\u00a9"; + (*em)["icirc"] = "\u00ee"; + (*em)["lowast"] = "\u2217"; + (*em)["Oacute"] = "\u00d3"; + (*em)["aacute"] = "\u00e1"; + (*em)["oplus"] = "\u2295"; + (*em)["crarr"] = "\u21b5"; + (*em)["thetasym"] = "\u03d1"; + (*em)["Beta"] = "\u0392"; + (*em)["laquo"] = "\u00ab"; + (*em)["rang"] = "\u232a"; + (*em)["tilde"] = "\u02dc"; + (*em)["Uuml"] = "\u00dc"; + (*em)["zwj"] = "\u200d"; + (*em)["mu"] = "\u03bc"; + (*em)["Ccedil"] = "\u00c7"; + (*em)["infin"] = "\u221e"; + (*em)["ouml"] = "\u00f6"; + (*em)["rfloor"] = "\u230b"; + (*em)["pound"] = "\u00a3"; + (*em)["szlig"] = "\u00df"; + (*em)["thorn"] = "\u00fe"; + (*em)["forall"] = "\u2200"; + (*em)["piv"] = "\u03d6"; + (*em)["rdquo"] = "\u201d"; + (*em)["frac12"] = "\u00bd"; + (*em)["frac14"] = "\u00bc"; + (*em)["Ocirc"] = "\u00d4"; + (*em)["Ecirc"] = "\u00ca"; + (*em)["kappa"] = "\u03ba"; + (*em)["Euml"] = "\u00cb"; + (*em)["minus"] = "\u2212"; + (*em)["cong"] = "\u2245"; + (*em)["hellip"] = "\u2026"; + (*em)["equiv"] = "\u2261"; + (*em)["cent"] = "\u00a2"; + (*em)["Uacute"] = "\u00da"; + (*em)["darr"] = "\u2193"; + (*em)["Eta"] = "\u0397"; + (*em)["sbquo"] = "\u201a"; + (*em)["rArr"] = "\u21d2"; + (*em)["igrave"] = "\u00ec"; + (*em)["uml"] = "\u00a8"; + (*em)["lambda"] = "\u03bb"; + (*em)["oelig"] = "\u0153"; + (*em)["harr"] = "\u2194"; + (*em)["ang"] = "\u2220"; + (*em)["clubs"] = "\u2663"; + (*em)["and"] = "\u2227"; + (*em)["permil"] = "\u2030"; + (*em)["larr"] = "\u2190"; + (*em)["Yuml"] = "\u0178"; + (*em)["cup"] = "\u222a"; + (*em)["Xi"] = "\u039e"; + (*em)["Alpha"] = "\u0391"; + (*em)["phi"] = "\u03c6"; + (*em)["ucirc"] = "\u00fb"; + (*em)["oslash"] = "\u00f8"; + (*em)["rsquo"] = "\u2019"; + (*em)["AElig"] = "\u00c6"; + (*em)["mdash"] = "\u2014"; + (*em)["psi"] = "\u03c8"; + (*em)["eacute"] = "\u00e9"; + (*em)["otilde"] = "\u00f5"; + (*em)["yen"] = "\u00a5"; + (*em)["gt"] = "\u003e"; + (*em)["Iuml"] = "\u00cf"; + (*em)["Prime"] = "\u2033"; + (*em)["Chi"] = "\u03a7"; + (*em)["ge"] = "\u2265"; + (*em)["reg"] = "\u00ae"; + (*em)["hearts"] = "\u2665"; + (*em)["auml"] = "\u00e4"; + (*em)["Agrave"] = "\u00c0"; + (*em)["sect"] = "\u00a7"; + (*em)["sube"] = "\u2286"; + (*em)["sigmaf"] = "\u03c2"; + (*em)["Gamma"] = "\u0393"; + (*em)["amp"] = "\u0026"; + (*em)["ensp"] = "\u2002"; + (*em)["ETH"] = "\u00d0"; + (*em)["Igrave"] = "\u00cc"; + (*em)["Omega"] = "\u03a9"; + (*em)["Lambda"] = "\u039b"; + (*em)["Omicron"] = "\u039f"; + (*em)["there4"] = "\u2234"; + (*em)["ntilde"] = "\u00f1"; + (*em)["xi"] = "\u03be"; + (*em)["dagger"] = "\u2020"; + (*em)["egrave"] = "\u00e8"; + (*em)["Delta"] = "\u0394"; + (*em)["OElig"] = "\u0152"; + (*em)["diams"] = "\u2666"; + (*em)["ldquo"] = "\u201c"; + (*em)["radic"] = "\u221a"; + (*em)["Oslash"] = "\u00d8"; + (*em)["Ouml"] = "\u00d6"; + (*em)["lceil"] = "\u2308"; + (*em)["uarr"] = "\u2191"; + (*em)["atilde"] = "\u00e3"; + (*em)["iquest"] = "\u00bf"; + (*em)["lsaquo"] = "\u2039"; + (*em)["Epsilon"] = "\u0395"; + (*em)["iacute"] = "\u00ed"; + (*em)["cap"] = "\u2229"; + (*em)["deg"] = "\u00b0"; + (*em)["Otilde"] = "\u00d5"; + (*em)["zeta"] = "\u03b6"; + (*em)["ocirc"] = "\u00f4"; + (*em)["scaron"] = "\u0161"; + (*em)["ecirc"] = "\u00ea"; + (*em)["ordm"] = "\u00ba"; + (*em)["tau"] = "\u03c4"; + (*em)["Auml"] = "\u00c4"; + (*em)["dArr"] = "\u21d3"; + (*em)["ordf"] = "\u00aa"; + (*em)["alefsym"] = "\u2135"; + (*em)["notin"] = "\u2209"; + (*em)["Pi"] = "\u03a0"; + (*em)["sdot"] = "\u22c5"; + (*em)["upsilon"] = "\u03c5"; + (*em)["iota"] = "\u03b9"; + (*em)["hArr"] = "\u21d4"; + (*em)["Sigma"] = "\u03a3"; + (*em)["lang"] = "\u2329"; + (*em)["curren"] = "\u00a4"; + (*em)["Theta"] = "\u0398"; + (*em)["lArr"] = "\u21d0"; + (*em)["Phi"] = "\u03a6"; + (*em)["Nu"] = "\u039d"; + (*em)["rho"] = "\u03c1"; + (*em)["alpha"] = "\u03b1"; + (*em)["iexcl"] = "\u00a1"; + (*em)["micro"] = "\u00b5"; + (*em)["cedil"] = "\u00b8"; + (*em)["Ntilde"] = "\u00d1"; + (*em)["Psi"] = "\u03a8"; + (*em)["Dagger"] = "\u2021"; + (*em)["Egrave"] = "\u00c8"; + (*em)["Icirc"] = "\u00ce"; + (*em)["nsub"] = "\u2284"; + (*em)["bdquo"] = "\u201e"; + (*em)["empty"] = "\u2205"; + (*em)["aelig"] = "\u00e6"; + (*em)["ograve"] = "\u00f2"; + (*em)["macr"] = "\u00af"; + (*em)["Zeta"] = "\u0396"; + (*em)["beta"] = "\u03b2"; + (*em)["sim"] = "\u223c"; + (*em)["uuml"] = "\u00fc"; + (*em)["Aacute"] = "\u00c1"; + (*em)["Iacute"] = "\u00cd"; + (*em)["exist"] = "\u2203"; + (*em)["prime"] = "\u2032"; + (*em)["rceil"] = "\u2309"; + (*em)["real"] = "\u211c"; + (*em)["zwnj"] = "\u200c"; + (*em)["bull"] = "\u2022"; + (*em)["quot"] = "\u0022"; + (*em)["Scaron"] = "\u0160"; + (*em)["ugrave"] = "\u00f9"; }