2009-11-23 17:28:44 +00:00
|
|
|
package xmlx
|
|
|
|
|
2009-11-24 16:49:27 +00:00
|
|
|
import "fmt"
|
|
|
|
import "utf8"
|
|
|
|
import "regexp"
|
|
|
|
import "strconv"
|
|
|
|
|
|
|
|
var reg_entity = regexp.MustCompile("^&#[0-9]+;$");
|
|
|
|
|
|
|
|
// Converts a single numerical html entity to a regular Go utf-token.
|
|
|
|
// ex: "♣" -> "♣"
|
|
|
|
func HtmlToUTF8(entity string) string {
|
|
|
|
// Make sure we have a valid entity: {
|
|
|
|
ok := reg_entity.MatchString(entity);
|
|
|
|
if !ok { return "" }
|
|
|
|
|
|
|
|
// Convert entity to number
|
|
|
|
num, err := strconv.Atoi(entity[2:len(entity)-1]);
|
|
|
|
if err != nil { return "" }
|
|
|
|
|
|
|
|
var arr [3]byte;
|
|
|
|
size := utf8.EncodeRune(num, &arr);
|
|
|
|
if size == 0 { return "" }
|
|
|
|
|
|
|
|
return string(&arr);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Converts a single Go utf-token to it's an Html entity.
|
|
|
|
// ex: "♣" -> "♣"
|
|
|
|
func UTF8ToHtml(token string) string {
|
|
|
|
rune, size := utf8.DecodeRuneInString(token);
|
|
|
|
if size == 0 { return "" }
|
|
|
|
return fmt.Sprintf("&#%d;", rune);
|
|
|
|
}
|
|
|
|
|
2009-11-23 17:28:44 +00:00
|
|
|
/*
|
2009-11-24 16:49:27 +00:00
|
|
|
http://www.w3.org/TR/html4/sgml/entities.html
|
|
|
|
|
|
|
|
Portions © International Organization for Standardization 1986
|
|
|
|
Permission to copy in any form is granted for use with
|
|
|
|
conforming SGML systems and applications as defined in
|
|
|
|
ISO 8879, provided this notice is included in all copies.
|
|
|
|
|
|
|
|
Fills the supplied map with html entities mapped to their Go utf8
|
|
|
|
equivalents. This map can be assigned to xml.Parser.Entity
|
|
|
|
It will be used to map non-standard xml entities to a proper value.
|
|
|
|
If the parser encounters any unknown entities, it will throw a syntax
|
|
|
|
error and abort the parsing. Hence the ability to supply this map.
|
2009-11-23 17:28:44 +00:00
|
|
|
*/
|
2009-11-24 16:49:27 +00:00
|
|
|
func loadNonStandardEntities(em *map[string]string) {
|
2009-12-02 20:44:02 +00:00
|
|
|
(*em)["pi"] = "\u03c0";
|
|
|
|
(*em)["nabla"] = "\u2207";
|
|
|
|
(*em)["isin"] = "\u2208";
|
|
|
|
(*em)["loz"] = "\u25ca";
|
|
|
|
(*em)["prop"] = "\u221d";
|
|
|
|
(*em)["para"] = "\u00b6";
|
|
|
|
(*em)["Aring"] = "\u00c5";
|
|
|
|
(*em)["euro"] = "\u20ac";
|
|
|
|
(*em)["sup3"] = "\u00b3";
|
|
|
|
(*em)["sup2"] = "\u00b2";
|
|
|
|
(*em)["sup1"] = "\u00b9";
|
|
|
|
(*em)["prod"] = "\u220f";
|
|
|
|
(*em)["gamma"] = "\u03b3";
|
|
|
|
(*em)["perp"] = "\u22a5";
|
|
|
|
(*em)["lfloor"] = "\u230a";
|
|
|
|
(*em)["fnof"] = "\u0192";
|
|
|
|
(*em)["frasl"] = "\u2044";
|
|
|
|
(*em)["rlm"] = "\u200f";
|
|
|
|
(*em)["omega"] = "\u03c9";
|
|
|
|
(*em)["part"] = "\u2202";
|
|
|
|
(*em)["euml"] = "\u00eb";
|
|
|
|
(*em)["Kappa"] = "\u039a";
|
|
|
|
(*em)["nbsp"] = "\u00a0";
|
|
|
|
(*em)["Eacute"] = "\u00c9";
|
|
|
|
(*em)["brvbar"] = "\u00a6";
|
|
|
|
(*em)["otimes"] = "\u2297";
|
|
|
|
(*em)["ndash"] = "\u2013";
|
|
|
|
(*em)["thinsp"] = "\u2009";
|
|
|
|
(*em)["nu"] = "\u03bd";
|
|
|
|
(*em)["Upsilon"] = "\u03a5";
|
|
|
|
(*em)["upsih"] = "\u03d2";
|
|
|
|
(*em)["raquo"] = "\u00bb";
|
|
|
|
(*em)["yacute"] = "\u00fd";
|
|
|
|
(*em)["delta"] = "\u03b4";
|
|
|
|
(*em)["eth"] = "\u00f0";
|
|
|
|
(*em)["supe"] = "\u2287";
|
|
|
|
(*em)["ne"] = "\u2260";
|
|
|
|
(*em)["ni"] = "\u220b";
|
|
|
|
(*em)["eta"] = "\u03b7";
|
|
|
|
(*em)["uArr"] = "\u21d1";
|
|
|
|
(*em)["image"] = "\u2111";
|
|
|
|
(*em)["asymp"] = "\u2248";
|
|
|
|
(*em)["oacute"] = "\u00f3";
|
|
|
|
(*em)["rarr"] = "\u2192";
|
|
|
|
(*em)["emsp"] = "\u2003";
|
|
|
|
(*em)["acirc"] = "\u00e2";
|
|
|
|
(*em)["shy"] = "\u00ad";
|
|
|
|
(*em)["yuml"] = "\u00ff";
|
|
|
|
(*em)["acute"] = "\u00b4";
|
|
|
|
(*em)["int"] = "\u222b";
|
|
|
|
(*em)["ccedil"] = "\u00e7";
|
|
|
|
(*em)["Acirc"] = "\u00c2";
|
|
|
|
(*em)["Ograve"] = "\u00d2";
|
|
|
|
(*em)["times"] = "\u00d7";
|
|
|
|
(*em)["weierp"] = "\u2118";
|
|
|
|
(*em)["Tau"] = "\u03a4";
|
|
|
|
(*em)["omicron"] = "\u03bf";
|
|
|
|
(*em)["lt"] = "\u003c";
|
|
|
|
(*em)["Mu"] = "\u039c";
|
|
|
|
(*em)["Ucirc"] = "\u00db";
|
|
|
|
(*em)["sub"] = "\u2282";
|
|
|
|
(*em)["le"] = "\u2264";
|
|
|
|
(*em)["sum"] = "\u2211";
|
|
|
|
(*em)["sup"] = "\u2283";
|
|
|
|
(*em)["lrm"] = "\u200e";
|
|
|
|
(*em)["frac34"] = "\u00be";
|
|
|
|
(*em)["Iota"] = "\u0399";
|
|
|
|
(*em)["Ugrave"] = "\u00d9";
|
|
|
|
(*em)["THORN"] = "\u00de";
|
|
|
|
(*em)["rsaquo"] = "\u203a";
|
|
|
|
(*em)["not"] = "\u00ac";
|
|
|
|
(*em)["sigma"] = "\u03c3";
|
|
|
|
(*em)["iuml"] = "\u00ef";
|
|
|
|
(*em)["epsilon"] = "\u03b5";
|
|
|
|
(*em)["spades"] = "\u2660";
|
|
|
|
(*em)["theta"] = "\u03b8";
|
|
|
|
(*em)["divide"] = "\u00f7";
|
|
|
|
(*em)["Atilde"] = "\u00c3";
|
|
|
|
(*em)["uacute"] = "\u00fa";
|
|
|
|
(*em)["Rho"] = "\u03a1";
|
|
|
|
(*em)["trade"] = "\u2122";
|
|
|
|
(*em)["chi"] = "\u03c7";
|
|
|
|
(*em)["agrave"] = "\u00e0";
|
|
|
|
(*em)["or"] = "\u2228";
|
|
|
|
(*em)["circ"] = "\u02c6";
|
|
|
|
(*em)["middot"] = "\u00b7";
|
|
|
|
(*em)["plusmn"] = "\u00b1";
|
|
|
|
(*em)["aring"] = "\u00e5";
|
|
|
|
(*em)["lsquo"] = "\u2018";
|
|
|
|
(*em)["Yacute"] = "\u00dd";
|
|
|
|
(*em)["oline"] = "\u203e";
|
|
|
|
(*em)["copy"] = "\u00a9";
|
|
|
|
(*em)["icirc"] = "\u00ee";
|
|
|
|
(*em)["lowast"] = "\u2217";
|
|
|
|
(*em)["Oacute"] = "\u00d3";
|
|
|
|
(*em)["aacute"] = "\u00e1";
|
|
|
|
(*em)["oplus"] = "\u2295";
|
|
|
|
(*em)["crarr"] = "\u21b5";
|
|
|
|
(*em)["thetasym"] = "\u03d1";
|
|
|
|
(*em)["Beta"] = "\u0392";
|
|
|
|
(*em)["laquo"] = "\u00ab";
|
|
|
|
(*em)["rang"] = "\u232a";
|
|
|
|
(*em)["tilde"] = "\u02dc";
|
|
|
|
(*em)["Uuml"] = "\u00dc";
|
|
|
|
(*em)["zwj"] = "\u200d";
|
|
|
|
(*em)["mu"] = "\u03bc";
|
|
|
|
(*em)["Ccedil"] = "\u00c7";
|
|
|
|
(*em)["infin"] = "\u221e";
|
|
|
|
(*em)["ouml"] = "\u00f6";
|
|
|
|
(*em)["rfloor"] = "\u230b";
|
|
|
|
(*em)["pound"] = "\u00a3";
|
|
|
|
(*em)["szlig"] = "\u00df";
|
|
|
|
(*em)["thorn"] = "\u00fe";
|
|
|
|
(*em)["forall"] = "\u2200";
|
|
|
|
(*em)["piv"] = "\u03d6";
|
|
|
|
(*em)["rdquo"] = "\u201d";
|
|
|
|
(*em)["frac12"] = "\u00bd";
|
|
|
|
(*em)["frac14"] = "\u00bc";
|
|
|
|
(*em)["Ocirc"] = "\u00d4";
|
|
|
|
(*em)["Ecirc"] = "\u00ca";
|
|
|
|
(*em)["kappa"] = "\u03ba";
|
|
|
|
(*em)["Euml"] = "\u00cb";
|
|
|
|
(*em)["minus"] = "\u2212";
|
|
|
|
(*em)["cong"] = "\u2245";
|
|
|
|
(*em)["hellip"] = "\u2026";
|
|
|
|
(*em)["equiv"] = "\u2261";
|
|
|
|
(*em)["cent"] = "\u00a2";
|
|
|
|
(*em)["Uacute"] = "\u00da";
|
|
|
|
(*em)["darr"] = "\u2193";
|
|
|
|
(*em)["Eta"] = "\u0397";
|
|
|
|
(*em)["sbquo"] = "\u201a";
|
|
|
|
(*em)["rArr"] = "\u21d2";
|
|
|
|
(*em)["igrave"] = "\u00ec";
|
|
|
|
(*em)["uml"] = "\u00a8";
|
|
|
|
(*em)["lambda"] = "\u03bb";
|
|
|
|
(*em)["oelig"] = "\u0153";
|
|
|
|
(*em)["harr"] = "\u2194";
|
|
|
|
(*em)["ang"] = "\u2220";
|
|
|
|
(*em)["clubs"] = "\u2663";
|
|
|
|
(*em)["and"] = "\u2227";
|
|
|
|
(*em)["permil"] = "\u2030";
|
|
|
|
(*em)["larr"] = "\u2190";
|
|
|
|
(*em)["Yuml"] = "\u0178";
|
|
|
|
(*em)["cup"] = "\u222a";
|
|
|
|
(*em)["Xi"] = "\u039e";
|
|
|
|
(*em)["Alpha"] = "\u0391";
|
|
|
|
(*em)["phi"] = "\u03c6";
|
|
|
|
(*em)["ucirc"] = "\u00fb";
|
|
|
|
(*em)["oslash"] = "\u00f8";
|
|
|
|
(*em)["rsquo"] = "\u2019";
|
|
|
|
(*em)["AElig"] = "\u00c6";
|
|
|
|
(*em)["mdash"] = "\u2014";
|
|
|
|
(*em)["psi"] = "\u03c8";
|
|
|
|
(*em)["eacute"] = "\u00e9";
|
|
|
|
(*em)["otilde"] = "\u00f5";
|
|
|
|
(*em)["yen"] = "\u00a5";
|
|
|
|
(*em)["gt"] = "\u003e";
|
|
|
|
(*em)["Iuml"] = "\u00cf";
|
|
|
|
(*em)["Prime"] = "\u2033";
|
|
|
|
(*em)["Chi"] = "\u03a7";
|
|
|
|
(*em)["ge"] = "\u2265";
|
|
|
|
(*em)["reg"] = "\u00ae";
|
|
|
|
(*em)["hearts"] = "\u2665";
|
|
|
|
(*em)["auml"] = "\u00e4";
|
|
|
|
(*em)["Agrave"] = "\u00c0";
|
|
|
|
(*em)["sect"] = "\u00a7";
|
|
|
|
(*em)["sube"] = "\u2286";
|
|
|
|
(*em)["sigmaf"] = "\u03c2";
|
|
|
|
(*em)["Gamma"] = "\u0393";
|
|
|
|
(*em)["amp"] = "\u0026";
|
|
|
|
(*em)["ensp"] = "\u2002";
|
|
|
|
(*em)["ETH"] = "\u00d0";
|
|
|
|
(*em)["Igrave"] = "\u00cc";
|
|
|
|
(*em)["Omega"] = "\u03a9";
|
|
|
|
(*em)["Lambda"] = "\u039b";
|
|
|
|
(*em)["Omicron"] = "\u039f";
|
|
|
|
(*em)["there4"] = "\u2234";
|
|
|
|
(*em)["ntilde"] = "\u00f1";
|
|
|
|
(*em)["xi"] = "\u03be";
|
|
|
|
(*em)["dagger"] = "\u2020";
|
|
|
|
(*em)["egrave"] = "\u00e8";
|
|
|
|
(*em)["Delta"] = "\u0394";
|
|
|
|
(*em)["OElig"] = "\u0152";
|
|
|
|
(*em)["diams"] = "\u2666";
|
|
|
|
(*em)["ldquo"] = "\u201c";
|
|
|
|
(*em)["radic"] = "\u221a";
|
|
|
|
(*em)["Oslash"] = "\u00d8";
|
|
|
|
(*em)["Ouml"] = "\u00d6";
|
|
|
|
(*em)["lceil"] = "\u2308";
|
|
|
|
(*em)["uarr"] = "\u2191";
|
|
|
|
(*em)["atilde"] = "\u00e3";
|
|
|
|
(*em)["iquest"] = "\u00bf";
|
|
|
|
(*em)["lsaquo"] = "\u2039";
|
|
|
|
(*em)["Epsilon"] = "\u0395";
|
|
|
|
(*em)["iacute"] = "\u00ed";
|
|
|
|
(*em)["cap"] = "\u2229";
|
|
|
|
(*em)["deg"] = "\u00b0";
|
|
|
|
(*em)["Otilde"] = "\u00d5";
|
|
|
|
(*em)["zeta"] = "\u03b6";
|
|
|
|
(*em)["ocirc"] = "\u00f4";
|
|
|
|
(*em)["scaron"] = "\u0161";
|
|
|
|
(*em)["ecirc"] = "\u00ea";
|
|
|
|
(*em)["ordm"] = "\u00ba";
|
|
|
|
(*em)["tau"] = "\u03c4";
|
|
|
|
(*em)["Auml"] = "\u00c4";
|
|
|
|
(*em)["dArr"] = "\u21d3";
|
|
|
|
(*em)["ordf"] = "\u00aa";
|
|
|
|
(*em)["alefsym"] = "\u2135";
|
|
|
|
(*em)["notin"] = "\u2209";
|
|
|
|
(*em)["Pi"] = "\u03a0";
|
|
|
|
(*em)["sdot"] = "\u22c5";
|
|
|
|
(*em)["upsilon"] = "\u03c5";
|
|
|
|
(*em)["iota"] = "\u03b9";
|
|
|
|
(*em)["hArr"] = "\u21d4";
|
|
|
|
(*em)["Sigma"] = "\u03a3";
|
|
|
|
(*em)["lang"] = "\u2329";
|
|
|
|
(*em)["curren"] = "\u00a4";
|
|
|
|
(*em)["Theta"] = "\u0398";
|
|
|
|
(*em)["lArr"] = "\u21d0";
|
|
|
|
(*em)["Phi"] = "\u03a6";
|
|
|
|
(*em)["Nu"] = "\u039d";
|
|
|
|
(*em)["rho"] = "\u03c1";
|
|
|
|
(*em)["alpha"] = "\u03b1";
|
|
|
|
(*em)["iexcl"] = "\u00a1";
|
|
|
|
(*em)["micro"] = "\u00b5";
|
|
|
|
(*em)["cedil"] = "\u00b8";
|
|
|
|
(*em)["Ntilde"] = "\u00d1";
|
|
|
|
(*em)["Psi"] = "\u03a8";
|
|
|
|
(*em)["Dagger"] = "\u2021";
|
|
|
|
(*em)["Egrave"] = "\u00c8";
|
|
|
|
(*em)["Icirc"] = "\u00ce";
|
|
|
|
(*em)["nsub"] = "\u2284";
|
|
|
|
(*em)["bdquo"] = "\u201e";
|
|
|
|
(*em)["empty"] = "\u2205";
|
|
|
|
(*em)["aelig"] = "\u00e6";
|
|
|
|
(*em)["ograve"] = "\u00f2";
|
|
|
|
(*em)["macr"] = "\u00af";
|
|
|
|
(*em)["Zeta"] = "\u0396";
|
|
|
|
(*em)["beta"] = "\u03b2";
|
|
|
|
(*em)["sim"] = "\u223c";
|
|
|
|
(*em)["uuml"] = "\u00fc";
|
|
|
|
(*em)["Aacute"] = "\u00c1";
|
|
|
|
(*em)["Iacute"] = "\u00cd";
|
|
|
|
(*em)["exist"] = "\u2203";
|
|
|
|
(*em)["prime"] = "\u2032";
|
|
|
|
(*em)["rceil"] = "\u2309";
|
|
|
|
(*em)["real"] = "\u211c";
|
|
|
|
(*em)["zwnj"] = "\u200c";
|
|
|
|
(*em)["bull"] = "\u2022";
|
|
|
|
(*em)["quot"] = "\u0022";
|
|
|
|
(*em)["Scaron"] = "\u0160";
|
|
|
|
(*em)["ugrave"] = "\u00f9";
|
2009-11-23 17:28:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|