added HtmlToUTF8() and UTF8ToHTML() functions for converting non-standard html entities.

This commit is contained in:
jim teeuwen 2009-11-24 17:49:27 +01:00
parent dcae3f9bff
commit 5bbe4dbe10
2 changed files with 48 additions and 6 deletions

View File

@ -60,7 +60,7 @@ func New() *Document {
// if need be, this method can be called to fill the map with the entire set // if need be, this method can be called to fill the map with the entire set
// defined on http://www.w3.org/TR/html4/sgml/entities.html // defined on http://www.w3.org/TR/html4/sgml/entities.html
func (this *Document) LoadExtendedEntityMap() { func (this *Document) LoadExtendedEntityMap() {
entitymap_load(&this.Entity); loadNonStandardEntities(&this.Entity);
} }
func (this *Document) String() string { func (this *Document) String() string {

View File

@ -1,12 +1,54 @@
package xmlx package xmlx
import "fmt"
import "utf8"
import "regexp"
import "strconv"
var reg_entity = regexp.MustCompile("^&#[0-9]+;$");
// Converts a single numerical html entity to a regular Go utf-token.
// ex: "♣" -> "♣"
func HtmlToUTF8(entity string) string {
// Make sure we have a valid entity: {
ok := reg_entity.MatchString(entity);
if !ok { return "" }
// Convert entity to number
num, err := strconv.Atoi(entity[2:len(entity)-1]);
if err != nil { return "" }
var arr [3]byte;
size := utf8.EncodeRune(num, &arr);
if size == 0 { return "" }
return string(&arr);
}
// Converts a single Go utf-token to it's an Html entity.
// ex: "♣" -> "♣"
func UTF8ToHtml(token string) string {
rune, size := utf8.DecodeRuneInString(token);
if size == 0 { return "" }
return fmt.Sprintf("&#%d;", rune);
}
/* /*
Portions © International Organization for Standardization 1986 http://www.w3.org/TR/html4/sgml/entities.html
Permission to copy in any form is granted for use with
conforming SGML systems and applications as defined in Portions © International Organization for Standardization 1986
ISO 8879, provided this notice is included in all copies. Permission to copy in any form is granted for use with
conforming SGML systems and applications as defined in
ISO 8879, provided this notice is included in all copies.
Fills the supplied map with html entities mapped to their Go utf8
equivalents. This map can be assigned to xml.Parser.Entity
It will be used to map non-standard xml entities to a proper value.
If the parser encounters any unknown entities, it will throw a syntax
error and abort the parsing. Hence the ability to supply this map.
*/ */
func entitymap_load(em *map[string]string) { func loadNonStandardEntities(em *map[string]string) {
// Generic entities string([]uint8{160}); // Generic entities string([]uint8{160});
(*em)["nbsp"] = " "; (*em)["nbsp"] = " ";
(*em)["iexcl"] = "¡"; (*em)["iexcl"] = "¡";