added HtmlToUTF8() and UTF8ToHTML() functions for converting non-standard html entities.
This commit is contained in:
parent
dcae3f9bff
commit
5bbe4dbe10
|
@ -60,7 +60,7 @@ func New() *Document {
|
||||||
// if need be, this method can be called to fill the map with the entire set
|
// if need be, this method can be called to fill the map with the entire set
|
||||||
// defined on http://www.w3.org/TR/html4/sgml/entities.html
|
// defined on http://www.w3.org/TR/html4/sgml/entities.html
|
||||||
func (this *Document) LoadExtendedEntityMap() {
|
func (this *Document) LoadExtendedEntityMap() {
|
||||||
entitymap_load(&this.Entity);
|
loadNonStandardEntities(&this.Entity);
|
||||||
}
|
}
|
||||||
|
|
||||||
func (this *Document) String() string {
|
func (this *Document) String() string {
|
||||||
|
|
|
@ -1,12 +1,54 @@
|
||||||
package xmlx
|
package xmlx
|
||||||
|
|
||||||
|
import "fmt"
|
||||||
|
import "utf8"
|
||||||
|
import "regexp"
|
||||||
|
import "strconv"
|
||||||
|
|
||||||
|
var reg_entity = regexp.MustCompile("^&#[0-9]+;$");
|
||||||
|
|
||||||
|
// Converts a single numerical html entity to a regular Go utf-token.
|
||||||
|
// ex: "♣" -> "♣"
|
||||||
|
func HtmlToUTF8(entity string) string {
|
||||||
|
// Make sure we have a valid entity: {
|
||||||
|
ok := reg_entity.MatchString(entity);
|
||||||
|
if !ok { return "" }
|
||||||
|
|
||||||
|
// Convert entity to number
|
||||||
|
num, err := strconv.Atoi(entity[2:len(entity)-1]);
|
||||||
|
if err != nil { return "" }
|
||||||
|
|
||||||
|
var arr [3]byte;
|
||||||
|
size := utf8.EncodeRune(num, &arr);
|
||||||
|
if size == 0 { return "" }
|
||||||
|
|
||||||
|
return string(&arr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Converts a single Go utf-token to it's an Html entity.
|
||||||
|
// ex: "♣" -> "♣"
|
||||||
|
func UTF8ToHtml(token string) string {
|
||||||
|
rune, size := utf8.DecodeRuneInString(token);
|
||||||
|
if size == 0 { return "" }
|
||||||
|
return fmt.Sprintf("&#%d;", rune);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
http://www.w3.org/TR/html4/sgml/entities.html
|
||||||
|
|
||||||
Portions © International Organization for Standardization 1986
|
Portions © International Organization for Standardization 1986
|
||||||
Permission to copy in any form is granted for use with
|
Permission to copy in any form is granted for use with
|
||||||
conforming SGML systems and applications as defined in
|
conforming SGML systems and applications as defined in
|
||||||
ISO 8879, provided this notice is included in all copies.
|
ISO 8879, provided this notice is included in all copies.
|
||||||
|
|
||||||
|
Fills the supplied map with html entities mapped to their Go utf8
|
||||||
|
equivalents. This map can be assigned to xml.Parser.Entity
|
||||||
|
It will be used to map non-standard xml entities to a proper value.
|
||||||
|
If the parser encounters any unknown entities, it will throw a syntax
|
||||||
|
error and abort the parsing. Hence the ability to supply this map.
|
||||||
*/
|
*/
|
||||||
func entitymap_load(em *map[string]string) {
|
func loadNonStandardEntities(em *map[string]string) {
|
||||||
// Generic entities string([]uint8{160});
|
// Generic entities string([]uint8{160});
|
||||||
(*em)["nbsp"] = " ";
|
(*em)["nbsp"] = " ";
|
||||||
(*em)["iexcl"] = "¡";
|
(*em)["iexcl"] = "¡";
|
||||||
|
|
Loading…
Reference in New Issue