added HtmlToUTF8() and UTF8ToHTML() functions for converting non-standard html entities.
This commit is contained in:
parent
dcae3f9bff
commit
5bbe4dbe10
|
@ -60,7 +60,7 @@ func New() *Document {
|
|||
// if need be, this method can be called to fill the map with the entire set
|
||||
// defined on http://www.w3.org/TR/html4/sgml/entities.html
|
||||
func (this *Document) LoadExtendedEntityMap() {
|
||||
entitymap_load(&this.Entity);
|
||||
loadNonStandardEntities(&this.Entity);
|
||||
}
|
||||
|
||||
func (this *Document) String() string {
|
||||
|
|
|
@ -1,12 +1,54 @@
|
|||
package xmlx
|
||||
|
||||
import "fmt"
|
||||
import "utf8"
|
||||
import "regexp"
|
||||
import "strconv"
|
||||
|
||||
var reg_entity = regexp.MustCompile("^&#[0-9]+;$");
|
||||
|
||||
// Converts a single numerical html entity to a regular Go utf-token.
|
||||
// ex: "♣" -> "♣"
|
||||
func HtmlToUTF8(entity string) string {
|
||||
// Make sure we have a valid entity: {
|
||||
ok := reg_entity.MatchString(entity);
|
||||
if !ok { return "" }
|
||||
|
||||
// Convert entity to number
|
||||
num, err := strconv.Atoi(entity[2:len(entity)-1]);
|
||||
if err != nil { return "" }
|
||||
|
||||
var arr [3]byte;
|
||||
size := utf8.EncodeRune(num, &arr);
|
||||
if size == 0 { return "" }
|
||||
|
||||
return string(&arr);
|
||||
}
|
||||
|
||||
// Converts a single Go utf-token to it's an Html entity.
|
||||
// ex: "♣" -> "♣"
|
||||
func UTF8ToHtml(token string) string {
|
||||
rune, size := utf8.DecodeRuneInString(token);
|
||||
if size == 0 { return "" }
|
||||
return fmt.Sprintf("&#%d;", rune);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Portions © International Organization for Standardization 1986
|
||||
Permission to copy in any form is granted for use with
|
||||
conforming SGML systems and applications as defined in
|
||||
ISO 8879, provided this notice is included in all copies.
|
||||
http://www.w3.org/TR/html4/sgml/entities.html
|
||||
|
||||
Portions © International Organization for Standardization 1986
|
||||
Permission to copy in any form is granted for use with
|
||||
conforming SGML systems and applications as defined in
|
||||
ISO 8879, provided this notice is included in all copies.
|
||||
|
||||
Fills the supplied map with html entities mapped to their Go utf8
|
||||
equivalents. This map can be assigned to xml.Parser.Entity
|
||||
It will be used to map non-standard xml entities to a proper value.
|
||||
If the parser encounters any unknown entities, it will throw a syntax
|
||||
error and abort the parsing. Hence the ability to supply this map.
|
||||
*/
|
||||
func entitymap_load(em *map[string]string) {
|
||||
func loadNonStandardEntities(em *map[string]string) {
|
||||
// Generic entities string([]uint8{160});
|
||||
(*em)["nbsp"] = " ";
|
||||
(*em)["iexcl"] = "¡";
|
||||
|
|
Loading…
Reference in New Issue