2011-01-18 20:31:56 +00:00
|
|
|
// Copyright (c) 2010, Jim Teeuwen. All rights reserved.
|
|
|
|
// This code is subject to a 1-clause BSD license.
|
|
|
|
// The contents of which can be found in the LICENSE file.
|
2009-11-23 04:16:27 +00:00
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
/*
|
2009-11-23 04:16:27 +00:00
|
|
|
This package wraps the standard XML library and uses it to build a node tree of
|
|
|
|
any document you load. This allows you to look up nodes forwards and backwards,
|
2011-01-18 20:31:56 +00:00
|
|
|
as well as perform simple search queries.
|
2009-11-23 04:16:27 +00:00
|
|
|
|
|
|
|
Nodes now simply become collections and don't require you to read them in the
|
|
|
|
order in which the xml.Parser finds them.
|
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
The Document currently implements 2 search functions which allow you to
|
2009-11-23 04:16:27 +00:00
|
|
|
look for specific nodes.
|
2010-05-06 03:36:48 +00:00
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
*xmlx.Document.SelectNode(namespace, name string) *Node;
|
|
|
|
*xmlx.Document.SelectNodes(namespace, name string) []*Node;
|
2010-05-06 03:36:48 +00:00
|
|
|
|
2009-11-23 04:16:27 +00:00
|
|
|
SelectNode() returns the first, single node it finds matching the given name
|
|
|
|
and namespace. SelectNodes() returns a slice containing all the matching nodes.
|
2010-05-06 03:36:48 +00:00
|
|
|
|
2009-11-23 04:16:27 +00:00
|
|
|
Note that these search functions can be invoked on individual nodes as well.
|
|
|
|
This allows you to search only a subset of the entire document.
|
|
|
|
*/
|
|
|
|
package xmlx
|
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
import (
|
|
|
|
"os"
|
|
|
|
"io"
|
|
|
|
"io/ioutil"
|
|
|
|
"path"
|
|
|
|
"strings"
|
|
|
|
"xml"
|
|
|
|
"fmt"
|
|
|
|
"http"
|
2011-01-20 21:48:03 +00:00
|
|
|
iconv "github.com/sloonz/go-iconv/src"
|
2011-01-18 20:31:56 +00:00
|
|
|
)
|
2009-11-23 04:16:27 +00:00
|
|
|
|
|
|
|
type Document struct {
|
2010-05-06 03:36:48 +00:00
|
|
|
Version string
|
|
|
|
Encoding string
|
|
|
|
StandAlone string
|
|
|
|
SaveDocType bool
|
|
|
|
Root *Node
|
|
|
|
Entity map[string]string
|
|
|
|
Verbose bool
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func New() *Document {
|
|
|
|
return &Document{
|
2010-05-06 03:36:48 +00:00
|
|
|
Version: "1.0",
|
|
|
|
Encoding: "utf-8",
|
|
|
|
StandAlone: "yes",
|
2009-11-23 04:16:27 +00:00
|
|
|
SaveDocType: true,
|
2010-05-06 03:36:48 +00:00
|
|
|
Entity: make(map[string]string),
|
|
|
|
Verbose: false,
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-11-23 17:28:44 +00:00
|
|
|
// This loads a rather massive table of non-conventional xml escape sequences.
|
|
|
|
// Needed to make the parser map them to characters properly. It is advised to
|
|
|
|
// set only those entities needed manually using the document.Entity map, but
|
|
|
|
// if need be, this method can be called to fill the map with the entire set
|
|
|
|
// defined on http://www.w3.org/TR/html4/sgml/entities.html
|
2010-08-22 03:07:38 +00:00
|
|
|
func (this *Document) LoadExtendedEntityMap() { loadNonStandardEntities(this.Entity) }
|
2009-11-23 17:28:44 +00:00
|
|
|
|
2009-11-23 04:16:27 +00:00
|
|
|
func (this *Document) String() string {
|
2010-05-06 03:36:48 +00:00
|
|
|
s, _ := this.SaveString()
|
|
|
|
return s
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Document) SelectNode(namespace, name string) *Node {
|
2010-05-06 03:36:48 +00:00
|
|
|
return this.Root.SelectNode(namespace, name)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Document) SelectNodes(namespace, name string) []*Node {
|
2010-05-06 03:36:48 +00:00
|
|
|
return this.Root.SelectNodes(namespace, name)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// *****************************************************************************
|
|
|
|
// *** Satisfy ILoader interface
|
|
|
|
// *****************************************************************************
|
|
|
|
func (this *Document) LoadString(s string) (err os.Error) {
|
2011-01-18 20:31:56 +00:00
|
|
|
// Ensure we are passing UTF-8 encoding content to the XML tokenizer.
|
|
|
|
if s, err = this.correctEncoding(s); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// tokenize data
|
2010-05-06 03:36:48 +00:00
|
|
|
xp := xml.NewParser(strings.NewReader(s))
|
|
|
|
xp.Entity = this.Entity
|
2009-11-23 16:50:29 +00:00
|
|
|
|
2010-05-06 03:36:48 +00:00
|
|
|
this.Root = NewNode(NT_ROOT)
|
|
|
|
ct := this.Root
|
2009-11-23 04:16:27 +00:00
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
var tok xml.Token
|
2010-08-22 03:07:38 +00:00
|
|
|
var t *Node
|
|
|
|
var i int
|
|
|
|
var doctype string
|
|
|
|
var v xml.Attr
|
|
|
|
|
2009-11-23 04:16:27 +00:00
|
|
|
for {
|
2010-05-26 00:24:44 +00:00
|
|
|
if tok, err = xp.Token(); err != nil {
|
|
|
|
if err == os.EOF {
|
|
|
|
return nil
|
2009-11-24 13:37:17 +00:00
|
|
|
}
|
2009-11-23 04:16:27 +00:00
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
if this.Verbose {
|
|
|
|
fmt.Fprintf(os.Stderr, "Xml Error: %s\n", err)
|
|
|
|
}
|
|
|
|
return err
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
switch tt := tok.(type) {
|
|
|
|
case xml.SyntaxError:
|
|
|
|
return os.NewError(tt.String())
|
|
|
|
case xml.CharData:
|
2010-08-22 03:07:38 +00:00
|
|
|
ct.Value = strings.TrimSpace(string([]byte(tt)))
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.Comment:
|
2010-05-06 03:36:48 +00:00
|
|
|
t := NewNode(NT_COMMENT)
|
2010-08-22 03:07:38 +00:00
|
|
|
t.Value = strings.TrimSpace(string([]byte(tt)))
|
2010-05-06 03:36:48 +00:00
|
|
|
ct.AddChild(t)
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.Directive:
|
2010-08-22 03:07:38 +00:00
|
|
|
t = NewNode(NT_DIRECTIVE)
|
|
|
|
t.Value = strings.TrimSpace(string([]byte(tt)))
|
2010-05-06 03:36:48 +00:00
|
|
|
ct.AddChild(t)
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.StartElement:
|
2010-08-22 03:07:38 +00:00
|
|
|
t = NewNode(NT_ELEMENT)
|
2010-05-26 00:24:44 +00:00
|
|
|
t.Name = tt.Name
|
2010-08-22 03:07:38 +00:00
|
|
|
t.Attributes = make([]*Attr, len(tt.Attr))
|
|
|
|
for i, v = range tt.Attr {
|
|
|
|
t.Attributes[i] = new(Attr)
|
2010-05-06 03:36:48 +00:00
|
|
|
t.Attributes[i].Name = v.Name
|
|
|
|
t.Attributes[i].Value = v.Value
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2010-05-06 03:36:48 +00:00
|
|
|
ct.AddChild(t)
|
|
|
|
ct = t
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.ProcInst:
|
|
|
|
if tt.Target == "xml" { // xml doctype
|
2010-08-22 03:07:38 +00:00
|
|
|
doctype = strings.TrimSpace(string(tt.Inst))
|
|
|
|
if i = strings.Index(doctype, `standalone="`); i > -1 {
|
|
|
|
this.StandAlone = doctype[i+len(`standalone="`) : len(doctype)]
|
|
|
|
i = strings.Index(this.StandAlone, `"`)
|
|
|
|
this.StandAlone = this.StandAlone[0:i]
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2010-05-26 00:24:44 +00:00
|
|
|
} else {
|
2010-08-22 03:07:38 +00:00
|
|
|
t = NewNode(NT_PROCINST)
|
2010-05-26 00:24:44 +00:00
|
|
|
t.Target = strings.TrimSpace(tt.Target)
|
|
|
|
t.Value = strings.TrimSpace(string(tt.Inst))
|
2010-05-06 03:36:48 +00:00
|
|
|
ct.AddChild(t)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.EndElement:
|
|
|
|
if ct = ct.Parent; ct == nil {
|
|
|
|
return
|
|
|
|
}
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-06 03:36:48 +00:00
|
|
|
return
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
func (this *Document) LoadFile(filename string) (err os.Error) {
|
|
|
|
var data []byte
|
2009-11-23 04:16:27 +00:00
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
if data, err = ioutil.ReadFile(path.Clean(filename)); err != nil {
|
|
|
|
return
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
return this.LoadString(string(data))
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2009-11-25 01:50:06 +00:00
|
|
|
func (this *Document) LoadUri(uri string) (err os.Error) {
|
2010-12-16 20:08:01 +00:00
|
|
|
var r *http.Response
|
|
|
|
if r, _, err = http.Get(uri); err != nil {
|
2009-11-25 01:50:06 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2010-05-06 03:36:48 +00:00
|
|
|
defer r.Body.Close()
|
2009-11-25 01:50:06 +00:00
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
var b []byte
|
|
|
|
if b, err = ioutil.ReadAll(r.Body); err != nil {
|
|
|
|
return
|
2009-11-25 01:50:06 +00:00
|
|
|
}
|
|
|
|
|
2010-12-16 20:08:01 +00:00
|
|
|
return this.LoadString(string(b))
|
2009-11-25 01:50:06 +00:00
|
|
|
}
|
|
|
|
|
2010-10-10 18:04:58 +00:00
|
|
|
func (this *Document) LoadStream(r io.Reader) (err os.Error) {
|
2010-12-19 19:45:57 +00:00
|
|
|
var b []byte
|
|
|
|
if b, err = ioutil.ReadAll(r); err != nil {
|
|
|
|
return
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2010-12-19 19:45:57 +00:00
|
|
|
return this.LoadString(string(b))
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// *****************************************************************************
|
|
|
|
// *** Satisfy ISaver interface
|
|
|
|
// *****************************************************************************
|
|
|
|
func (this *Document) SaveFile(path string) (err os.Error) {
|
2010-08-22 03:07:38 +00:00
|
|
|
var data string
|
|
|
|
if data, err = this.SaveString(); err != nil {
|
2009-11-23 04:16:27 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2010-08-22 03:07:38 +00:00
|
|
|
return ioutil.WriteFile(path, []byte(data), 0600)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Document) SaveString() (s string, err os.Error) {
|
|
|
|
if this.SaveDocType {
|
|
|
|
s = fmt.Sprintf(`<?xml version="%s" encoding="%s" standalone="%s"?>`,
|
|
|
|
this.Version, this.Encoding, this.StandAlone)
|
|
|
|
}
|
|
|
|
|
2010-05-06 03:36:48 +00:00
|
|
|
s += this.Root.String()
|
|
|
|
return
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2010-10-10 18:04:58 +00:00
|
|
|
func (this *Document) SaveStream(w io.Writer) (err os.Error) {
|
2010-12-16 20:08:01 +00:00
|
|
|
var s string
|
|
|
|
if s, err = this.SaveString(); err != nil {
|
2009-11-23 04:16:27 +00:00
|
|
|
return
|
|
|
|
}
|
2010-12-16 20:08:01 +00:00
|
|
|
_, err = w.Write([]byte(s))
|
2010-05-06 03:36:48 +00:00
|
|
|
return
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2011-01-18 20:31:56 +00:00
|
|
|
|
|
|
|
// Use libiconv to ensure we get UTF-8 encoded data. The Go Xml tokenizer will
|
|
|
|
// throw a tantrum if we give it anything else.
|
|
|
|
func (this *Document) correctEncoding(data string) (ret string, err os.Error) {
|
|
|
|
var cd *iconv.Iconv
|
|
|
|
var tok xml.Token
|
|
|
|
|
|
|
|
enc := "utf-8"
|
|
|
|
xp := xml.NewParser(strings.NewReader(data))
|
|
|
|
xp.Entity = this.Entity
|
|
|
|
|
|
|
|
loop:
|
|
|
|
for {
|
|
|
|
if tok, err = xp.Token(); err != nil {
|
|
|
|
if err == os.EOF {
|
|
|
|
break loop
|
|
|
|
}
|
2011-01-27 21:10:38 +00:00
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
|
|
|
|
switch tt := tok.(type) {
|
|
|
|
case xml.ProcInst:
|
|
|
|
if tt.Target == "xml" { // xml doctype
|
2011-01-27 21:10:38 +00:00
|
|
|
var pair []string
|
|
|
|
var entry string
|
|
|
|
|
|
|
|
list := strings.Split(string(tt.Inst), " ", -1)
|
|
|
|
for _, entry = range list {
|
|
|
|
if pair = strings.Split(entry, "=", -1); len(pair) < 2 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
switch pair[0] {
|
|
|
|
case "encoding":
|
|
|
|
enc = pair[1][1:len(pair[1])-1]
|
|
|
|
break loop
|
|
|
|
}
|
2011-01-18 20:31:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if enc == "utf-8" {
|
2011-01-27 21:10:38 +00:00
|
|
|
// Data already in utf-8 format. Nothing to do here.
|
2011-01-18 20:31:56 +00:00
|
|
|
return data, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if cd, err = iconv.Open("utf-8", enc); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
defer cd.Close()
|
|
|
|
return cd.Conv(data)
|
|
|
|
}
|