2011-03-19 13:50:46 +00:00
|
|
|
// This work is subject to the CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
|
|
|
|
// license. Its contents can be found at:
|
|
|
|
// http://creativecommons.org/publicdomain/zero/1.0/
|
2009-11-23 04:16:27 +00:00
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
/*
|
2009-11-23 04:16:27 +00:00
|
|
|
This package wraps the standard XML library and uses it to build a node tree of
|
|
|
|
any document you load. This allows you to look up nodes forwards and backwards,
|
2011-01-18 20:31:56 +00:00
|
|
|
as well as perform simple search queries.
|
2009-11-23 04:16:27 +00:00
|
|
|
|
|
|
|
Nodes now simply become collections and don't require you to read them in the
|
|
|
|
order in which the xml.Parser finds them.
|
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
The Document currently implements 2 search functions which allow you to
|
2009-11-23 04:16:27 +00:00
|
|
|
look for specific nodes.
|
2010-05-06 03:36:48 +00:00
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
*xmlx.Document.SelectNode(namespace, name string) *Node;
|
|
|
|
*xmlx.Document.SelectNodes(namespace, name string) []*Node;
|
2012-07-29 19:28:03 +00:00
|
|
|
*xmlx.Document.SelectNodesRecursive(namespace, name string) []*Node;
|
2010-05-06 03:36:48 +00:00
|
|
|
|
2009-11-23 04:16:27 +00:00
|
|
|
SelectNode() returns the first, single node it finds matching the given name
|
2012-07-29 19:28:03 +00:00
|
|
|
and namespace. SelectNodes() returns a slice containing all the matching nodes
|
|
|
|
(without recursing into matching nodes). SelectNodesRecursive() returns a slice
|
|
|
|
of all matching nodes, including nodes inside other matching nodes.
|
2010-05-06 03:36:48 +00:00
|
|
|
|
2009-11-23 04:16:27 +00:00
|
|
|
Note that these search functions can be invoked on individual nodes as well.
|
|
|
|
This allows you to search only a subset of the entire document.
|
|
|
|
*/
|
|
|
|
package xmlx
|
|
|
|
|
2011-01-18 20:31:56 +00:00
|
|
|
import (
|
2011-11-09 13:56:55 +00:00
|
|
|
"bytes"
|
|
|
|
"encoding/xml"
|
2011-11-02 15:50:45 +00:00
|
|
|
"errors"
|
2011-11-09 13:56:55 +00:00
|
|
|
"fmt"
|
2011-01-18 20:31:56 +00:00
|
|
|
"io"
|
|
|
|
"io/ioutil"
|
2015-11-30 18:30:01 +00:00
|
|
|
"log"
|
2011-11-09 13:56:55 +00:00
|
|
|
"net/http"
|
|
|
|
"os"
|
2011-01-18 20:31:56 +00:00
|
|
|
"strings"
|
|
|
|
)
|
2009-11-23 04:16:27 +00:00
|
|
|
|
2012-02-29 10:27:52 +00:00
|
|
|
// This signature represents a character encoding conversion routine.
|
|
|
|
// Used to tell the xml decoder how to deal with non-utf8 characters.
|
2012-02-29 10:08:37 +00:00
|
|
|
type CharsetFunc func(charset string, input io.Reader) (io.Reader, error)
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// represents a single XML document.
|
2009-11-23 04:16:27 +00:00
|
|
|
type Document struct {
|
2012-02-29 10:42:39 +00:00
|
|
|
Version string // XML version
|
|
|
|
Encoding string // Encoding found in document. If absent, assumes UTF-8.
|
|
|
|
StandAlone string // Value of XML doctype's 'standalone' attribute.
|
|
|
|
Entity map[string]string // Mapping of custom entity conversions.
|
|
|
|
Root *Node // The document's root node.
|
|
|
|
SaveDocType bool // Whether not to include the XML doctype in saves.
|
2015-11-30 18:30:01 +00:00
|
|
|
|
|
|
|
useragent string // Used internally
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Create a new, empty XML document instance.
|
2009-11-23 04:16:27 +00:00
|
|
|
func New() *Document {
|
|
|
|
return &Document{
|
2012-02-29 10:42:39 +00:00
|
|
|
Version: "1.0",
|
|
|
|
Encoding: "utf-8",
|
|
|
|
StandAlone: "yes",
|
|
|
|
SaveDocType: true,
|
|
|
|
Entity: make(map[string]string),
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-11-23 17:28:44 +00:00
|
|
|
// This loads a rather massive table of non-conventional xml escape sequences.
|
|
|
|
// Needed to make the parser map them to characters properly. It is advised to
|
|
|
|
// set only those entities needed manually using the document.Entity map, but
|
|
|
|
// if need be, this method can be called to fill the map with the entire set
|
|
|
|
// defined on http://www.w3.org/TR/html4/sgml/entities.html
|
2010-08-22 03:07:38 +00:00
|
|
|
func (this *Document) LoadExtendedEntityMap() { loadNonStandardEntities(this.Entity) }
|
2009-11-23 17:28:44 +00:00
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Select a single node with the given namespace and name. Returns nil if no
|
|
|
|
// matching node was found.
|
2009-11-23 04:16:27 +00:00
|
|
|
func (this *Document) SelectNode(namespace, name string) *Node {
|
2010-05-06 03:36:48 +00:00
|
|
|
return this.Root.SelectNode(namespace, name)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Select all nodes with the given namespace and name. Returns an empty slice
|
|
|
|
// if no matches were found.
|
2012-07-29 19:28:03 +00:00
|
|
|
// Select all nodes with the given namespace and name, without recursing
|
|
|
|
// into the children of those matches. Returns an empty slice if no matching
|
|
|
|
// node was found.
|
2009-11-23 04:16:27 +00:00
|
|
|
func (this *Document) SelectNodes(namespace, name string) []*Node {
|
2010-05-06 03:36:48 +00:00
|
|
|
return this.Root.SelectNodes(namespace, name)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2015-02-03 21:30:24 +00:00
|
|
|
// Select all nodes directly under this document, with the given namespace
|
|
|
|
// and name. Returns an empty slice if no matches were found.
|
|
|
|
func (this *Document) SelectNodesDirect(namespace, name string) []*Node {
|
|
|
|
return this.Root.SelectNodesDirect(namespace, name)
|
|
|
|
}
|
|
|
|
|
2012-07-29 19:28:03 +00:00
|
|
|
// Select all nodes with the given namespace and name, also recursing into the
|
|
|
|
// children of those matches. Returns an empty slice if no matches were found.
|
|
|
|
func (this *Document) SelectNodesRecursive(namespace, name string) []*Node {
|
|
|
|
return this.Root.SelectNodesRecursive(namespace, name)
|
|
|
|
}
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Load the contents of this document from the supplied reader.
|
2012-02-29 10:21:35 +00:00
|
|
|
func (this *Document) LoadStream(r io.Reader, charset CharsetFunc) (err error) {
|
2012-01-27 10:51:02 +00:00
|
|
|
xp := xml.NewDecoder(r)
|
2010-05-06 03:36:48 +00:00
|
|
|
xp.Entity = this.Entity
|
2012-02-29 10:21:35 +00:00
|
|
|
xp.CharsetReader = charset
|
2009-11-23 16:50:29 +00:00
|
|
|
|
2010-05-06 03:36:48 +00:00
|
|
|
this.Root = NewNode(NT_ROOT)
|
|
|
|
ct := this.Root
|
2009-11-23 04:16:27 +00:00
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
var tok xml.Token
|
2010-08-22 03:07:38 +00:00
|
|
|
var t *Node
|
|
|
|
var doctype string
|
|
|
|
|
2009-11-23 04:16:27 +00:00
|
|
|
for {
|
2010-05-26 00:24:44 +00:00
|
|
|
if tok, err = xp.Token(); err != nil {
|
2011-11-02 15:50:45 +00:00
|
|
|
if err == io.EOF {
|
2010-05-26 00:24:44 +00:00
|
|
|
return nil
|
2009-11-24 13:37:17 +00:00
|
|
|
}
|
2010-05-26 00:24:44 +00:00
|
|
|
return err
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2010-05-26 00:24:44 +00:00
|
|
|
switch tt := tok.(type) {
|
|
|
|
case xml.SyntaxError:
|
2011-11-02 15:50:45 +00:00
|
|
|
return errors.New(tt.Error())
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.CharData:
|
2013-09-10 21:18:13 +00:00
|
|
|
t := NewNode(NT_TEXT)
|
|
|
|
t.Value = string([]byte(tt))
|
|
|
|
ct.AddChild(t)
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.Comment:
|
2010-05-06 03:36:48 +00:00
|
|
|
t := NewNode(NT_COMMENT)
|
2010-08-22 03:07:38 +00:00
|
|
|
t.Value = strings.TrimSpace(string([]byte(tt)))
|
2010-05-06 03:36:48 +00:00
|
|
|
ct.AddChild(t)
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.Directive:
|
2010-08-22 03:07:38 +00:00
|
|
|
t = NewNode(NT_DIRECTIVE)
|
|
|
|
t.Value = strings.TrimSpace(string([]byte(tt)))
|
2010-05-06 03:36:48 +00:00
|
|
|
ct.AddChild(t)
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.StartElement:
|
2010-08-22 03:07:38 +00:00
|
|
|
t = NewNode(NT_ELEMENT)
|
2010-05-26 00:24:44 +00:00
|
|
|
t.Name = tt.Name
|
2010-08-22 03:07:38 +00:00
|
|
|
t.Attributes = make([]*Attr, len(tt.Attr))
|
2011-05-11 15:44:09 +00:00
|
|
|
for i, v := range tt.Attr {
|
2010-08-22 03:07:38 +00:00
|
|
|
t.Attributes[i] = new(Attr)
|
2010-05-06 03:36:48 +00:00
|
|
|
t.Attributes[i].Name = v.Name
|
|
|
|
t.Attributes[i].Value = v.Value
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2010-05-06 03:36:48 +00:00
|
|
|
ct.AddChild(t)
|
|
|
|
ct = t
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.ProcInst:
|
|
|
|
if tt.Target == "xml" { // xml doctype
|
2010-08-22 03:07:38 +00:00
|
|
|
doctype = strings.TrimSpace(string(tt.Inst))
|
2011-05-11 15:44:09 +00:00
|
|
|
if i := strings.Index(doctype, `standalone="`); i > -1 {
|
2010-08-22 03:07:38 +00:00
|
|
|
this.StandAlone = doctype[i+len(`standalone="`) : len(doctype)]
|
|
|
|
i = strings.Index(this.StandAlone, `"`)
|
|
|
|
this.StandAlone = this.StandAlone[0:i]
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2010-05-26 00:24:44 +00:00
|
|
|
} else {
|
2010-08-22 03:07:38 +00:00
|
|
|
t = NewNode(NT_PROCINST)
|
2010-05-26 00:24:44 +00:00
|
|
|
t.Target = strings.TrimSpace(tt.Target)
|
|
|
|
t.Value = strings.TrimSpace(string(tt.Inst))
|
2010-05-06 03:36:48 +00:00
|
|
|
ct.AddChild(t)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2010-05-26 00:24:44 +00:00
|
|
|
case xml.EndElement:
|
|
|
|
if ct = ct.Parent; ct == nil {
|
|
|
|
return
|
|
|
|
}
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-06 03:36:48 +00:00
|
|
|
return
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Load the contents of this document from the supplied byte slice.
|
2012-02-29 10:21:35 +00:00
|
|
|
func (this *Document) LoadBytes(d []byte, charset CharsetFunc) (err error) {
|
|
|
|
return this.LoadStream(bytes.NewBuffer(d), charset)
|
2011-05-11 15:44:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Load the contents of this document from the supplied string.
|
2012-02-29 10:21:35 +00:00
|
|
|
func (this *Document) LoadString(s string, charset CharsetFunc) (err error) {
|
|
|
|
return this.LoadStream(strings.NewReader(s), charset)
|
2011-05-11 15:44:09 +00:00
|
|
|
}
|
2009-11-23 04:16:27 +00:00
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Load the contents of this document from the supplied file.
|
2012-02-29 10:21:35 +00:00
|
|
|
func (this *Document) LoadFile(filename string, charset CharsetFunc) (err error) {
|
2011-05-11 15:44:09 +00:00
|
|
|
var fd *os.File
|
|
|
|
if fd, err = os.Open(filename); err != nil {
|
2010-05-26 00:24:44 +00:00
|
|
|
return
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
defer fd.Close()
|
2012-02-29 10:21:35 +00:00
|
|
|
return this.LoadStream(fd, charset)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2013-03-19 13:49:00 +00:00
|
|
|
// Load the contents of this document from the supplied uri using the specifed
|
|
|
|
// client.
|
|
|
|
func (this *Document) LoadUriClient(uri string, client *http.Client, charset CharsetFunc) (err error) {
|
2010-12-16 20:08:01 +00:00
|
|
|
var r *http.Response
|
2015-11-30 18:30:01 +00:00
|
|
|
|
|
|
|
req, err := http.NewRequest("GET", uri, nil)
|
|
|
|
if err != nil {
|
|
|
|
log.Fatalln(err) // TODO
|
|
|
|
}
|
|
|
|
req.Header.Set("User-Agent", this.useragent)
|
|
|
|
|
|
|
|
if r, err = client.Do(req); err != nil {
|
2009-11-25 01:50:06 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2010-05-06 03:36:48 +00:00
|
|
|
defer r.Body.Close()
|
2012-02-29 10:21:35 +00:00
|
|
|
return this.LoadStream(r.Body, charset)
|
2009-11-25 01:50:06 +00:00
|
|
|
}
|
|
|
|
|
2013-03-19 13:49:00 +00:00
|
|
|
// Load the contents of this document from the supplied uri.
|
|
|
|
// (calls LoadUriClient with http.DefaultClient)
|
|
|
|
func (this *Document) LoadUri(uri string, charset CharsetFunc) (err error) {
|
|
|
|
return this.LoadUriClient(uri, http.DefaultClient, charset)
|
|
|
|
}
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Save the contents of this document to the supplied file.
|
2011-11-02 15:50:45 +00:00
|
|
|
func (this *Document) SaveFile(path string) error {
|
2011-05-11 15:44:09 +00:00
|
|
|
return ioutil.WriteFile(path, this.SaveBytes(), 0600)
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Save the contents of this document as a byte slice.
|
|
|
|
func (this *Document) SaveBytes() []byte {
|
|
|
|
var b bytes.Buffer
|
2009-11-23 04:16:27 +00:00
|
|
|
|
|
|
|
if this.SaveDocType {
|
2011-05-11 15:44:09 +00:00
|
|
|
b.WriteString(fmt.Sprintf(`<?xml version="%s" encoding="%s" standalone="%s"?>`,
|
|
|
|
this.Version, this.Encoding, this.StandAlone))
|
2012-11-25 22:46:56 +00:00
|
|
|
|
|
|
|
if len(IndentPrefix) > 0 {
|
|
|
|
b.WriteByte('\n')
|
|
|
|
}
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
b.Write(this.Root.Bytes())
|
|
|
|
return b.Bytes()
|
2009-11-23 04:16:27 +00:00
|
|
|
}
|
2011-01-18 20:31:56 +00:00
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Save the contents of this document as a string.
|
|
|
|
func (this *Document) SaveString() string { return string(this.SaveBytes()) }
|
2011-01-27 21:10:38 +00:00
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Alias for Document.SaveString(). This one is invoked by anything looking for
|
|
|
|
// the standard String() method (eg: fmt.Printf("%s\n", mydoc).
|
|
|
|
func (this *Document) String() string { return string(this.SaveBytes()) }
|
2011-01-18 20:31:56 +00:00
|
|
|
|
2011-05-11 15:44:09 +00:00
|
|
|
// Save the contents of this document to the supplied writer.
|
2011-11-02 15:50:45 +00:00
|
|
|
func (this *Document) SaveStream(w io.Writer) (err error) {
|
2011-05-11 15:44:09 +00:00
|
|
|
_, err = w.Write(this.SaveBytes())
|
|
|
|
return
|
2011-01-18 20:31:56 +00:00
|
|
|
}
|
2015-11-30 18:30:01 +00:00
|
|
|
|
|
|
|
// Set a custom user agent when making a new request.
|
|
|
|
func (this *Document) SetUserAgent(s string) {
|
|
|
|
this.useragent = s
|
|
|
|
}
|