2009-11-23 12:46:12 +00:00
|
|
|
/*
|
|
|
|
Author: jim teeuwen <jimteeuwen@gmail.com>
|
|
|
|
Dependencies: go-pkg-xmlx (http://github.com/jteeuwen/go-pkg-xmlx)
|
|
|
|
|
|
|
|
This package allows us to fetch Rss and Atom feeds from the internet.
|
|
|
|
They are parsed into an object tree which is a hyvrid of both the RSS and Atom
|
|
|
|
standards.
|
|
|
|
|
|
|
|
Supported feeds are:
|
|
|
|
- Rss v0.91, 0.91 and 2.0
|
|
|
|
- Atom 1.0
|
|
|
|
|
|
|
|
The package allows us to maintain cache timeout management. This prevents us
|
2010-05-23 14:21:30 +00:00
|
|
|
from querying the servers for feed updates too often and risk ip bams. Appart
|
2009-11-23 12:46:12 +00:00
|
|
|
from setting a cache timeout manually, the package also optionally adheres to
|
|
|
|
the TTL, SkipDays and SkipHours values specied in the feeds themselves.
|
|
|
|
|
|
|
|
Note that the TTL, SkipDays and SkipHour fields are only part of the RSS spec.
|
|
|
|
For Atom feeds, we use the CacheTimeout in the Feed struct.
|
|
|
|
|
|
|
|
Because the object structure is a hybrid between both RSS and Atom specs, not
|
|
|
|
all fields will be filled when requesting either an RSS or Atom feed. I have
|
|
|
|
tried to create as many shared fields as possiblem but some of them simply do
|
|
|
|
not occur in either the RSS or Atom spec.
|
|
|
|
*/
|
|
|
|
package feeder
|
|
|
|
|
2011-02-01 14:30:39 +00:00
|
|
|
import (
|
|
|
|
"os"
|
|
|
|
"time"
|
|
|
|
xmlx "github.com/jteeuwen/go-pkg-xmlx"
|
|
|
|
"fmt"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
)
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2010-12-17 23:25:16 +00:00
|
|
|
type ChannelHandler func(f *Feed, newchannels []*Channel)
|
|
|
|
type ItemHandler func(f *Feed, ch *Channel, newitems []*Item)
|
|
|
|
|
2009-11-23 12:46:12 +00:00
|
|
|
type Feed struct {
|
|
|
|
// Custom cache timeout in minutes.
|
2010-05-23 14:21:30 +00:00
|
|
|
CacheTimeout int
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Make sure we adhere to the cache timeout specified in the feed. If
|
|
|
|
// our CacheTimeout is higher than that, we will use that instead.
|
2010-05-23 14:21:30 +00:00
|
|
|
EnforceCacheLimit bool
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Type of feed. Rss, Atom, etc
|
2010-05-23 14:21:30 +00:00
|
|
|
Type string
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Version of the feed. Major and Minor.
|
2010-05-23 14:21:30 +00:00
|
|
|
Version [2]int
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Channels with content.
|
2010-12-17 23:25:16 +00:00
|
|
|
Channels []*Channel
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Url from which this feed was created.
|
2010-05-23 14:21:30 +00:00
|
|
|
Url string
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2010-12-17 23:25:16 +00:00
|
|
|
// A notification function, used to notify the host when a new channel
|
|
|
|
// has been found.
|
|
|
|
chanhandler ChannelHandler
|
|
|
|
|
|
|
|
// A notification function, used to notify the host when a new item
|
|
|
|
// has been found for a given channel.
|
|
|
|
itemhandler ItemHandler
|
|
|
|
|
2009-11-23 12:46:12 +00:00
|
|
|
// Last time content was fetched. Used in conjunction with CacheTimeout
|
|
|
|
// to ensure we don't get content too often.
|
2010-05-23 14:21:30 +00:00
|
|
|
lastupdate int64
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
|
2010-12-17 23:25:16 +00:00
|
|
|
func New(cachetimeout int, enforcecachelimit bool, ch ChannelHandler, ih ItemHandler) *Feed {
|
2010-12-17 20:57:48 +00:00
|
|
|
v := new(Feed)
|
|
|
|
v.CacheTimeout = cachetimeout
|
|
|
|
v.EnforceCacheLimit = enforcecachelimit
|
|
|
|
v.Type = "none"
|
2010-12-17 23:25:16 +00:00
|
|
|
v.chanhandler = ch
|
|
|
|
v.itemhandler = ih
|
2010-12-17 20:57:48 +00:00
|
|
|
return v
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
|
2010-12-18 17:19:53 +00:00
|
|
|
// This returns a timestamp of the last time the feed was updated.
|
|
|
|
// The value is in seconds.
|
|
|
|
func (this *Feed) LastUpdate() int64 { return this.lastupdate }
|
|
|
|
|
2009-11-23 12:46:12 +00:00
|
|
|
func (this *Feed) Fetch(uri string) (err os.Error) {
|
2010-12-17 20:57:48 +00:00
|
|
|
if !this.CanUpdate() {
|
2010-05-23 14:21:30 +00:00
|
|
|
return
|
|
|
|
}
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
this.Url = uri
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Extract type and version of the feed so we can have the appropriate
|
|
|
|
// function parse it (rss 0.91, rss 0.92, rss 2, atom etc).
|
2010-05-23 14:21:30 +00:00
|
|
|
doc := xmlx.New()
|
2010-12-17 20:57:48 +00:00
|
|
|
if err = doc.LoadUri(uri); err != nil {
|
2010-05-23 14:21:30 +00:00
|
|
|
return
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
2010-05-23 14:21:30 +00:00
|
|
|
this.Type, this.Version = this.GetVersionInfo(doc)
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
if ok := this.testVersions(); !ok {
|
|
|
|
err = os.NewError(fmt.Sprintf("Unsupported feed: %s, version: %+v", this.Type, this.Version))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2010-12-17 23:25:16 +00:00
|
|
|
chancount := len(this.Channels)
|
2010-05-23 14:21:30 +00:00
|
|
|
if err = this.buildFeed(doc); err != nil || len(this.Channels) == 0 {
|
|
|
|
return
|
|
|
|
}
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2010-12-17 23:25:16 +00:00
|
|
|
// Notify host of new channels
|
|
|
|
if chancount != len(this.Channels) && this.chanhandler != nil {
|
|
|
|
this.chanhandler(this, this.Channels[chancount:])
|
|
|
|
}
|
|
|
|
|
2009-11-23 12:46:12 +00:00
|
|
|
// reset cache timeout values according to feed specified values (TTL)
|
2010-05-23 14:21:30 +00:00
|
|
|
if this.EnforceCacheLimit && this.CacheTimeout < this.Channels[0].TTL {
|
|
|
|
this.CacheTimeout = this.Channels[0].TTL
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
2010-12-17 23:25:16 +00:00
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
return
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
|
2010-12-17 20:57:48 +00:00
|
|
|
// This function returns true or false, depending on whether the CacheTimeout
|
|
|
|
// value has expired or not. Additionally, it will ensure that we adhere to the
|
|
|
|
// RSS spec's SkipDays and SkipHours values (if Feed.EnforceCacheLimit is set to
|
|
|
|
// true). If this function returns true, you can be sure that a fresh feed
|
|
|
|
// update will be performed.
|
|
|
|
func (this *Feed) CanUpdate() bool {
|
2009-11-23 12:46:12 +00:00
|
|
|
// Make sure we are not within the specified cache-limit.
|
|
|
|
// This ensures we don't request data too often.
|
2010-05-23 14:21:30 +00:00
|
|
|
utc := time.UTC()
|
|
|
|
if utc.Seconds()-this.lastupdate < int64(this.CacheTimeout*60) {
|
2009-11-23 12:46:12 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// If skipDays or skipHours are set in the RSS feed, use these to see if
|
|
|
|
// we can update.
|
|
|
|
if len(this.Channels) == 0 && this.Type == "rss" {
|
|
|
|
if this.EnforceCacheLimit && len(this.Channels[0].SkipDays) > 0 {
|
2010-05-23 14:21:30 +00:00
|
|
|
for _, v := range this.Channels[0].SkipDays {
|
2009-11-23 12:46:12 +00:00
|
|
|
if v == utc.Weekday {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if this.EnforceCacheLimit && len(this.Channels[0].SkipHours) > 0 {
|
2010-05-23 14:21:30 +00:00
|
|
|
for _, v := range this.Channels[0].SkipHours {
|
2009-11-23 12:46:12 +00:00
|
|
|
if v == utc.Hour {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
this.lastupdate = utc.Seconds()
|
2009-11-23 12:46:12 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Feed) buildFeed(doc *xmlx.Document) (err os.Error) {
|
|
|
|
switch this.Type {
|
2010-05-23 14:21:30 +00:00
|
|
|
case "rss":
|
|
|
|
err = this.readRss2(doc)
|
|
|
|
case "atom":
|
|
|
|
err = this.readAtom(doc)
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Feed) testVersions() bool {
|
|
|
|
switch this.Type {
|
|
|
|
case "rss":
|
|
|
|
if this.Version[0] > 2 || (this.Version[0] == 2 && this.Version[1] > 0) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
case "atom":
|
|
|
|
if this.Version[0] > 1 || (this.Version[0] == 1 && this.Version[1] > 0) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
return true
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Feed) GetVersionInfo(doc *xmlx.Document) (ftype string, fversion [2]int) {
|
2011-02-01 14:30:39 +00:00
|
|
|
var node *xmlx.Node
|
|
|
|
|
|
|
|
if node = doc.SelectNode("http://www.w3.org/2005/Atom", "feed"); node == nil {
|
2010-05-23 14:21:30 +00:00
|
|
|
goto rss
|
|
|
|
}
|
2011-02-01 14:30:39 +00:00
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
ftype = "atom"
|
|
|
|
fversion = [2]int{1, 0}
|
|
|
|
return
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
rss:
|
2011-02-01 14:30:39 +00:00
|
|
|
if node = doc.SelectNode("", "rss"); node != nil {
|
|
|
|
ftype = "rss"
|
|
|
|
version := node.GetAttr("", "version")
|
|
|
|
p := strings.Index(version, ".")
|
|
|
|
major, _ := strconv.Atoi(version[0:p])
|
|
|
|
minor, _ := strconv.Atoi(version[p+1 : len(version)])
|
|
|
|
fversion = [2]int{major, minor}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// issue#5: Some documents have an RDF root node instead of rss.
|
|
|
|
if node = doc.SelectNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"); node != nil {
|
|
|
|
ftype = "rss"
|
|
|
|
fversion = [2]int{1, 1}
|
|
|
|
return
|
2010-05-23 14:21:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
end:
|
|
|
|
ftype = "unknown"
|
|
|
|
fversion = [2]int{0, 0}
|
|
|
|
return
|
|
|
|
}
|