2009-11-23 12:46:12 +00:00
|
|
|
/*
|
|
|
|
Author: jim teeuwen <jimteeuwen@gmail.com>
|
2013-03-19 18:12:11 +00:00
|
|
|
Dependencies: go-pkg-xmlx (http://github.com/jteeuwen/go-pkg-xmlx)
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
This package allows us to fetch Rss and Atom feeds from the internet.
|
|
|
|
They are parsed into an object tree which is a hyvrid of both the RSS and Atom
|
|
|
|
standards.
|
|
|
|
|
|
|
|
Supported feeds are:
|
|
|
|
- Rss v0.91, 0.91 and 2.0
|
|
|
|
- Atom 1.0
|
|
|
|
|
|
|
|
The package allows us to maintain cache timeout management. This prevents us
|
2010-05-23 14:21:30 +00:00
|
|
|
from querying the servers for feed updates too often and risk ip bams. Appart
|
2009-11-23 12:46:12 +00:00
|
|
|
from setting a cache timeout manually, the package also optionally adheres to
|
|
|
|
the TTL, SkipDays and SkipHours values specied in the feeds themselves.
|
|
|
|
|
|
|
|
Note that the TTL, SkipDays and SkipHour fields are only part of the RSS spec.
|
|
|
|
For Atom feeds, we use the CacheTimeout in the Feed struct.
|
|
|
|
|
|
|
|
Because the object structure is a hybrid between both RSS and Atom specs, not
|
|
|
|
all fields will be filled when requesting either an RSS or Atom feed. I have
|
|
|
|
tried to create as many shared fields as possiblem but some of them simply do
|
|
|
|
not occur in either the RSS or Atom spec.
|
|
|
|
*/
|
|
|
|
package feeder
|
|
|
|
|
2011-02-01 14:30:39 +00:00
|
|
|
import (
|
|
|
|
"fmt"
|
2013-03-19 18:12:11 +00:00
|
|
|
xmlx "github.com/jteeuwen/go-pkg-xmlx"
|
2013-03-19 14:03:21 +00:00
|
|
|
"net/http"
|
2011-02-01 14:30:39 +00:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
2011-12-07 12:46:06 +00:00
|
|
|
"time"
|
2011-02-01 14:30:39 +00:00
|
|
|
)
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2014-09-25 12:27:05 +00:00
|
|
|
type UnsupportedFeedError struct {
|
|
|
|
Type string
|
|
|
|
Version [2]int
|
|
|
|
}
|
|
|
|
|
|
|
|
func (err *UnsupportedFeedError) Error() string {
|
|
|
|
return fmt.Sprintf("Unsupported feed: %s, version: %+v", err.Type, err.Version)
|
|
|
|
}
|
|
|
|
|
2014-10-04 11:45:05 +00:00
|
|
|
type ChannelHandlerFunc func(f *Feed, newchannels []*Channel)
|
|
|
|
|
|
|
|
func (h ChannelHandlerFunc) ProcessChannels(f *Feed, newchannels []*Channel) {
|
2014-10-10 18:35:26 +00:00
|
|
|
if h != nil {
|
|
|
|
h(f, newchannels)
|
|
|
|
}
|
2014-10-04 11:45:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type ItemHandlerFunc func(f *Feed, ch *Channel, newitems []*Item)
|
|
|
|
|
|
|
|
func (h ItemHandlerFunc) ProcessItems(f *Feed, ch *Channel, newitems []*Item) {
|
2014-10-10 18:35:26 +00:00
|
|
|
if h != nil {
|
|
|
|
h(f, ch, newitems)
|
|
|
|
}
|
2014-10-04 11:45:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type ChannelHandler interface {
|
|
|
|
ProcessChannels(f *Feed, newchannels []*Channel)
|
|
|
|
}
|
|
|
|
|
|
|
|
type ItemHandler interface {
|
|
|
|
ProcessItems(f *Feed, ch *Channel, newitems []*Item)
|
|
|
|
}
|
|
|
|
|
2009-11-23 12:46:12 +00:00
|
|
|
type Feed struct {
|
|
|
|
// Custom cache timeout in minutes.
|
2010-05-23 14:21:30 +00:00
|
|
|
CacheTimeout int
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Make sure we adhere to the cache timeout specified in the feed. If
|
|
|
|
// our CacheTimeout is higher than that, we will use that instead.
|
2010-05-23 14:21:30 +00:00
|
|
|
EnforceCacheLimit bool
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Type of feed. Rss, Atom, etc
|
2010-05-23 14:21:30 +00:00
|
|
|
Type string
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Version of the feed. Major and Minor.
|
2010-05-23 14:21:30 +00:00
|
|
|
Version [2]int
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Channels with content.
|
2010-12-17 23:25:16 +00:00
|
|
|
Channels []*Channel
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// Url from which this feed was created.
|
2010-05-23 14:21:30 +00:00
|
|
|
Url string
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2014-10-14 17:51:13 +00:00
|
|
|
// Database
|
|
|
|
database *database
|
|
|
|
|
|
|
|
// The channel handler
|
|
|
|
channelHandler ChannelHandler
|
|
|
|
|
|
|
|
// The item handler
|
|
|
|
itemHandler ItemHandler
|
2010-12-17 23:25:16 +00:00
|
|
|
|
2009-11-23 12:46:12 +00:00
|
|
|
// Last time content was fetched. Used in conjunction with CacheTimeout
|
|
|
|
// to ensure we don't get content too often.
|
2014-11-28 15:53:09 +00:00
|
|
|
lastupdate time.Time
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
|
2014-10-04 11:45:05 +00:00
|
|
|
// New is a helper function to stay semi-compatible with
|
2014-10-14 17:53:00 +00:00
|
|
|
// the old code. Includes the database handler to ensure
|
2014-10-04 11:48:45 +00:00
|
|
|
// that this approach is functionally identical to the
|
2014-11-26 15:10:39 +00:00
|
|
|
// old database/handlers version.
|
2014-10-04 11:45:05 +00:00
|
|
|
func New(cachetimeout int, enforcecachelimit bool, ch ChannelHandlerFunc, ih ItemHandlerFunc) *Feed {
|
2014-10-14 17:51:13 +00:00
|
|
|
db := NewDatabase()
|
|
|
|
f := NewWithHandlers(cachetimeout, enforcecachelimit, NewDatabaseChannelHandler(db, ch), NewDatabaseItemHandler(db, ih))
|
|
|
|
f.database = db
|
|
|
|
return f
|
2014-10-04 11:45:05 +00:00
|
|
|
}
|
|
|
|
|
2014-11-26 14:26:17 +00:00
|
|
|
// NewWithHandler creates a new feed with handlers.
|
|
|
|
// People should use this approach from now on.
|
2014-10-14 17:51:13 +00:00
|
|
|
func NewWithHandlers(cachetimeout int, enforcecachelimit bool, ch ChannelHandler, ih ItemHandler) *Feed {
|
2010-12-17 20:57:48 +00:00
|
|
|
v := new(Feed)
|
|
|
|
v.CacheTimeout = cachetimeout
|
|
|
|
v.EnforceCacheLimit = enforcecachelimit
|
|
|
|
v.Type = "none"
|
2014-10-14 17:51:13 +00:00
|
|
|
v.channelHandler = ch
|
|
|
|
v.itemHandler = ih
|
2010-12-17 20:57:48 +00:00
|
|
|
return v
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
|
2010-12-18 17:19:53 +00:00
|
|
|
// This returns a timestamp of the last time the feed was updated.
|
2014-11-28 15:53:09 +00:00
|
|
|
func (this *Feed) LastUpdate() time.Time {
|
|
|
|
return this.lastupdate
|
|
|
|
}
|
2010-12-18 17:19:53 +00:00
|
|
|
|
2012-02-29 10:31:01 +00:00
|
|
|
// Fetch retrieves the feed's latest content if necessary.
|
|
|
|
//
|
|
|
|
// The charset parameter overrides the xml decoder's CharsetReader.
|
|
|
|
// This allows us to specify a custom character encoding conversion
|
|
|
|
// routine when dealing with non-utf8 input. Supply 'nil' to use the
|
|
|
|
// default from Go's xml package.
|
2013-03-19 14:03:21 +00:00
|
|
|
//
|
|
|
|
// This is equivalent to calling FetchClient with http.DefaultClient
|
2012-02-29 10:31:01 +00:00
|
|
|
func (this *Feed) Fetch(uri string, charset xmlx.CharsetFunc) (err error) {
|
2013-03-19 14:03:21 +00:00
|
|
|
return this.FetchClient(uri, http.DefaultClient, charset)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fetch retrieves the feed's latest content if necessary.
|
|
|
|
//
|
|
|
|
// The charset parameter overrides the xml decoder's CharsetReader.
|
|
|
|
// This allows us to specify a custom character encoding conversion
|
|
|
|
// routine when dealing with non-utf8 input. Supply 'nil' to use the
|
|
|
|
// default from Go's xml package.
|
|
|
|
//
|
|
|
|
// The client parameter allows the use of arbitrary network connections, for
|
|
|
|
// example the Google App Engine "URL Fetch" service.
|
|
|
|
func (this *Feed) FetchClient(uri string, client *http.Client, charset xmlx.CharsetFunc) (err error) {
|
2010-12-17 20:57:48 +00:00
|
|
|
if !this.CanUpdate() {
|
2010-05-23 14:21:30 +00:00
|
|
|
return
|
|
|
|
}
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2014-11-28 15:53:09 +00:00
|
|
|
this.lastupdate = time.Now().UTC()
|
2010-05-23 14:21:30 +00:00
|
|
|
this.Url = uri
|
|
|
|
doc := xmlx.New()
|
2012-02-29 10:31:01 +00:00
|
|
|
|
2013-03-19 14:03:21 +00:00
|
|
|
if err = doc.LoadUriClient(uri, client, charset); err != nil {
|
2010-05-23 14:21:30 +00:00
|
|
|
return
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
2013-03-28 03:05:27 +00:00
|
|
|
|
|
|
|
return this.makeFeed(doc)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fetch retrieves the feed's content from the []byte
|
|
|
|
//
|
|
|
|
// The charset parameter overrides the xml decoder's CharsetReader.
|
|
|
|
// This allows us to specify a custom character encoding conversion
|
|
|
|
// routine when dealing with non-utf8 input. Supply 'nil' to use the
|
|
|
|
// default from Go's xml package.
|
|
|
|
func (this *Feed) FetchBytes(uri string, content []byte, charset xmlx.CharsetFunc) (err error) {
|
|
|
|
this.Url = uri
|
|
|
|
|
|
|
|
doc := xmlx.New()
|
|
|
|
|
|
|
|
if err = doc.LoadBytes(content, charset); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
return this.makeFeed(doc)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Feed) makeFeed(doc *xmlx.Document) (err error) {
|
|
|
|
// Extract type and version of the feed so we can have the appropriate
|
|
|
|
// function parse it (rss 0.91, rss 0.92, rss 2, atom etc).
|
2010-05-23 14:21:30 +00:00
|
|
|
this.Type, this.Version = this.GetVersionInfo(doc)
|
2009-11-23 12:46:12 +00:00
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
if ok := this.testVersions(); !ok {
|
2014-09-25 12:27:05 +00:00
|
|
|
return &UnsupportedFeedError{Type: this.Type, Version: this.Version}
|
2010-05-23 14:21:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if err = this.buildFeed(doc); err != nil || len(this.Channels) == 0 {
|
|
|
|
return
|
|
|
|
}
|
2009-11-23 12:46:12 +00:00
|
|
|
|
|
|
|
// reset cache timeout values according to feed specified values (TTL)
|
2010-05-23 14:21:30 +00:00
|
|
|
if this.EnforceCacheLimit && this.CacheTimeout < this.Channels[0].TTL {
|
|
|
|
this.CacheTimeout = this.Channels[0].TTL
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
2010-12-17 23:25:16 +00:00
|
|
|
|
2013-12-05 14:31:49 +00:00
|
|
|
this.notifyListeners()
|
|
|
|
|
2013-11-28 09:23:49 +00:00
|
|
|
return
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
|
2013-12-05 14:31:49 +00:00
|
|
|
func (this *Feed) notifyListeners() {
|
|
|
|
for _, channel := range this.Channels {
|
2014-10-14 17:51:13 +00:00
|
|
|
if len(channel.Items) > 0 && this.itemHandler != nil {
|
|
|
|
this.itemHandler.ProcessItems(this, channel, channel.Items)
|
2013-12-05 14:31:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-14 17:51:13 +00:00
|
|
|
if len(this.Channels) > 0 && this.channelHandler != nil {
|
|
|
|
this.channelHandler.ProcessChannels(this, this.Channels)
|
2013-12-05 14:31:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-12-17 20:57:48 +00:00
|
|
|
// This function returns true or false, depending on whether the CacheTimeout
|
|
|
|
// value has expired or not. Additionally, it will ensure that we adhere to the
|
|
|
|
// RSS spec's SkipDays and SkipHours values (if Feed.EnforceCacheLimit is set to
|
|
|
|
// true). If this function returns true, you can be sure that a fresh feed
|
|
|
|
// update will be performed.
|
|
|
|
func (this *Feed) CanUpdate() bool {
|
2009-11-23 12:46:12 +00:00
|
|
|
// Make sure we are not within the specified cache-limit.
|
|
|
|
// This ensures we don't request data too often.
|
2014-11-28 15:44:20 +00:00
|
|
|
if SecondsTillUpdate() > 0 {
|
2009-11-23 12:46:12 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2014-11-28 15:44:20 +00:00
|
|
|
utc := time.Now().UTC()
|
|
|
|
|
2009-11-23 12:46:12 +00:00
|
|
|
// If skipDays or skipHours are set in the RSS feed, use these to see if
|
|
|
|
// we can update.
|
2013-06-05 17:43:16 +00:00
|
|
|
if len(this.Channels) == 1 && this.Type == "rss" {
|
2009-11-23 12:46:12 +00:00
|
|
|
if this.EnforceCacheLimit && len(this.Channels[0].SkipDays) > 0 {
|
2010-05-23 14:21:30 +00:00
|
|
|
for _, v := range this.Channels[0].SkipDays {
|
2011-12-02 11:50:42 +00:00
|
|
|
if time.Weekday(v) == utc.Weekday() {
|
2009-11-23 12:46:12 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if this.EnforceCacheLimit && len(this.Channels[0].SkipHours) > 0 {
|
2010-05-23 14:21:30 +00:00
|
|
|
for _, v := range this.Channels[0].SkipHours {
|
2011-12-02 11:50:42 +00:00
|
|
|
if v == utc.Hour() {
|
2009-11-23 12:46:12 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2011-05-09 19:11:50 +00:00
|
|
|
// Returns the number of seconds needed to elapse
|
|
|
|
// before the feed should update.
|
|
|
|
func (this *Feed) SecondsTillUpdate() int64 {
|
2011-12-02 11:50:42 +00:00
|
|
|
utc := time.Now().UTC()
|
2014-11-28 15:53:09 +00:00
|
|
|
elapsed := utc.Sub(this.lastupdate)
|
|
|
|
return int64(this.CacheTimeout*60) - int64(elapsed.Seconds())
|
2011-05-09 19:11:50 +00:00
|
|
|
}
|
|
|
|
|
2011-11-02 15:51:04 +00:00
|
|
|
func (this *Feed) buildFeed(doc *xmlx.Document) (err error) {
|
2009-11-23 12:46:12 +00:00
|
|
|
switch this.Type {
|
2010-05-23 14:21:30 +00:00
|
|
|
case "rss":
|
|
|
|
err = this.readRss2(doc)
|
|
|
|
case "atom":
|
|
|
|
err = this.readAtom(doc)
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Feed) testVersions() bool {
|
|
|
|
switch this.Type {
|
|
|
|
case "rss":
|
|
|
|
if this.Version[0] > 2 || (this.Version[0] == 2 && this.Version[1] > 0) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
case "atom":
|
|
|
|
if this.Version[0] > 1 || (this.Version[0] == 1 && this.Version[1] > 0) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2010-05-23 14:21:30 +00:00
|
|
|
return true
|
2009-11-23 12:46:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (this *Feed) GetVersionInfo(doc *xmlx.Document) (ftype string, fversion [2]int) {
|
2011-02-01 14:30:39 +00:00
|
|
|
var node *xmlx.Node
|
|
|
|
|
2014-11-26 15:21:39 +00:00
|
|
|
if node = doc.SelectNode("http://www.w3.org/2005/Atom", "feed"); node != nil {
|
|
|
|
ftype = "atom"
|
|
|
|
fversion = [2]int{1, 0}
|
|
|
|
return
|
2010-05-23 14:21:30 +00:00
|
|
|
}
|
2011-02-01 14:30:39 +00:00
|
|
|
|
|
|
|
if node = doc.SelectNode("", "rss"); node != nil {
|
|
|
|
ftype = "rss"
|
2014-09-28 11:30:58 +00:00
|
|
|
major := 0
|
|
|
|
minor := 0
|
2011-05-11 15:40:56 +00:00
|
|
|
version := node.As("", "version")
|
2011-02-01 14:30:39 +00:00
|
|
|
p := strings.Index(version, ".")
|
2014-09-28 11:30:58 +00:00
|
|
|
if p != -1 {
|
|
|
|
major, _ = strconv.Atoi(version[0:p])
|
|
|
|
minor, _ = strconv.Atoi(version[p+1 : len(version)])
|
|
|
|
}
|
2011-02-01 14:30:39 +00:00
|
|
|
fversion = [2]int{major, minor}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// issue#5: Some documents have an RDF root node instead of rss.
|
|
|
|
if node = doc.SelectNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"); node != nil {
|
|
|
|
ftype = "rss"
|
|
|
|
fversion = [2]int{1, 1}
|
|
|
|
return
|
2010-05-23 14:21:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ftype = "unknown"
|
|
|
|
fversion = [2]int{0, 0}
|
|
|
|
return
|
|
|
|
}
|