3 from urlparse import urlsplit
6 """Base-class for all Feeds. Initializes needed Elements."""
7 MAX_HISTORY_ELEMENTS = 100
9 def __init__(self, uri, autoupdate, stripper):
10 # Set URI (used as Identifier)
13 # Determine URI Elements
14 remote = urlsplit(uri)
15 self.hostname = remote.hostname
16 self.port = remote.port or 80
17 self.path = '?'.join([remote.path, remote.query])
18 print "[SimpleRSS] determined hostname:", self.hostname, ", port:", self.port, ", path:", self.path
21 self.autoupdate = autoupdate
24 self.stripper = stripper
27 self.title = uri.encode("UTF-8")
29 self.last_update = None
33 class AtomFeed(BaseFeed):
34 """Parses an Atom-Feed into expected format."""
35 def gotDom(self, dom):
37 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
38 updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
39 if self.last_update == updated:
41 self.last_update = updated
44 return AtomFeed.parse(self, dom.getElementsByTagName("entry"))
46 def parse(self, items):
52 # Try to read title, continue if none found
54 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
58 # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
60 id = item.getElementsByTagName("id")[0].childNodes[0].data
61 if id in self.last_ids:
66 # Read out enclosures and link
67 for current in item.getElementsByTagName("link"):
69 if current.getAttribute("rel") == "enclosure":
71 current.getAttribute("href").encode("UTF-8"),
72 current.getAttribute("type").encode("UTF-8")
74 # No Enclosure, assume its a link to the item
76 link = current.getAttribute("href")
78 # Try to read summary, empty if none
80 summary = self.stripper.strip(item.getElementsByTagName("summary")[0].childNodes[0].data)
86 title.encode("UTF-8"),
88 summary.encode("UTF-8"),
93 # Append known Items to new Items and eventually cut it
94 self.history = new_items + self.history
95 self.history[:self.MAX_HISTORY_ELEMENTS]
99 class RSSFeed(BaseFeed):
100 """Parses an RSS-Feed into expected format."""
101 def gotDom(self, dom):
102 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
104 updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
105 if self.last_update == updated:
107 self.last_update = updated
110 return RSSFeed.parse(self, dom.getElementsByTagName("item"))
112 def parse(self, items):
117 # Try to read title, continue if none found
119 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
123 # Try to read link, empty if none
125 link = item.getElementsByTagName("link")[0].childNodes[0].data
129 # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
131 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
135 # Continue if item is to be excluded
136 if guid in self.last_ids:
139 # Try to read summary (description element), empty if none
141 summary = self.stripper.strip(item.getElementsByTagName("description")[0].childNodes[0].data)
145 # Read out enclosures
146 for current in item.getElementsByTagName("enclosure"):
148 current.getAttribute("url").encode("UTF-8"),
149 current.getAttribute("type").encode("UTF-8")
154 title.encode("UTF-8"),
155 link.encode("UTF-8"),
156 summary.encode("UTF-8"),
160 self.last_ids.add(guid)
162 # Append known Items to new Items and eventually cut it
163 self.history = new_items + self.history
164 self.history[:self.MAX_HISTORY_ELEMENTS]
168 class UniversalFeed(BaseFeed, RSSFeed, AtomFeed):
169 """Universal Feed which on first run determines its type and calls the correct parsing-functions"""
170 def __init__(self, uri, autoupdate, stripper):
171 BaseFeed.__init__(self, uri, autoupdate, stripper)
174 def gotDom(self, dom):
175 if self.type is None:
177 if dom.documentElement.getAttribute("version") in ["2.0", "0.94", "0.93", "0.92", "0.91"]:
180 self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
181 self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
184 # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
185 elif dom.documentElement.localName == "RDF":
188 self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
189 self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
192 # Atom (NS: http://www.w3.org/2005/Atom)
193 elif dom.documentElement.localName == "feed":
196 self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
197 self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
201 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
202 self.title = self.stripper.strip(self.title).encode("UTF-8")
203 self.description = self.stripper.strip(self.description).encode("UTF-8")
204 if self.type == "rss":
205 print "[SimpleRSS] type is rss"
206 return RSSFeed.gotDom(self, dom)
207 elif self.type == "atom":
208 print "[SimpleRSS] type is atom"
209 return AtomFeed.gotDom(self, dom)