4 """Base-class for all Feeds. Initializes needed Elements."""
5 MAX_HISTORY_ELEMENTS = 100
7 def __init__(self, uri, autoupdate, stripper):
8 # Set URI (used as Identifier)
12 self.autoupdate = autoupdate
15 self.stripper = stripper
18 self.title = uri.encode("UTF-8")
20 self.last_update = None
24 class AtomFeed(BaseFeed):
25 """Parses an Atom-Feed into expected format."""
26 def gotDom(self, dom):
28 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
29 updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
30 if self.last_update == updated:
32 self.last_update = updated
35 return AtomFeed.parse(self, dom.getElementsByTagName("entry"))
37 def parse(self, items):
43 # Try to read title, continue if none found
45 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
49 # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
51 id = item.getElementsByTagName("id")[0].childNodes[0].data
52 if id in self.last_ids:
57 # Read out enclosures and link
58 for current in item.getElementsByTagName("link"):
60 if current.getAttribute("rel") == "enclosure":
62 current.getAttribute("href").encode("UTF-8"),
63 current.getAttribute("type").encode("UTF-8")
65 # No Enclosure, assume its a link to the item
67 link = current.getAttribute("href")
69 # Try to read summary, empty if none
71 summary = self.stripper.strip_readable(item.getElementsByTagName("summary")[0].childNodes[0].data)
77 title.encode("UTF-8"),
79 summary.encode("UTF-8"),
84 # Append known Items to new Items and eventually cut it
85 self.history = new_items + self.history
86 self.history[:self.MAX_HISTORY_ELEMENTS]
90 class RSSFeed(BaseFeed):
91 """Parses an RSS-Feed into expected format."""
92 def gotDom(self, dom):
93 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
95 updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
96 if self.last_update == updated:
98 self.last_update = updated
101 return RSSFeed.parse(self, dom.getElementsByTagName("item"))
103 def parse(self, items):
108 # Try to read title, continue if none found
110 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
114 # Try to read link, empty if none
116 link = item.getElementsByTagName("link")[0].childNodes[0].data
120 # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
122 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
126 # Continue if item is to be excluded
127 if guid in self.last_ids:
130 # Try to read summary (description element), empty if none
132 summary = self.stripper.strip_readable(item.getElementsByTagName("description")[0].childNodes[0].data)
136 # Read out enclosures
137 for current in item.getElementsByTagName("enclosure"):
139 current.getAttribute("url").encode("UTF-8"),
140 current.getAttribute("type").encode("UTF-8")
145 title.encode("UTF-8"),
146 link.encode("UTF-8"),
147 summary.encode("UTF-8"),
151 self.last_ids.add(guid)
153 # Append known Items to new Items and eventually cut it
154 self.history = new_items + self.history
155 self.history[:self.MAX_HISTORY_ELEMENTS]
159 class UniversalFeed(BaseFeed, RSSFeed, AtomFeed):
160 """Universal Feed which on first run determines its type and calls the correct parsing-functions"""
161 def __init__(self, uri, autoupdate, stripper):
162 BaseFeed.__init__(self, uri, autoupdate, stripper)
165 def gotDom(self, dom):
166 if self.type is None:
168 if dom.documentElement.getAttribute("version") in ["2.0", "0.94", "0.93", "0.92", "0.91"]:
171 self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
172 self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
175 # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
176 elif dom.documentElement.localName == "RDF":
179 self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
180 self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
183 # Atom (NS: http://www.w3.org/2005/Atom)
184 elif dom.documentElement.localName == "feed":
187 self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
188 self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
192 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
193 self.title = self.stripper.strip(self.title).encode("UTF-8")
194 self.description = self.stripper.strip_readable(self.description).encode("UTF-8")
195 if self.type == "rss":
196 print "[SimpleRSS] type is rss"
197 return RSSFeed.gotDom(self, dom)
198 elif self.type == "atom":
199 print "[SimpleRSS] type is atom"
200 return AtomFeed.gotDom(self, dom)