2 from TagStrip import strip, strip_readable
3 from Components.Scanner import ScanFile
6 """Base-class for all Feeds. Initializes needed Elements."""
7 MAX_HISTORY_ELEMENTS = 100
9 def __init__(self, uri, autoupdate):
10 # Set URI (used as Identifier)
14 self.autoupdate = autoupdate
17 self.title = uri.encode("UTF-8")
19 self.last_update = None
23 class AtomFeed(BaseFeed):
24 """Parses an Atom-Feed into expected format."""
25 def gotDom(self, dom):
27 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
28 updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
29 if self.last_update == updated:
31 self.last_update = updated
34 return AtomFeed.parse(self, dom.getElementsByTagName("entry"))
36 def parse(self, items):
42 # Try to read title, continue if none found
44 title = strip(item.getElementsByTagName("title")[0].childNodes[0].data)
48 # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
50 id = item.getElementsByTagName("id")[0].childNodes[0].data
51 if id in self.last_ids:
56 # Read out enclosures and link
57 for current in item.getElementsByTagName("link"):
59 if current.getAttribute("rel") == "enclosure":
60 href = current.getAttribute("href").encode("UTF-8")
61 type = current.getAttribute("type").encode("UTF-8")
62 if current.hasAttribute("length"):
63 size = int(current.getAttribute("length")) / 1048576
67 enclosure.append(ScanFile(href, mimetype = type, size = size, autodetect = False))
68 # No Enclosure, assume its a link to the item
70 link = current.getAttribute("href")
72 # Try to read summary, empty if none
74 summary = strip_readable(item.getElementsByTagName("summary")[0].childNodes[0].data)
80 title.encode("UTF-8"),
82 summary.encode("UTF-8"),
87 # Append known Items to new Items and eventually cut it
88 self.history = new_items + self.history
89 self.history[:self.MAX_HISTORY_ELEMENTS]
93 class RSSFeed(BaseFeed):
94 """Parses an RSS-Feed into expected format."""
95 def gotDom(self, dom):
96 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
98 updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
99 if self.last_update == updated:
101 self.last_update = updated
104 return RSSFeed.parse(self, dom.getElementsByTagName("item"))
106 def parse(self, items):
111 # Try to read title, continue if none found
113 title = strip(item.getElementsByTagName("title")[0].childNodes[0].data)
117 # Try to read link, empty if none
119 link = item.getElementsByTagName("link")[0].childNodes[0].data
123 # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
125 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
129 # Continue if item is to be excluded
130 if guid in self.last_ids:
133 # Try to read summary (description element), empty if none
135 summary = strip_readable(item.getElementsByTagName("description")[0].childNodes[0].data)
139 # Read out enclosures
140 for current in item.getElementsByTagName("enclosure"):
141 href = current.getAttribute("url").encode("UTF-8")
142 type = current.getAttribute("type").encode("UTF-8")
143 if current.hasAttribute("length"):
144 size = int(current.getAttribute("length")) / 1048576
148 enclosure.append(ScanFile(href, mimetype = type, size = size, autodetect = False))
152 title.encode("UTF-8"),
153 link.encode("UTF-8"),
154 summary.encode("UTF-8"),
158 self.last_ids.add(guid)
160 # Append known Items to new Items and eventually cut it
161 self.history = new_items + self.history
162 self.history[:self.MAX_HISTORY_ELEMENTS]
166 class UniversalFeed(BaseFeed, RSSFeed, AtomFeed):
167 """Universal Feed which on first run determines its type and calls the correct parsing-functions"""
168 def __init__(self, uri, autoupdate):
169 BaseFeed.__init__(self, uri, autoupdate)
172 def gotDom(self, dom):
173 if self.type == "rss":
174 print "[SimpleRSS] type is rss"
175 return RSSFeed.gotDom(self, dom)
176 elif self.type == "atom":
177 print "[SimpleRSS] type is atom"
178 return AtomFeed.gotDom(self, dom)
179 elif self.type is None:
181 if dom.documentElement.getAttribute("version") in ["2.0", "0.94", "0.93", "0.92", "0.91"]:
184 self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
185 self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
188 # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
189 elif dom.documentElement.localName == "RDF":
192 self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
193 self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
196 # Atom (NS: http://www.w3.org/2005/Atom)
197 elif dom.documentElement.localName == "feed":
200 self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
201 self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
205 self.type = "unknown"
206 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
207 self.title = strip(self.title).encode("UTF-8")
208 self.description = strip_readable(self.description).encode("UTF-8")
210 # Re-run function to parse dom
211 return self.gotDom(dom)