1 from TagStrip import strip, strip_readable
2 from Components.Scanner import ScanFile
4 NS_RDF = "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}"
5 NS_RSS_09 = "{http://my.netscape.com/rdf/simple/0.9/}"
6 NS_RSS_10 = "{http://purl.org/rss/1.0/}"
8 # based on http://effbot.org/zone/element-rss-wrapper.htm
10 def __init__(self, element, ns = ""):
11 self._element = element
14 def __getattr__(self, tag):
15 if tag.startswith('__'):
16 raise AttributeError(tag)
17 return self._element.findtext(self._ns + tag)
19 class RSSEntryWrapper(ElementWrapper):
20 def __getattr__(self, tag):
21 if tag == "enclosures":
23 for elem in self._element.findall(self._ns + 'enclosure'):
24 length = elem.get("length")
26 length = int(length) / 1048576
29 mimetype = elem.get("type"),
35 return self._element.findtext(self._ns + 'guid', self.title + self.link)
36 elif tag == "updated":
38 elif tag == "summary":
40 return ElementWrapper.__getattr__(self, tag)
42 class PEAEntryWrapper(ElementWrapper):
43 def __getattr__(self, tag):
45 for elem in self._element.findall(self._ns + tag):
46 if not elem.get("rel") == "enclosure":
47 return elem.get("href")
49 elif tag == "enclosures":
51 for elem in self._element.findall(self._ns + 'link'):
52 if elem.get("rel") == "enclosure":
53 length = elem.get("length")
55 length = int(length) / 1048576
58 mimetype = elem.get("type"),
63 return ElementWrapper.__getattr__(self, tag)
65 class RSSWrapper(ElementWrapper):
66 def __init__(self, channel, items, ns = ""):
68 ElementWrapper.__init__(self, channel, ns)
72 self.len = len(self)-1
83 return len(self._items)
85 def __getitem__(self, index):
86 return RSSEntryWrapper(self._items[index], self._ns)
88 class RSS1Wrapper(RSSWrapper):
89 def __init__(self, feed, ns):
91 self, feed.find(ns + 'channel'),
92 feed.findall(ns + 'item'), ns
95 class RSS2Wrapper(RSSWrapper):
96 def __init__(self, feed, ns):
97 channel = feed.find("channel")
99 self, channel, channel.findall("item")
102 class PEAWrapper(RSSWrapper):
103 def __init__(self, feed, ns):
104 ns = feed.tag[:feed.tag.index("}")+1]
106 self, feed, feed.findall(ns + 'entry'), ns
109 def __getitem__(self, index):
110 return PEAEntryWrapper(self._items[index], self._ns)
112 def __getattr__(self, tag):
113 if tag == "description":
115 return ElementWrapper.__getattr__(self, tag)
118 """Base-class for all Feeds. Initializes needed Elements."""
119 MAX_HISTORY_ELEMENTS = 100
121 def __init__(self, uri, title = "", description = ""):
122 # Set URI (used as Identifier)
126 self.title = title or uri.encode("UTF-8")
127 self.description = description
131 return "<%s, \"%s\", \"%s\", %d items>" % (self.__class__, self.title, self.description, len(self.history))
133 class UniversalFeed(BaseFeed):
134 """Feed which can handle rdf, rss and atom feeds utilizing abstraction wrappers."""
135 def __init__(self, uri, autoupdate):
136 BaseFeed.__init__(self, uri)
139 self.autoupdate = autoupdate
142 self.last_update = None
143 self.last_ids = set()
147 def gotWrapper(self, wrapper):
148 updated = wrapper.updated
149 if updated and self.last_update == updated:
155 # Try to read title, continue if none found
156 title = strip(item.title)
160 # Try to read id, continue if none found (invalid feed or internal error) or to be excluded
162 if not id or id in ids:
168 # Try to read summary, empty if none
169 summary = strip_readable(item.summary or "")
172 self.history.insert(idx, (
173 title.encode("UTF-8"),
174 link.encode("UTF-8"),
175 summary.encode("UTF-8"),
182 # Eventually cut history
183 del self.history[self.MAX_HISTORY_ELEMENTS:]
185 return self.history[:idx]
187 def gotFeed(self, feed):
188 if self.wrapper is not None:
189 wrapper = self.wrapper(feed, self.ns)
191 if feed.tag == "rss":
192 self.wrapper = RSS2Wrapper
193 elif feed.tag.startswith(NS_RDF):
195 self.wrapper = RSS1Wrapper
196 elif feed.tag.startswith(NS_RSS_09):
198 self.wrapper = RSS1Wrapper
199 elif feed.tag.startswith(NS_RSS_10):
201 self.wrapper = RSS1Wrapper
202 elif feed.tag.endswith("feed"):
203 self.wrapper = PEAWrapper
205 raise NotImplementedError, 'Unsupported Feed: %s' % feed.tag
207 wrapper = self.wrapper(feed, self.ns)
209 self.title = strip(wrapper.title).encode("UTF-8")
210 self.description = strip_readable(wrapper.description or "").encode("UTF-8")
212 return self.gotWrapper(wrapper)