Fix missing return statement which prevented fetching new_items on first run
[enigma2-plugins.git] / simplerss / src / RSSFeed.py
1 from sets import Set
2 from TagStrip import strip, strip_readable
3 from Components.Scanner import ScanFile
4
5 class BaseFeed:
6         """Base-class for all Feeds. Initializes needed Elements."""
7         MAX_HISTORY_ELEMENTS = 100
8
9         def __init__(self, uri, autoupdate):
10                 # Set URI (used as Identifier)
11                 self.uri = uri
12
13                 # Set Autoupdate
14                 self.autoupdate = autoupdate
15
16                 # Initialize
17                 self.title = uri.encode("UTF-8")
18                 self.description = ""
19                 self.last_update = None
20                 self.last_ids = set()
21                 self.history = []
22
23 class AtomFeed(BaseFeed):
24         """Parses an Atom-Feed into expected format."""
25         def gotDom(self, dom):
26                 try:
27                         # Try to read when feed was last updated, if time equals return empty list. else fetch new items
28                         updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
29                         if self.last_update == updated:
30                                 return [ ]
31                         self.last_update = updated
32                 except:
33                         pass
34                 return AtomFeed.parse(self, dom.getElementsByTagName("entry"))
35
36         def parse(self, items):
37                 new_items = []
38                 for item in items:
39                         enclosure = []
40                         link = ""
41                         
42                         # Try to read title, continue if none found
43                         try:
44                                 title = strip(item.getElementsByTagName("title")[0].childNodes[0].data)
45                         except:
46                                 continue
47
48                         # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
49                         try:
50                                 id = item.getElementsByTagName("id")[0].childNodes[0].data
51                                 if id in self.last_ids:
52                                         continue
53                         except:
54                                 continue
55
56                         # Read out enclosures and link
57                         for current in item.getElementsByTagName("link"):
58                                 # Enclosure
59                                 if current.getAttribute("rel") == "enclosure":
60                                         href = current.getAttribute("href").encode("UTF-8")
61                                         type = current.getAttribute("type").encode("UTF-8")
62                                         if current.hasAttribute("length"):
63                                                 size = int(current.getAttribute("length")) / 1048576
64                                         else:
65                                                 size = None
66
67                                         # Workaround so PicturePlayer does not try to open these
68                                         if type in ["image/jpeg", "image/png", "image/gif", "image/bmp"]:
69                                                 type = None
70                                         enclosure.append(ScanFile(href, mimetype = type, size = size, autodetect = False))
71                                 # No Enclosure, assume its a link to the item
72                                 else:
73                                         link = current.getAttribute("href")
74                         
75                         # Try to read summary, empty if none
76                         try:
77                                 summary = strip_readable(item.getElementsByTagName("summary")[0].childNodes[0].data)
78                         except:
79                                 summary = ""
80
81                         # Update Lists
82                         new_items.append((
83                                         title.encode("UTF-8"),
84                                         link.encode("UTF-8"),
85                                         summary.encode("UTF-8"),
86                                         enclosure
87                         ))
88                         self.last_ids.add(id)
89
90                  # Append known Items to new Items and eventually cut it
91                 self.history = new_items + self.history
92                 self.history[:self.MAX_HISTORY_ELEMENTS]
93
94                 return new_items
95
96 class RSSFeed(BaseFeed):
97         """Parses an RSS-Feed into expected format."""
98         def gotDom(self, dom):
99                 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
100                 try:
101                         updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
102                         if self.last_update == updated:
103                                 return [ ]
104                         self.last_update = updated
105                 except:
106                         pass
107                 return RSSFeed.parse(self, dom.getElementsByTagName("item"))
108
109         def parse(self, items):
110                 new_items = []
111                 for item in items:
112                         enclosure = []
113
114                         # Try to read title, continue if none found
115                         try:
116                                 title = strip(item.getElementsByTagName("title")[0].childNodes[0].data)
117                         except:
118                                 continue
119
120                         # Try to read link, empty if none
121                         try:
122                                 link = item.getElementsByTagName("link")[0].childNodes[0].data
123                         except:
124                                 link = ""
125                         
126                         # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
127                         try:
128                                 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
129                         except:
130                                 guid = link
131
132                         # Continue if item is to be excluded
133                         if guid in self.last_ids:
134                                 continue
135
136                         # Try to read summary (description element), empty if none
137                         try:
138                                 summary = strip_readable(item.getElementsByTagName("description")[0].childNodes[0].data)
139                         except:
140                                 summary = ""
141
142                         # Read out enclosures
143                         for current in item.getElementsByTagName("enclosure"):
144                                 href = current.getAttribute("url").encode("UTF-8")
145                                 type = current.getAttribute("type").encode("UTF-8")
146                                 if current.hasAttribute("length"):
147                                         size = int(current.getAttribute("length")) / 1048576
148                                 else:
149                                         size = None
150
151                                 # Workaround so PicturePlayer does not try to open these
152                                 if type in ["image/jpeg", "image/png", "image/gif", "image/bmp"]:
153                                         type = None
154                                 enclosure.append(ScanFile(href, mimetype = type, size = size, autodetect = False))
155
156                         # Update Lists
157                         new_items.append((
158                                         title.encode("UTF-8"),
159                                         link.encode("UTF-8"),
160                                         summary.encode("UTF-8"),
161                                         enclosure
162                         ))
163                         
164                         self.last_ids.add(guid)
165
166                 # Append known Items to new Items and eventually cut it
167                 self.history = new_items + self.history
168                 self.history[:self.MAX_HISTORY_ELEMENTS]
169
170                 return new_items
171
172 class UniversalFeed(BaseFeed, RSSFeed, AtomFeed):
173         """Universal Feed which on first run determines its type and calls the correct parsing-functions"""
174         def __init__(self, uri, autoupdate):
175                 BaseFeed.__init__(self, uri, autoupdate)
176                 self.type = None
177
178         def gotDom(self, dom):
179                 if self.type == "rss":
180                         print "[SimpleRSS] type is rss"
181                         return RSSFeed.gotDom(self, dom)
182                 elif self.type == "atom":
183                         print "[SimpleRSS] type is atom"
184                         return AtomFeed.gotDom(self, dom)
185                 elif self.type is None:
186                         # RSS 2.0
187                         if dom.documentElement.getAttribute("version") in ["2.0", "0.94", "0.93", "0.92", "0.91"]:
188                                 self.type = "rss"
189                                 try:
190                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
191                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
192                                 except:
193                                         pass
194                         # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
195                         elif dom.documentElement.localName == "RDF":
196                                 self.type = "rss"
197                                 try:
198                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
199                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
200                                 except:
201                                         pass
202                         # Atom (NS: http://www.w3.org/2005/Atom)
203                         elif dom.documentElement.localName == "feed":
204                                 self.type = "atom"
205                                 try:
206                                         self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
207                                         self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
208                                 except:
209                                         pass
210                         else:
211                                 self.type = "unknown"
212                                 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
213                         self.title = strip(self.title).encode("UTF-8")
214                         self.description = strip_readable(self.description).encode("UTF-8")
215
216                         # Re-run function to parse dom
217                         return self.gotDom(dom)