allow bogus rss version 2.00
[enigma2-plugins.git] / simplerss / src / RSSFeed.py
1 from sets import Set
2 from TagStrip import strip, strip_readable
3 from Components.Scanner import ScanFile
4
5 class BaseFeed:
6         """Base-class for all Feeds. Initializes needed Elements."""
7         MAX_HISTORY_ELEMENTS = 100
8
9         def __init__(self, uri, autoupdate):
10                 # Set URI (used as Identifier)
11                 self.uri = uri
12
13                 # Set Autoupdate
14                 self.autoupdate = autoupdate
15
16                 # Initialize
17                 self.title = uri.encode("UTF-8")
18                 self.description = ""
19                 self.last_update = None
20                 self.last_ids = set()
21                 self.history = []
22
23 class AtomFeed(BaseFeed):
24         """Parses an Atom-Feed into expected format."""
25         def gotDom(self, dom):
26                 try:
27                         # Try to read when feed was last updated, if time equals return empty list. else fetch new items
28                         updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
29                         if self.last_update == updated:
30                                 return [ ]
31                         self.last_update = updated
32                 except:
33                         pass
34                 return AtomFeed.parse(self, dom.getElementsByTagName("entry"))
35
36         def parse(self, items):
37                 idx = 0
38                 for item in items:
39                         enclosure = []
40                         link = ""
41                         
42                         # Try to read title, continue if none found
43                         try:
44                                 title = strip(item.getElementsByTagName("title")[0].childNodes[0].data)
45                         except:
46                                 continue
47
48                         # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
49                         try:
50                                 id = item.getElementsByTagName("id")[0].childNodes[0].data
51                                 if id in self.last_ids:
52                                         continue
53                         except:
54                                 continue
55
56                         # Read out enclosures and link
57                         for current in item.getElementsByTagName("link"):
58                                 # Enclosure
59                                 if current.getAttribute("rel") == "enclosure":
60                                         href = current.getAttribute("href").encode("UTF-8")
61                                         type = current.getAttribute("type").encode("UTF-8")
62                                         if current.hasAttribute("length"):
63                                                 size = int(current.getAttribute("length")) / 1048576
64                                         else:
65                                                 size = None
66
67                                         enclosure.append(ScanFile(href, mimetype = type, size = size, autodetect = False))
68                                 # No Enclosure, assume its a link to the item
69                                 else:
70                                         link = current.getAttribute("href")
71                         
72                         # Try to read summary, empty if none
73                         try:
74                                 summary = strip_readable(item.getElementsByTagName("summary")[0].childNodes[0].data)
75                         except:
76                                 summary = ""
77
78                         # Update Lists
79                         self.history.insert(idx, (
80                                         title.encode("UTF-8"),
81                                         link.encode("UTF-8"),
82                                         summary.encode("UTF-8"),
83                                         enclosure
84                         ))
85                         self.last_ids.add(id)
86                         
87                         idx += 1
88
89                 # Eventually cut history
90                 del self.history[self.MAX_HISTORY_ELEMENTS:]
91
92                 return self.history[:idx]
93
94 class RSSFeed(BaseFeed):
95         """Parses an RSS-Feed into expected format."""
96         def gotDom(self, dom):
97                 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
98                 try:
99                         updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
100                         if self.last_update == updated:
101                                 return [ ]
102                         self.last_update = updated
103                 except:
104                         pass
105                 return RSSFeed.parse(self, dom.getElementsByTagName("item"))
106
107         def parse(self, items):
108                 idx = 0
109                 new_items = []
110                 for item in items:
111                         enclosure = []
112
113                         # Try to read title, continue if none found
114                         try:
115                                 title = strip(item.getElementsByTagName("title")[0].childNodes[0].data)
116                         except:
117                                 continue
118
119                         # Try to read link, empty if none
120                         try:
121                                 link = item.getElementsByTagName("link")[0].childNodes[0].data
122                         except:
123                                 link = ""
124                         
125                         # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
126                         try:
127                                 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
128                         except:
129                                 guid = link
130
131                         # Continue if item is to be excluded
132                         if guid in self.last_ids:
133                                 continue
134
135                         # Try to read summary (description element), empty if none
136                         try:
137                                 summary = strip_readable(item.getElementsByTagName("description")[0].childNodes[0].data)
138                         except:
139                                 summary = ""
140
141                         # Read out enclosures
142                         for current in item.getElementsByTagName("enclosure"):
143                                 href = current.getAttribute("url").encode("UTF-8")
144                                 type = current.getAttribute("type").encode("UTF-8")
145                                 if current.hasAttribute("length"):
146                                         size = int(current.getAttribute("length")) / 1048576
147                                 else:
148                                         size = None
149
150                                 enclosure.append(ScanFile(href, mimetype = type, size = size, autodetect = False))
151
152                         # Update Lists
153                         self.history.insert(idx, (
154                                         title.encode("UTF-8"),
155                                         link.encode("UTF-8"),
156                                         summary.encode("UTF-8"),
157                                         enclosure
158                         ))
159                         self.last_ids.add(guid)
160
161                         idx += 1
162
163                 # Eventually cut history
164                 del self.history[self.MAX_HISTORY_ELEMENTS:]
165
166                 return self.history[:idx]
167
168 class UniversalFeed(BaseFeed, RSSFeed, AtomFeed):
169         """Universal Feed which on first run determines its type and calls the correct parsing-functions"""
170         def __init__(self, uri, autoupdate):
171                 BaseFeed.__init__(self, uri, autoupdate)
172                 self.type = None
173
174         def gotDom(self, dom):
175                 if self.type == "rss":
176                         print "[SimpleRSS] type is rss"
177                         return RSSFeed.gotDom(self, dom)
178                 elif self.type == "atom":
179                         print "[SimpleRSS] type is atom"
180                         return AtomFeed.gotDom(self, dom)
181                 elif self.type is None:
182                         # RSS 2.0
183                         if dom.documentElement.getAttribute("version") in ["2.0", "2.00", "0.94", "0.93", "0.92", "0.91"]:
184                                 self.type = "rss"
185                                 try:
186                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
187                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
188                                 except:
189                                         pass
190                         # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
191                         elif dom.documentElement.localName == "RDF":
192                                 self.type = "rss"
193                                 try:
194                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
195                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
196                                 except:
197                                         pass
198                         # Atom (NS: http://www.w3.org/2005/Atom)
199                         elif dom.documentElement.localName == "feed":
200                                 self.type = "atom"
201                                 try:
202                                         self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
203                                         self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
204                                 except:
205                                         pass
206                         else:
207                                 self.type = "unknown"
208                                 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
209                         self.title = strip(self.title).encode("UTF-8")
210                         self.description = strip_readable(self.description).encode("UTF-8")
211
212                         # Re-run function to parse dom
213                         return self.gotDom(dom)