Remove Workaround which removed mimetypes of Images.
[enigma2-plugins.git] / simplerss / src / RSSFeed.py
1 from sets import Set
2 from TagStrip import strip, strip_readable
3 from Components.Scanner import ScanFile
4
5 class BaseFeed:
6         """Base-class for all Feeds. Initializes needed Elements."""
7         MAX_HISTORY_ELEMENTS = 100
8
9         def __init__(self, uri, autoupdate):
10                 # Set URI (used as Identifier)
11                 self.uri = uri
12
13                 # Set Autoupdate
14                 self.autoupdate = autoupdate
15
16                 # Initialize
17                 self.title = uri.encode("UTF-8")
18                 self.description = ""
19                 self.last_update = None
20                 self.last_ids = set()
21                 self.history = []
22
23 class AtomFeed(BaseFeed):
24         """Parses an Atom-Feed into expected format."""
25         def gotDom(self, dom):
26                 try:
27                         # Try to read when feed was last updated, if time equals return empty list. else fetch new items
28                         updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
29                         if self.last_update == updated:
30                                 return [ ]
31                         self.last_update = updated
32                 except:
33                         pass
34                 return AtomFeed.parse(self, dom.getElementsByTagName("entry"))
35
36         def parse(self, items):
37                 new_items = []
38                 for item in items:
39                         enclosure = []
40                         link = ""
41                         
42                         # Try to read title, continue if none found
43                         try:
44                                 title = strip(item.getElementsByTagName("title")[0].childNodes[0].data)
45                         except:
46                                 continue
47
48                         # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
49                         try:
50                                 id = item.getElementsByTagName("id")[0].childNodes[0].data
51                                 if id in self.last_ids:
52                                         continue
53                         except:
54                                 continue
55
56                         # Read out enclosures and link
57                         for current in item.getElementsByTagName("link"):
58                                 # Enclosure
59                                 if current.getAttribute("rel") == "enclosure":
60                                         href = current.getAttribute("href").encode("UTF-8")
61                                         type = current.getAttribute("type").encode("UTF-8")
62                                         if current.hasAttribute("length"):
63                                                 size = int(current.getAttribute("length")) / 1048576
64                                         else:
65                                                 size = None
66
67                                         enclosure.append(ScanFile(href, mimetype = type, size = size, autodetect = False))
68                                 # No Enclosure, assume its a link to the item
69                                 else:
70                                         link = current.getAttribute("href")
71                         
72                         # Try to read summary, empty if none
73                         try:
74                                 summary = strip_readable(item.getElementsByTagName("summary")[0].childNodes[0].data)
75                         except:
76                                 summary = ""
77
78                         # Update Lists
79                         new_items.append((
80                                         title.encode("UTF-8"),
81                                         link.encode("UTF-8"),
82                                         summary.encode("UTF-8"),
83                                         enclosure
84                         ))
85                         self.last_ids.add(id)
86
87                  # Append known Items to new Items and eventually cut it
88                 self.history = new_items + self.history
89                 self.history[:self.MAX_HISTORY_ELEMENTS]
90
91                 return new_items
92
93 class RSSFeed(BaseFeed):
94         """Parses an RSS-Feed into expected format."""
95         def gotDom(self, dom):
96                 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
97                 try:
98                         updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
99                         if self.last_update == updated:
100                                 return [ ]
101                         self.last_update = updated
102                 except:
103                         pass
104                 return RSSFeed.parse(self, dom.getElementsByTagName("item"))
105
106         def parse(self, items):
107                 new_items = []
108                 for item in items:
109                         enclosure = []
110
111                         # Try to read title, continue if none found
112                         try:
113                                 title = strip(item.getElementsByTagName("title")[0].childNodes[0].data)
114                         except:
115                                 continue
116
117                         # Try to read link, empty if none
118                         try:
119                                 link = item.getElementsByTagName("link")[0].childNodes[0].data
120                         except:
121                                 link = ""
122                         
123                         # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
124                         try:
125                                 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
126                         except:
127                                 guid = link
128
129                         # Continue if item is to be excluded
130                         if guid in self.last_ids:
131                                 continue
132
133                         # Try to read summary (description element), empty if none
134                         try:
135                                 summary = strip_readable(item.getElementsByTagName("description")[0].childNodes[0].data)
136                         except:
137                                 summary = ""
138
139                         # Read out enclosures
140                         for current in item.getElementsByTagName("enclosure"):
141                                 href = current.getAttribute("url").encode("UTF-8")
142                                 type = current.getAttribute("type").encode("UTF-8")
143                                 if current.hasAttribute("length"):
144                                         size = int(current.getAttribute("length")) / 1048576
145                                 else:
146                                         size = None
147
148                                 enclosure.append(ScanFile(href, mimetype = type, size = size, autodetect = False))
149
150                         # Update Lists
151                         new_items.append((
152                                         title.encode("UTF-8"),
153                                         link.encode("UTF-8"),
154                                         summary.encode("UTF-8"),
155                                         enclosure
156                         ))
157                         
158                         self.last_ids.add(guid)
159
160                 # Append known Items to new Items and eventually cut it
161                 self.history = new_items + self.history
162                 self.history[:self.MAX_HISTORY_ELEMENTS]
163
164                 return new_items
165
166 class UniversalFeed(BaseFeed, RSSFeed, AtomFeed):
167         """Universal Feed which on first run determines its type and calls the correct parsing-functions"""
168         def __init__(self, uri, autoupdate):
169                 BaseFeed.__init__(self, uri, autoupdate)
170                 self.type = None
171
172         def gotDom(self, dom):
173                 if self.type == "rss":
174                         print "[SimpleRSS] type is rss"
175                         return RSSFeed.gotDom(self, dom)
176                 elif self.type == "atom":
177                         print "[SimpleRSS] type is atom"
178                         return AtomFeed.gotDom(self, dom)
179                 elif self.type is None:
180                         # RSS 2.0
181                         if dom.documentElement.getAttribute("version") in ["2.0", "0.94", "0.93", "0.92", "0.91"]:
182                                 self.type = "rss"
183                                 try:
184                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
185                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
186                                 except:
187                                         pass
188                         # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
189                         elif dom.documentElement.localName == "RDF":
190                                 self.type = "rss"
191                                 try:
192                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
193                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
194                                 except:
195                                         pass
196                         # Atom (NS: http://www.w3.org/2005/Atom)
197                         elif dom.documentElement.localName == "feed":
198                                 self.type = "atom"
199                                 try:
200                                         self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
201                                         self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
202                                 except:
203                                         pass
204                         else:
205                                 self.type = "unknown"
206                                 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
207                         self.title = strip(self.title).encode("UTF-8")
208                         self.description = strip_readable(self.description).encode("UTF-8")
209
210                         # Re-run function to parse dom
211                         return self.gotDom(dom)