New Items displayed in Feeds,
[enigma2-plugins.git] / simplerss / src / RSSFeed.py
1 from sets import Set
2
3 from urlparse import urlsplit
4
5 class BaseFeed:
6         """Base-class for all Feeds. Initializes needed Elements."""
7         MAX_HISTORY_ELEMENTS = 100
8
9         def __init__(self, uri, autoupdate, stripper):
10                 # Set URI (used as Identifier)
11                 self.uri = uri
12
13                 # Determine URI Elements
14                 remote = urlsplit(uri)
15                 self.hostname = remote.hostname
16                 self.port = remote.port or 80
17                 self.path = '?'.join([remote.path, remote.query])
18                 print "[SimpleRSS] determined hostname:", self.hostname, ", port:", self.port, ", path:", self.path
19
20                 # Set Autoupdate
21                 self.autoupdate = autoupdate
22
23                 # Set Stripper
24                 self.stripper = stripper
25
26                 # Initialize
27                 self.title = uri.encode("UTF-8")
28                 self.description = ""
29                 self.last_update = None
30                 self.last_ids = set()
31                 self.history = []
32
33 class AtomFeed(BaseFeed):
34         """Parses an Atom-Feed into expected format."""
35         def gotDom(self, dom):
36                 try:
37                         # Try to read when feed was last updated, if time equals return empty list. else fetch new items
38                         updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
39                         if self.last_update == updated:
40                                 return [ ]
41                         self.last_update = updated
42                 except:
43                         pass
44                 return AtomFeed.parse(self, dom.getElementsByTagName("entry"))
45
46         def parse(self, items):
47                 new_items = []
48                 for item in items:
49                         enclosure = []
50                         link = ""
51                         
52                         # Try to read title, continue if none found
53                         try:
54                                 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
55                         except:
56                                 continue
57
58                         # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
59                         try:
60                                 id = item.getElementsByTagName("id")[0].childNodes[0].data
61                                 if id in self.last_ids:
62                                         continue
63                         except:
64                                 continue
65
66                         # Read out enclosures and link
67                         for current in item.getElementsByTagName("link"):
68                                 # Enclosure
69                                 if current.getAttribute("rel") == "enclosure":
70                                         enclosure.append((
71                                                         current.getAttribute("href").encode("UTF-8"),
72                                                         current.getAttribute("type").encode("UTF-8")
73                                         ))
74                                 # No Enclosure, assume its a link to the item
75                                 else:
76                                         link = current.getAttribute("href")
77                         
78                         # Try to read summary, empty if none
79                         try:
80                                 summary = self.stripper.strip(item.getElementsByTagName("summary")[0].childNodes[0].data)
81                         except:
82                                 summary = ""
83
84                         # Update Lists
85                         new_items.append((
86                                         title.encode("UTF-8"),
87                                         link.encode("UTF-8"),
88                                         summary.encode("UTF-8"),
89                                         enclosure
90                         ))
91                         self.last_ids.add(id)
92
93                  # Append known Items to new Items and eventually cut it
94                 self.history = new_items + self.history
95                 self.history[:self.MAX_HISTORY_ELEMENTS]
96
97                 return new_items
98
99 class RSSFeed(BaseFeed):
100         """Parses an RSS-Feed into expected format."""
101         def gotDom(self, dom):
102                 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
103                 try:
104                         updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
105                         if self.last_update == updated:
106                                 return [ ]
107                         self.last_update = updated
108                 except:
109                         pass
110                 return RSSFeed.parse(self, dom.getElementsByTagName("item"))
111
112         def parse(self, items):
113                 new_items = []
114                 for item in items:
115                         enclosure = []
116
117                         # Try to read title, continue if none found
118                         try:
119                                 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
120                         except:
121                                 continue
122
123                         # Try to read link, empty if none
124                         try:
125                                 link = item.getElementsByTagName("link")[0].childNodes[0].data
126                         except:
127                                 link = ""
128                         
129                         # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
130                         try:
131                                 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
132                         except:
133                                 guid = link
134
135                         # Continue if item is to be excluded
136                         if guid in self.last_ids:
137                                 continue
138
139                         # Try to read summary (description element), empty if none
140                         try:
141                                 summary = self.stripper.strip(item.getElementsByTagName("description")[0].childNodes[0].data)
142                         except:
143                                 summary = ""
144
145                         # Read out enclosures
146                         for current in item.getElementsByTagName("enclosure"):
147                                 enclosure.append((
148                                                 current.getAttribute("url").encode("UTF-8"),
149                                                 current.getAttribute("type").encode("UTF-8")
150                                 ))
151
152                         # Update Lists
153                         new_items.append((
154                                         title.encode("UTF-8"),
155                                         link.encode("UTF-8"),
156                                         summary.encode("UTF-8"),
157                                         enclosure
158                         ))
159                         
160                         self.last_ids.add(guid)
161
162                 # Append known Items to new Items and eventually cut it
163                 self.history = new_items + self.history
164                 self.history[:self.MAX_HISTORY_ELEMENTS]
165
166                 return new_items
167
168 class UniversalFeed(BaseFeed, RSSFeed, AtomFeed):
169         """Universal Feed which on first run determines its type and calls the correct parsing-functions"""
170         def __init__(self, uri, autoupdate, stripper):
171                 BaseFeed.__init__(self, uri, autoupdate, stripper)
172                 self.type = None
173
174         def gotDom(self, dom):
175                 if self.type is None:
176                         # RSS 2.0
177                         if dom.documentElement.getAttribute("version") in ["2.0", "0.94", "0.93", "0.92", "0.91"]:
178                                 self.type = "rss"
179                                 try:
180                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
181                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
182                                 except:
183                                         pass
184                         # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
185                         elif dom.documentElement.localName == "RDF":
186                                 self.type = "rss"
187                                 try:
188                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
189                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
190                                 except:
191                                         pass
192                         # Atom (NS: http://www.w3.org/2005/Atom)
193                         elif dom.documentElement.localName == "feed":
194                                 self.type = "atom"
195                                 try:
196                                         self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
197                                         self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
198                                 except:
199                                         pass
200                         else:
201                                 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
202                         self.title = self.stripper.strip(self.title).encode("UTF-8")
203                         self.description = self.stripper.strip(self.description).encode("UTF-8")
204                 if self.type == "rss":
205                         print "[SimpleRSS] type is rss"
206                         return RSSFeed.gotDom(self, dom)
207                 elif self.type == "atom":
208                         print "[SimpleRSS] type is atom"
209                         return AtomFeed.gotDom(self, dom)