Remove httpclient.py and rely on twisted-web
[enigma2-plugins.git] / simplerss / src / RSSFeed.py
1 from sets import Set
2
3 class BaseFeed:
4         """Base-class for all Feeds. Initializes needed Elements."""
5         MAX_HISTORY_ELEMENTS = 100
6
7         def __init__(self, uri, autoupdate, stripper):
8                 # Set URI (used as Identifier)
9                 self.uri = uri
10
11                 # Set Autoupdate
12                 self.autoupdate = autoupdate
13
14                 # Set Stripper
15                 self.stripper = stripper
16
17                 # Initialize
18                 self.title = uri.encode("UTF-8")
19                 self.description = ""
20                 self.last_update = None
21                 self.last_ids = set()
22                 self.history = []
23
24 class AtomFeed(BaseFeed):
25         """Parses an Atom-Feed into expected format."""
26         def gotDom(self, dom):
27                 try:
28                         # Try to read when feed was last updated, if time equals return empty list. else fetch new items
29                         updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
30                         if self.last_update == updated:
31                                 return [ ]
32                         self.last_update = updated
33                 except:
34                         pass
35                 return AtomFeed.parse(self, dom.getElementsByTagName("entry"))
36
37         def parse(self, items):
38                 new_items = []
39                 for item in items:
40                         enclosure = []
41                         link = ""
42                         
43                         # Try to read title, continue if none found
44                         try:
45                                 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
46                         except:
47                                 continue
48
49                         # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
50                         try:
51                                 id = item.getElementsByTagName("id")[0].childNodes[0].data
52                                 if id in self.last_ids:
53                                         continue
54                         except:
55                                 continue
56
57                         # Read out enclosures and link
58                         for current in item.getElementsByTagName("link"):
59                                 # Enclosure
60                                 if current.getAttribute("rel") == "enclosure":
61                                         enclosure.append((
62                                                         current.getAttribute("href").encode("UTF-8"),
63                                                         current.getAttribute("type").encode("UTF-8")
64                                         ))
65                                 # No Enclosure, assume its a link to the item
66                                 else:
67                                         link = current.getAttribute("href")
68                         
69                         # Try to read summary, empty if none
70                         try:
71                                 summary = self.stripper.strip_readable(item.getElementsByTagName("summary")[0].childNodes[0].data)
72                         except:
73                                 summary = ""
74
75                         # Update Lists
76                         new_items.append((
77                                         title.encode("UTF-8"),
78                                         link.encode("UTF-8"),
79                                         summary.encode("UTF-8"),
80                                         enclosure
81                         ))
82                         self.last_ids.add(id)
83
84                  # Append known Items to new Items and eventually cut it
85                 self.history = new_items + self.history
86                 self.history[:self.MAX_HISTORY_ELEMENTS]
87
88                 return new_items
89
90 class RSSFeed(BaseFeed):
91         """Parses an RSS-Feed into expected format."""
92         def gotDom(self, dom):
93                 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
94                 try:
95                         updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
96                         if self.last_update == updated:
97                                 return [ ]
98                         self.last_update = updated
99                 except:
100                         pass
101                 return RSSFeed.parse(self, dom.getElementsByTagName("item"))
102
103         def parse(self, items):
104                 new_items = []
105                 for item in items:
106                         enclosure = []
107
108                         # Try to read title, continue if none found
109                         try:
110                                 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
111                         except:
112                                 continue
113
114                         # Try to read link, empty if none
115                         try:
116                                 link = item.getElementsByTagName("link")[0].childNodes[0].data
117                         except:
118                                 link = ""
119                         
120                         # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
121                         try:
122                                 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
123                         except:
124                                 guid = link
125
126                         # Continue if item is to be excluded
127                         if guid in self.last_ids:
128                                 continue
129
130                         # Try to read summary (description element), empty if none
131                         try:
132                                 summary = self.stripper.strip_readable(item.getElementsByTagName("description")[0].childNodes[0].data)
133                         except:
134                                 summary = ""
135
136                         # Read out enclosures
137                         for current in item.getElementsByTagName("enclosure"):
138                                 enclosure.append((
139                                                 current.getAttribute("url").encode("UTF-8"),
140                                                 current.getAttribute("type").encode("UTF-8")
141                                 ))
142
143                         # Update Lists
144                         new_items.append((
145                                         title.encode("UTF-8"),
146                                         link.encode("UTF-8"),
147                                         summary.encode("UTF-8"),
148                                         enclosure
149                         ))
150                         
151                         self.last_ids.add(guid)
152
153                 # Append known Items to new Items and eventually cut it
154                 self.history = new_items + self.history
155                 self.history[:self.MAX_HISTORY_ELEMENTS]
156
157                 return new_items
158
159 class UniversalFeed(BaseFeed, RSSFeed, AtomFeed):
160         """Universal Feed which on first run determines its type and calls the correct parsing-functions"""
161         def __init__(self, uri, autoupdate, stripper):
162                 BaseFeed.__init__(self, uri, autoupdate, stripper)
163                 self.type = None
164
165         def gotDom(self, dom):
166                 if self.type is None:
167                         # RSS 2.0
168                         if dom.documentElement.getAttribute("version") in ["2.0", "0.94", "0.93", "0.92", "0.91"]:
169                                 self.type = "rss"
170                                 try:
171                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
172                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
173                                 except:
174                                         pass
175                         # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
176                         elif dom.documentElement.localName == "RDF":
177                                 self.type = "rss"
178                                 try:
179                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
180                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
181                                 except:
182                                         pass
183                         # Atom (NS: http://www.w3.org/2005/Atom)
184                         elif dom.documentElement.localName == "feed":
185                                 self.type = "atom"
186                                 try:
187                                         self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
188                                         self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
189                                 except:
190                                         pass
191                         else:
192                                 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
193                         self.title = self.stripper.strip(self.title).encode("UTF-8")
194                         self.description = self.stripper.strip_readable(self.description).encode("UTF-8")
195                 if self.type == "rss":
196                         print "[SimpleRSS] type is rss"
197                         return RSSFeed.gotDom(self, dom)
198                 elif self.type == "atom":
199                         print "[SimpleRSS] type is atom"
200                         return AtomFeed.gotDom(self, dom)