adding possibilty to use a proxy
[enigma2-plugins.git] / simplerss / src / Feed.py
1 from sets import Set
2
3 class Feed:
4         MAX_HISTORY_ELEMENTS = 100
5
6         def __init__(self, uri, autoupdate, stripper):
7                 self.uri = uri
8                 self.autoupdate = autoupdate
9                 self.type = None
10                 self.title = uri.encode("UTF-8")
11                 self.description = ""
12                 self.stripper = stripper
13                 self.last_update = None
14                 self.last_ids = set()
15                 self.history = []
16
17         def gotDom(self, dom):
18                 if self.type is None:
19                         # RSS 2.0
20                         if dom.documentElement.getAttribute("version") in ["2.0", "0.94", "0.93", "0.92", "0.91"]:
21                                 self.type = "rss"
22                                 try:
23                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
24                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
25                                 except:
26                                         pass
27                         # RSS 1.0 (NS: http://www.w3.org/1999/02/22-rdf-syntax-ns#)
28                         elif dom.documentElement.localName == "RDF":
29                                 self.type = "rss"
30                                 try:
31                                         self.title = dom.getElementsByTagName("channel")[0].getElementsByTagName("title")[0].childNodes[0].data
32                                         self.description = dom.getElementsByTagName("channel")[0].getElementsByTagName("description")[0].childNodes[0].data
33                                 except:
34                                         pass
35                         # Atom (NS: http://www.w3.org/2005/Atom)
36                         elif dom.documentElement.localName == "feed":
37                                 self.type = "atom"
38                                 try:
39                                         self.title = dom.getElementsByTagName("title")[0].childNodes[0].data
40                                         self.description = dom.getElementsByTagName("subtitle")[0].childNodes[0].data
41                                 except:
42                                         pass
43                         else:
44                                 raise NotImplementedError, 'Unsupported Feed: %s' % dom.documentElement.localName
45                         self.title = self.stripper.strip(self.title).encode("UTF-8")
46                         self.description = self.stripper.strip(self.description).encode("UTF-8")
47                 if self.type == "rss":
48                         print "[SimpleRSS] type is rss"
49                         return self.gotRSSDom(dom)
50                 elif self.type == "atom":
51                         print "[SimpleRSS] type is atom"
52                         return self.gotAtomDom(dom)
53
54         def gotRSSDom(self, dom):
55                 # Try to read when feed was last updated, if time equals return empty list. else fetch new items
56                 try:
57                         updated = dom.getElementsByTagName("lastBuildDate")[0].childNodes[0].data
58                         if self.last_update == updated:
59                                 return [ ]
60                         self.last_update = updated
61                 except:
62                         pass
63                 return self.parseRSS(dom.getElementsByTagName("item"))
64
65         def parseRSS(self, items):
66                 new_items = []
67                 for item in items:
68                         enclosure = []
69
70                         # Try to read title, continue if none found
71                         try:
72                                 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
73                         except:
74                                 continue
75
76                         # Try to read link, empty if none
77                         try:
78                                 link = item.getElementsByTagName("link")[0].childNodes[0].data
79                         except:
80                                 link = ""
81                         
82                         # Try to read guid, link if none (RSS 1.0 or invalid RSS 2.0)
83                         try:
84                                 guid = item.getElementsByTagName("guid")[0].childNodes[0].data
85                         except:
86                                 guid = link
87
88                         # Continue if item is to be excluded
89                         if guid in self.last_ids:
90                                 continue
91
92                         # Try to read summary (description element), empty if none
93                         try:
94                                 summary = self.stripper.strip(item.getElementsByTagName("description")[0].childNodes[0].data)
95                         except:
96                                 summary = ""
97
98                         # Read out enclosures
99                         for current in item.getElementsByTagName("enclosure"):
100                                 enclosure.append((current.getAttribute("url").encode("UTF-8"), current.getAttribute("type").encode("UTF-8")))
101
102                         # Update Lists
103                         new_items.append((title.encode("UTF-8"), link.encode("UTF-8"), summary.encode("UTF-8"), enclosure))
104                         self.last_ids.add(guid)
105
106                 # Append known Items to new Items and evenentually cut it
107                 self.history = new_items + self.history
108                 self.history[:self.MAX_HISTORY_ELEMENTS]
109                 
110                 return new_items
111
112         def gotAtomDom(self, dom):
113                 try:
114                         # Try to read when feed was last updated, if time equals return empty list. else fetch new items
115                         updated = dom.getElementsByTagName("updated")[0].childNodes[0].data
116                         if self.last_update == updated:
117                                 return [ ]
118                         self.last_update = updated
119                 except:
120                         pass
121                 return self.parseAtom(dom.getElementsByTagName("entry"))
122
123         def parseAtom(self, items):
124                 new_items = []
125                 for item in items:
126                         enclosure = []
127                         link = ""
128                         
129                         # Try to read title, continue if none found
130                         try:
131                                 title = self.stripper.strip(item.getElementsByTagName("title")[0].childNodes[0].data)
132                         except:
133                                 continue
134
135                         # Try to read id, continue if none found (invalid feed, should be handled differently) or to be excluded
136                         try:
137                                 id = item.getElementsByTagName("id")[0].childNodes[0].data
138                                 if id in self.last_ids:
139                                         continue
140                         except:
141                                 continue
142
143                         # Read out enclosures and link
144                         for current in item.getElementsByTagName("link"):
145                                 # Enclosure
146                                 if current.getAttribute("rel") == "enclosure":
147                                         enclosure.append((current.getAttribute("href").encode("UTF-8"), current.getAttribute("type").encode("UTF-8")))
148                                 # No Enclosure, assume its a link to the item
149                                 else:
150                                         link = current.getAttribute("href")
151                         
152                         # Try to read summary, empty if none
153                         try:
154                                 summary = self.stripper.strip(item.getElementsByTagName("summary")[0].childNodes[0].data)
155                         except:
156                                 summary = ""
157
158                         # Update Lists
159                         new_items.append((title.encode("UTF-8"), link.encode("UTF-8"), summary.encode("UTF-8"), enclosure))
160                         self.last_ids.add(id)
161
162                  # Append known Items to new Items and evenentually cut it
163                 self.history = new_items + self.history
164                 self.history[:self.MAX_HISTORY_ELEMENTS]
165
166                 return new_items