Fix stupid Typo in TagStrip
[enigma2-plugins.git] / simplerss / src / TagStrip.py
1 # -*- coding: utf-8 -*-
2
3 from re import sub
4
5 class TagStrip():
6         """Simple class to Strip HTML-Tags and convert common entities."""
7         # Entities to be converted
8         convertables = [
9                 # ISO-8895-1 (most common)
10                 ("ä", u""),
11                 ("ä", u""),
12                 ("ü", u""),
13                 ("ü", u""),
14                 ("ö", u""),
15                 ("ö", u""),
16                 ("Ä", u""),
17                 ("Ä", u""),
18                 ("Ü", u""),
19                 ("Ü", u""),
20                 ("Ö", u""),
21                 ("Ö", u""),
22                 ("ß", u""),
23                 ("ß", u""),
24
25                 # Rarely used entities
26                 ("…", u"..."),
27                 ("–", u"-"),
28                 (" ", u" "),
29                 ("&", u"&"),
30
31         # Common entities
32                 ("&lt;", u"<"),
33                 ("&gt;", u">"),
34                 ("&nbsp;", u" "),
35                 ("&amp;", u"&"),
36                 ("&quot;", u"\""),
37         ]
38
39         def strip_readable(self, html):
40                 # Replace <p> and </p> with newline
41                 html = sub('</?p>', u"\n", html)
42                 
43                 # Replace multiple whitespaces
44                 html = sub('\s\s+', u' ', html)
45
46                 # Replace <li> by - and </li> by newline
47                 html = html.replace('<li>', u"-")
48                 html = html.replace('</li>', u"\n")
49
50                 # And 'normal' stripping
51                 return self.strip(html)
52
53         def strip(self, html):
54                 # Strip enclosed tags
55                 html = sub('<(.*?)>', '', html)
56
57                 # Convert html entities
58                 for escaped, unescaped in self.convertables:
59                         html = html.replace(escaped, unescaped)
60
61                 # Return result with leading/trailing whitespaces removed
62                 return html.strip()