New Items displayed in Feeds,
[enigma2-plugins.git] / simplerss / src / TagStrip.py
1 # -*- coding: utf-8 -*-
2
3 from re import sub
4
5 class TagStrip():
6         """Simple class to Strip HTML-Tags and convert common entities."""
7         # Entities to be converted
8         convertables = [
9                 # ISO-8895-1 (most common)
10                 ("ä", u""),
11                 ("ä", u""),
12                 ("ü", u""),
13                 ("ü", u""),
14                 ("ö", u""),
15                 ("ö", u""),
16                 ("Ä", u""),
17                 ("Ä", u""),
18                 ("Ü", u""),
19                 ("Ü", u""),
20                 ("Ö", u""),
21                 ("Ö", u""),
22                 ("ß", u""),
23                 ("ß", u""),
24
25                 # Rarely used entities
26                 ("…", u"..."),
27                 ("–", u"-"),
28                 (" ", u" "),
29                 ("&", u"&"),
30
31         # Common entities
32                 ("&lt;", u"<"),
33                 ("&gt;", u">"),
34                 ("&nbsp;", u" "),
35                 ("&amp;", u"&"),
36                 ("&quot;", u"\""),
37         ]
38
39         def strip(self, html):
40                 # Replace <p> and </p> with newline
41                 html = sub('</?p>', u"\n", html)
42
43                 # Strip enclosed tags
44                 html = sub('<(.*?)>', '', html)
45
46                 # Convert html entities
47                 for escaped, unescaped in self.convertables:
48                         html = html.replace(escaped, unescaped)
49
50                 # Return result with leading/trailing whitespaces removed
51                 return html.strip()