Add a TagStripper to replace german umlauts and strip html-tags,
[enigma2-plugins.git] / simplerss / src / TagStrip.py
1 # -*- coding: utf-8 -*-
2
3 import re
4
5 class TagStrip():
6         convertables = {
7                 "ä": u"",
8                 "ä": u"",
9                 "ü": u"",
10                 "ü": u"",
11                 "ö": u"",
12                 "ö": u"",
13                 "Ä": u"",
14                 "Ä": u"",
15                 "Ü": u"",
16                 "Ü": u"",
17                 "Ö": u"",
18                 "Ö": u"",
19                 "ß": u"",
20                 "ß": u"",
21                 "&": u"&",
22                 "…": u"...",
23                 "–": u"-",
24                 " ": u" ",
25     
26                 "&lt;": u"<",
27                 "&gt;": u">",
28                 "&nbsp;": u" ",
29                 "&amp;": u"&",
30                 "&quot;": u"\"",
31         }
32
33         def strip(self, html):
34                 # Convert htmlspecialchars
35                 for escaped, unescaped in self.convertables.iteritems():
36                         html = html.replace(escaped, unescaped)
37
38                 # Strip everything of form <a>CONTENT</a>, but keep CONTENT (elements with content)
39                 html = re.sub('<(?P<tag>.*?)>(?P<content>.*?)</(?P=tag)>', '\g<content>', html)
40
41                 # Strip everything of form <a /> (elements without conent)
42                 html = re.sub('<(.*?) />', '', html)
43                 return html