Add a TagStripper to replace german umlauts and strip html-tags,
[enigma2-plugins.git] / simplerss / src / TagStrip.py
1 # -*- coding: utf-8 -*-
2
3 import re
4
5 class TagStrip():
6         convertables = {
7                 "ä": u"ä",
8                 "ä": u"ä",
9                 "ü": u"ü",
10                 "ü": u"ü",
11                 "ö": u"ö",
12                 "ö": u"ö",
13                 "Ä": u"Ä",
14                 "Ä": u"Ä",
15                 "Ü": u"Ü",
16                 "Ü": u"Ü",
17                 "Ö": u"Ö",
18                 "Ö": u"Ö",
19                 "ß": u"ß",
20                 "ß": u"ß",
21                 "&": u"&",
22                 "…": u"...",
23                 "–": u"-",
24                 " ": u" ",
25     
26                 "&lt;": u"<",
27                 "&gt;": u">",
28                 "&nbsp;": u" ",
29                 "&amp;": u"&",
30                 "&quot;": u"\"",
31         }
32
33         def strip(self, html):
34                 # Convert htmlspecialchars
35                 for escaped, unescaped in self.convertables.iteritems():
36                         html = html.replace(escaped, unescaped)
37
38                 # Strip everything of form <a>CONTENT</a>, but keep CONTENT (elements with content)
39                 html = re.sub('<(?P<tag>.*?)>(?P<content>.*?)</(?P=tag)>', '\g<content>', html)
40
41                 # Strip everything of form <a /> (elements without conent)
42                 html = re.sub('<(.*?) />', '', html)
43                 return html