fix Depends (twisted-web)
[enigma2-plugins.git] / emailclient / src / TagStrip.py
1 # -*- coding: utf-8 -*-
2
3 from re import sub
4
5 class TagStrip():
6         """Simple class to Strip HTML-Tags and convert common entities."""
7         # Entities to be converted
8         entities = [
9                 # ISO-8895-1 (most common)
10                 ("ä", u""),
11                 ("ä", u""),
12                 ("ü", u""),
13                 ("ü", u""),
14                 ("ö", u""),
15                 ("ö", u""),
16                 ("Ä", u""),
17                 ("Ä", u""),
18                 ("Ü", u""),
19                 ("Ü", u""),
20                 ("Ö", u""),
21                 ("Ö", u""),
22                 ("ß", u""),
23                 ("ß", u""),
24
25                 # Rarely used entities
26                 ("…", u"..."),
27                 ("–", u"-"),
28                 (" ", u" "),
29                 (""", u"\""),
30                 ("&", u"&"),
31                 ("'", u"'"),
32                 ("&#60;", u"<"),
33                 ("&#62;", u">"),
34
35         # Common entities
36                 ("&lt;", u"<"),
37                 ("&gt;", u">"),
38                 ("&nbsp;", u" "),
39                 ("&amp;", u"&"),
40                 ("&quot;", u"\""),
41                 ("&apos;", u"'"),
42         ]
43
44         def strip_readable(self, html):
45                 # Newlines are rendered as whitespace in html
46                 html = html.replace('\n', ' ')
47
48                 # Multiple whitespaces are rendered as a single one
49                 html = sub('\s\s+', ' ', html)
50
51                 # Replace <br> by newlines
52                 html = sub('<br(\s+/)?>', '\n', html)
53
54                 # Replace <p>, <ul>, <ol> and end of these tags by newline
55                 html = sub('</?(p|ul|ol)(\s+.*?)?>', '\n', html)
56
57                 # Replace <li> by - and </li> by newline
58                 html = sub('<li(\s+.*?)?>', '-', html)
59                 html = html.replace('</li>', '\n')
60
61                 # And 'normal' stripping
62                 return self.strip(html)
63
64         def strip(self, html):
65                 # Strip enclosed tags
66                 html = sub('<(.*?)>', '', html)
67
68                 # Convert html entities
69                 for escaped, unescaped in self.entities:
70                         html = html.replace(escaped, unescaped)
71
72                 # Return result with leading/trailing whitespaces removed
73                 return html.strip()