SeriesPlugin 2.4.3: Changed handling of special char encoding
[enigma2-plugins.git] / seriesplugin / src / Identifiers / Fernsehserien.py
1 # -*- coding: utf-8 -*-
2 # by betonme @2012
3
4 import os, sys
5 import json
6 import re
7 import math
8
9 from sys import maxint
10
11 from Components.config import config
12 from Tools.BoundFunction import boundFunction
13
14 # Imports
15 from urllib import urlencode
16 from urllib2 import urlopen
17
18 from time import time
19 from datetime import datetime, timedelta
20
21 # Internal
22 from Plugins.Extensions.SeriesPlugin.IdentifierBase import IdentifierBase
23 from Plugins.Extensions.SeriesPlugin.Logger import splog
24
25 from bs4 import BeautifulSoup
26 from HTMLParser import HTMLParser
27
28 #import codecs
29 #utf8_encoder = codecs.getencoder("utf-8")
30
31
32 # Constants
33 SERIESLISTURL = "http://www.fernsehserien.de/suche?"
34 EPISODEIDURL = 'http://www.fernsehserien.de%s/sendetermine/%s'
35
36 Headers = {
37                 'User-Agent' : 'Mozilla/5.0',
38                 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
39                 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
40                 'Accept-Encoding':'',
41                 'Accept-Language':'de-DE,de;q=0.8,en-US;q=0.6,en;q=0.4',
42                 'Cache-Control':'no-cache',
43                 'Connection':'keep-alive',
44                 'Host':'www.fernsehserien.de',
45                 'Pragma':'no-cache'
46         }
47
48 CompiledRegexpNonASCII = re.compile('\xe2\x80.')
49
50
51 def str_to_utf8(s):
52         # Convert a byte string with unicode escaped characters
53         splog("WL: str_to_utf8: s: ", repr(s))
54         #unicode_str = s.decode('unicode-escape')
55         #splog("WL: str_to_utf8: s: ", repr(unicode_str))
56         ## Python 2.x can't convert the special chars nativly
57         #utf8_str = utf8_encoder(unicode_str)[0]
58         #splog("WL: str_to_utf8: s: ", repr(utf8_str))
59         #return utf8_str  #.decode("utf-8").encode("ascii", "ignore")
60         if type(s) == unicode:
61                 # Default shoud be here
62                 try:
63                         s = s.encode('utf-8')
64                         splog("WL: str_to_utf8 encode utf8: s: ", repr(s))
65                 except:
66                         s = s.encode('utf-8', 'ignore')
67                         splog("WL: str_to_utf8 except encode utf8 ignore: s: ", repr(s))
68         else:
69                 try:
70                         s = s.decode('utf-8')
71                         splog("WL: str_to_utf8 decode utf8: s: ", repr(s))
72                 except:
73                         try:
74                                 s = unicode(s, 'ISO-8859-1')
75                                 s = s.encode('utf-8')
76                                 splog("WL: str_to_utf8 decode ISO-8859-1: s: ", repr(s))
77                         except:
78                                 try:
79                                         s = unicode(s, 'cp1252')
80                                         s = s.encode('utf-8')
81                                         splog("WL: str_to_utf8 decode cp1252: s: ", repr(s))
82                                 except:
83                                         s = unicode(s, 'ISO-8859-1', 'ignore')
84                                         s = s.encode('utf-8')
85                                         splog("WL: str_to_utf8 decode ISO-8859-1 ignore: s: ", repr(s))
86         s = s.replace('\xe2\x80\x93','-').replace('\xe2\x80\x99',"'").replace('\xc3\x9f','')
87         return CompiledRegexpNonASCII.sub('', s)
88
89
90 class FSParser(HTMLParser):
91         def __init__(self):
92                 HTMLParser.__init__(self)
93                 # Hint: xpath from Firebug without tbody elements
94                 xpath = '/html/body/div[2]/div[2]/div/table/tr[3]/td/div/table[2]/tr/td'
95                 
96                 self.xpath = [ e for e in xpath.split('/') if e ]
97                 self.xpath.reverse()
98
99                 self.lookfor = self.xpath.pop()
100                 self.waitforendtag = 0
101
102                 self.start = False
103                 self.table = False
104                 self.tr= False
105                 self.td= False
106                 self.data = []
107                 self.list = []
108
109         def handle_starttag(self, tag, attributes):
110                 if self.waitforendtag == 0:
111                         if tag == self.lookfor:
112                                 if self.xpath:
113                                         self.lookfor = self.xpath.pop()
114                                         s = self.lookfor.split('[')
115                                         if len(s) == 2:
116                                                 self.lookfor = s[0]
117                                                 self.waitforendtag = int( s[1].split(']' )[0]) - 1
118                                 else:
119                                         self.start = True
120
121                 if self.start and tag == 'table':
122                         self.table = True
123
124                 if self.table:
125                         if tag == 'td':
126                                 self.td= True
127                         elif tag == 'tr':
128                                 self.tr= True
129
130         def handle_endtag(self, tag):
131                 if self.table:
132                         if tag == 'td':
133                                 self.td= False
134                         elif tag == 'tr':
135                                 self.tr= False
136                                 self.list.append(self.data)
137                                 self.data= []
138
139                 if tag == 'table':
140                         self.table = False
141
142                 if tag == self.lookfor:
143                         if self.waitforendtag > 0: self.waitforendtag -= 1
144
145         def handle_data(self, data):
146                 if self.tr and self.td:
147                         self.data.append(data)
148
149
150 class Fernsehserien(IdentifierBase):
151         def __init__(self):
152                 IdentifierBase.__init__(self)
153
154         @classmethod
155         def knowsElapsed(cls):
156                 return True
157
158         @classmethod
159         def knowsToday(cls):
160                 return True
161
162         @classmethod
163         def knowsFuture(cls):
164                 return True
165
166         def getEpisode(self, name, begin, end=None, service=None):
167                 # On Success: Return a single season, episode, title tuple
168                 # On Failure: Return a empty list or String or None
169                 
170                 self.begin = begin
171                 self.year = begin.year
172                 self.end = end
173                 self.service = service
174                 
175                 self.series = ""
176                 self.first = None
177                 self.last = None
178                 self.page = 0
179                 
180                 self.td_max_time_drift = timedelta(seconds=self.max_time_drift)
181                 
182                 self.knownids = []
183                 self.returnvalue = None
184                 
185                 # Check preconditions
186                 if not name:
187                         splog(_("Skip Fernsehserien: No show name specified"))
188                         return _("Skip Fernsehserien: No show name specified")
189                 if not begin:
190                         splog(_("Skip Fernsehserien: No begin timestamp specified"))
191                         return _("Skip Fernsehserien: No begin timestamp specified")
192                 
193                 if self.begin > datetime.now():
194                         self.future = True
195                 else:
196                         self.future = False
197                 splog("Fernsehserien getEpisode future", self.future)
198         
199                 while name:     
200                         ids = self.getSeries(name)
201                         
202                         while ids:
203                                 idserie = ids.pop()
204                                 
205                                 if idserie and len(idserie) == 2:
206                                         id, idname = idserie
207                                         
208                                         # Handle encodings
209                                         self.series = str_to_utf8(idname)
210                                         
211                                         #self.page = 0
212                                         if self.future:
213                                                 self.page = 0
214                                         else:
215                                                 if self.actual_year == self.year:
216                                                         #if self.begin > self.now-timedelta(seconds=3600):
217                                                         self.page = 0
218                                                         #else:
219                                                         #       self.page = -1
220                                                 else:
221                                                         self.page = 0
222                                                         
223                                                         year_url = EPISODEIDURL % (id, '')
224                                                         #/sendetermine/jahr-2014
225                                                         response = urlopen( year_url+"jahr-"+str(self.year) )
226                                                         
227                                                         #redirecturl = http://www.fernsehserien.de/criminal-intent-verbrechen-im-visier/sendetermine/-14
228                                                         redirect_url = response.geturl()
229                                                         
230                                                         page = int( redirect_url.replace(year_url,'') )
231                                                         
232                                         
233                                         self.first = None
234                                         self.last = None
235                                         
236                                         while self.page is not None:
237                                                 result = self.getNextPage(id)
238                                                 if result:
239                                                         return result
240                                         
241                         else:
242                                 name = self.getAlternativeSeries(name)
243                 
244                 else:
245                         return ( self.returnvalue or _("No matching series found") )
246
247         def getSeries(self, name):
248                 parameter =  urlencode({ 'term' : re.sub("[^a-zA-Z0-9*]", " ", name) })
249                 url = SERIESLISTURL + parameter
250                 data = self.getPage(url, Headers)
251                 
252                 if data and isinstance(data, basestring):
253                         data = self.parseSeries(data)
254                         self.doCacheList(url, data)
255                 
256                 if data and isinstance(data, list):
257                         splog("Fernsehserien ids", data)
258                         return self.filterKnownIds(data)
259
260         def parseSeries(self, data):
261                 serieslist = []
262                 for line in json.loads(data):
263                         id = line['id']
264                         idname = line['value']
265                         splog(id, idname)
266                         if not idname.endswith("/person"):
267                                 serieslist.append( ( id, idname ) )
268                 serieslist.reverse()
269                 return serieslist
270
271         def parseNextPage(self, data):
272                 trs = []
273                 
274                 #parser = FSParser()
275                 #parser.feed(data)
276                 #return parser.list
277                 
278                 # Handle malformed HTML issues
279                 data = data.replace('\\"','"')  # target=\"_blank\"
280                 data = data.replace('\'+\'','') # document.write('<scr'+'ipt
281                 
282                 soup = BeautifulSoup(data)
283                 
284                 table = soup.find('table', 'sendetermine')
285                 if table:
286                         for trnode in table.find_all('tr'):
287                                 # TODO skip first header row
288                                 tdnodes = trnode and trnode.find_all('td')
289                                 
290                                 if tdnodes:
291                                         # Filter for known rows
292                                         #if len(tdnodes) == 7 and len(tdnodes[2].string) >= 15:
293                                         
294                                         if len(tdnodes) >= 6 and tdnodes[2].string and len(tdnodes[2].string) >= 15:
295                                                 tds = []
296                                                 for tdnode in tdnodes:
297                                                         tds.append(tdnode.string or "")
298                                                 trs.append( tds )
299                                         # This row belongs to the previous
300                                         elif trs and len(tdnodes) == 5:
301                                                 #if trs[-1][5] and tdnodes[3].string:
302                                                 trs[-1][5] += ' ' + (tdnodes[3].string or "")
303                                                 #if trs[-1][6] and tdnodes[4].string:
304                                                 trs[-1][6] += ' ' + (tdnodes[4].string or "")
305                                         #else:
306                                         #       splog( "tdnodes", len(tdnodes), tdnodes )
307                                 
308                                 #else:
309                                 #       splog( "tdnodes", tdnodes )
310                 
311                 #splog(trs)
312                 return trs
313
314         def getNextPage(self, id):
315                 url = EPISODEIDURL % (id, self.page)
316                 data = self.getPage(url, Headers)
317                 
318                 if data and isinstance(data, basestring):
319                         splog("getNextPage: basestring")
320                         data = self.parseNextPage(data)
321                         self.doCacheList(url, data)
322                 
323                 if data and isinstance(data, list):
324                         splog("getNextPage: list")
325                         
326                         trs = data
327                         # trs[x] = [None, u'31.10.2012', u'20:15\u201321:15 Uhr', u'ProSieben', u'8.', u'15', u'Richtungswechsel']
328
329                         yepisode = None
330                         ydelta = maxint
331                         
332                         #first = trs[0][2]
333                         #last = trs[-1][2]
334                         #print first[0:5]
335                         #print last[6:11] 
336                         
337                         # trs[0] first line [2] second element = timestamps [a:b] use first time
338                         first = datetime.strptime( trs[0][2][0:5] + trs[0][1], "%H:%M%d.%m.%Y" )
339                         
340                         # trs[-1] last line [2] second element = timestamps [a:b] use second time
341                         #last = datetime.strptime( trs[-1][2][6:11] + trs[-1][1], "%H:%M%d.%m.%Y" )
342                         # Problem with wrap around use also start time
343                         # Sa 30.11.2013 23:35 - 01:30 Uhr ProSieben 46 3. 13 Showdown 3
344                         last = datetime.strptime( trs[-1][2][0:5] + trs[-1][1], "%H:%M%d.%m.%Y" )
345                         
346                         #first = first - self.td_max_time_drift
347                         #last = last + self.td_max_time_drift
348                         
349                         
350                         if self.page != 0:
351                                 new_page = (self.first != first or self.last != last)
352                                 splog("getNextPage: first_on_prev_page, first, last_on_prev_page, last, if: ", self.first, first, self.last, last, new_page)
353                                 self.first = first
354                                 self.last = last
355                         else:
356                                 new_page = True
357                         
358                         if new_page:
359                                 test_future_timespan = ( (first-self.td_max_time_drift) <= self.begin and self.begin <= (last+self.td_max_time_drift) )
360                                 test_past_timespan = ( (first+self.td_max_time_drift) >= self.begin and self.begin >= (last-self.td_max_time_drift) )
361                                 
362                                 splog("first_on_page, self.begin, last_on_page, if, if:", first, self.begin, last, test_future_timespan, test_past_timespan )
363                                 if ( test_future_timespan or test_past_timespan ):
364                                         #search in page for matching datetime
365                                         for tds in trs:
366                                                 if tds and len(tds) >= 6:  #7:
367                                                         # Grey's Anathomy
368                                                         # [None, u'31.10.2012', u'20:15\u201321:15 Uhr', u'ProSieben', u'8.', u'15', u'Richtungswechsel']
369                                                         # 
370                                                         # Gute Zeiten 
371                                                         # [None, u'20.11.2012', u'06:40\u201307:20 Uhr', u'NDR', None, u'4187', u'Folge 4187']
372                                                         # [None, u'01.12.2012', u'10:45\u201313:15 Uhr', u'RTL', None, u'5131', u'Folge 5131']
373                                                         # [None, u'\xa0', None, u'5132', u'Folge 5132']
374                                                         # [None, u'\xa0', None, u'5133', u'Folge 5133']
375                                                         # [None, u'\xa0', None, u'5134', u'Folge 5134']
376                                                         # [None, u'\xa0', None, u'5135', u'Folge 5135']
377                                                         
378                                                         # Wahnfried
379                                                         # [u'Sa', u'26.12.1987', u'\u2013', u'So', u'27.12.1987', u'1Plus', None]
380                                                         
381                                                         # First part: date, times, channel
382                                                         xdate, xbegin = tds[1:3]
383                                                         #splog( "tds", tds )
384                                                         
385                                                         #xend = xbegin[6:11]
386                                                         xbegin = xbegin[0:5]
387                                                         xbegin = datetime.strptime( xbegin+xdate, "%H:%M%d.%m.%Y" )
388                                                         #xend = datetime.strptime( xend+xdate, "%H:%M%d.%m.%Y" )
389                                                         #print "xbegin", xbegin
390                                                         
391                                                         #Py2.6
392                                                         delta = abs(self.begin - xbegin)
393                                                         delta = delta.seconds + delta.days * 24 * 3600
394                                                         #Py2.7 delta = abs(self.begin - xbegin).total_seconds()
395                                                         splog(self.begin, xbegin, delta, self.max_time_drift)
396                                                         
397                                                         if delta <= self.max_time_drift:
398                                                                 
399                                                                 if self.compareChannels(self.service, tds[3]):
400                                                                         
401                                                                         if delta < ydelta:
402                                                                                 
403                                                                                 splog( "tds", len(tds), tds )
404                                                                                 if len(tds) >= 10:
405                                                                                         # Second part: s1e1, s1e2,
406                                                                                         xseason = tds[7] or "1"
407                                                                                         xepisode = tds[8]
408                                                                                         xtitle = " ".join(tds[10:])  # Use all available titles
409                                                                                 elif len(tds) >= 7:
410                                                                                         # Second part: s1e1, s1e2,
411                                                                                         xseason = tds[4]
412                                                                                         xepisode = tds[5]
413                                                                                         if xseason and xseason.find(".") != -1:
414                                                                                                 xseason = xseason[:-1]
415                                                                                                 xtitle = " ".join(tds[6:])  # Use all available titles
416                                                                                         else:
417                                                                                                 xseason = "1"
418                                                                                                 xtitle = " ".join(tds[6:])  # Use all available titles
419                                                                                 elif len(tds) == 6:
420                                                                                         xseason = "0"
421                                                                                         xepisode = "0"
422                                                                                         xtitle = tds[5]
423                                                                                 if xseason and xepisode and xtitle and self.series:
424                                                                                 
425                                                                                         # Handle encodings
426                                                                                         xtitle = str_to_utf8(xtitle)
427                                                                                         
428                                                                                         yepisode = (xseason, xepisode, xtitle, self.series)
429                                                                                         ydelta = delta
430                                                                         
431                                                                         else: #if delta >= ydelta:
432                                                                                 break
433                                                                 
434                                                                 else:
435                                                                         self.returnvalue = _("Check the channel name")
436                                                                 
437                                                         elif yepisode:
438                                                                 break
439                                         
440                                         if yepisode:
441                                                 return ( yepisode )
442                                 
443                                 else:
444                                         # TODO calculate next page : use firstrow lastrow datetime
445                                         if not self.future:
446                                                 if first > self.begin:
447                                                         self.page -= 1
448                                                         return
449                                         
450                                         else:
451                                                 if self.begin > last:
452                                                         self.page += 1
453                                                         return
454                 
455                 self.page = None
456                 return