SeriesPlugin 2.2.2: Bulk update
[enigma2-plugins.git] / seriesplugin / src / Identifiers / Fernsehserien.py
1 # -*- coding: utf-8 -*-
2 # by betonme @2012
3
4 import os, sys
5 import json
6 import re
7 import math
8
9 from sys import maxint
10
11 from Components.config import config
12 from Tools.BoundFunction import boundFunction
13
14 # Imports
15 from urllib import urlencode
16
17 from time import time
18 from datetime import datetime, timedelta
19
20 # Internal
21 from Plugins.Extensions.SeriesPlugin.IdentifierBase import IdentifierBase
22 from Plugins.Extensions.SeriesPlugin.Logger import splog
23
24 from bs4 import BeautifulSoup
25 from HTMLParser import HTMLParser
26
27 import codecs
28 utf8_encoder = codecs.getencoder("utf-8")
29
30
31 # Constants
32 SERIESLISTURL = "http://www.fernsehserien.de/suche?"
33 EPISODEIDURL = 'http://www.fernsehserien.de%s/sendetermine/%d'
34
35 Headers = {
36                 'User-Agent' : 'Mozilla/5.0',
37                 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
38                 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
39                 'Accept-Encoding':'',
40                 'Accept-Language':'de-DE,de;q=0.8,en-US;q=0.6,en;q=0.4',
41                 'Cache-Control':'no-cache',
42                 'Connection':'keep-alive',
43                 'Host':'www.fernsehserien.de',
44                 'Pragma':'no-cache'
45         }
46
47
48 def str_to_utf8(s):
49         # Convert a byte string with unicode escaped characters
50         splog("FS: str_to_utf8: s: ", repr(s))
51         # Python 2.x can't convert the special chars nativly
52         utf8_str = utf8_encoder(s)[0]
53         splog("FS: str_to_utf8: s: ", repr(utf8_str))
54         return utf8_str
55
56
57 class FSParser(HTMLParser):
58         def __init__(self):
59                 HTMLParser.__init__(self)
60                 # Hint: xpath from Firebug without tbody elements
61                 xpath = '/html/body/div[2]/div[2]/div/table/tr[3]/td/div/table[2]/tr/td'
62                 
63                 self.xpath = [ e for e in xpath.split('/') if e ]
64                 self.xpath.reverse()
65
66                 self.lookfor = self.xpath.pop()
67                 self.waitforendtag = 0
68
69                 self.start = False
70                 self.table = False
71                 self.tr= False
72                 self.td= False
73                 self.data = []
74                 self.list = []
75
76         def handle_starttag(self, tag, attributes):
77                 if self.waitforendtag == 0:
78                         if tag == self.lookfor:
79                                 if self.xpath:
80                                         self.lookfor = self.xpath.pop()
81                                         s = self.lookfor.split('[')
82                                         if len(s) == 2:
83                                                 self.lookfor = s[0]
84                                                 self.waitforendtag = int( s[1].split(']' )[0]) - 1
85                                 else:
86                                         self.start = True
87
88                 if self.start and tag == 'table':
89                         self.table = True
90
91                 if self.table:
92                         if tag == 'td':
93                                 self.td= True
94                         elif tag == 'tr':
95                                 self.tr= True
96
97         def handle_endtag(self, tag):
98                 if self.table:
99                         if tag == 'td':
100                                 self.td= False
101                         elif tag == 'tr':
102                                 self.tr= False
103                                 self.list.append(self.data)
104                                 self.data= []
105
106                 if tag == 'table':
107                         self.table = False
108
109                 if tag == self.lookfor:
110                         if self.waitforendtag > 0: self.waitforendtag -= 1
111
112         def handle_data(self, data):
113                 if self.tr and self.td:
114                         self.data.append(data)
115
116
117 class Fernsehserien(IdentifierBase):
118         def __init__(self):
119                 IdentifierBase.__init__(self)
120
121         @classmethod
122         def knowsElapsed(cls):
123                 return True
124
125         @classmethod
126         def knowsToday(cls):
127                 return True
128
129         @classmethod
130         def knowsFuture(cls):
131                 return True
132
133         def getEpisode(self, name, begin, end=None, service=None):
134                 # On Success: Return a single season, episode, title tuple
135                 # On Failure: Return a empty list or String or None
136                 
137                 self.begin = begin
138                 #self.year = datetime.fromtimestamp(begin).year
139                 self.end = end
140                 self.service = service
141                 
142                 self.series = ""
143                 self.first = None
144                 self.last = None
145                 self.page = 0
146                 
147                 self.td_max_time_drift = timedelta(seconds=self.max_time_drift)
148                 
149                 self.knownids = []
150                 self.returnvalue = None
151                 
152                 # Check preconditions
153                 if not name:
154                         splog(_("Skip Fernsehserien: No show name specified"))
155                         return _("Skip Fernsehserien: No show name specified")
156                 if not begin:
157                         splog(_("Skip Fernsehserien: No begin timestamp specified"))
158                         return _("Skip Fernsehserien: No begin timestamp specified")
159                 
160                 if self.begin > datetime.now():
161                         self.future = True
162                 else:
163                         self.future = False
164                 splog("Fernsehserien getEpisode future", self.future)
165         
166                 while name:     
167                         ids = self.getSeries(name)
168                         
169                         while ids:
170                                 idserie = ids.pop()
171                                 
172                                 if idserie and len(idserie) == 2:
173                                         id, idname = idserie
174                                         
175                                         # Handle encodings
176                                         self.series = str_to_utf8(idname)
177                                         
178                                         self.page = 0
179                                         #if self.future:
180                                         #       self.page = 0
181                                         #else:
182                                         #       self.page = -1
183                                         
184                                         self.first = None
185                                         self.last = None
186                                         
187                                         while self.page is not None:
188                                                 result = self.getNextPage(id)
189                                                 if result:
190                                                         return result
191                                         
192                         else:
193                                 name = self.getAlternativeSeries(name)
194                 
195                 else:
196                         return ( self.returnvalue or _("No matching series found") )
197
198         def getSeries(self, name):
199                 parameter =  urlencode({ 'term' : re.sub("[^a-zA-Z0-9*]", " ", name) })
200                 url = SERIESLISTURL + parameter
201                 data = self.getPage(url, Headers)
202                 
203                 if data and isinstance(data, basestring):
204                         data = self.parseSeries(data)
205                         self.doCacheList(url, data)
206                 
207                 if data and isinstance(data, list):
208                         splog("Fernsehserien ids", data)
209                         return self.filterKnownIds(data)
210
211         def parseSeries(self, data):
212                 serieslist = []
213                 for line in json.loads(data):
214                         id = line['id']
215                         idname = line['value']
216                         splog(id, idname)
217                         if not idname.endswith("/person"):
218                                 serieslist.append( ( id, idname ) )
219                 serieslist.reverse()
220                 return serieslist
221
222         def parseNextPage(self, data):
223                 trs = []
224                 
225                 #parser = FSParser()
226                 #parser.feed(data)
227                 #return parser.list
228                 
229                 # Handle malformed HTML issues
230                 data = data.replace('\\"','"')  # target=\"_blank\"
231                 data = data.replace('\'+\'','') # document.write('<scr'+'ipt
232                 
233                 soup = BeautifulSoup(data)
234                 
235                 table = soup.find('table', 'sendetermine')
236                 if table:
237                         for trnode in table.find_all('tr'):
238                                 # TODO skip first header row
239                                 tdnodes = trnode and trnode.find_all('td')
240                                 
241                                 if tdnodes:
242                                         # Filter for known rows
243                                         #if len(tdnodes) == 7 and len(tdnodes[2].string) >= 15:
244                                         
245                                         if len(tdnodes) >= 6 and tdnodes[2].string and len(tdnodes[2].string) >= 15:
246                                                 tds = []
247                                                 for tdnode in tdnodes:
248                                                         tds.append(tdnode.string or "")
249                                                 trs.append( tds )
250                                         # This row belongs to the previous
251                                         elif trs and len(tdnodes) == 5:
252                                                 #if trs[-1][5] and tdnodes[3].string:
253                                                 trs[-1][5] += ' ' + (tdnodes[3].string or "")
254                                                 #if trs[-1][6] and tdnodes[4].string:
255                                                 trs[-1][6] += ' ' + (tdnodes[4].string or "")
256                                         #else:
257                                         #       splog( "tdnodes", len(tdnodes), tdnodes )
258                                 
259                                 #else:
260                                 #       splog( "tdnodes", tdnodes )
261                 
262                 #splog(trs)
263                 return trs
264
265         def getNextPage(self, id):
266                 url = EPISODEIDURL % (id, self.page)
267                 data = self.getPage(url, Headers)
268                 
269                 if data and isinstance(data, basestring):
270                         splog("getNextPage: basestring")
271                         data = self.parseNextPage(data)
272                         self.doCacheList(url, data)
273                 
274                 if data and isinstance(data, list):
275                         splog("getNextPage: list")
276                         
277                         trs = data
278                         # trs[x] = [None, u'31.10.2012', u'20:15\u201321:15 Uhr', u'ProSieben', u'8.', u'15', u'Richtungswechsel']
279
280                         yepisode = None
281                         ydelta = maxint
282                         
283                         #first = trs[0][2]
284                         #last = trs[-1][2]
285                         #print first[0:5]
286                         #print last[6:11] 
287                         
288                         # trs[0] first line [2] second element = timestamps [a:b] use first time
289                         first = datetime.strptime( trs[0][2][0:5] + trs[0][1], "%H:%M%d.%m.%Y" )
290                         
291                         # trs[-1] last line [2] second element = timestamps [a:b] use second time
292                         #last = datetime.strptime( trs[-1][2][6:11] + trs[-1][1], "%H:%M%d.%m.%Y" )
293                         # Problem with wrap around use also start time
294                         # Sa 30.11.2013 23:35 - 01:30 Uhr ProSieben 46 3. 13 Showdown 3
295                         last = datetime.strptime( trs[-1][2][0:5] + trs[-1][1], "%H:%M%d.%m.%Y" )
296                         
297                         #first = first - self.td_max_time_drift
298                         #last = last + self.td_max_time_drift
299                         
300                         
301                         if self.page != 0:
302                                 new_page = (self.first != first or self.last != last)
303                                 splog("getNextPage: first_on_prev_page, first, last_on_prev_page, last, if: ", self.first, first, self.last, last, new_page)
304                                 self.first = first
305                                 self.last = last
306                         else:
307                                 new_page = True
308                         
309                         if new_page:
310                                 test_future_timespan = ( (first-self.td_max_time_drift) <= self.begin and self.begin <= (last+self.td_max_time_drift) )
311                                 test_past_timespan = ( (first+self.td_max_time_drift) >= self.begin and self.begin >= (last-self.td_max_time_drift) )
312                                 
313                                 splog("first_on_page, self.begin, last_on_page, if, if:", first, self.begin, last, test_future_timespan, test_past_timespan )
314                                 if ( test_future_timespan or test_past_timespan ):
315                                         #search in page for matching datetime
316                                         for tds in trs:
317                                                 if tds and len(tds) >= 6:  #7:
318                                                         # Grey's Anathomy
319                                                         # [None, u'31.10.2012', u'20:15\u201321:15 Uhr', u'ProSieben', u'8.', u'15', u'Richtungswechsel']
320                                                         # 
321                                                         # Gute Zeiten 
322                                                         # [None, u'20.11.2012', u'06:40\u201307:20 Uhr', u'NDR', None, u'4187', u'Folge 4187']
323                                                         # [None, u'01.12.2012', u'10:45\u201313:15 Uhr', u'RTL', None, u'5131', u'Folge 5131']
324                                                         # [None, u'\xa0', None, u'5132', u'Folge 5132']
325                                                         # [None, u'\xa0', None, u'5133', u'Folge 5133']
326                                                         # [None, u'\xa0', None, u'5134', u'Folge 5134']
327                                                         # [None, u'\xa0', None, u'5135', u'Folge 5135']
328                                                         
329                                                         # Wahnfried
330                                                         # [u'Sa', u'26.12.1987', u'\u2013', u'So', u'27.12.1987', u'1Plus', None]
331                                                         
332                                                         # First part: date, times, channel
333                                                         xdate, xbegin = tds[1:3]
334                                                         #splog( "tds", tds )
335                                                         
336                                                         #xend = xbegin[6:11]
337                                                         xbegin = xbegin[0:5]
338                                                         xbegin = datetime.strptime( xbegin+xdate, "%H:%M%d.%m.%Y" )
339                                                         #xend = datetime.strptime( xend+xdate, "%H:%M%d.%m.%Y" )
340                                                         #print "xbegin", xbegin
341                                                         
342                                                         #Py2.6
343                                                         delta = abs(self.begin - xbegin)
344                                                         delta = delta.seconds + delta.days * 24 * 3600
345                                                         #Py2.7 delta = abs(self.begin - xbegin).total_seconds()
346                                                         splog(self.begin, xbegin, delta, self.max_time_drift)
347                                                         
348                                                         if delta <= self.max_time_drift:
349                                                                 
350                                                                 if self.compareChannels(self.service, tds[3]):
351                                                                         
352                                                                         if delta < ydelta:
353                                                                                 
354                                                                                 splog( "tds", len(tds), tds )
355                                                                                 if len(tds) >= 10:
356                                                                                         # Second part: s1e1, s1e2,
357                                                                                         xseason = tds[7] or "1"
358                                                                                         xepisode = tds[8]
359                                                                                         xtitle = " ".join(tds[10:])  # Use all available titles
360                                                                                 elif len(tds) >= 7:
361                                                                                         # Second part: s1e1, s1e2,
362                                                                                         xseason = tds[4]
363                                                                                         xepisode = tds[5]
364                                                                                         if xseason and xseason.find(".") != -1:
365                                                                                                 xseason = xseason[:-1]
366                                                                                                 xtitle = " ".join(tds[6:])  # Use all available titles
367                                                                                         else:
368                                                                                                 xseason = "1"
369                                                                                                 xtitle = " ".join(tds[6:])  # Use all available titles
370                                                                                 elif len(tds) == 6:
371                                                                                         xseason = "0"
372                                                                                         xepisode = "0"
373                                                                                         xtitle = tds[5]
374                                                                                 if xseason and xepisode and xtitle and self.series:
375                                                                                 
376                                                                                         # Handle encodings
377                                                                                         xtitle = str_to_utf8(xtitle)
378                                                                                         
379                                                                                         yepisode = (xseason, xepisode, xtitle, self.series)
380                                                                                         ydelta = delta
381                                                                         
382                                                                         else: #if delta >= ydelta:
383                                                                                 break
384                                                                 
385                                                                 else:
386                                                                         self.returnvalue = _("Check the channel name")
387                                                                 
388                                                         elif yepisode:
389                                                                 break
390                                         
391                                         if yepisode:
392                                                 return ( yepisode )
393                                 
394                                 else:
395                                         # TODO calculate next page : use firstrow lastrow datetime
396                                         if not self.future:
397                                                 if first > self.begin:
398                                                         self.page -= 1
399                                                         return
400                                         
401                                         else:
402                                                 if self.begin > last:
403                                                         self.page += 1
404                                                         return
405                 
406                 self.page = None
407                 return