youtube-dl/youtube_dl/extractor/arte.py

import re
import socket

from .common import InfoExtractor
from ..utils import (
    compat_http_client,
    compat_str,
    compat_urllib_error,
    compat_urllib_parse,
    compat_urllib_request,

    ExtractorError,
    unified_strdate,
)

class ArteTvIE(InfoExtractor):
    """arte.tv information extractor."""

    _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
    _LIVE_URL = r'index-[0-9]+\.html$'

    IE_NAME = u'arte.tv'

    def fetch_webpage(self, url):
        request = compat_urllib_request.Request(url)
        try:
            self.report_download_webpage(url)
            webpage = compat_urllib_request.urlopen(request).read()
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
        except ValueError as err:
            raise ExtractorError(u'Invalid URL: %s' % url)
        return webpage

    def grep_webpage(self, url, regex, regexFlags, matchTuples):
        page = self.fetch_webpage(url)
        mobj = re.search(regex, page, regexFlags)
        info = {}

        if mobj is None:
            raise ExtractorError(u'Invalid URL: %s' % url)

        for (i, key, err) in matchTuples:
            if mobj.group(i) is None:
                raise ExtractorError(err)
            else:
                info[key] = mobj.group(i)

        return info

    # TODO implement Live Stream
    # def extractLiveStream(self, url):
    #     video_lang = url.split('/')[-4]
    #     info = self.grep_webpage(
    #         url,
    #         r'src="(.*?/videothek_js.*?\.js)',
    #         0,
    #         [
    #             (1, 'url', u'Invalid URL: %s' % url)
    #         ]
    #     )
    #     http_host = url.split('/')[2]
    #     next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
    #     info = self.grep_webpage(
    #         next_url,
    #         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
    #             '(http://.*?\.swf).*?' +
    #             '(rtmp://.*?)\'',
    #         re.DOTALL,
    #         [
    #             (1, 'path',   u'could not extract video path: %s' % url),
    #             (2, 'player', u'could not extract video player: %s' % url),
    #             (3, 'url',    u'could not extract video url: %s' % url)
    #         ]
    #     )
    #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))

    def extractPlus7Stream(self, url):
        video_lang = url.split('/')[-3]
        info = self.grep_webpage(
            url,
            r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
            0,
            [
                (1, 'url', u'Invalid URL: %s' % url)
            ]
        )
        next_url = compat_urllib_parse.unquote(info.get('url'))
        info = self.grep_webpage(
            next_url,
            r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
            0,
            [
                (1, 'url', u'Could not find <video> tag: %s' % url)
            ]
        )
        next_url = compat_urllib_parse.unquote(info.get('url'))

        info = self.grep_webpage(
            next_url,
            r'<video id="(.*?)".*?>.*?' +
                '<name>(.*?)</name>.*?' +
                '<dateVideo>(.*?)</dateVideo>.*?' +
                '<url quality="hd">(.*?)</url>',
            re.DOTALL,
            [
                (1, 'id',    u'could not extract video id: %s' % url),
                (2, 'title', u'could not extract video title: %s' % url),
                (3, 'date',  u'could not extract video date: %s' % url),
                (4, 'url',   u'could not extract video url: %s' % url)
            ]
        )

        return {
            'id':           info.get('id'),
            'url':          compat_urllib_parse.unquote(info.get('url')),
            'uploader':     u'arte.tv',
            'upload_date':  unified_strdate(info.get('date')),
            'title':        info.get('title').decode('utf-8'),
            'ext':          u'mp4',
            'format':       u'NA',
            'player_url':   None,
        }

    def _real_extract(self, url):
        video_id = url.split('/')[-1]
        self.report_extraction(video_id)

        if re.search(self._LIVE_URL, video_id) is not None:
            raise ExtractorError(u'Arte live streams are not yet supported, sorry')
            # self.extractLiveStream(url)
            # return
        else:
            info = self.extractPlus7Stream(url)

        return [info]
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00			`import re`
			`import socket`

			`from .common import InfoExtractor`
			`from ..utils import (`
			`compat_http_client,`
			`compat_str,`
			`compat_urllib_error,`
			`compat_urllib_parse,`
			`compat_urllib_request,`

			`ExtractorError,`
			`unified_strdate,`
			`)`

			`class ArteTvIE(InfoExtractor):`
			`"""arte.tv information extractor."""`

			`_VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr\|de)/videos/.*'`
			`_LIVE_URL = r'index-[0-9]+\.html$'`

			`IE_NAME = u'arte.tv'`

			`def fetch_webpage(self, url):`
			`request = compat_urllib_request.Request(url)`
			`try:`
			`self.report_download_webpage(url)`
			`webpage = compat_urllib_request.urlopen(request).read()`
			`except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:`
			`raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))`
			`except ValueError as err:`
			`raise ExtractorError(u'Invalid URL: %s' % url)`
			`return webpage`

			`def grep_webpage(self, url, regex, regexFlags, matchTuples):`
			`page = self.fetch_webpage(url)`
			`mobj = re.search(regex, page, regexFlags)`
			`info = {}`

			`if mobj is None:`
			`raise ExtractorError(u'Invalid URL: %s' % url)`

			`for (i, key, err) in matchTuples:`
			`if mobj.group(i) is None:`
			`raise ExtractorError(err)`
			`else:`
			`info[key] = mobj.group(i)`

			`return info`

[arte] Mark dead code as such 2013-06-23 20:26:35 +02:00			`# TODO implement Live Stream`
			`# def extractLiveStream(self, url):`
			`# video_lang = url.split('/')[-4]`
			`# info = self.grep_webpage(`
			`# url,`
			`# r'src="(.?/videothek_js.?\.js)',`
			`# 0,`
			`# [`
			`# (1, 'url', u'Invalid URL: %s' % url)`
			`# ]`
			`# )`
			`# http_host = url.split('/')[2]`
			`# next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))`
			`# info = self.grep_webpage(`
			`# next_url,`
			`# r'(s_artestras_scst_geoFRDE_' + video_lang + '.?)\'.?' +`
			`# '(http://.?\.swf).?' +`
			`# '(rtmp://.*?)\'',`
			`# re.DOTALL,`
			`# [`
			`# (1, 'path', u'could not extract video path: %s' % url),`
			`# (2, 'player', u'could not extract video player: %s' % url),`
			`# (3, 'url', u'could not extract video url: %s' % url)`
			`# ]`
			`# )`
			`# video_url = u'%s/%s' % (info.get('url'), info.get('path'))`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00
			`def extractPlus7Stream(self, url):`
			`video_lang = url.split('/')[-3]`
			`info = self.grep_webpage(`
			`url,`
			`r'param name="movie".?videorefFileUrl=(http[^\'"&])',`
			`0,`
			`[`
			`(1, 'url', u'Invalid URL: %s' % url)`
			`]`
			`)`
			`next_url = compat_urllib_parse.unquote(info.get('url'))`
			`info = self.grep_webpage(`
			`next_url,`
			`r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,`
			`0,`
			`[`
			`(1, 'url', u'Could not find <video> tag: %s' % url)`
			`]`
			`)`
			`next_url = compat_urllib_parse.unquote(info.get('url'))`

			`info = self.grep_webpage(`
			`next_url,`
			`r'<video id="(.?)".?>.*?' +`
			`'<name>(.?)</name>.?' +`
			`'<dateVideo>(.?)</dateVideo>.?' +`
			`'<url quality="hd">(.*?)</url>',`
			`re.DOTALL,`
			`[`
			`(1, 'id', u'could not extract video id: %s' % url),`
			`(2, 'title', u'could not extract video title: %s' % url),`
			`(3, 'date', u'could not extract video date: %s' % url),`
			`(4, 'url', u'could not extract video url: %s' % url)`
			`]`
			`)`

			`return {`
			`'id': info.get('id'),`
			`'url': compat_urllib_parse.unquote(info.get('url')),`
			`'uploader': u'arte.tv',`
			`'upload_date': unified_strdate(info.get('date')),`
			`'title': info.get('title').decode('utf-8'),`
			`'ext': u'mp4',`
			`'format': u'NA',`
			`'player_url': None,`
			`}`

			`def _real_extract(self, url):`
			`video_id = url.split('/')[-1]`
			`self.report_extraction(video_id)`

			`if re.search(self._LIVE_URL, video_id) is not None:`
[arte] Mark dead code as such 2013-06-23 20:26:35 +02:00			`raise ExtractorError(u'Arte live streams are not yet supported, sorry')`
			`# self.extractLiveStream(url)`
			`# return`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00			`else:`
			`info = self.extractPlus7Stream(url)`

			`return [info]`