From aa016336a8023dfd2955c648519a083de3137705 Mon Sep 17 00:00:00 2001 From: Petar Kukolj Date: Wed, 20 Sep 2017 03:02:02 +0200 Subject: [PATCH] [LibriVox] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/librivox.py | 48 ++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 youtube_dl/extractor/librivox.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ab95c8575..eeaa4b8aa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -527,6 +527,7 @@ from .leeco import ( LetvCloudIE, ) from .libraryofcongress import LibraryOfCongressIE +from .librivox import LibriVoxIE from .libsyn import LibsynIE from .lifenews import ( LifeNewsIE, diff --git a/youtube_dl/extractor/librivox.py b/youtube_dl/extractor/librivox.py new file mode 100644 index 000000000..ea5bffc1e --- /dev/null +++ b/youtube_dl/extractor/librivox.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + orderedSet +) + + +class LibriVoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?librivox\.org/(?P(?P(?:[^\-]*\-)+[^\-]*)\-by\-(?P<author>(-.*\-)*[^/]*))/?' + _TESTS = [{ + 'url': 'https://librivox.org/the-art-of-war-by-sun-tzu/', + 'info_dict': { + 'id': 'the-art-of-war-by-sun-tzu', + 'title': 'The Art Of War by Sun Tzu' + }, + 'playlist_mincount': 7 + }, { + 'url': 'https://librivox.org/alexander-the-great-by-jacob-abbott/', + 'info_dict': { + 'id': 'alexander-the-great-by-jacob-abbott', + 'title': 'Alexander The Great by Jacob Abbott' + }, + 'playlist_mincount': 12 + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + book_title = mobj.group('title').replace('-', ' ').strip().title() + author = mobj.group('author').replace('-', ' ').strip().title() + + info = { + 'id': video_id, + '_type': 'playlist', + 'title': book_title + ' by ' + author + } + + webpage = self._download_webpage(url, video_id) + + links = orderedSet(re.findall(r'<a href="(https?://(?:www\.)?archive\.org/download/[^/]*/([^\.]*(?<!(?:64kb)))\.mp3)".*>(.*)</a>', webpage)) + info['entries'] = [self.url_result(link[0], video_id=link[1], video_title=link[2]) for link in links] + + return info