From aa016336a8023dfd2955c648519a083de3137705 Mon Sep 17 00:00:00 2001
From: Petar Kukolj <petarkukolj3@yahoo.com>
Date: Wed, 20 Sep 2017 03:02:02 +0200
Subject: [PATCH] [LibriVox] Add new extractor

---
 youtube_dl/extractor/extractors.py |  1 +
 youtube_dl/extractor/librivox.py   | 48 ++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 youtube_dl/extractor/librivox.py
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index ab95c8575..eeaa4b8aa 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -527,6 +527,7 @@ from .leeco import (
     LetvCloudIE,
 )
 from .libraryofcongress import LibraryOfCongressIE
+from .librivox import LibriVoxIE
 from .libsyn import LibsynIE
 from .lifenews import (
     LifeNewsIE,
diff --git a/youtube_dl/extractor/librivox.py b/youtube_dl/extractor/librivox.py
new file mode 100644
index 000000000..ea5bffc1e
--- /dev/null
+++ b/youtube_dl/extractor/librivox.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    orderedSet
+)
+
+
+class LibriVoxIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?librivox\.org/(?P<id>(?P<title>(?:[^\-]*\-)+[^\-]*)\-by\-(?P<author>(-.*\-)*[^/]*))/?'
+    _TESTS = [{
+        'url': 'https://librivox.org/the-art-of-war-by-sun-tzu/',
+        'info_dict': {
+            'id': 'the-art-of-war-by-sun-tzu',
+            'title': 'The Art Of War by Sun Tzu'
+        },
+        'playlist_mincount': 7
+    }, {
+        'url': 'https://librivox.org/alexander-the-great-by-jacob-abbott/',
+        'info_dict': {
+            'id': 'alexander-the-great-by-jacob-abbott',
+            'title': 'Alexander The Great by Jacob Abbott'
+        },
+        'playlist_mincount': 12
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        book_title = mobj.group('title').replace('-', ' ').strip().title()
+        author = mobj.group('author').replace('-', ' ').strip().title()
+
+        info = {
+            'id': video_id,
+            '_type': 'playlist',
+            'title': book_title + ' by ' + author
+        }
+
+        webpage = self._download_webpage(url, video_id)
+
+        links = orderedSet(re.findall(r'<a href="(https?://(?:www\.)?archive\.org/download/[^/]*/([^\.]*(?<!(?:64kb)))\.mp3)".*>(.*)</a>', webpage))
+        info['entries'] = [self.url_result(link[0], video_id=link[1], video_title=link[2]) for link in links]
+
+        return info