1
0
mirror of https://codeberg.org/polarisfm/youtube-dl synced 2024-12-27 08:47:55 +01:00

Merge pull request #1 from rg3/master

update
This commit is contained in:
tsia 2019-03-03 13:56:59 +01:00 committed by GitHub
commit 07cc388fcc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
79 changed files with 2561 additions and 858 deletions

View File

@ -6,8 +6,8 @@
---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.10**
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.03.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.03.01**
### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2019.01.10
[debug] youtube-dl version 2019.03.01
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}

View File

@ -9,7 +9,6 @@ python:
- "3.6"
- "pypy"
- "pypy3"
sudo: false
env:
- YTDL_TEST_SET=core
- YTDL_TEST_SET=download

View File

@ -339,7 +339,7 @@ Incorrect:
'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4'
```
### Use safe conversion functions
### Use convenience conversion and parsing functions
Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
@ -347,6 +347,8 @@ Use `url_or_none` for safe URL processing.
Use `try_get` for safe metadata extraction from parsed JSON.
Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction.
Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions.
#### More examples

172
ChangeLog
View File

@ -1,3 +1,175 @@
version 2019.03.01
Core
+ [downloader/external] Add support for rate limit and retries for wget
* [downloader/external] Fix infinite retries for curl (#19303)
Extractors
* [npo] Fix extraction (#20084)
* [francetv:site] Extend video id regex (#20029, #20071)
+ [periscope] Extract width and height (#20015)
* [servus] Fix extraction (#19297)
* [bbccouk] Make subtitles non fatal (#19651)
* [metacafe] Fix family filter bypass (#19287)
version 2019.02.18
Extractors
* [tvp:website] Fix and improve extraction
+ [tvp] Detect unavailable videos
* [tvp] Fix description extraction and make thumbnail optional
+ [linuxacademy] Add support for linuxacademy.com (#12207)
* [bilibili] Update keys (#19233)
* [udemy] Extend URL regular expressions (#14330, #15883)
* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126)
* [noovo] Fix extraction (#19230)
* [rai] Relax URL regular expression (#19232)
+ [vshare] Pass Referer to download request (#19205, #19221)
+ [openload] Add support for oload.live (#19222)
* [imgur] Use video id as title fallback (#18590)
+ [twitch] Add new source format detection approach (#19193)
* [tvplayhome] Fix video id extraction (#19190)
* [tvplayhome] Fix episode metadata extraction (#19190)
* [rutube:embed] Fix extraction (#19163)
+ [rutube:embed] Add support private videos (#19163)
+ [soundcloud] Extract more metadata
+ [trunews] Add support for trunews.com (#19153)
+ [linkedin:learning] Extract chapter_number and chapter_id (#19162)
version 2019.02.08
Core
* [utils] Improve JSON-LD regular expression (#18058)
* [YoutubeDL] Fallback to ie_key of matching extractor while making
download archive id when no explicit ie_key is provided (#19022)
Extractors
+ [malltv] Add support for mall.tv (#18058, #17856)
+ [spankbang:playlist] Add support for playlists (#19145)
* [spankbang] Extend URL regular expression
* [trutv] Fix extraction (#17336)
* [toutv] Fix authentication (#16398, #18700)
* [pornhub] Fix tags and categories extraction (#13720, #19135)
* [pornhd] Fix formats extraction
+ [pornhd] Extract like count (#19123, #19125)
* [radiocanada] Switch to the new media requests (#19115)
+ [teachable] Add support for courses.workitdaily.com (#18871)
- [vporn] Remove extractor (#16276)
+ [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086)
+ [drtuber] Extract duration (#19078)
* [soundcloud] Fix paged playlists extraction, add support for albums and update client id
* [soundcloud] Update client id
* [drtv] Improve preference (#19079)
+ [openload] Add support for openload.pw and oload.pw (#18930)
+ [openload] Add support for oload.info (#19073)
* [crackle] Authorize media detail request (#16931)
version 2019.01.30.1
Core
* [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067)
version 2019.01.30
Core
* [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding
subtitles (#19024, #19042)
* [postprocessor/ffmpeg] Disable "Last message repeated" messages (#19025)
Extractors
* [yourporn] Fix extraction and extract duration (#18815, #18852, #19061)
* [drtv] Improve extraction (#19039)
+ Add support for EncryptedUri videos
+ Extract more metadata
* Fix subtitles extraction
+ [fox] Add support for locked videos using cookies (#19060)
* [fox] Fix extraction for free videos (#19060)
+ [zattoo] Add support for tv.salt.ch (#19059)
version 2019.01.27
Core
+ [extractor/common] Extract season in _json_ld
* [postprocessor/ffmpeg] Fallback to ffmpeg/avconv for audio codec detection
(#681)
Extractors
* [vice] Fix extraction for locked videos (#16248)
+ [wakanim] Detect DRM protected videos
+ [wakanim] Add support for wakanim.tv (#14374)
* [usatoday] Fix extraction for videos with custom brightcove partner id
(#18990)
* [drtv] Fix extraction (#18989)
* [nhk] Extend URL regular expression (#18968)
* [go] Fix Adobe Pass requests for Disney Now (#18901)
+ [openload] Add support for oload.club (#18969)
version 2019.01.24
Core
* [YoutubeDL] Fix negation for string operators in format selection (#18961)
version 2019.01.23
Core
* [utils] Fix urljoin for paths with non-http(s) schemes
* [extractor/common] Improve jwplayer relative URL handling (#18892)
+ [YoutubeDL] Add negation support for string comparisons in format selection
expressions (#18600, #18805)
* [extractor/common] Improve HLS video-only format detection (#18923)
Extractors
* [crunchyroll] Extend URL regular expression (#18955)
* [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722,
#17197, #18338 #18842, #18899)
+ [vrv] Add support for authentication (#14307)
* [videomore:season] Fix extraction
* [videomore] Improve extraction (#18908)
+ [tnaflix] Pass Referer in metadata request (#18925)
* [radiocanada] Relax DRM check (#18608, #18609)
* [vimeo] Fix video password verification for videos protected by
Referer HTTP header
+ [hketv] Add support for hkedcity.net (#18696)
+ [streamango] Add support for fruithosts.net (#18710)
+ [instagram] Add support for tags (#18757)
+ [odnoklassniki] Detect paid videos (#18876)
* [ted] Correct acodec for HTTP formats (#18923)
* [cartoonnetwork] Fix extraction (#15664, #17224)
* [vimeo] Fix extraction for password protected player URLs (#18889)
version 2019.01.17
Extractors
* [youtube] Extend JS player signature function name regular expressions
(#18890, #18891, #18893)
version 2019.01.16
Core
+ [test/helper] Add support for maxcount and count collection len checkers
* [downloader/hls] Fix uplynk ad skipping (#18824)
* [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813)
Extractors
* [youtube] Skip unsupported adaptive stream type (#18804)
+ [youtube] Extract DASH formats from player response (#18804)
* [funimation] Fix extraction (#14089)
* [skylinewebcams] Fix extraction (#18853)
+ [curiositystream] Add support for non app URLs
+ [bitchute] Check formats (#18833)
* [wistia] Extend URL regular expression (#18823)
+ [playplustv] Add support for playplus.com (#18789)
version 2019.01.10
Core

View File

@ -667,7 +667,7 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `
- `asr`: Audio sampling rate in Hertz
- `fps`: Frame rate
Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begins with), `$=` (ends with), `*=` (contains) and following string meta fields:
Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains) and following string meta fields:
- `ext`: File extension
- `acodec`: Name of the audio codec in use
- `vcodec`: Name of the video codec in use
@ -675,6 +675,8 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin
- `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`)
- `format_id`: A short description of the format
Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain).
Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster.
Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s.
@ -1211,7 +1213,7 @@ Incorrect:
'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4'
```
### Use safe conversion functions
### Use convenience conversion and parsing functions
Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
@ -1219,6 +1221,8 @@ Use `url_or_none` for safe URL processing.
Use `try_get` for safe metadata extraction from parsed JSON.
Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction.
Explore [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions.
#### More examples

View File

@ -361,6 +361,7 @@
- **hitbox**
- **hitbox:live**
- **HitRecord**
- **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau
- **HornBunny**
- **HotNewHipHop**
- **hotstar**
@ -386,6 +387,7 @@
- **IndavideoEmbed**
- **InfoQ**
- **Instagram**
- **instagram:tag**: Instagram hashtag search
- **instagram:user**: Instagram user profile
- **Internazionale**
- **InternetVideoArchive**
@ -456,6 +458,7 @@
- **LineTV**
- **linkedin:learning**
- **linkedin:learning:course**
- **LinuxAcademy**
- **LiTV**
- **LiveLeak**
- **LiveLeakEmbed**
@ -474,6 +477,7 @@
- **mailru:music**: Музыка@Mail.Ru
- **mailru:music:search**: Музыка@Mail.Ru
- **MakerTV**
- **MallTV**
- **mangomolo:live**
- **mangomolo:video**
- **ManyVids**
@ -544,6 +548,7 @@
- **MyVisionTV**
- **n-tv.de**
- **natgeo:video**
- **NationalGeographicTV**
- **Naver**
- **NBA**
- **NBC**
@ -774,6 +779,7 @@
- **safari:api**
- **safari:course**: safaribooksonline.com online courses
- **SAKTV**
- **SaltTV**
- **Sapo**: SAPO Vídeos
- **savefrom.net**
- **SBS**: sbs.com.au
@ -823,6 +829,7 @@
- **southpark.nl**
- **southparkstudios.dk**
- **SpankBang**
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- **Spiegel:Article**: Articles on spiegel.de
@ -909,6 +916,7 @@
- **ToypicsUser**: Toypics user profile
- **TrailerAddict** (Currently broken)
- **Trilulilu**
- **TruNews**
- **TruTV**
- **Tube8**
- **TubiTv**
@ -1053,7 +1061,6 @@
- **Voot**
- **VoxMedia**
- **VoxMediaVolume**
- **Vporn**
- **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **Vrak**
- **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be
@ -1067,6 +1074,7 @@
- **VVVVID**
- **VyboryMos**
- **Vzaar**
- **Wakanim**
- **Walla**
- **WalyTV**
- **washingtonpost**

View File

@ -153,15 +153,27 @@ def expect_value(self, got, expected, field):
isinstance(got, compat_str),
'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got)))
got = 'md5:' + md5(got)
elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
elif isinstance(expected, compat_str) and re.match(r'^(?:min|max)?count:\d+', expected):
self.assertTrue(
isinstance(got, (list, dict)),
'Expected field %s to be a list or a dict, but it is of type %s' % (
field, type(got).__name__))
expected_num = int(expected.partition(':')[2])
assertGreaterEqual(
op, _, expected_num = expected.partition(':')
expected_num = int(expected_num)
if op == 'mincount':
assert_func = assertGreaterEqual
msg_tmpl = 'Expected %d items in field %s, but only got %d'
elif op == 'maxcount':
assert_func = assertLessEqual
msg_tmpl = 'Expected maximum %d items in field %s, but got %d'
elif op == 'count':
assert_func = assertEqual
msg_tmpl = 'Expected exactly %d items in field %s, but got %d'
else:
assert False
assert_func(
self, len(got), expected_num,
'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got)))
msg_tmpl % (expected_num, field, len(got)))
return
self.assertEqual(
expected, got,
@ -237,6 +249,20 @@ def assertGreaterEqual(self, got, expected, msg=None):
self.assertTrue(got >= expected, msg)
def assertLessEqual(self, got, expected, msg=None):
if not (got <= expected):
if msg is None:
msg = '%r not less than or equal to %r' % (got, expected)
self.assertTrue(got <= expected, msg)
def assertEqual(self, got, expected, msg=None):
if not (got == expected):
if msg is None:
msg = '%r not equal to %r' % (got, expected)
self.assertTrue(got == expected, msg)
def expect_warnings(ydl, warnings_re):
real_warning = ydl.report_warning

View File

@ -61,6 +61,7 @@ class TestInfoExtractor(unittest.TestCase):
<meta content='Foo' property=og:foobar>
<meta name="og:test1" content='foo > < bar'/>
<meta name="og:test2" content="foo >//< bar"/>
<meta property=og-test3 content='Ill-formatted opengraph'/>
'''
self.assertEqual(ie._og_search_title(html), 'Foo')
self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
@ -69,6 +70,7 @@ class TestInfoExtractor(unittest.TestCase):
self.assertEqual(ie._og_search_property('foobar', html), 'Foo')
self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar')
self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar')
self.assertEqual(ie._og_search_property('test3', html), 'Ill-formatted opengraph')
self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)
@ -497,7 +499,64 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'width': 1280,
'height': 720,
}]
)
),
(
# https://github.com/rg3/youtube-dl/issues/18923
# https://www.ted.com/talks/boris_hesser_a_grassroots_healthcare_revolution_in_africa
'ted_18923',
'http://hls.ted.com/talks/31241.m3u8',
[{
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '600k-Audio',
'vcodec': 'none',
}, {
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '68',
'vcodec': 'none',
}, {
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '163',
'acodec': 'none',
'width': 320,
'height': 180,
}, {
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '481',
'acodec': 'none',
'width': 512,
'height': 288,
}, {
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '769',
'acodec': 'none',
'width': 512,
'height': 288,
}, {
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '984',
'acodec': 'none',
'width': 512,
'height': 288,
}, {
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '1255',
'acodec': 'none',
'width': 640,
'height': 360,
}, {
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '1693',
'acodec': 'none',
'width': 853,
'height': 480,
}, {
'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b',
'format_id': '2462',
'acodec': 'none',
'width': 1280,
'height': 720,
}]
),
]
for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:

View File

@ -239,6 +239,76 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot')
def test_format_selection_string_ops(self):
formats = [
{'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL},
{'format_id': 'zxc-cxz', 'ext': 'webm', 'url': TEST_URL},
]
info_dict = _make_result(formats)
# equals (=)
ydl = YDL({'format': '[format_id=abc-cba]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'abc-cba')
# does not equal (!=)
ydl = YDL({'format': '[format_id!=abc-cba]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'zxc-cxz')
ydl = YDL({'format': '[format_id!=abc-cba][format_id!=zxc-cxz]'})
self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
# starts with (^=)
ydl = YDL({'format': '[format_id^=abc]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'abc-cba')
# does not start with (!^=)
ydl = YDL({'format': '[format_id!^=abc]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'zxc-cxz')
ydl = YDL({'format': '[format_id!^=abc][format_id!^=zxc]'})
self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
# ends with ($=)
ydl = YDL({'format': '[format_id$=cba]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'abc-cba')
# does not end with (!$=)
ydl = YDL({'format': '[format_id!$=cba]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'zxc-cxz')
ydl = YDL({'format': '[format_id!$=cba][format_id!$=cxz]'})
self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
# contains (*=)
ydl = YDL({'format': '[format_id*=bc-cb]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'abc-cba')
# does not contain (!*=)
ydl = YDL({'format': '[format_id!*=bc-cb]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'zxc-cxz')
ydl = YDL({'format': '[format_id!*=abc][format_id!*=zxc]'})
self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
ydl = YDL({'format': '[format_id!*=-]'})
self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
def test_youtube_format_selection(self):
order = [
'38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13',

View File

@ -29,6 +29,16 @@ class TestYoutubeDLCookieJar(unittest.TestCase):
tf.close()
os.remove(tf.name)
def test_strip_httponly_prefix(self):
cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt')
cookiejar.load(ignore_discard=True, ignore_expires=True)
def assert_cookie_has_value(key):
self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE')
assert_cookie_has_value('HTTPONLY_COOKIE')
assert_cookie_has_value('JS_ACCESSIBLE_COOKIE')
if __name__ == '__main__':
unittest.main()

View File

@ -507,6 +507,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual(urljoin('http://foo.de/', ''), None)
self.assertEqual(urljoin('http://foo.de/', ['foobar']), None)
self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt')
self.assertEqual(urljoin('http://foo.de/a/b/c.txt', 'rtmp://foo.de'), 'rtmp://foo.de')
self.assertEqual(urljoin(None, 'rtmp://foo.de'), 'rtmp://foo.de')
def test_url_or_none(self):
self.assertEqual(url_or_none(None), None)

View File

@ -0,0 +1,6 @@
# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
#HttpOnly_www.foobar.foobar FALSE / TRUE 2147483647 HTTPONLY_COOKIE HTTPONLY_COOKIE_VALUE
www.foobar.foobar FALSE / TRUE 2147483647 JS_ACCESSIBLE_COOKIE JS_ACCESSIBLE_COOKIE_VALUE

28
test/testdata/m3u8/ted_18923.m3u8 vendored Normal file
View File

@ -0,0 +1,28 @@
#EXTM3U
#EXT-X-VERSION:4
#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1255659,PROGRAM-ID=1,CODECS="avc1.42c01e,mp4a.40.2",RESOLUTION=640x360
/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b
#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=163154,PROGRAM-ID=1,CODECS="avc1.42c00c,mp4a.40.2",RESOLUTION=320x180
/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b
#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=481701,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288
/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b
#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=769968,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288
/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b
#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=984037,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288
/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b
#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1693925,PROGRAM-ID=1,CODECS="avc1.4d401f,mp4a.40.2",RESOLUTION=853x480
/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b
#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=2462469,PROGRAM-ID=1,CODECS="avc1.640028,mp4a.40.2",RESOLUTION=1280x720
/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b
#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=68101,PROGRAM-ID=1,CODECS="mp4a.40.2",DEFAULT=YES
/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=74298,PROGRAM-ID=1,CODECS="avc1.42c00c",RESOLUTION=320x180,URI="/videos/BorisHesser_2018S/video/64k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=216200,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/180k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=304717,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/320k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=350933,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/450k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=495850,PROGRAM-ID=1,CODECS="avc1.42c01e",RESOLUTION=640x360,URI="/videos/BorisHesser_2018S/video/600k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=810750,PROGRAM-ID=1,CODECS="avc1.4d401f",RESOLUTION=853x480,URI="/videos/BorisHesser_2018S/video/950k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=1273700,PROGRAM-ID=1,CODECS="avc1.640028",RESOLUTION=1280x720,URI="/videos/BorisHesser_2018S/video/1500k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="600k",LANGUAGE="en",NAME="Audio",AUTOSELECT=YES,DEFAULT=YES,URI="/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b",BANDWIDTH=614400

View File

@ -82,6 +82,7 @@ from .utils import (
sanitize_url,
sanitized_Request,
std_headers,
str_or_none,
subtitles_filename,
UnavailableVideoError,
url_basename,
@ -1063,21 +1064,24 @@ class YoutubeDL(object):
if not m:
STR_OPERATORS = {
'=': operator.eq,
'!=': operator.ne,
'^=': lambda attr, value: attr.startswith(value),
'$=': lambda attr, value: attr.endswith(value),
'*=': lambda attr, value: value in attr,
}
str_operator_rex = re.compile(r'''(?x)
\s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
\s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
\s*(?P<value>[a-zA-Z0-9._-]+)
\s*$
''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
m = str_operator_rex.search(filter_spec)
if m:
comparison_value = m.group('value')
op = STR_OPERATORS[m.group('op')]
str_op = STR_OPERATORS[m.group('op')]
if m.group('negation'):
op = lambda attr, value: not str_op(attr, value)
else:
op = str_op
if not m:
raise ValueError('Invalid filter specification %r' % filter_spec)
@ -2057,15 +2061,24 @@ class YoutubeDL(object):
self.report_warning('Unable to remove downloaded original file')
def _make_archive_id(self, info_dict):
video_id = info_dict.get('id')
if not video_id:
return
# Future-proof against any change in case
# and backwards compatibility with prior versions
extractor = info_dict.get('extractor_key')
extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
if extractor is None:
if 'id' in info_dict:
extractor = info_dict.get('ie_key') # key in a playlist
if extractor is None:
return None # Incomplete video information
return extractor.lower() + ' ' + info_dict['id']
url = str_or_none(info_dict.get('url'))
if not url:
return
# Try to find matching extractor for the URL and take its ie_key
for ie in self._ies:
if ie.suitable(url):
extractor = ie.ie_key()
break
else:
return
return extractor.lower() + ' ' + video_id
def in_download_archive(self, info_dict):
fn = self.params.get('download_archive')
@ -2073,7 +2086,7 @@ class YoutubeDL(object):
return False
vid_id = self._make_archive_id(info_dict)
if vid_id is None:
if not vid_id:
return False # Incomplete video information
try:

View File

@ -121,7 +121,11 @@ class CurlFD(ExternalFD):
cmd += self._valueless_option('--silent', 'noprogress')
cmd += self._valueless_option('--verbose', 'verbose')
cmd += self._option('--limit-rate', 'ratelimit')
cmd += self._option('--retry', 'retries')
retry = self._option('--retry', 'retries')
if len(retry) == 2:
if retry[1] in ('inf', 'infinite'):
retry[1] = '2147483647'
cmd += retry
cmd += self._option('--max-filesize', 'max_filesize')
cmd += self._option('--interface', 'source_address')
cmd += self._option('--proxy', 'proxy')
@ -160,6 +164,12 @@ class WgetFD(ExternalFD):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._option('--limit-rate', 'ratelimit')
retry = self._option('--tries', 'retries')
if len(retry) == 2:
if retry[1] in ('inf', 'infinite'):
retry[1] = '0'
cmd += retry
cmd += self._option('--bind-address', 'source_address')
cmd += self._option('--proxy', 'proxy')
cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate')

View File

@ -1,8 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import itertools
import re
import xml
from .common import InfoExtractor
from ..utils import (
@ -17,6 +18,7 @@ from ..utils import (
parse_iso8601,
try_get,
unescapeHTML,
url_or_none,
urlencode_postdata,
urljoin,
)
@ -310,7 +312,13 @@ class BBCCoUkIE(InfoExtractor):
def _get_subtitles(self, media, programme_id):
subtitles = {}
for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
cc_url = url_or_none(connection.get('href'))
if not cc_url:
continue
captions = self._download_xml(
cc_url, programme_id, 'Downloading captions', fatal=False)
if not isinstance(captions, xml.etree.ElementTree.Element):
continue
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
subtitles[lang] = [
{

View File

@ -93,8 +93,8 @@ class BiliBiliIE(InfoExtractor):
}]
}]
_APP_KEY = '84956560bc028eb7'
_BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e'
_APP_KEY = 'iVGUTjsxvpLeuDCf'
_BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
def _report_error(self, result):
if 'message' in result:

View File

@ -1,20 +1,19 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .turner import TurnerBaseIE
from ..utils import int_or_none
class CartoonNetworkIE(TurnerBaseIE):
_VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html'
_TEST = {
'url': 'http://www.cartoonnetwork.com/video/teen-titans-go/starfire-the-cat-lady-clip.html',
'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html',
'info_dict': {
'id': '8a250ab04ed07e6c014ef3f1e2f9016c',
'id': '6e3375097f63874ebccec7ef677c1c3845fa850e',
'ext': 'mp4',
'title': 'Starfire the Cat Lady',
'description': 'Robin decides to become a cat so that Starfire will finally love him.',
'title': 'How to Draw Upgrade',
'description': 'md5:2061d83776db7e8be4879684eefe8c0f',
},
'params': {
# m3u8 download
@ -25,18 +24,39 @@ class CartoonNetworkIE(TurnerBaseIE):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups()
query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id
return self._extract_cvp_info(
'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, {
'secure': {
'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big',
'tokenizer_src': 'https://token.vgtf.net/token/token_mobile',
},
}, {
def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False):
metadata_re = ''
if content_re:
metadata_re = r'|video_metadata\.content_' + content_re
return self._search_regex(
r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re),
webpage, name, fatal=fatal)
media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True)
title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True)
info = self._extract_ngtv_info(
media_id, {'networkId': 'cartoonnetwork'}, {
'url': url,
'site_name': 'CartoonNetwork',
'auth_required': self._search_regex(
r'_cnglobal\.cvpFullOrPreviewAuth\s*=\s*(true|false);',
webpage, 'auth required', default='false') == 'true',
'auth_required': find_field('authType', 'auth type') != 'unauth',
})
series = find_field(
'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage)
info.update({
'id': media_id,
'display_id': display_id,
'title': title,
'description': self._html_search_meta('description', webpage),
'series': series,
'episode': title,
})
for field in ('season', 'episode'):
field_name = field + 'Number'
info[field + '_number'] = int_or_none(find_field(
field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage))
return info

View File

@ -1058,7 +1058,7 @@ class InfoExtractor(object):
@staticmethod
def _og_regexes(prop):
content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
% {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
@ -1249,7 +1249,10 @@ class InfoExtractor(object):
info['title'] = episode_name
part_of_season = e.get('partOfSeason')
if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
info.update({
'season': unescapeHTML(part_of_season.get('name')),
'season_number': int_or_none(part_of_season.get('seasonNumber')),
})
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
info['series'] = unescapeHTML(part_of_series.get('name'))
@ -1596,6 +1599,7 @@ class InfoExtractor(object):
# References:
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
# 2. https://github.com/rg3/youtube-dl/issues/12211
# 3. https://github.com/rg3/youtube-dl/issues/18923
# We should try extracting formats only from master playlists [1, 4.3.4],
# i.e. playlists that describe available qualities. On the other hand
@ -1667,11 +1671,16 @@ class InfoExtractor(object):
rendition = stream_group[0]
return rendition.get('NAME') or stream_group_id
# parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
# chance to detect video only formats when EXT-X-STREAM-INF tags
# precede EXT-X-MEDIA tags in HLS manifest such as [3].
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-MEDIA:'):
extract_media(line)
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
last_stream_inf = parse_m3u8_attributes(line)
elif line.startswith('#EXT-X-MEDIA:'):
extract_media(line)
elif line.startswith('#') or not line.strip():
continue
else:
@ -2624,7 +2633,7 @@ class InfoExtractor(object):
'id': this_video_id,
'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
'description': video_data.get('description'),
'thumbnail': self._proto_relative_url(video_data.get('image')),
'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
'subtitles': subtitles,
@ -2651,12 +2660,9 @@ class InfoExtractor(object):
for source in jwplayer_sources_data:
if not isinstance(source, dict):
continue
source_url = self._proto_relative_url(source.get('file'))
if not source_url:
continue
if base_url:
source_url = compat_urlparse.urljoin(base_url, source_url)
if source_url in urls:
source_url = urljoin(
base_url, self._proto_relative_url(source.get('file')))
if not source_url or source_url in urls:
continue
urls.append(source_url)
source_type = source.get('type') or ''

View File

@ -1,7 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals, division
import hashlib
import hmac
import re
import time
from .common import InfoExtractor
from ..compat import compat_HTTPError
@ -74,13 +77,16 @@ class CrackleIE(InfoExtractor):
for country in countries:
try:
# Authorization generation algorithm is reverse engineered from:
# https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js
media_detail_url = 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country)
timestamp = time.strftime('%Y%m%d%H%M', time.gmtime())
h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([media_detail_url, timestamp]).encode(), hashlib.sha1).hexdigest().upper()
media = self._download_json(
'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s'
% (video_id, country), video_id,
'Downloading media JSON as %s' % country,
'Unable to download media JSON', query={
'disableProtocols': 'true',
'format': 'json'
media_detail_url, video_id, 'Downloading media JSON as %s' % country,
'Unable to download media JSON', headers={
'Accept': 'application/json',
'Authorization': '|'.join([h, timestamp, '117', '1']),
})
except ExtractorError as e:
# 401 means geo restriction, trying next country

View File

@ -56,22 +56,11 @@ class CrunchyrollBaseIE(InfoExtractor):
if username is None:
return
self._download_webpage(
'https://www.crunchyroll.com/?a=formhandler',
None, 'Logging in', 'Wrong login info',
data=urlencode_postdata({
'formname': 'RpcApiUser_Login',
'next_url': 'https://www.crunchyroll.com/acct/membership',
'name': username,
'password': password,
}))
'''
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
def is_logged(webpage):
return '<title>Redirecting' in webpage
return 'href="/logout"' in webpage
# Already logged in
if is_logged(login_page):
@ -110,7 +99,6 @@ class CrunchyrollBaseIE(InfoExtractor):
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
'''
def _real_initialize(self):
self._login()
@ -144,7 +132,7 @@ class CrunchyrollBaseIE(InfoExtractor):
class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
IE_NAME = 'crunchyroll'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
'info_dict': {
@ -269,6 +257,9 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
}, {
'url': 'http://www.crunchyroll.com/media-723735',
'only_matching': True,
}, {
'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921',
'only_matching': True,
}]
_FORMAT_IDS = {

View File

@ -4,7 +4,9 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
NO_DEFAULT,
parse_duration,
str_to_int,
)
@ -65,6 +67,9 @@ class DrTuberIE(InfoExtractor):
})
self._sort_formats(formats)
duration = int_or_none(video_data.get('duration')) or parse_duration(
video_data.get('duration_format'))
title = self._html_search_regex(
(r'<h1[^>]+class=["\']title[^>]+>([^<]+)',
r'<title>([^<]+)\s*@\s+DrTuber',
@ -103,4 +108,5 @@ class DrTuberIE(InfoExtractor):
'comment_count': comment_count,
'categories': categories,
'age_limit': self._rta_search(webpage),
'duration': duration,
}

View File

@ -1,15 +1,25 @@
# coding: utf-8
from __future__ import unicode_literals
import binascii
import hashlib
import re
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt
from ..compat import compat_urllib_parse_unquote
from ..utils import (
bytes_to_intlist,
ExtractorError,
int_or_none,
intlist_to_bytes,
float_or_none,
mimetype2ext,
parse_iso8601,
remove_end,
str_or_none,
unified_timestamp,
update_url_query,
url_or_none,
)
@ -20,23 +30,31 @@ class DRTVIE(InfoExtractor):
IE_NAME = 'drtv'
_TESTS = [{
'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
'md5': '7ae17b4e18eb5d29212f424a7511c184',
'md5': '25e659cccc9a2ed956110a299fdf5983',
'info_dict': {
'id': 'klassen-darlig-taber-10',
'ext': 'mp4',
'title': 'Klassen - Dårlig taber (10)',
'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa',
'timestamp': 1471991907,
'upload_date': '20160823',
'timestamp': 1539085800,
'upload_date': '20181009',
'duration': 606.84,
'series': 'Klassen',
'season': 'Klassen I',
'season_number': 1,
'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b',
'episode': 'Episode 10',
'episode_number': 10,
'release_year': 2016,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
# embed
'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
'info_dict': {
'id': 'christiania-pusher-street-ryddes-drdkrjpo',
'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6',
'ext': 'mp4',
'title': 'LIVE Christianias rydning af Pusher Street er i gang',
'title': 'christiania pusher street ryddes drdkrjpo',
'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
'timestamp': 1472800279,
'upload_date': '20160902',
@ -45,17 +63,18 @@ class DRTVIE(InfoExtractor):
'params': {
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
# with SignLanguage formats
'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
'info_dict': {
'id': 'historien-om-danmark-stenalder',
'ext': 'mp4',
'title': 'Historien om Danmark: Stenalder (1)',
'title': 'Historien om Danmark: Stenalder',
'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
'timestamp': 1490401996,
'upload_date': '20170325',
'duration': 3502.04,
'timestamp': 1546628400,
'upload_date': '20190104',
'duration': 3502.56,
'formats': 'mincount:20',
},
'params': {
@ -74,20 +93,26 @@ class DRTVIE(InfoExtractor):
video_id = self._search_regex(
(r'data-(?:material-identifier|episode-slug)="([^"]+)"',
r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'),
webpage, 'video id')
r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'),
webpage, 'video id', default=None)
programcard = self._download_json(
'http://www.dr.dk/mu/programcard/expanded/%s' % video_id,
video_id, 'Downloading video JSON')
data = programcard['Data'][0]
if not video_id:
video_id = compat_urllib_parse_unquote(self._search_regex(
r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)',
webpage, 'urn'))
title = remove_end(self._og_search_title(
webpage, default=None), ' | TV | DR') or data['Title']
data = self._download_json(
'https://www.dr.dk/mu-online/api/1.4/programcard/%s' % video_id,
video_id, 'Downloading video JSON', query={'expanded': 'true'})
title = str_or_none(data.get('Title')) or re.sub(
r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '',
self._og_search_title(webpage))
description = self._og_search_description(
webpage, default=None) or data.get('Description')
timestamp = parse_iso8601(data.get('CreatedTime'))
timestamp = unified_timestamp(
data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime'))
thumbnail = None
duration = None
@ -97,24 +122,62 @@ class DRTVIE(InfoExtractor):
formats = []
subtitles = {}
for asset in data['Assets']:
assets = []
primary_asset = data.get('PrimaryAsset')
if isinstance(primary_asset, dict):
assets.append(primary_asset)
secondary_assets = data.get('SecondaryAssets')
if isinstance(secondary_assets, list):
for secondary_asset in secondary_assets:
if isinstance(secondary_asset, dict):
assets.append(secondary_asset)
def hex_to_bytes(hex):
return binascii.a2b_hex(hex.encode('ascii'))
def decrypt_uri(e):
n = int(e[2:10], 16)
a = e[10 + n:]
data = bytes_to_intlist(hex_to_bytes(e[10:10 + n]))
key = bytes_to_intlist(hashlib.sha256(
('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest())
iv = bytes_to_intlist(hex_to_bytes(a))
decrypted = aes_cbc_decrypt(data, key, iv)
return intlist_to_bytes(
decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0]
for asset in assets:
kind = asset.get('Kind')
if kind == 'Image':
thumbnail = asset.get('Uri')
thumbnail = url_or_none(asset.get('Uri'))
elif kind in ('VideoResource', 'AudioResource'):
duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
restricted_to_denmark = asset.get('RestrictedToDenmark')
asset_target = asset.get('Target')
for link in asset.get('Links', []):
uri = link.get('Uri')
if not uri:
encrypted_uri = link.get('EncryptedUri')
if not encrypted_uri:
continue
try:
uri = decrypt_uri(encrypted_uri)
except Exception:
self.report_warning(
'Unable to decrypt EncryptedUri', video_id)
continue
uri = url_or_none(uri)
if not uri:
continue
target = link.get('Target')
format_id = target or ''
preference = None
if asset_target in ('SpokenSubtitles', 'SignLanguage'):
if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'):
preference = -1
format_id += '-%s' % asset_target
elif asset_target == 'Default':
preference = 1
else:
preference = None
if target == 'HDS':
f4m_formats = self._extract_f4m_formats(
uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
@ -140,19 +203,22 @@ class DRTVIE(InfoExtractor):
'vcodec': 'none' if kind == 'AudioResource' else None,
'preference': preference,
})
subtitles_list = asset.get('SubtitlesList')
if isinstance(subtitles_list, list):
LANGS = {
'Danish': 'da',
}
for subs in subtitles_list:
if not subs.get('Uri'):
continue
lang = subs.get('Language') or 'da'
subtitles.setdefault(LANGS.get(lang, lang), []).append({
'url': subs['Uri'],
'ext': mimetype2ext(subs.get('MimeType')) or 'vtt'
})
subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist')
if isinstance(subtitles_list, list):
LANGS = {
'Danish': 'da',
}
for subs in subtitles_list:
if not isinstance(subs, dict):
continue
sub_uri = url_or_none(subs.get('Uri'))
if not sub_uri:
continue
lang = subs.get('Language') or 'da'
subtitles.setdefault(LANGS.get(lang, lang), []).append({
'url': sub_uri,
'ext': mimetype2ext(subs.get('MimeType')) or 'vtt'
})
if not formats and restricted_to_denmark:
self.raise_geo_restricted(
@ -170,6 +236,13 @@ class DRTVIE(InfoExtractor):
'duration': duration,
'formats': formats,
'subtitles': subtitles,
'series': str_or_none(data.get('SeriesTitle')),
'season': str_or_none(data.get('SeasonTitle')),
'season_number': int_or_none(data.get('SeasonNumber')),
'season_id': str_or_none(data.get('SeasonUrn')),
'episode': str_or_none(data.get('EpisodeTitle')),
'episode_number': int_or_none(data.get('EpisodeNumber')),
'release_year': int_or_none(data.get('ProductionYear')),
}

View File

@ -29,7 +29,8 @@ class ESPNIE(OnceIE):
(?:
.*?\?.*?\bid=|
/_/id/
)
)|
[^/]+/video/
)
)|
(?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/
@ -94,6 +95,9 @@ class ESPNIE(OnceIE):
}, {
'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets',
'only_matching': True,
}, {
'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@ -456,6 +456,7 @@ from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .hgtv import HGTVComShowIE
from .hketv import HKETVIE
from .hidive import HiDiveIE
from .historicfilms import HistoricFilmsIE
from .hitbox import HitboxIE, HitboxLiveIE
@ -498,7 +499,11 @@ from .ina import InaIE
from .inc import IncIE
from .indavideo import IndavideoEmbedIE
from .infoq import InfoQIE
from .instagram import InstagramIE, InstagramUserIE
from .instagram import (
InstagramIE,
InstagramUserIE,
InstagramTagIE,
)
from .internazionale import InternazionaleIE
from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE
@ -592,6 +597,7 @@ from .linkedin import (
LinkedInLearningIE,
LinkedInLearningCourseIE,
)
from .linuxacademy import LinuxAcademyIE
from .litv import LiTVIE
from .liveleak import (
LiveLeakIE,
@ -618,6 +624,7 @@ from .mailru import (
MailRuMusicSearchIE,
)
from .makertv import MakerTVIE
from .malltv import MallTVIE
from .mangomolo import (
MangomoloVideoIE,
MangomoloLiveIE,
@ -691,7 +698,10 @@ from .myvi import (
MyviEmbedIE,
)
from .myvidster import MyVidsterIE
from .nationalgeographic import NationalGeographicVideoIE
from .nationalgeographic import (
NationalGeographicVideoIE,
NationalGeographicTVIE,
)
from .naver import NaverIE
from .nba import NBAIE
from .nbc import (
@ -1054,7 +1064,10 @@ from .southpark import (
SouthParkEsIE,
SouthParkNlIE
)
from .spankbang import SpankBangIE
from .spankbang import (
SpankBangIE,
SpankBangPlaylistIE,
)
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
@ -1163,6 +1176,7 @@ from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .trunews import TruNewsIE
from .trutv import TruTVIE
from .tube8 import Tube8IE
from .tubitv import TubiTvIE
@ -1208,7 +1222,7 @@ from .tvnow import (
from .tvp import (
TVPEmbedIE,
TVPIE,
TVPSeriesIE,
TVPWebsiteIE,
)
from .tvplay import (
TVPlayIE,
@ -1358,7 +1372,6 @@ from .voxmedia import (
VoxMediaVolumeIE,
VoxMediaIE,
)
from .vporn import VpornIE
from .vrt import VRTIE
from .vrak import VrakIE
from .vrv import (
@ -1372,6 +1385,7 @@ from .vuclip import VuClipIE
from .vvvvid import VVVVIDIE
from .vyborymos import VyboryMosIE
from .vzaar import VzaarIE
from .wakanim import WakanimIE
from .walla import WallaIE
from .washingtonpost import (
WashingtonPostIE,
@ -1495,6 +1509,7 @@ from .zattoo import (
QuantumTVIE,
QuicklineIE,
QuicklineLiveIE,
SaltTVIE,
SAKTVIE,
VTXTVIE,
WalyTVIE,

View File

@ -1,22 +1,25 @@
# coding: utf-8
from __future__ import unicode_literals
# import json
# import uuid
import json
import uuid
from .adobepass import AdobePassIE
from ..compat import (
compat_str,
compat_urllib_parse_unquote,
)
from ..utils import (
int_or_none,
parse_age_limit,
parse_duration,
try_get,
unified_timestamp,
update_url_query,
)
class FOXIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?(?:fox\.com|nationalgeographic\.com/tv)/watch/(?P<id>[\da-fA-F]+)'
_VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
_TESTS = [{
# clip
'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',
@ -31,6 +34,7 @@ class FOXIE(AdobePassIE):
'upload_date': '20170901',
'creator': 'FOX',
'series': 'Gotham',
'age_limit': 14,
},
'params': {
'skip_download': True,
@ -43,61 +47,49 @@ class FOXIE(AdobePassIE):
# episode, geo-restricted, tv provided required
'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/',
'only_matching': True,
}, {
'url': 'https://www.nationalgeographic.com/tv/watch/f690e05ebbe23ab79747becd0cc223d1/',
'only_matching': True,
}]
# _access_token = None
_HOME_PAGE_URL = 'https://www.fox.com/'
_API_KEY = 'abdcbed02c124d393b39e818a4312055'
_access_token = None
# def _call_api(self, path, video_id, data=None):
# headers = {
# 'X-Api-Key': '238bb0a0c2aba67922c48709ce0c06fd',
# }
# if self._access_token:
# headers['Authorization'] = 'Bearer ' + self._access_token
# return self._download_json(
# 'https://api2.fox.com/v2.0/' + path, video_id, data=data, headers=headers)
def _call_api(self, path, video_id, data=None):
headers = {
'X-Api-Key': self._API_KEY,
}
if self._access_token:
headers['Authorization'] = 'Bearer ' + self._access_token
return self._download_json(
'https://api2.fox.com/v2.0/' + path,
video_id, data=data, headers=headers)
# def _real_initialize(self):
# self._access_token = self._call_api(
# 'login', None, json.dumps({
# 'deviceId': compat_str(uuid.uuid4()),
# }).encode())['accessToken']
def _real_initialize(self):
if not self._access_token:
mvpd_auth = self._get_cookies(self._HOME_PAGE_URL).get('mvpd-auth')
if mvpd_auth:
self._access_token = (self._parse_json(compat_urllib_parse_unquote(
mvpd_auth.value), None, fatal=False) or {}).get('accessToken')
if not self._access_token:
self._access_token = self._call_api(
'login', None, json.dumps({
'deviceId': compat_str(uuid.uuid4()),
}).encode())['accessToken']
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
'https://api.fox.com/fbc-content/v1_5/video/%s' % video_id,
video_id, headers={
'apikey': 'abdcbed02c124d393b39e818a4312055',
'Content-Type': 'application/json',
'Referer': url,
})
# video = self._call_api('vodplayer/' + video_id, video_id)
video = self._call_api('vodplayer/' + video_id, video_id)
title = video['name']
release_url = video['videoRelease']['url']
# release_url = video['url']
data = try_get(
video, lambda x: x['trackingData']['properties'], dict) or {}
rating = video.get('contentRating')
if data.get('authRequired'):
resource = self._get_mvpd_resource(
'fbc-fox', title, video.get('guid'), rating)
release_url = update_url_query(
release_url, {
'auth': self._extract_mvpd_auth(
url, video_id, 'fbc-fox', resource)
})
release_url = video['url']
m3u8_url = self._download_json(release_url, video_id)['playURL']
formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
self._sort_formats(formats)
data = try_get(
video, lambda x: x['trackingData']['properties'], dict) or {}
duration = int_or_none(video.get('durationInSeconds')) or int_or_none(
video.get('duration')) or parse_duration(video.get('duration'))
timestamp = unified_timestamp(video.get('datePublished'))
@ -123,7 +115,7 @@ class FOXIE(AdobePassIE):
'description': video.get('description'),
'duration': duration,
'timestamp': timestamp,
'age_limit': parse_age_limit(rating),
'age_limit': parse_age_limit(video.get('contentRating')),
'creator': creator,
'series': series,
'season_number': int_or_none(video.get('seasonNumber')),

View File

@ -271,7 +271,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
catalogue = None
video_id = self._search_regex(
r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1',
r'(?:data-main-video\s*=|videoId\s*:)\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'video id', default=None, group='id')
if not video_id:

View File

@ -1,6 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
import random
import string
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
@ -87,7 +90,7 @@ class FunimationIE(InfoExtractor):
video_id = title_data.get('id') or self._search_regex([
r"KANE_customdimensions.videoID\s*=\s*'(\d+)';",
r'<iframe[^>]+src="/player/(\d+)"',
r'<iframe[^>]+src="/player/(\d+)',
], webpage, 'video_id', default=None)
if not video_id:
player_url = self._html_search_meta([
@ -108,8 +111,10 @@ class FunimationIE(InfoExtractor):
if self._TOKEN:
headers['Authorization'] = 'Token %s' % self._TOKEN
sources = self._download_json(
'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id,
video_id, headers=headers)['items']
'https://www.funimation.com/api/showexperience/%s/' % video_id,
video_id, headers=headers, query={
'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]),
})['items']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
error = self._parse_json(e.cause.read(), video_id)['errors'][0]

View File

@ -25,15 +25,15 @@ class GoIE(AdobePassIE):
},
'watchdisneychannel': {
'brand': '004',
'requestor_id': 'Disney',
'resource_id': 'Disney',
},
'watchdisneyjunior': {
'brand': '008',
'requestor_id': 'DisneyJunior',
'resource_id': 'DisneyJunior',
},
'watchdisneyxd': {
'brand': '009',
'requestor_id': 'DisneyXD',
'resource_id': 'DisneyXD',
}
}
_VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))'\
@ -130,8 +130,8 @@ class GoIE(AdobePassIE):
'device': '001',
}
if video_data.get('accesslevel') == '1':
requestor_id = site_info['requestor_id']
resource = self._get_mvpd_resource(
requestor_id = site_info.get('requestor_id', 'DisneyChannels')
resource = site_info.get('resource_id') or self._get_mvpd_resource(
requestor_id, title, video_id, None)
auth = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)

View File

@ -0,0 +1,191 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
clean_html,
ExtractorError,
int_or_none,
merge_dicts,
parse_count,
str_or_none,
try_get,
unified_strdate,
urlencode_postdata,
urljoin,
)
class HKETVIE(InfoExtractor):
IE_NAME = 'hketv'
IE_DESC = '香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau'
_GEO_BYPASS = False
_GEO_COUNTRIES = ['HK']
_VALID_URL = r'https?://(?:www\.)?hkedcity\.net/etv/resource/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://www.hkedcity.net/etv/resource/2932360618',
'md5': 'f193712f5f7abb208ddef3c5ea6ed0b7',
'info_dict': {
'id': '2932360618',
'ext': 'mp4',
'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)',
'description': 'md5:d5286d05219ef50e0613311cbe96e560',
'upload_date': '20181024',
'duration': 900,
'subtitles': 'count:2',
},
'skip': 'Geo restricted to HK',
}, {
'url': 'https://www.hkedcity.net/etv/resource/972641418',
'md5': '1ed494c1c6cf7866a8290edad9b07dc9',
'info_dict': {
'id': '972641418',
'ext': 'mp4',
'title': '衣冠楚楚 (天使系列之一)',
'description': 'md5:10bb3d659421e74f58e5db5691627b0f',
'upload_date': '20070109',
'duration': 907,
'subtitles': {},
},
'params': {
'geo_verification_proxy': '<HK proxy here>',
},
'skip': 'Geo restricted to HK',
}]
_CC_LANGS = {
'中文(繁體中文)': 'zh-Hant',
'中文(简体中文)': 'zh-Hans',
'English': 'en',
'Bahasa Indonesia': 'id',
'\u0939\u093f\u0928\u094d\u0926\u0940': 'hi',
'\u0928\u0947\u092a\u093e\u0932\u0940': 'ne',
'Tagalog': 'tl',
'\u0e44\u0e17\u0e22': 'th',
'\u0627\u0631\u062f\u0648': 'ur',
}
_FORMAT_HEIGHTS = {
'SD': 360,
'HD': 720,
}
_APPS_BASE_URL = 'https://apps.hkedcity.net'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = (
self._html_search_meta(
('ed_title', 'search.ed_title'), webpage, default=None) or
self._search_regex(
r'data-favorite_title_(?:eng|chi)=(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'title', default=None, group='url') or
self._html_search_regex(
r'<h1>([^<]+)</h1>', webpage, 'title', default=None) or
self._og_search_title(webpage)
)
file_id = self._search_regex(
r'post_var\[["\']file_id["\']\s*\]\s*=\s*(.+?);',
webpage, 'file ID')
curr_url = self._search_regex(
r'post_var\[["\']curr_url["\']\s*\]\s*=\s*"(.+?)";',
webpage, 'curr URL')
data = {
'action': 'get_info',
'curr_url': curr_url,
'file_id': file_id,
'video_url': file_id,
}
response = self._download_json(
self._APPS_BASE_URL + '/media/play/handler.php', video_id,
data=urlencode_postdata(data),
headers=merge_dicts({
'Content-Type': 'application/x-www-form-urlencoded'},
self.geo_verification_headers()))
result = response['result']
if not response.get('success') or not response.get('access'):
error = clean_html(response.get('access_err_msg'))
if 'Video streaming is not available in your country' in error:
self.raise_geo_restricted(
msg=error, countries=self._GEO_COUNTRIES)
else:
raise ExtractorError(error, expected=True)
formats = []
width = int_or_none(result.get('width'))
height = int_or_none(result.get('height'))
playlist0 = result['playlist'][0]
for fmt in playlist0['sources']:
file_url = urljoin(self._APPS_BASE_URL, fmt.get('file'))
if not file_url:
continue
# If we ever wanted to provide the final resolved URL that
# does not require cookies, albeit with a shorter lifespan:
# urlh = self._downloader.urlopen(file_url)
# resolved_url = urlh.geturl()
label = fmt.get('label')
h = self._FORMAT_HEIGHTS.get(label)
w = h * width // height if h and width and height else None
formats.append({
'format_id': label,
'ext': fmt.get('type'),
'url': file_url,
'width': w,
'height': h,
})
self._sort_formats(formats)
subtitles = {}
tracks = try_get(playlist0, lambda x: x['tracks'], list) or []
for track in tracks:
if not isinstance(track, dict):
continue
track_kind = str_or_none(track.get('kind'))
if not track_kind or not isinstance(track_kind, compat_str):
continue
if track_kind.lower() not in ('captions', 'subtitles'):
continue
track_url = urljoin(self._APPS_BASE_URL, track.get('file'))
if not track_url:
continue
track_label = track.get('label')
subtitles.setdefault(self._CC_LANGS.get(
track_label, track_label), []).append({
'url': self._proto_relative_url(track_url),
'ext': 'srt',
})
# Likes
emotion = self._download_json(
'https://emocounter.hkedcity.net/handler.php', video_id,
data=urlencode_postdata({
'action': 'get_emotion',
'data[bucket_id]': 'etv',
'data[identifier]': video_id,
}),
headers={'Content-Type': 'application/x-www-form-urlencoded'},
fatal=False) or {}
like_count = int_or_none(try_get(
emotion, lambda x: x['data']['emotion_data'][0]['count']))
return {
'id': video_id,
'title': title,
'description': self._html_search_meta(
'description', webpage, fatal=False),
'upload_date': unified_strdate(self._html_search_meta(
'ed_date', webpage, fatal=False), day_first=False),
'duration': int_or_none(result.get('length')),
'formats': formats,
'subtitles': subtitles,
'thumbnail': urljoin(self._APPS_BASE_URL, result.get('image')),
'view_count': parse_count(result.get('view_count')),
'like_count': like_count,
}

View File

@ -27,6 +27,10 @@ class ImgurIE(InfoExtractor):
}, {
'url': 'https://i.imgur.com/crGpqCV.mp4',
'only_matching': True,
}, {
# no title
'url': 'https://i.imgur.com/jxBXAMC.gifv',
'only_matching': True,
}]
def _real_extract(self, url):
@ -87,7 +91,7 @@ class ImgurIE(InfoExtractor):
return {
'id': video_id,
'formats': formats,
'title': self._og_search_title(webpage),
'title': self._og_search_title(webpage, default=video_id),
}

View File

@ -227,44 +227,37 @@ class InstagramIE(InfoExtractor):
}
class InstagramUserIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
_TEST = {
'url': 'https://instagram.com/porsche',
'info_dict': {
'id': 'porsche',
'title': 'porsche',
},
'playlist_count': 5,
'params': {
'extract_flat': True,
'skip_download': True,
'playlistend': 5,
}
}
class InstagramPlaylistIE(InfoExtractor):
# A superclass for handling any kind of query based on GraphQL which
# results in a playlist.
_gis_tmpl = None
_gis_tmpl = None # used to cache GIS request type
def _entries(self, data):
def _parse_graphql(self, webpage, item_id):
# Reads a webpage and returns its GraphQL data.
return self._parse_json(
self._search_regex(
r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
item_id)
def _extract_graphql(self, data, url):
# Parses GraphQL queries containing videos and generates a playlist.
def get_count(suffix):
return int_or_none(try_get(
node, lambda x: x['edge_media_' + suffix]['count']))
uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
uploader_id = self._match_id(url)
csrf_token = data['config']['csrf_token']
rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
self._set_cookie('instagram.com', 'ig_pr', '1')
cursor = ''
for page_num in itertools.count(1):
variables = json.dumps({
'id': uploader_id,
variables = {
'first': 12,
'after': cursor,
})
}
variables.update(self._query_vars_for(data))
variables = json.dumps(variables)
if self._gis_tmpl:
gis_tmpls = [self._gis_tmpl]
@ -276,21 +269,26 @@ class InstagramUserIE(InfoExtractor):
'%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
]
# try all of the ways to generate a GIS query, and not only use the
# first one that works, but cache it for future requests
for gis_tmpl in gis_tmpls:
try:
media = self._download_json(
json_data = self._download_json(
'https://www.instagram.com/graphql/query/', uploader_id,
'Downloading JSON page %d' % page_num, headers={
'X-Requested-With': 'XMLHttpRequest',
'X-Instagram-GIS': hashlib.md5(
('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
}, query={
'query_hash': '42323d64886122307be10013ad2dcc44',
'query_hash': self._QUERY_HASH,
'variables': variables,
})['data']['user']['edge_owner_to_timeline_media']
})
media = self._parse_timeline_from(json_data)
self._gis_tmpl = gis_tmpl
break
except ExtractorError as e:
# if it's an error caused by a bad query, and there are
# more GIS templates to try, ignore it and keep trying
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
if gis_tmpl != gis_tmpls[-1]:
continue
@ -348,14 +346,80 @@ class InstagramUserIE(InfoExtractor):
break
def _real_extract(self, url):
username = self._match_id(url)
user_or_tag = self._match_id(url)
webpage = self._download_webpage(url, user_or_tag)
data = self._parse_graphql(webpage, user_or_tag)
webpage = self._download_webpage(url, username)
data = self._parse_json(
self._search_regex(
r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
username)
self._set_cookie('instagram.com', 'ig_pr', '1')
return self.playlist_result(
self._entries(data), username, username)
self._extract_graphql(data, url), user_or_tag, user_or_tag)
class InstagramUserIE(InstagramPlaylistIE):
_VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
_TEST = {
'url': 'https://instagram.com/porsche',
'info_dict': {
'id': 'porsche',
'title': 'porsche',
},
'playlist_count': 5,
'params': {
'extract_flat': True,
'skip_download': True,
'playlistend': 5,
}
}
_QUERY_HASH = '42323d64886122307be10013ad2dcc44',
@staticmethod
def _parse_timeline_from(data):
# extracts the media timeline data from a GraphQL result
return data['data']['user']['edge_owner_to_timeline_media']
@staticmethod
def _query_vars_for(data):
# returns a dictionary of variables to add to the timeline query based
# on the GraphQL of the original page
return {
'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
}
class InstagramTagIE(InstagramPlaylistIE):
_VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
IE_DESC = 'Instagram hashtag search'
IE_NAME = 'instagram:tag'
_TEST = {
'url': 'https://instagram.com/explore/tags/lolcats',
'info_dict': {
'id': 'lolcats',
'title': 'lolcats',
},
'playlist_count': 50,
'params': {
'extract_flat': True,
'skip_download': True,
'playlistend': 50,
}
}
_QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
@staticmethod
def _parse_timeline_from(data):
# extracts the media timeline data from a GraphQL result
return data['data']['hashtag']['edge_hashtag_to_media']
@staticmethod
def _query_vars_for(data):
# returns a dictionary of variables to add to the timeline query based
# on the GraphQL of the original page
return {
'tag_name':
data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
}

View File

@ -1,12 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_class,
parse_duration,
strip_or_none,
unified_strdate,
)
@ -21,7 +23,9 @@ class LibsynIE(InfoExtractor):
'id': '6385796',
'ext': 'mp3',
'title': "Champion Minded - Developing a Growth Mindset",
'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
# description fetched using another request:
# http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796
# 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
'upload_date': '20180320',
'thumbnail': 're:^https?://.*',
},
@ -38,22 +42,36 @@ class LibsynIE(InfoExtractor):
}]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
url = m.group('mainurl')
url, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
podcast_title = self._search_regex(
r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None)
if podcast_title:
podcast_title = podcast_title.strip()
episode_title = self._search_regex(
r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title')
if episode_title:
episode_title = episode_title.strip()
data = self._parse_json(self._search_regex(
r'var\s+playlistItem\s*=\s*({.+?});',
webpage, 'JSON data block'), video_id)
episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage)
if not episode_title:
self._search_regex(
[r'data-title="([^"]+)"', r'<title>(.+?)</title>'],
webpage, 'episode title')
episode_title = episode_title.strip()
podcast_title = strip_or_none(clean_html(self._search_regex(
r'<h3>([^<]+)</h3>', webpage, 'podcast title',
default=None) or get_element_by_class('podcast-title', webpage)))
title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
formats = []
for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')):
f_url = data.get(k)
if not f_url:
continue
formats.append({
'url': f_url,
'format_id': format_id,
})
description = self._html_search_regex(
r'<p\s+id="info_text_body">(.+?)</p>', webpage,
'description', default=None)
@ -61,27 +79,15 @@ class LibsynIE(InfoExtractor):
# Strip non-breaking and normal spaces
description = description.replace('\u00A0', ' ').strip()
release_date = unified_strdate(self._search_regex(
r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block')
data = json.loads(data_json)
formats = [{
'url': data['media_url'],
'format_id': 'main',
}, {
'url': data['media_url_libsyn'],
'format_id': 'libsyn',
}]
thumbnail = data.get('thumbnail_url')
duration = parse_duration(data.get('duration'))
r'<div class="release_date">Released: ([^<]+)<',
webpage, 'release date', default=None) or data.get('release_date'))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'thumbnail': data.get('thumbnail_url'),
'upload_date': release_date,
'duration': duration,
'duration': parse_duration(data.get('duration')),
'formats': formats,
}

View File

@ -34,12 +34,15 @@ class LinkedInLearningBaseIE(InfoExtractor):
'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value,
}, query=query)['elements'][0]
def _get_video_id(self, urn, course_slug, video_slug):
def _get_urn_id(self, video_data):
urn = video_data.get('urn')
if urn:
mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn)
if mobj:
return mobj.group(1)
return '%s/%s' % (course_slug, video_slug)
def _get_video_id(self, video_data, course_slug, video_slug):
return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug)
def _real_initialize(self):
email, password = self._get_login_info()
@ -123,7 +126,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr'))
return {
'id': self._get_video_id(video_data.get('urn'), course_slug, video_slug),
'id': self._get_video_id(video_data, course_slug, video_slug),
'title': title,
'formats': formats,
'thumbnail': video_data.get('defaultThumbnail'),
@ -154,18 +157,21 @@ class LinkedInLearningCourseIE(LinkedInLearningBaseIE):
course_data = self._call_api(course_slug, 'chapters,description,title')
entries = []
for chapter in course_data.get('chapters', []):
for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1):
chapter_title = chapter.get('title')
chapter_id = self._get_urn_id(chapter)
for video in chapter.get('videos', []):
video_slug = video.get('slug')
if not video_slug:
continue
entries.append({
'_type': 'url_transparent',
'id': self._get_video_id(video.get('urn'), course_slug, video_slug),
'id': self._get_video_id(video, course_slug, video_slug),
'title': video.get('title'),
'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug),
'chapter': chapter_title,
'chapter_number': chapter_number,
'chapter_id': chapter_id,
'ie_key': LinkedInLearningIE.ie_key(),
})

View File

@ -0,0 +1,174 @@
from __future__ import unicode_literals
import json
import random
import re
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_HTTPError,
compat_str,
)
from ..utils import (
ExtractorError,
orderedSet,
unescapeHTML,
urlencode_postdata,
urljoin,
)
class LinuxAcademyIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:www\.)?linuxacademy\.com/cp/
(?:
courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
modules/view/id/(?P<course_id>\d+)
)
'''
_TESTS = [{
'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154',
'info_dict': {
'id': '1498-2',
'ext': 'mp4',
'title': "Introduction to the Practitioner's Brief",
},
'params': {
'skip_download': True,
},
'skip': 'Requires Linux Academy account credentials',
}, {
'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
'only_matching': True,
}, {
'url': 'https://linuxacademy.com/cp/modules/view/id/154',
'info_dict': {
'id': '154',
'title': 'AWS Certified Cloud Practitioner',
'description': 'md5:039db7e60e4aac9cf43630e0a75fa834',
},
'playlist_count': 41,
'skip': 'Requires Linux Academy account credentials',
}]
_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
_ORIGIN_URL = 'https://linuxacademy.com'
_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
_NETRC_MACHINE = 'linuxacademy'
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
def random_string():
return ''.join([
random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
for _ in range(32)])
webpage, urlh = self._download_webpage_handle(
self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
'client_id': self._CLIENT_ID,
'response_type': 'token id_token',
'redirect_uri': self._ORIGIN_URL,
'scope': 'openid email user_impersonation profile',
'audience': self._ORIGIN_URL,
'state': random_string(),
'nonce': random_string(),
})
login_data = self._parse_json(
self._search_regex(
r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
'login info', group='value'), None,
transform_source=lambda x: compat_b64decode(x).decode('utf-8')
)['extraParams']
login_data.update({
'client_id': self._CLIENT_ID,
'redirect_uri': self._ORIGIN_URL,
'tenant': 'lacausers',
'connection': 'Username-Password-Authentication',
'username': username,
'password': password,
'sso': 'true',
})
login_state_url = compat_str(urlh.geturl())
try:
login_page = self._download_webpage(
'https://login.linuxacademy.com/usernamepassword/login', None,
'Downloading login page', data=json.dumps(login_data).encode(),
headers={
'Content-Type': 'application/json',
'Origin': 'https://login.linuxacademy.com',
'Referer': login_state_url,
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
error = self._parse_json(e.cause.read(), None)
message = error.get('description') or error['code']
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True)
raise
callback_page, urlh = self._download_webpage_handle(
'https://login.linuxacademy.com/login/callback', None,
'Downloading callback page',
data=urlencode_postdata(self._hidden_inputs(login_page)),
headers={
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://login.linuxacademy.com',
'Referer': login_state_url,
})
access_token = self._search_regex(
r'access_token=([^=&]+)', compat_str(urlh.geturl()),
'access token')
self._download_webpage(
'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
% access_token, None, 'Downloading token validation page')
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
webpage = self._download_webpage(url, item_id)
# course path
if course_id:
entries = [
self.url_result(
urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key())
for lesson_url in orderedSet(re.findall(
r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)',
webpage))]
title = unescapeHTML(self._html_search_regex(
(r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)',
r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'),
webpage, 'title', default=None, group='value'))
description = unescapeHTML(self._html_search_regex(
r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
webpage, 'description', default=None, group='value'))
return self.playlist_result(entries, course_id, title, description)
# single video path
info = self._extract_jwplayer_data(
webpage, item_id, require_title=False, m3u8_id='hls',)
title = self._search_regex(
(r'>Lecture\s*:\s*(?P<value>[^<]+)',
r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
'title', group='value')
info.update({
'id': item_id,
'title': title,
})
return info

View File

@ -0,0 +1,53 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import merge_dicts
class MallTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'md5': '1c4a37f080e1f3023103a7b43458e518',
'info_dict': {
'id': 't0zzt0',
'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'ext': 'mp4',
'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
'description': 'md5:25fc0ec42a72ba602b602c683fa29deb',
'duration': 216,
'timestamp': 1538870400,
'upload_date': '20181007',
'view_count': int,
}
}, {
'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(
url, display_id, headers=self.geo_verification_headers())
SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b'
video_id = self._search_regex(
SOURCE_RE, webpage, 'video id', group='id')
media = self._parse_html5_media_entries(
url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id,
m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0]
info = self._search_json_ld(webpage, video_id, default={})
return merge_dicts(media, info, {
'id': video_id,
'display_id': display_id,
'title': self._og_search_title(webpage, default=None) or display_id,
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
})

View File

@ -1,12 +1,13 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse,
compat_urllib_parse_unquote,
compat_urllib_parse_urlencode,
)
from ..utils import (
determine_ext,
@ -144,7 +145,7 @@ class MetacafeIE(InfoExtractor):
headers = {
# Disable family filter
'Cookie': 'user=%s; ' % compat_urllib_parse_urlencode({'ffilter': False})
'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False}))
}
# AnyClip videos require the flashversion cookie so that we get the link

View File

@ -1,6 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from .fox import FOXIE
from ..utils import (
smuggle_url,
url_basename,
@ -58,3 +59,24 @@ class NationalGeographicVideoIE(InfoExtractor):
{'force_smil_url': True}),
'id': guid,
}
class NationalGeographicTVIE(FOXIE):
_VALID_URL = r'https?://(?:www\.)?nationalgeographic\.com/tv/watch/(?P<id>[\da-fA-F]+)'
_TESTS = [{
'url': 'https://www.nationalgeographic.com/tv/watch/6a875e6e734b479beda26438c9f21138/',
'info_dict': {
'id': '6a875e6e734b479beda26438c9f21138',
'ext': 'mp4',
'title': 'Why Nat Geo? Valley of the Boom',
'description': 'The lives of prominent figures in the tech world, including their friendships, rivalries, victories and failures.',
'timestamp': 1542662458,
'upload_date': '20181119',
'age_limit': 14,
},
'params': {
'skip_download': True,
},
}]
_HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/'
_API_KEY = '238bb0a0c2aba67922c48709ce0c06fd'

View File

@ -5,8 +5,8 @@ from ..utils import ExtractorError
class NhkVodIE(InfoExtractor):
_VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P<id>[^/]+/[^/?#&]+)'
_TEST = {
_VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/(?:vod|ondemand)/(?P<id>[^/]+/[^/?#&]+)'
_TESTS = [{
# Videos available only for a limited period of time. Visit
# http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples.
'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815',
@ -19,7 +19,10 @@ class NhkVodIE(InfoExtractor):
'episode': 'The Kimono as Global Fashion',
},
'skip': 'Videos available only for a limited period of time',
}
}, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
'only_matching': True,
}]
_API_URL = 'http://api.nhk.or.jp/nhkworld/vodesdlist/v1/all/all/all.json?apikey=EJfK8jdS57GqlupFgAfAAwr573q01y6k'
def _real_extract(self, url):

View File

@ -57,7 +57,8 @@ class NoovoIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
bc_url = BrightcoveNewIE._extract_url(self, webpage)
brightcove_id = self._search_regex(
r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
data = self._parse_json(
self._search_regex(
@ -89,7 +90,10 @@ class NoovoIE(InfoExtractor):
return {
'_type': 'url_transparent',
'ie_key': BrightcoveNewIE.ie_key(),
'url': smuggle_url(bc_url, {'geo_countries': ['CA']}),
'url': smuggle_url(
self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
{'geo_countries': ['CA']}),
'id': brightcove_id,
'title': title,
'description': description,
'series': series,

View File

@ -12,11 +12,16 @@ from ..utils import (
ExtractorError,
fix_xml_ampersands,
int_or_none,
merge_dicts,
orderedSet,
parse_duration,
qualities,
str_or_none,
strip_jsonp,
unified_strdate,
unified_timestamp,
url_or_none,
urlencode_postdata,
)
@ -176,9 +181,118 @@ class NPOIE(NPOBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
return self._get_info(video_id)
try:
return self._get_info(url, video_id)
except ExtractorError:
return self._get_old_info(video_id)
def _get_info(self, video_id):
def _get_info(self, url, video_id):
token = self._download_json(
'https://www.npostart.nl/api/token', video_id,
'Downloading token', headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
})['token']
player = self._download_json(
'https://www.npostart.nl/player/%s' % video_id, video_id,
'Downloading player JSON', data=urlencode_postdata({
'autoplay': 0,
'share': 1,
'pageUrl': url,
'hasAdConsent': 0,
'_token': token,
}))
player_token = player['token']
format_urls = set()
formats = []
for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'):
streams = self._download_json(
'https://start-player.npo.nl/video/%s/streams' % video_id,
video_id, 'Downloading %s profile JSON' % profile, fatal=False,
query={
'profile': profile,
'quality': 'npo',
'tokenId': player_token,
'streamType': 'broadcast',
})
if not streams:
continue
stream = streams.get('stream')
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('src'))
if not stream_url or stream_url in format_urls:
continue
format_urls.add(stream_url)
if stream.get('protection') is not None:
continue
stream_type = stream.get('type')
stream_ext = determine_ext(stream_url)
if stream_type == 'application/dash+xml' or stream_ext == 'mpd':
formats.extend(self._extract_mpd_formats(
stream_url, video_id, mpd_id='dash', fatal=False))
elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
elif '.ism/Manifest' in stream_url:
formats.extend(self._extract_ism_formats(
stream_url, video_id, ism_id='mss', fatal=False))
else:
formats.append({
'url': stream_url,
})
self._sort_formats(formats)
info = {
'id': video_id,
'title': video_id,
'formats': formats,
}
embed_url = url_or_none(player.get('embedUrl'))
if embed_url:
webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed page', fatal=False)
if webpage:
video = self._parse_json(
self._search_regex(
r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video',
default='{}'), video_id)
if video:
title = video.get('episodeTitle')
subtitles = {}
subtitles_list = video.get('subtitles')
if isinstance(subtitles_list, list):
for cc in subtitles_list:
cc_url = url_or_none(cc.get('src'))
if not cc_url:
continue
lang = str_or_none(cc.get('language')) or 'nl'
subtitles.setdefault(lang, []).append({
'url': cc_url,
})
return merge_dicts({
'title': title,
'description': video.get('description'),
'thumbnail': url_or_none(
video.get('still_image_url') or video.get('orig_image_url')),
'duration': int_or_none(video.get('duration')),
'timestamp': unified_timestamp(video.get('broadcastDate')),
'creator': video.get('channel'),
'series': video.get('title'),
'episode': title,
'episode_number': int_or_none(video.get('episodeNumber')),
'subtitles': subtitles,
}, info)
return info
def _get_old_info(self, video_id):
metadata = self._download_json(
'http://e.omroep.nl/metadata/%s' % video_id,
video_id,
@ -280,7 +394,7 @@ class NPOIE(NPOBaseIE):
# JSON
else:
video_url = stream_info.get('url')
if not video_url or video_url in urls:
if not video_url or 'vodnotavailable.' in video_url or video_url in urls:
continue
urls.add(video_url)
if determine_ext(video_url) == 'm3u8':

View File

@ -115,6 +115,10 @@ class OdnoklassnikiIE(InfoExtractor):
}, {
'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
'only_matching': True,
}, {
# Paid video
'url': 'https://ok.ru/video/954886983203',
'only_matching': True,
}]
def _real_extract(self, url):
@ -244,6 +248,11 @@ class OdnoklassnikiIE(InfoExtractor):
'ext': 'flv',
})
if not formats:
payment_info = metadata.get('paymentInfo')
if payment_info:
raise ExtractorError('This video is paid, subscribe to download it', expected=True)
self._sort_formats(formats)
info['formats'] = formats

View File

@ -248,8 +248,8 @@ class OpenloadIE(InfoExtractor):
(?P<host>
(?:www\.)?
(?:
openload\.(?:co|io|link)|
oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun)
openload\.(?:co|io|link|pw)|
oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw|live)
)
)/
(?:f|embed)/
@ -334,6 +334,21 @@ class OpenloadIE(InfoExtractor):
}, {
'url': 'https://oload.fun/f/gb6G1H4sHXY',
'only_matching': True,
}, {
'url': 'https://oload.club/f/Nr1L-aZ2dbQ',
'only_matching': True,
}, {
'url': 'https://oload.info/f/5NEAbI2BDSk',
'only_matching': True,
}, {
'url': 'https://openload.pw/f/WyKgK8s94N0',
'only_matching': True,
}, {
'url': 'https://oload.pw/f/WyKgK8s94N0',
'only_matching': True,
}, {
'url': 'https://oload.live/f/-Z58UZ-GR4M',
'only_matching': True,
}]
_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'

View File

@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
unescapeHTML,
)
@ -75,6 +76,14 @@ class PeriscopeIE(PeriscopeBaseIE):
'url': broadcast[image],
} for image in ('image_url', 'image_url_small') if broadcast.get(image)]
width = int_or_none(broadcast.get('width'))
height = int_or_none(broadcast.get('height'))
def add_width_and_height(f):
for key, val in (('width', width), ('height', height)):
if not f.get(key):
f[key] = val
video_urls = set()
formats = []
for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):
@ -83,16 +92,21 @@ class PeriscopeIE(PeriscopeBaseIE):
continue
video_urls.add(video_url)
if format_id != 'rtmp':
formats.extend(self._extract_m3u8_formats(
m3u8_formats = self._extract_m3u8_formats(
video_url, token, 'mp4',
entry_protocol='m3u8_native'
if state in ('ended', 'timed_out') else 'm3u8',
m3u8_id=format_id, fatal=False))
m3u8_id=format_id, fatal=False)
if len(m3u8_formats) == 1:
add_width_and_height(m3u8_formats[0])
formats.extend(m3u8_formats)
continue
formats.append({
rtmp_format = {
'url': video_url,
'ext': 'flv' if format_id == 'rtmp' else 'mp4',
})
}
add_width_and_height(rtmp_format)
formats.append(rtmp_format)
self._sort_formats(formats)
return {

View File

@ -4,9 +4,11 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
js_to_json,
urljoin,
)
@ -14,7 +16,7 @@ class PornHdIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'
_TESTS = [{
'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
'md5': 'c8b964b1f0a4b5f7f28ae3a5c9f86ad5',
'md5': '87f1540746c1d32ec7a2305c12b96b25',
'info_dict': {
'id': '9864',
'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
@ -23,6 +25,7 @@ class PornHdIE(InfoExtractor):
'description': 'md5:3748420395e03e31ac96857a8f125b2b',
'thumbnail': r're:^https?://.*\.jpg',
'view_count': int,
'like_count': int,
'age_limit': 18,
}
}, {
@ -37,6 +40,7 @@ class PornHdIE(InfoExtractor):
'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
'thumbnail': r're:^https?://.*\.jpg',
'view_count': int,
'like_count': int,
'age_limit': 18,
},
'skip': 'Not available anymore',
@ -65,12 +69,14 @@ class PornHdIE(InfoExtractor):
formats = []
for format_id, video_url in sources.items():
video_url = urljoin(url, video_url)
if not video_url:
continue
height = int_or_none(self._search_regex(
r'^(\d+)[pP]', format_id, 'height', default=None))
formats.append({
'url': video_url,
'ext': determine_ext(video_url, 'mp4'),
'format_id': format_id,
'height': height,
})
@ -85,6 +91,11 @@ class PornHdIE(InfoExtractor):
r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage,
'thumbnail', fatal=False, group='url')
like_count = int_or_none(self._search_regex(
(r'(\d+)\s*</11[^>]+>(?:&nbsp;|\s)*\blikes',
r'class=["\']save-count["\'][^>]*>\s*(\d+)'),
webpage, 'like count', fatal=False))
return {
'id': video_id,
'display_id': display_id,
@ -92,6 +103,7 @@ class PornHdIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'view_count': view_count,
'like_count': like_count,
'formats': formats,
'age_limit': 18,
}

View File

@ -10,11 +10,12 @@ from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
compat_urllib_request,
)
from .openload import PhantomJSwrapper
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
orderedSet,
remove_quotes,
str_to_int,
@ -22,7 +23,29 @@ from ..utils import (
)
class PornHubIE(InfoExtractor):
class PornHubBaseIE(InfoExtractor):
def _download_webpage_handle(self, *args, **kwargs):
def dl(*args, **kwargs):
return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
webpage, urlh = dl(*args, **kwargs)
if any(re.search(p, webpage) for p in (
r'<body\b[^>]+\bonload=["\']go\(\)',
r'document\.cookie\s*=\s*["\']RNKEY=',
r'document\.location\.reload\(true\)')):
url_or_request = args[0]
url = (url_or_request.get_full_url()
if isinstance(url_or_request, compat_urllib_request.Request)
else url_or_request)
phantom = PhantomJSwrapper(self, required_version='2.0')
phantom.get(url, html=webpage)
webpage, urlh = dl(*args, **kwargs)
return webpage, urlh
class PornHubIE(PornHubBaseIE):
IE_DESC = 'PornHub and Thumbzilla'
_VALID_URL = r'''(?x)
https?://
@ -279,14 +302,12 @@ class PornHubIE(InfoExtractor):
comment_count = self._extract_count(
r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
page_params = self._parse_json(self._search_regex(
r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
webpage, 'page parameters', group='data', default='{}'),
video_id, transform_source=js_to_json, fatal=False)
tags = categories = None
if page_params:
tags = page_params.get('tags', '').split(',')
categories = page_params.get('categories', '').split(',')
def extract_list(meta_key):
div = self._search_regex(
r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
% meta_key, webpage, meta_key, default=None)
if div:
return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
return {
'id': video_id,
@ -301,13 +322,13 @@ class PornHubIE(InfoExtractor):
'comment_count': comment_count,
'formats': formats,
'age_limit': 18,
'tags': tags,
'categories': categories,
'tags': extract_list('tags'),
'categories': extract_list('categories'),
'subtitles': subtitles,
}
class PornHubPlaylistBaseIE(InfoExtractor):
class PornHubPlaylistBaseIE(PornHubBaseIE):
def _extract_entries(self, webpage, host):
# Only process container div with main playlist content skipping
# drop-down menu that uses similar pattern for videos (see

View File

@ -4,16 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
xpath_text,
find_xpath_attr,
determine_ext,
ExtractorError,
int_or_none,
unified_strdate,
xpath_element,
ExtractorError,
determine_protocol,
unsmuggle_url,
)
@ -49,107 +45,79 @@ class RadioCanadaIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
},
{
# with protectionType but not actually DRM protected
'url': 'radiocanada:toutv:140872',
'info_dict': {
'id': '140872',
'title': 'Épisode 1',
'series': 'District 31',
},
'only_matching': True,
}
]
_GEO_COUNTRIES = ['CA']
_access_token = None
_claims = None
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
app_code, video_id = re.match(self._VALID_URL, url).groups()
metadata = self._download_xml(
'http://api.radio-canada.ca/metaMedia/v1/index.ashx',
video_id, note='Downloading metadata XML', query={
def _call_api(self, path, video_id=None, app_code=None, query=None):
if not query:
query = {}
query.update({
'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb',
'output': 'json',
})
if video_id:
query.update({
'appCode': app_code,
'idMedia': video_id,
})
if self._access_token:
query['access_token'] = self._access_token
try:
return self._download_json(
'https://services.radio-canada.ca/media/' + path, video_id, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422):
data = self._parse_json(e.cause.read().decode(), None)
error = data.get('error_description') or data['errorMessage']['text']
raise ExtractorError(error, expected=True)
raise
def _extract_info(self, app_code, video_id):
metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas']
def get_meta(name):
el = find_xpath_attr(metadata, './/Meta', 'name', name)
return el.text if el is not None else None
for meta in metas:
if meta.get('name') == name:
text = meta.get('text')
if text:
return text
# protectionType does not necessarily mean the video is DRM protected (see
# https://github.com/rg3/youtube-dl/pull/18609).
if get_meta('protectionType'):
raise ExtractorError('This video is DRM protected.', expected=True)
self.report_warning('This video is probably DRM protected.')
device_types = ['ipad']
if not smuggled_data:
device_types.append('flash')
device_types.append('android')
formats = []
error = None
# TODO: extract f4m formats
# f4m formats can be extracted using flashhd device_type but they produce unplayable file
for device_type in device_types:
validation_url = 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx'
query = {
'appCode': app_code,
'idMedia': video_id,
'connectionType': 'broadband',
'multibitrate': 'true',
'deviceType': device_type,
}
if smuggled_data:
validation_url = 'https://services.radio-canada.ca/media/validation/v2/'
query.update(smuggled_data)
else:
query.update({
# paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
'paysJ391wsHjbOJwvCs26toz': 'CA',
'bypasslock': 'NZt5K62gRqfc',
})
v_data = self._download_xml(validation_url, video_id, note='Downloading %s XML' % device_type, query=query, fatal=False)
v_url = xpath_text(v_data, 'url')
if not v_url:
continue
if v_url == 'null':
error = xpath_text(v_data, 'message')
continue
ext = determine_ext(v_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
v_url, video_id, f4m_id='hds', fatal=False))
else:
ext = determine_ext(v_url)
bitrates = xpath_element(v_data, 'bitrates')
for url_e in bitrates.findall('url'):
tbr = int_or_none(url_e.get('bitrate'))
if not tbr:
continue
f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url)
protocol = determine_protocol({'url': f_url})
f = {
'format_id': '%s-%d' % (protocol, tbr),
'url': f_url,
'ext': 'flv' if protocol == 'rtmp' else ext,
'protocol': protocol,
'width': int_or_none(url_e.get('width')),
'height': int_or_none(url_e.get('height')),
'tbr': tbr,
}
mobj = re.match(r'(?P<url>rtmp://[^/]+/[^/]+)/(?P<playpath>[^?]+)(?P<auth>\?.+)', f_url)
if mobj:
f.update({
'url': mobj.group('url') + mobj.group('auth'),
'play_path': mobj.group('playpath'),
})
formats.append(f)
if protocol == 'rtsp':
base_url = self._search_regex(
r'rtsp://([^?]+)', f_url, 'base url', default=None)
if base_url:
base_url = 'http://' + base_url
formats.extend(self._extract_m3u8_formats(
base_url + '/playlist.m3u8', video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False))
formats.extend(self._extract_f4m_formats(
base_url + '/manifest.f4m', video_id,
f4m_id='hds', fatal=False))
if not formats and error:
query = {
'connectionType': 'hd',
'deviceType': 'ipad',
'multibitrate': 'true',
}
if self._claims:
query['claims'] = self._claims
v_data = self._call_api('validation/v2/', video_id, app_code, query)
v_url = v_data.get('url')
if not v_url:
error = v_data['message']
if error == "Le contenu sélectionné n'est pas disponible dans votre pays":
raise self.raise_geo_restricted(error, self._GEO_COUNTRIES)
if error == 'Le contenu sélectionné est disponible seulement en premium':
self.raise_login_required(error)
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error), expected=True)
formats = self._extract_m3u8_formats(v_url, video_id, 'mp4')
self._sort_formats(formats)
subtitles = {}
@ -174,11 +142,14 @@ class RadioCanadaIE(InfoExtractor):
'formats': formats,
}
def _real_extract(self, url):
return self._extract_info(*re.match(self._VALID_URL, url).groups())
class RadioCanadaAudioVideoIE(InfoExtractor):
'radiocanada:audiovideo'
_VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)'
_TEST = {
_VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
'info_dict': {
'id': '7527184',
@ -191,7 +162,10 @@ class RadioCanadaAudioVideoIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
}
}, {
'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam',
'only_matching': True,
}]
def _real_extract(self, url):
return self.url_result('radiocanada:medianet:%s' % self._match_id(url))

View File

@ -288,7 +288,7 @@ class RaiPlayPlaylistIE(InfoExtractor):
class RaiIE(RaiBaseIE):
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_TESTS = [{
# var uniquename = "ContentItem-..."
# data-id="ContentItem-..."
@ -375,6 +375,9 @@ class RaiIE(RaiBaseIE):
# Direct MMS URL
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
'only_matching': True,
}, {
'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html',
'only_matching': True,
}]
def _extract_from_content_id(self, content_id, url):

View File

@ -21,7 +21,17 @@ from ..utils import (
class RutubeBaseIE(InfoExtractor):
def _extract_video(self, video, video_id=None, require_title=True):
def _download_api_info(self, video_id, query=None):
if not query:
query = {}
query['format'] = 'json'
return self._download_json(
'http://rutube.ru/api/video/%s/' % video_id,
video_id, 'Downloading video JSON',
'Unable to download video JSON', query=query)
@staticmethod
def _extract_info(video, video_id=None, require_title=True):
title = video['title'] if require_title else video.get('title')
age_limit = video.get('is_adult')
@ -32,7 +42,7 @@ class RutubeBaseIE(InfoExtractor):
category = try_get(video, lambda x: x['category']['name'])
return {
'id': video.get('id') or video_id,
'id': video.get('id') or video_id if video_id else video['id'],
'title': title,
'description': video.get('description'),
'thumbnail': video.get('thumbnail_url'),
@ -47,6 +57,42 @@ class RutubeBaseIE(InfoExtractor):
'is_live': bool_or_none(video.get('is_livestream')),
}
def _download_and_extract_info(self, video_id, query=None):
return self._extract_info(
self._download_api_info(video_id, query=query), video_id)
def _download_api_options(self, video_id, query=None):
if not query:
query = {}
query['format'] = 'json'
return self._download_json(
'http://rutube.ru/api/play/options/%s/' % video_id,
video_id, 'Downloading options JSON',
'Unable to download options JSON',
headers=self.geo_verification_headers(), query=query)
def _extract_formats(self, options, video_id):
formats = []
for format_id, format_url in options['video_balancer'].items():
ext = determine_ext(format_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_id, fatal=False))
else:
formats.append({
'url': format_url,
'format_id': format_id,
})
self._sort_formats(formats)
return formats
def _download_and_extract_formats(self, video_id, query=None):
return self._extract_formats(
self._download_api_options(video_id, query=query), video_id)
class RutubeIE(RutubeBaseIE):
IE_NAME = 'rutube'
@ -55,13 +101,13 @@ class RutubeIE(RutubeBaseIE):
_TESTS = [{
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
'md5': '79938ade01294ef7e27574890d0d3769',
'md5': '1d24f180fac7a02f3900712e5a5764d6',
'info_dict': {
'id': '3eac3b4561676c17df9132a9a1e62e3e',
'ext': 'flv',
'ext': 'mp4',
'title': 'Раненный кенгуру забежал в аптеку',
'description': 'http://www.ntdtv.ru ',
'duration': 80,
'duration': 81,
'uploader': 'NTDRussian',
'uploader_id': '29790',
'timestamp': 1381943602,
@ -94,39 +140,12 @@ class RutubeIE(RutubeBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
'http://rutube.ru/api/video/%s/?format=json' % video_id,
video_id, 'Downloading video JSON')
info = self._extract_video(video, video_id)
options = self._download_json(
'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
video_id, 'Downloading options JSON',
headers=self.geo_verification_headers())
formats = []
for format_id, format_url in options['video_balancer'].items():
ext = determine_ext(format_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_id, fatal=False))
else:
formats.append({
'url': format_url,
'format_id': format_id,
})
self._sort_formats(formats)
info['formats'] = formats
info = self._download_and_extract_info(video_id)
info['formats'] = self._download_and_extract_formats(video_id)
return info
class RutubeEmbedIE(InfoExtractor):
class RutubeEmbedIE(RutubeBaseIE):
IE_NAME = 'rutube:embed'
IE_DESC = 'Rutube embedded videos'
_VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)'
@ -135,7 +154,7 @@ class RutubeEmbedIE(InfoExtractor):
'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
'info_dict': {
'id': 'a10e53b86e8f349080f718582ce4c661',
'ext': 'flv',
'ext': 'mp4',
'timestamp': 1387830582,
'upload_date': '20131223',
'uploader_id': '297833',
@ -149,16 +168,26 @@ class RutubeEmbedIE(InfoExtractor):
}, {
'url': 'http://rutube.ru/play/embed/8083783',
'only_matching': True,
}, {
# private video
'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ',
'only_matching': True,
}]
def _real_extract(self, url):
embed_id = self._match_id(url)
webpage = self._download_webpage(url, embed_id)
canonical_url = self._html_search_regex(
r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage,
'Canonical URL')
return self.url_result(canonical_url, RutubeIE.ie_key())
# Query may contain private videos token and should be passed to API
# requests (see #19163)
query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
options = self._download_api_options(embed_id, query)
video_id = options['effective_video']
formats = self._extract_formats(options, video_id)
info = self._download_and_extract_info(video_id, query)
info.update({
'extractor_key': 'Rutube',
'formats': formats,
})
return info
class RutubePlaylistBaseIE(RutubeBaseIE):
@ -181,7 +210,7 @@ class RutubePlaylistBaseIE(RutubeBaseIE):
video_url = url_or_none(result.get('video_url'))
if not video_url:
continue
entry = self._extract_video(result, require_title=False)
entry = self._extract_info(result, require_title=False)
entry.update({
'_type': 'url',
'url': video_url,

View File

@ -1,31 +1,44 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class ServusIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?servus\.com/(?:at|de)/p/[^/]+/(?P<id>AA-\w+|\d+-\d+)'
_VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)'
_TESTS = [{
'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/',
'md5': '046dee641cda1c4cabe13baef3be2c1c',
'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
'info_dict': {
'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4',
'title': 'Die Grünen aus Volkssicht',
'description': 'md5:052b5da1cb2cd7d562ef1f19be5a5cba',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Die Grünen aus Sicht des Volkes',
'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg',
}
}, {
'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/',
'only_matching': True,
}, {
'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/',
'only_matching': True,
}, {
'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_id = self._match_id(url).upper()
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
title = self._search_regex(
(r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'),
webpage, 'title', default=None,
group='title') or self._og_search_title(webpage)
title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)

View File

@ -61,7 +61,8 @@ class SixPlayIE(InfoExtractor):
quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
formats = []
subtitles = {}
for asset in clip_data['assets']:
assets = clip_data.get('assets') or []
for asset in assets:
asset_url = asset.get('full_physical_path')
protocol = asset.get('protocol')
if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls:

View File

@ -26,7 +26,7 @@ class SkylineWebcamsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
stream_url = self._search_regex(
r'url\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage,
r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage,
'stream url', group='url')
title = self._og_search_title(webpage)

View File

@ -16,8 +16,10 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
unified_strdate,
try_get,
unified_timestamp,
update_url_query,
url_or_none,
)
@ -34,7 +36,7 @@ class SoundcloudIE(InfoExtractor):
(?:(?:(?:www\.|m\.)?soundcloud\.com/
(?!stations/track)
(?P<uploader>[\w\d-]+)/
(?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
@ -50,12 +52,17 @@ class SoundcloudIE(InfoExtractor):
'info_dict': {
'id': '62986583',
'ext': 'mp3',
'upload_date': '20121011',
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
'uploader': 'E.T. ExTerrestrial Music',
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'timestamp': 1349920598,
'upload_date': '20121011',
'duration': 143,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
}
},
# not streamable song
@ -67,9 +74,14 @@ class SoundcloudIE(InfoExtractor):
'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept',
'timestamp': 1337635207,
'upload_date': '20120521',
'duration': 227,
'duration': 30,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
'params': {
# rtmp
@ -84,11 +96,16 @@ class SoundcloudIE(InfoExtractor):
'id': '123998367',
'ext': 'mp3',
'title': 'Youtube - Dl Test Video \'\' Ä↭',
'uploader': 'jaimeMF',
'description': 'test chars: \"\'/\\ä↭',
'uploader': 'jaimeMF',
'timestamp': 1386604920,
'upload_date': '20131209',
'duration': 9,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
},
# private link (alt format)
@ -99,11 +116,16 @@ class SoundcloudIE(InfoExtractor):
'id': '123998367',
'ext': 'mp3',
'title': 'Youtube - Dl Test Video \'\' Ä↭',
'uploader': 'jaimeMF',
'description': 'test chars: \"\'/\\ä↭',
'uploader': 'jaimeMF',
'timestamp': 1386604920,
'upload_date': '20131209',
'duration': 9,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
},
# downloadable song
@ -116,9 +138,14 @@ class SoundcloudIE(InfoExtractor):
'title': 'Bus Brakes',
'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
'uploader': 'oddsamples',
'timestamp': 1389232924,
'upload_date': '20140109',
'duration': 17,
'license': 'cc-by-sa',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
},
# private link, downloadable format
@ -131,9 +158,14 @@ class SoundcloudIE(InfoExtractor):
'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
'uploader': 'Ori Uplift Music',
'timestamp': 1504206263,
'upload_date': '20170831',
'duration': 7449,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
},
# no album art, use avatar pic for thumbnail
@ -146,10 +178,15 @@ class SoundcloudIE(InfoExtractor):
'title': 'Sideways (Prod. Mad Real)',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'uploader': 'garyvee',
'timestamp': 1488152409,
'upload_date': '20170226',
'duration': 207,
'thumbnail': r're:https?://.*\.jpg',
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
'params': {
'skip_download': True,
@ -157,7 +194,7 @@ class SoundcloudIE(InfoExtractor):
},
]
_CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb'
_CLIENT_ID = 'NmW1FlPaiL94ueEu7oziOWjYEzZzQDcK'
@staticmethod
def _extract_urls(webpage):
@ -175,22 +212,33 @@ class SoundcloudIE(InfoExtractor):
def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
track_id = compat_str(info['id'])
title = info['title']
name = full_title or track_id
if quiet:
self.report_extraction(name)
thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url')
if isinstance(thumbnail, compat_str):
thumbnail = thumbnail.replace('-large', '-t500x500')
username = try_get(info, lambda x: x['user']['username'], compat_str)
def extract_count(key):
return int_or_none(info.get('%s_count' % key))
result = {
'id': track_id,
'uploader': info.get('user', {}).get('username'),
'upload_date': unified_strdate(info.get('created_at')),
'title': info['title'],
'uploader': username,
'timestamp': unified_timestamp(info.get('created_at')),
'title': title,
'description': info.get('description'),
'thumbnail': thumbnail,
'duration': int_or_none(info.get('duration'), 1000),
'webpage_url': info.get('permalink_url'),
'license': info.get('license'),
'view_count': extract_count('playback'),
'like_count': extract_count('favoritings'),
'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'),
'genre': info.get('genre'),
}
formats = []
query = {'client_id': self._CLIENT_ID}
@ -368,7 +416,6 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
_API_BASE = 'https://api.soundcloud.com'
_API_V2_BASE = 'https://api-v2.soundcloud.com'
def _extract_playlist(self, base_url, playlist_id, playlist_title):
@ -389,21 +436,30 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
next_href, playlist_id, 'Downloading track page %s' % (i + 1))
collection = response['collection']
if not collection:
break
def resolve_permalink_url(candidates):
if not isinstance(collection, list):
collection = []
# Empty collection may be returned, in this case we proceed
# straight to next_href
def resolve_entry(candidates):
for cand in candidates:
if isinstance(cand, dict):
permalink_url = cand.get('permalink_url')
entry_id = self._extract_id(cand)
if permalink_url and permalink_url.startswith('http'):
return permalink_url, entry_id
if not isinstance(cand, dict):
continue
permalink_url = url_or_none(cand.get('permalink_url'))
if not permalink_url:
continue
return self.url_result(
permalink_url,
ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
video_id=self._extract_id(cand),
video_title=cand.get('title'))
for e in collection:
permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
if permalink_url:
entries.append(self.url_result(permalink_url, video_id=entry_id))
entry = resolve_entry((e, e.get('track'), e.get('playlist')))
if entry:
entries.append(entry)
next_href = response.get('next_href')
if not next_href:
@ -429,46 +485,53 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
(?:(?:www|m)\.)?soundcloud\.com/
(?P<user>[^/]+)
(?:/
(?P<rsrc>tracks|sets|reposts|likes|spotlight)
(?P<rsrc>tracks|albums|sets|reposts|likes|spotlight)
)?
/?(?:[?#].*)?$
'''
IE_NAME = 'soundcloud:user'
_TESTS = [{
'url': 'https://soundcloud.com/the-akashic-chronicler',
'url': 'https://soundcloud.com/soft-cell-official',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (All)',
'id': '207965082',
'title': 'Soft Cell (All)',
},
'playlist_mincount': 74,
'playlist_mincount': 28,
}, {
'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
'url': 'https://soundcloud.com/soft-cell-official/tracks',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (Tracks)',
'id': '207965082',
'title': 'Soft Cell (Tracks)',
},
'playlist_mincount': 37,
'playlist_mincount': 27,
}, {
'url': 'https://soundcloud.com/the-akashic-chronicler/sets',
'url': 'https://soundcloud.com/soft-cell-official/albums',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (Playlists)',
'id': '207965082',
'title': 'Soft Cell (Albums)',
},
'playlist_mincount': 1,
}, {
'url': 'https://soundcloud.com/jcv246/sets',
'info_dict': {
'id': '12982173',
'title': 'Jordi / cv (Playlists)',
},
'playlist_mincount': 2,
}, {
'url': 'https://soundcloud.com/the-akashic-chronicler/reposts',
'url': 'https://soundcloud.com/jcv246/reposts',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (Reposts)',
'id': '12982173',
'title': 'Jordi / cv (Reposts)',
},
'playlist_mincount': 7,
'playlist_mincount': 6,
}, {
'url': 'https://soundcloud.com/the-akashic-chronicler/likes',
'url': 'https://soundcloud.com/clalberg/likes',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (Likes)',
'id': '11817582',
'title': 'clalberg (Likes)',
},
'playlist_mincount': 321,
'playlist_mincount': 5,
}, {
'url': 'https://soundcloud.com/grynpyret/spotlight',
'info_dict': {
@ -479,10 +542,11 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
}]
_BASE_URL_MAP = {
'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE,
'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
}
@ -490,6 +554,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
_TITLE_MAP = {
'all': 'All',
'tracks': 'Tracks',
'albums': 'Albums',
'sets': 'Playlists',
'reposts': 'Reposts',
'likes': 'Likes',

View File

@ -5,14 +5,17 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
orderedSet,
parse_duration,
parse_resolution,
str_to_int,
url_or_none,
urlencode_postdata,
)
class SpankBangIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video'
_VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b'
_TESTS = [{
'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
'md5': '1cc433e1d6aa14bc376535b8679302f7',
@ -41,29 +44,71 @@ class SpankBangIE(InfoExtractor):
# 4k
'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k',
'only_matching': True,
}, {
'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/',
'only_matching': True,
}, {
'url': 'https://m.spankbang.com/3vvn/play',
'only_matching': True,
}, {
'url': 'https://spankbang.com/2y3td/embed/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id, headers={
'Cookie': 'country=US'
})
webpage = self._download_webpage(
url.replace('/%s/embed' % video_id, '/%s/video' % video_id),
video_id, headers={'Cookie': 'country=US'})
if re.search(r'<[^>]+\bid=["\']video_removed', webpage):
raise ExtractorError(
'Video %s is not available' % video_id, expected=True)
formats = []
for mobj in re.finditer(
r'stream_url_(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2',
webpage):
format_id, format_url = mobj.group('id', 'url')
def extract_format(format_id, format_url):
f_url = url_or_none(format_url)
if not f_url:
return
f = parse_resolution(format_id)
f.update({
'url': format_url,
'url': f_url,
'format_id': format_id,
})
formats.append(f)
STREAM_URL_PREFIX = 'stream_url_'
for mobj in re.finditer(
r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2'
% STREAM_URL_PREFIX, webpage):
extract_format(mobj.group('id', 'url'))
if not formats:
stream_key = self._search_regex(
r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
webpage, 'stream key', group='value')
sb_csrf_session = self._get_cookies(
'https://spankbang.com')['sb_csrf_session'].value
stream = self._download_json(
'https://spankbang.com/api/videos/stream', video_id,
'Downloading stream JSON', data=urlencode_postdata({
'id': stream_key,
'data': 0,
'sb_csrf_session': sb_csrf_session,
}), headers={
'Referer': url,
'X-CSRFToken': sb_csrf_session,
})
for format_id, format_url in stream.items():
if format_id.startswith(STREAM_URL_PREFIX):
extract_format(
format_id[len(STREAM_URL_PREFIX):], format_url)
self._sort_formats(formats)
title = self._html_search_regex(
@ -94,3 +139,33 @@ class SpankBangIE(InfoExtractor):
'formats': formats,
'age_limit': age_limit,
}
class SpankBangPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+'
_TEST = {
'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties',
'info_dict': {
'id': 'ug0k',
'title': 'Big Ass Titties',
},
'playlist_mincount': 50,
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(
url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
entries = [self.url_result(
'https://spankbang.com/%s/video' % video_id,
ie=SpankBangIE.ie_key(), video_id=video_id)
for video_id in orderedSet(re.findall(
r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))]
title = self._html_search_regex(
r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title',
fatal=False)
return self.playlist_result(entries, playlist_id, title)

View File

@ -14,7 +14,7 @@ from ..utils import (
class StreamangoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net)/(?:f|embed)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
'md5': 'e992787515a182f55e38fc97588d802a',
@ -38,6 +38,9 @@ class StreamangoIE(InfoExtractor):
}, {
'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
'only_matching': True,
}, {
'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@ -27,6 +27,7 @@ class TeachableBaseIE(InfoExtractor):
'market.saleshacker.com': 'saleshacker',
'learnability.org': 'learnability',
'edurila.com': 'edurila',
'courses.workitdaily.com': 'workitdaily',
}
_VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys()))

View File

@ -265,6 +265,8 @@ class TEDIE(InfoExtractor):
'format_id': m3u8_format['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
if f.get('acodec') == 'none':
del f['acodec']
formats.append(f)
audio_download = talk_info.get('audioDownload')

View File

@ -96,7 +96,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
cfg_xml = self._download_xml(
cfg_url, display_id, 'Downloading metadata',
transform_source=fix_xml_ampersands)
transform_source=fix_xml_ampersands, headers={'Referer': url})
formats = []

View File

@ -3,22 +3,19 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .radiocanada import RadioCanadaIE
from ..utils import (
int_or_none,
js_to_json,
urlencode_postdata,
extract_attributes,
smuggle_url,
int_or_none,
merge_dicts,
urlencode_postdata,
)
class TouTvIE(InfoExtractor):
class TouTvIE(RadioCanadaIE):
_NETRC_MACHINE = 'toutv'
IE_NAME = 'tou.tv'
_VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)'
_access_token = None
_claims = None
_TESTS = [{
'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17',
@ -46,18 +43,14 @@ class TouTvIE(InfoExtractor):
email, password = self._get_login_info()
if email is None:
return
state = 'http://ici.tou.tv/'
webpage = self._download_webpage(state, None, 'Downloading homepage')
toutvlogin = self._parse_json(self._search_regex(
r'(?s)toutvlogin\s*=\s*({.+?});', webpage, 'toutvlogin'), None, js_to_json)
authorize_url = toutvlogin['host'] + '/auth/oauth/v2/authorize'
login_webpage = self._download_webpage(
authorize_url, None, 'Downloading login page', query={
'client_id': toutvlogin['clientId'],
'redirect_uri': 'https://ici.tou.tv/login/loginCallback',
'https://services.radio-canada.ca/auth/oauth/v2/authorize',
None, 'Downloading login page', query={
'client_id': '4dd36440-09d5-4468-8923-b6d91174ad36',
'redirect_uri': 'https://ici.tou.tv/logincallback',
'response_type': 'token',
'scope': 'media-drmt openid profile email id.write media-validation.read.privileged',
'state': state,
'scope': 'id.write media-validation.read',
'state': '/',
})
def extract_form_url_and_data(wp, default_form_url, form_spec_re=''):
@ -86,12 +79,7 @@ class TouTvIE(InfoExtractor):
self._access_token = self._search_regex(
r'access_token=([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
urlh.geturl(), 'access token')
self._claims = self._download_json(
'https://services.radio-canada.ca/media/validation/v2/getClaims',
None, 'Extracting Claims', query={
'token': self._access_token,
'access_token': self._access_token,
})['claims']
self._claims = self._call_api('validation/v2/getClaims')['claims']
def _real_extract(self, url):
path = self._match_id(url)
@ -102,19 +90,10 @@ class TouTvIE(InfoExtractor):
self.report_warning('This video is probably DRM protected.', path)
video_id = metadata['IdMedia']
details = metadata['Details']
title = details['OriginalTitle']
video_url = 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id)
if self._access_token and self._claims:
video_url = smuggle_url(video_url, {
'access_token': self._access_token,
'claims': self._claims,
})
return {
'_type': 'url_transparent',
'url': video_url,
return merge_dicts({
'id': video_id,
'title': title,
'title': details.get('OriginalTitle'),
'thumbnail': details.get('ImageUrl'),
'duration': int_or_none(details.get('LengthInSeconds')),
}
}, self._extract_info(metadata.get('AppCode', 'toutv'), video_id))

View File

@ -0,0 +1,75 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
dict_get,
float_or_none,
int_or_none,
unified_timestamp,
update_url_query,
url_or_none,
)
class TruNewsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech',
'md5': 'a19c024c3906ff954fac9b96ce66bb08',
'info_dict': {
'id': '5c5a21e65d3c196e1c0020cc',
'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech',
'ext': 'mp4',
'title': "Will Democrats Stage a Circus During President Trump's State of the Union Speech?",
'description': 'md5:c583b72147cc92cf21f56a31aff7a670',
'duration': 3685,
'timestamp': 1549411440,
'upload_date': '20190206',
},
'add_ie': ['Zype'],
}
def _real_extract(self, url):
display_id = self._match_id(url)
video = self._download_json(
'https://api.zype.com/videos', display_id, query={
'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H',
'per_page': 1,
'active': 'true',
'friendly_title': display_id,
})['response'][0]
zype_id = video['_id']
thumbnails = []
thumbnails_list = video.get('thumbnails')
if isinstance(thumbnails_list, list):
for thumbnail in thumbnails_list:
if not isinstance(thumbnail, dict):
continue
thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url:
continue
thumbnails.append({
'url': thumbnail_url,
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
return {
'_type': 'url_transparent',
'url': update_url_query(
'https://player.zype.com/embed/%s.js' % zype_id,
{'api_key': 'X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'}),
'ie_key': 'Zype',
'id': zype_id,
'display_id': display_id,
'title': video.get('title'),
'description': dict_get(video, ('description', 'ott_description', 'short_description')),
'duration': int_or_none(video.get('duration')),
'timestamp': unified_timestamp(video.get('published_at')),
'average_rating': float_or_none(video.get('rating')),
'view_count': int_or_none(video.get('request_count')),
'thumbnails': thumbnails,
}

View File

@ -4,44 +4,72 @@ from __future__ import unicode_literals
import re
from .turner import TurnerBaseIE
from ..utils import (
int_or_none,
parse_iso8601,
)
class TruTVIE(TurnerBaseIE):
_VALID_URL = r'https?://(?:www\.)?trutv\.com(?:(?P<path>/shows/[^/]+/videos/[^/?#]+?)\.html|/full-episodes/[^/]+/(?P<id>\d+))'
_VALID_URL = r'https?://(?:www\.)?trutv\.com/(?:shows|full-episodes)/(?P<series_slug>[0-9A-Za-z-]+)/(?:videos/(?P<clip_slug>[0-9A-Za-z-]+)|(?P<id>\d+))'
_TEST = {
'url': 'http://www.trutv.com/shows/10-things/videos/you-wont-believe-these-sports-bets.html',
'md5': '2cdc844f317579fed1a7251b087ff417',
'url': 'https://www.trutv.com/shows/the-carbonaro-effect/videos/sunlight-activated-flower.html',
'info_dict': {
'id': '/shows/10-things/videos/you-wont-believe-these-sports-bets',
'id': 'f16c03beec1e84cd7d1a51f11d8fcc29124cc7f1',
'ext': 'mp4',
'title': 'You Won\'t Believe These Sports Bets',
'description': 'Jamie Lee sits down with a bookie to discuss the bizarre world of illegal sports betting.',
'upload_date': '20130305',
}
'title': 'Sunlight-Activated Flower',
'description': "A customer is stunned when he sees Michael's sunlight-activated flower.",
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
path, video_id = re.match(self._VALID_URL, url).groups()
auth_required = False
if path:
data_src = 'http://www.trutv.com/video/cvp/v2/xml/content.xml?id=%s.xml' % path
series_slug, clip_slug, video_id = re.match(self._VALID_URL, url).groups()
if video_id:
path = 'episode'
display_id = video_id
else:
webpage = self._download_webpage(url, video_id)
video_id = self._search_regex(
r"TTV\.TVE\.episodeId\s*=\s*'([^']+)';",
webpage, 'video id', default=video_id)
auth_required = self._search_regex(
r'TTV\.TVE\.authRequired\s*=\s*(true|false);',
webpage, 'auth required', default='false') == 'true'
data_src = 'http://www.trutv.com/tveverywhere/services/cvpXML.do?titleId=' + video_id
return self._extract_cvp_info(
data_src, path, {
'secure': {
'media_src': 'http://androidhls-secure.cdn.turner.com/trutv/big',
'tokenizer_src': 'http://www.trutv.com/tveverywhere/processors/services/token_ipadAdobe.do',
},
}, {
path = 'series/clip'
display_id = clip_slug
data = self._download_json(
'https://api.trutv.com/v2/web/%s/%s/%s' % (path, series_slug, display_id),
display_id)
video_data = data['episode'] if video_id else data['info']
media_id = video_data['mediaId']
title = video_data['title'].strip()
info = self._extract_ngtv_info(
media_id, {}, {
'url': url,
'site_name': 'truTV',
'auth_required': auth_required,
'auth_required': video_data.get('isAuthRequired'),
})
thumbnails = []
for image in video_data.get('images', []):
image_url = image.get('srcUrl')
if not image_url:
continue
thumbnails.append({
'url': image_url,
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
})
info.update({
'id': media_id,
'display_id': display_id,
'title': title,
'description': video_data.get('description'),
'thumbnails': thumbnails,
'timestamp': parse_iso8601(video_data.get('publicationDate')),
'series': video_data.get('showTitle'),
'season_number': int_or_none(video_data.get('seasonNum')),
'episode_number': int_or_none(video_data.get('episodeNum')),
})
return info

View File

@ -1,14 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
clean_html,
get_element_by_attribute,
determine_ext,
ExtractorError,
get_element_by_attribute,
orderedSet,
)
@ -19,12 +21,12 @@ class TVPIE(InfoExtractor):
_TESTS = [{
'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, I seria odc. 13',
'description': 'md5:381afa5bca72655fe94b05cfe82bf53d',
'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:437f48b93558370b031740546b696e24',
},
}, {
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
@ -45,6 +47,7 @@ class TVPIE(InfoExtractor):
'title': 'Wiadomości, 28.09.2017, 19:30',
'description': 'Wydanie główne codziennego serwisu informacyjnego.'
},
'skip': 'HTTP Error 404: Not Found',
}, {
'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
'only_matching': True,
@ -75,8 +78,10 @@ class TVPIE(InfoExtractor):
return {
'_type': 'url_transparent',
'url': 'tvp:' + video_id,
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(
webpage, default=None) or self._html_search_meta(
'description', webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'ie_key': 'TVPEmbed',
}
@ -87,6 +92,15 @@ class TVPEmbedIE(InfoExtractor):
_VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)'
_TESTS = [{
'url': 'tvp:194536',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek',
},
}, {
# not available
'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
'md5': '8c9cd59d16edabf39331f93bf8a766c7',
'info_dict': {
@ -94,6 +108,7 @@ class TVPEmbedIE(InfoExtractor):
'ext': 'mp4',
'title': 'Panorama, 07.12.2015, 15:40',
},
'skip': 'Transmisja została zakończona lub materiał niedostępny',
}, {
'url': 'tvp:22670268',
'only_matching': True,
@ -105,10 +120,13 @@ class TVPEmbedIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
error_massage = get_element_by_attribute('class', 'msg error', webpage)
if error_massage:
error = self._html_search_regex(
r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>',
webpage, 'error', default=None) or clean_html(
get_element_by_attribute('class', 'msg error', webpage))
if error:
raise ExtractorError('%s said: %s' % (
self.IE_NAME, clean_html(error_massage)), expected=True)
self.IE_NAME, clean_html(error)), expected=True)
title = self._search_regex(
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
@ -180,48 +198,55 @@ class TVPEmbedIE(InfoExtractor):
}
class TVPSeriesIE(InfoExtractor):
class TVPWebsiteIE(InfoExtractor):
IE_NAME = 'tvp:series'
_VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
_VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
_TESTS = [{
'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem',
# series
'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video',
'info_dict': {
'title': 'Ogniem i mieczem',
'id': '4278026',
'id': '38678312',
},
'playlist_count': 4,
'playlist_count': 115,
}, {
'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat',
# film
'url': 'https://vod.tvp.pl/website/gloria,35139666',
'info_dict': {
'title': 'Boso przez świat',
'id': '9329207',
'id': '36637049',
'ext': 'mp4',
'title': 'Gloria, Gloria',
},
'playlist_count': 86,
'params': {
'skip_download': True,
},
'add_ie': ['TVPEmbed'],
}, {
'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
'only_matching': True,
}]
def _entries(self, display_id, playlist_id):
url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
for page_num in itertools.count(1):
page = self._download_webpage(
url, display_id, 'Downloading page %d' % page_num,
query={'page': page_num})
video_ids = orderedSet(re.findall(
r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id,
page))
if not video_ids:
break
for video_id in video_ids:
yield self.url_result(
'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(),
video_id=video_id)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, tries=5)
title = self._html_search_regex(
r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series')
playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id')
playlist = self._download_webpage(
'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend'
'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5,
note='Downloading playlist')
videos_paths = re.findall(
'(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
entries = [
self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())
for v_path in videos_paths]
return {
'_type': 'playlist',
'id': playlist_id,
'display_id': display_id,
'title': title,
'entries': entries,
}
mobj = re.match(self._VALID_URL, url)
display_id, playlist_id = mobj.group('display_id', 'id')
return self.playlist_result(
self._entries(display_id, playlist_id), playlist_id)

View File

@ -493,10 +493,9 @@ class TVPlayHomeIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_id = self._search_regex(
r'data-asset-id\s*=\s*["\'](\d{5,7})\b', webpage, 'video id',
default=None)
r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id')
if video_id:
if len(video_id) < 8:
return self.url_result(
'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id)
@ -537,8 +536,9 @@ class TVPlayHomeIE(InfoExtractor):
r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number',
default=None))
episode = self._search_regex(
r'(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'episode',
default=None, group='value')
(r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
'episode', default=None, group='value')
episode_number = int_or_none(self._search_regex(
r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number',
default=None))

View File

@ -136,7 +136,12 @@ class TwitchBaseIE(InfoExtractor):
source = next(f for f in formats if f['format_id'] == 'Source')
source['preference'] = 10
except StopIteration:
pass # No Source stream present
for f in formats:
if '/chunked/' in f['url']:
f.update({
'source_preference': 10,
'format_note': 'Source',
})
self._sort_formats(formats)

View File

@ -29,7 +29,7 @@ class UdemyIE(InfoExtractor):
IE_NAME = 'udemy'
_VALID_URL = r'''(?x)
https?://
www\.udemy\.com/
(?:[^/]+\.)?udemy\.com/
(?:
[^#]+\#/lecture/|
lecture/view/?\?lectureId=|
@ -64,6 +64,9 @@ class UdemyIE(InfoExtractor):
# only outputs rendition
'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0',
'only_matching': True,
}, {
'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757',
'only_matching': True,
}]
def _extract_course_info(self, webpage, video_id):
@ -123,10 +126,22 @@ class UdemyIE(InfoExtractor):
def _download_webpage_handle(self, *args, **kwargs):
headers = kwargs.get('headers', {}).copy()
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4'
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
kwargs['headers'] = headers
return super(UdemyIE, self)._download_webpage_handle(
ret = super(UdemyIE, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs))
if not ret:
return ret
webpage, _ = ret
if any(p in webpage for p in (
'>Please verify you are a human',
'Access to this page has been denied because we believe you are using automation tools to browse the website',
'"_pxCaptcha"')):
raise ExtractorError(
'Udemy asks you to solve a CAPTCHA. Login with browser, '
'solve CAPTCHA, then export cookies and pass cookie file to '
'youtube-dl with --cookies.', expected=True)
return ret
def _download_json(self, url_or_request, *args, **kwargs):
headers = {
@ -403,8 +418,14 @@ class UdemyIE(InfoExtractor):
class UdemyCourseIE(UdemyIE):
IE_NAME = 'udemy:course'
_VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P<id>[^/?#&]+)'
_TESTS = []
_VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.udemy.com/java-tutorial/',
'only_matching': True,
}, {
'url': 'https://wipro.udemy.com/java-tutorial/',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):

View File

@ -3,21 +3,23 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
get_element_by_attribute,
parse_duration,
try_get,
update_url_query,
ExtractorError,
)
from ..compat import compat_str
class USATodayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P<id>[^?/#]+)'
_TEST = {
_TESTS = [{
# Brightcove Partner ID = 29906170001
'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/',
'md5': '4d40974481fa3475f8bccfd20c5361f8',
'md5': '033587d2529dc3411a1ab3644c3b8827',
'info_dict': {
'id': '81729424',
'id': '4799374959001',
'ext': 'mp4',
'title': 'US, France warn Syrian regime ahead of new peace talks',
'timestamp': 1457891045,
@ -25,8 +27,20 @@ class USATodayIE(InfoExtractor):
'uploader_id': '29906170001',
'upload_date': '20160313',
}
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/29906170001/38a9eecc-bdd8-42a3-ba14-95397e48b3f8_default/index.html?videoId=%s'
}, {
# ui-video-data[asset_metadata][items][brightcoveaccount] = 28911775001
'url': 'https://www.usatoday.com/story/tech/science/2018/08/21/yellowstone-supervolcano-eruption-stop-worrying-its-blow/973633002/',
'info_dict': {
'id': '5824495846001',
'ext': 'mp4',
'title': 'Yellowstone more likely to crack rather than explode',
'timestamp': 1534790612,
'description': 'md5:3715e7927639a4f16b474e9391687c62',
'uploader_id': '28911775001',
'upload_date': '20180820',
}
}]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
def _real_extract(self, url):
display_id = self._match_id(url)
@ -35,10 +49,11 @@ class USATodayIE(InfoExtractor):
if not ui_video_data:
raise ExtractorError('no video on the webpage', expected=True)
video_data = self._parse_json(ui_video_data, display_id)
item = try_get(video_data, lambda x: x['asset_metadata']['items'], dict) or {}
return {
'_type': 'url_transparent',
'url': self.BRIGHTCOVE_URL_TEMPLATE % video_data['brightcove_id'],
'url': self.BRIGHTCOVE_URL_TEMPLATE % (item.get('brightcoveaccount', '29906170001'), item.get('brightcoveid') or video_data['brightcove_id']),
'id': compat_str(video_data['id']),
'title': video_data['title'],
'thumbnail': video_data.get('thumbnail'),

View File

@ -94,7 +94,6 @@ class ViceIE(AdobePassIE):
'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1',
'only_matching': True,
}]
_PREPLAY_HOST = 'vms.vice'
@staticmethod
def _extract_urls(webpage):
@ -158,9 +157,8 @@ class ViceIE(AdobePassIE):
})
try:
host = 'www.viceland' if is_locked else self._PREPLAY_HOST
preplay = self._download_json(
'https://%s.com/%s/video/preplay/%s' % (host, locale, video_id),
'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id),
video_id, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401):

View File

@ -4,8 +4,14 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
orderedSet,
parse_duration,
str_or_none,
unified_strdate,
url_or_none,
xpath_element,
xpath_text,
)
@ -13,7 +19,19 @@ from ..utils import (
class VideomoreIE(InfoExtractor):
IE_NAME = 'videomore'
_VALID_URL = r'videomore:(?P<sid>\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P<id>\d+)(?:[/?#&]|\.(?:xml|json)|$)'
_VALID_URL = r'''(?x)
videomore:(?P<sid>\d+)$|
https?://(?:player\.)?videomore\.ru/
(?:
(?:
embed|
[^/]+/[^/]+
)/|
[^/]*\?.*?\btrack_id=
)
(?P<id>\d+)
(?:[/?#&]|\.(?:xml|json)|$)
'''
_TESTS = [{
'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617',
'md5': '44455a346edc0d509ac5b5a5b531dc35',
@ -79,6 +97,9 @@ class VideomoreIE(InfoExtractor):
}, {
'url': 'videomore:367617',
'only_matching': True,
}, {
'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=',
'only_matching': True,
}]
@staticmethod
@ -136,7 +157,7 @@ class VideomoreIE(InfoExtractor):
class VideomoreVideoIE(InfoExtractor):
IE_NAME = 'videomore:video'
_VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)[/?#&]*$'
_VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$'
_TESTS = [{
# single video with og:video:iframe
'url': 'http://videomore.ru/elki_3',
@ -176,6 +197,9 @@ class VideomoreVideoIE(InfoExtractor):
'params': {
'skip_download': True,
},
}, {
'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so',
'only_matching': True,
}]
@classmethod
@ -196,13 +220,16 @@ class VideomoreVideoIE(InfoExtractor):
r'track-id=["\'](\d+)',
r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id')
video_url = 'videomore:%s' % video_id
else:
video_id = None
return self.url_result(video_url, VideomoreIE.ie_key())
return self.url_result(
video_url, ie=VideomoreIE.ie_key(), video_id=video_id)
class VideomoreSeasonIE(InfoExtractor):
IE_NAME = 'videomore:season'
_VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)[/?#&]*$'
_VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$'
_TESTS = [{
'url': 'http://videomore.ru/molodezhka/sezon_promo',
'info_dict': {
@ -210,8 +237,16 @@ class VideomoreSeasonIE(InfoExtractor):
'title': 'Молодежка Промо',
},
'playlist_mincount': 12,
}, {
'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url))
else super(VideomoreSeasonIE, cls).suitable(url))
def _real_extract(self, url):
display_id = self._match_id(url)
@ -219,9 +254,54 @@ class VideomoreSeasonIE(InfoExtractor):
title = self._og_search_title(webpage)
entries = [
self.url_result(item) for item in re.findall(
r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"'
% display_id, webpage)]
data = self._parse_json(
self._html_search_regex(
r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1',
webpage, 'data', default='{}', group='value'),
display_id, fatal=False)
entries = []
if data:
episodes = data.get('episodes')
if isinstance(episodes, list):
for ep in episodes:
if not isinstance(ep, dict):
continue
ep_id = int_or_none(ep.get('id'))
ep_url = url_or_none(ep.get('url'))
if ep_id:
e = {
'url': 'videomore:%s' % ep_id,
'id': compat_str(ep_id),
}
elif ep_url:
e = {'url': ep_url}
else:
continue
e.update({
'_type': 'url',
'ie_key': VideomoreIE.ie_key(),
'title': str_or_none(ep.get('title')),
'thumbnail': url_or_none(ep.get('image')),
'duration': parse_duration(ep.get('duration')),
'episode_number': int_or_none(ep.get('number')),
'upload_date': unified_strdate(ep.get('date')),
})
entries.append(e)
if not entries:
entries = [
self.url_result(
'videomore:%s' % video_id, ie=VideomoreIE.ie_key(),
video_id=video_id)
for video_id in orderedSet(re.findall(
r':(?:id|key)=["\'](\d+)["\']', webpage))]
if not entries:
entries = [
self.url_result(item) for item in re.findall(
r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"'
% display_id, webpage)]
return self.playlist_result(entries, display_id, title)

View File

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import base64
import json
import re
import itertools
@ -392,6 +393,22 @@ class VimeoIE(VimeoBaseInfoExtractor):
'skip_download': True,
},
},
{
'url': 'http://player.vimeo.com/video/68375962',
'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
'info_dict': {
'id': '68375962',
'ext': 'mp4',
'title': 'youtube-dl password protected test video',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
},
'params': {
'videopassword': 'youtube-dl',
},
},
{
'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741',
'only_matching': True,
@ -418,6 +435,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
'url': 'https://vimeo.com/160743502/abd0e13fb4',
'only_matching': True,
}
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
]
@staticmethod
@ -448,18 +467,22 @@ class VimeoIE(VimeoBaseInfoExtractor):
urls = VimeoIE._extract_urls(url, webpage)
return urls[0] if urls else None
def _verify_player_video_password(self, url, video_id):
def _verify_player_video_password(self, url, video_id, headers):
password = self._downloader.params.get('videopassword')
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option')
data = urlencode_postdata({'password': password})
pass_url = url + '/check-password'
password_request = sanitized_Request(pass_url, data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
password_request.add_header('Referer', url)
return self._download_json(
password_request, video_id,
'Verifying the password', 'Wrong password')
data = urlencode_postdata({
'password': base64.b64encode(password.encode()),
})
headers = merge_dicts(headers, {
'Content-Type': 'application/x-www-form-urlencoded',
})
checked = self._download_json(
url + '/check-password', video_id,
'Verifying the password', data=data, headers=headers)
if checked is False:
raise ExtractorError('Wrong video password', expected=True)
return checked
def _real_initialize(self):
self._login()
@ -479,7 +502,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
orig_url = url
if mobj.group('pro') or mobj.group('player'):
if mobj.group('pro'):
# some videos require portfolio_id to be present in player url
# https://github.com/rg3/youtube-dl/issues/20070
url = self._extract_url(url, self._download_webpage(url, video_id))
elif mobj.group('player'):
url = 'https://player.vimeo.com/video/' + video_id
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id
@ -572,7 +599,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
cause=e)
else:
if config.get('view') == 4:
config = self._verify_player_video_password(redirect_url, video_id)
config = self._verify_player_video_password(redirect_url, video_id, headers)
vod = config.get('video', {}).get('vod', {})

View File

@ -1,123 +0,0 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_duration,
str_to_int,
urljoin,
)
class VpornIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P<display_id>[^/]+)/(?P<id>\d+)'
_TESTS = [
{
'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/',
'md5': 'facf37c1b86546fa0208058546842c55',
'info_dict': {
'id': '497944',
'display_id': 'violet-on-her-th-birthday',
'ext': 'mp4',
'title': 'Violet on her 19th birthday',
'description': 'Violet dances in front of the camera which is sure to get you horny.',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'kileyGrope',
'categories': ['Masturbation', 'Teen'],
'duration': 393,
'age_limit': 18,
'view_count': int,
},
'skip': 'video removed',
},
{
'url': 'http://www.vporn.com/female/hana-shower/523564/',
'md5': 'ced35a4656198a1664cf2cda1575a25f',
'info_dict': {
'id': '523564',
'display_id': 'hana-shower',
'ext': 'mp4',
'title': 'Hana Shower',
'description': 'Hana showers at the bathroom.',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Hmmmmm',
'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'],
'duration': 588,
'age_limit': 18,
'view_count': int,
}
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
errmsg = 'This video has been deleted due to Copyright Infringement or by the account owner!'
if errmsg in webpage:
raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True)
title = self._html_search_regex(
r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip()
description = self._html_search_regex(
r'class="(?:descr|description_txt)">(.*?)</div>',
webpage, 'description', fatal=False)
thumbnail = urljoin('http://www.vporn.com', self._html_search_regex(
r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description',
default=None))
uploader = self._html_search_regex(
r'(?s)Uploaded by:.*?<a href="/user/[^"]+"[^>]*>(.+?)</a>',
webpage, 'uploader', fatal=False)
categories = re.findall(r'<a href="/cat/[^"]+"[^>]*>([^<]+)</a>', webpage)
duration = parse_duration(self._search_regex(
r'Runtime:\s*</span>\s*(\d+ min \d+ sec)',
webpage, 'duration', fatal=False))
view_count = str_to_int(self._search_regex(
r'class="views">([\d,\.]+) [Vv]iews<',
webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
r"'Comments \(([\d,\.]+)\)'",
webpage, 'comment count', default=None))
formats = []
for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"(https?://[^"]+)"', webpage):
video_url = video[1]
fmt = {
'url': video_url,
'format_id': video[0],
}
m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)_(?P<vbr>\d+)k\.mp4$', video_url)
if m:
fmt.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
'vbr': int(m.group('vbr')),
})
formats.append(fmt)
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'categories': categories,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'age_limit': 18,
'formats': formats,
}

View File

@ -11,10 +11,12 @@ import time
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_urllib_parse_urlencode,
compat_urllib_parse,
)
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
)
@ -24,29 +26,41 @@ class VRVBaseIE(InfoExtractor):
_API_DOMAIN = None
_API_PARAMS = {}
_CMS_SIGNING = {}
_TOKEN = None
_TOKEN_SECRET = ''
def _call_api(self, path, video_id, note, data=None):
# https://tools.ietf.org/html/rfc5849#section-3
base_url = self._API_DOMAIN + '/core/' + path
encoded_query = compat_urllib_parse_urlencode({
'oauth_consumer_key': self._API_PARAMS['oAuthKey'],
'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]),
'oauth_signature_method': 'HMAC-SHA1',
'oauth_timestamp': int(time.time()),
'oauth_version': '1.0',
})
query = [
('oauth_consumer_key', self._API_PARAMS['oAuthKey']),
('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])),
('oauth_signature_method', 'HMAC-SHA1'),
('oauth_timestamp', int(time.time())),
]
if self._TOKEN:
query.append(('oauth_token', self._TOKEN))
encoded_query = compat_urllib_parse_urlencode(query)
headers = self.geo_verification_headers()
if data:
data = json.dumps(data).encode()
headers['Content-Type'] = 'application/json'
method = 'POST' if data else 'GET'
base_string = '&'.join([method, compat_urllib_parse.quote(base_url, ''), compat_urllib_parse.quote(encoded_query, '')])
base_string = '&'.join([
'POST' if data else 'GET',
compat_urllib_parse.quote(base_url, ''),
compat_urllib_parse.quote(encoded_query, '')])
oauth_signature = base64.b64encode(hmac.new(
(self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'),
(self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'),
base_string.encode(), hashlib.sha1).digest()).decode()
encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '')
return self._download_json(
'?'.join([base_url, encoded_query]), video_id,
note='Downloading %s JSON metadata' % note, headers=headers, data=data)
try:
return self._download_json(
'?'.join([base_url, encoded_query]), video_id,
note='Downloading %s JSON metadata' % note, headers=headers, data=data)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True)
raise
def _call_cms(self, path, video_id, note):
if not self._CMS_SIGNING:
@ -55,19 +69,22 @@ class VRVBaseIE(InfoExtractor):
self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING,
note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers())
def _set_api_params(self, webpage, video_id):
if not self._API_PARAMS:
self._API_PARAMS = self._parse_json(self._search_regex(
r'window\.__APP_CONFIG__\s*=\s*({.+?})</script>',
webpage, 'api config'), video_id)['cxApiParams']
self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co')
def _get_cms_resource(self, resource_key, video_id):
return self._call_api(
'cms_resource', video_id, 'resource path', data={
'resource_key': resource_key,
})['__links__']['cms_resource']['href']
def _real_initialize(self):
webpage = self._download_webpage(
'https://vrv.co/', None, headers=self.geo_verification_headers())
self._API_PARAMS = self._parse_json(self._search_regex(
[
r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:</script>|;)',
r'window\.__APP_CONFIG__\s*=\s*({.+})'
], webpage, 'app config'), None)['cxApiParams']
self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co')
class VRVIE(VRVBaseIE):
IE_NAME = 'vrv'
@ -86,6 +103,22 @@ class VRVIE(VRVBaseIE):
'skip_download': True,
},
}]
_NETRC_MACHINE = 'vrv'
def _real_initialize(self):
super(VRVIE, self)._real_initialize()
email, password = self._get_login_info()
if email is None:
return
token_credentials = self._call_api(
'authenticate/by:credentials', None, 'Token Credentials', data={
'email': email,
'password': password,
})
self._TOKEN = token_credentials['oauth_token']
self._TOKEN_SECRET = token_credentials['oauth_token_secret']
def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
if not url or stream_format not in ('hls', 'dash'):
@ -116,28 +149,16 @@ class VRVIE(VRVBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
url, video_id,
headers=self.geo_verification_headers())
media_resource = self._parse_json(self._search_regex(
[
r'window\.__INITIAL_STATE__\s*=\s*({.+?})(?:</script>|;)',
r'window\.__INITIAL_STATE__\s*=\s*({.+})'
], webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {}
video_data = media_resource.get('json')
if not video_data:
self._set_api_params(webpage, video_id)
episode_path = self._get_cms_resource(
'cms:/episodes/' + video_id, video_id)
video_data = self._call_cms(episode_path, video_id, 'video')
episode_path = self._get_cms_resource(
'cms:/episodes/' + video_id, video_id)
video_data = self._call_cms(episode_path, video_id, 'video')
title = video_data['title']
streams_json = media_resource.get('streams', {}).get('json', {})
if not streams_json:
self._set_api_params(webpage, video_id)
streams_path = video_data['__links__']['streams']['href']
streams_json = self._call_cms(streams_path, video_id, 'streams')
streams_path = video_data['__links__'].get('streams', {}).get('href')
if not streams_path:
self.raise_login_required()
streams_json = self._call_cms(streams_path, video_id, 'streams')
audio_locale = streams_json.get('audio_locale')
formats = []
@ -202,11 +223,7 @@ class VRVSeriesIE(VRVBaseIE):
def _real_extract(self, url):
series_id = self._match_id(url)
webpage = self._download_webpage(
url, series_id,
headers=self.geo_verification_headers())
self._set_api_params(webpage, series_id)
seasons_path = self._get_cms_resource(
'cms:/seasons?series_id=' + series_id, series_id)
seasons_data = self._call_cms(seasons_path, series_id, 'seasons')

View File

@ -48,7 +48,7 @@ class VShareIE(InfoExtractor):
webpage = self._download_webpage(
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
video_id)
video_id, headers={'Referer': url})
title = self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title')

View File

@ -0,0 +1,66 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
merge_dicts,
urljoin,
)
class WakanimIE(InfoExtractor):
_VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu',
'info_dict': {
'id': '2997',
'ext': 'mp4',
'title': 'Episode 02',
'description': 'md5:2927701ea2f7e901de8bfa8d39b2852d',
'series': 'The Asterisk War (OmU.)',
'season_number': 1,
'episode': 'Episode 02',
'episode_number': 2,
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
}, {
# DRM Protected
'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m3u8_url = urljoin(url, self._search_regex(
r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url',
group='url'))
# https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls
encryption = self._search_regex(
r'encryption%3D(c(?:enc|bc(?:s-aapl)?))',
m3u8_url, 'encryption', default=None)
if encryption and encryption in ('cenc', 'cbcs-aapl'):
raise ExtractorError('This video is DRM protected.', expected=True)
formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
info = self._search_json_ld(webpage, video_id, default={})
title = self._search_regex(
(r'<h1[^>]+\bclass=["\']episode_h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
r'<span[^>]+\bclass=["\']episode_title["\'][^>]*>(?P<title>[^<]+)'),
webpage, 'title', default=None, group='title')
return merge_dicts(info, {
'id': video_id,
'title': title,
'formats': formats,
})

View File

@ -1,7 +1,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import urljoin
from ..utils import (
parse_duration,
urljoin,
)
class YourPornIE(InfoExtractor):
@ -14,7 +17,11 @@ class YourPornIE(InfoExtractor):
'ext': 'mp4',
'title': 'md5:c9f43630bd968267672651ba905a7d35',
'thumbnail': r're:^https?://.*\.jpg$',
'age_limit': 18
'duration': 165,
'age_limit': 18,
},
'params': {
'skip_download': True,
},
}
@ -27,17 +34,21 @@ class YourPornIE(InfoExtractor):
self._search_regex(
r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info',
group='data'),
video_id)[video_id]).replace('/cdn/', '/cdn3/')
video_id)[video_id]).replace('/cdn/', '/cdn4/')
title = (self._search_regex(
r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title',
default=None) or self._og_search_description(webpage)).strip()
thumbnail = self._og_search_thumbnail(webpage)
duration = parse_duration(self._search_regex(
r'duration\s*:\s*<[^>]+>([\d:]+)', webpage, 'duration',
default=None))
return {
'id': video_id,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
'age_limit': 18
'duration': duration,
'age_limit': 18,
}

View File

@ -352,6 +352,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
(?:www\.)?invidio\.us/|
(?:www\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/|
(?:www\.)?vid\.wxzm\.sx/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
@ -498,7 +501,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
'upload_date': '20121002',
'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
@ -527,7 +529,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
'license': 'Standard YouTube License',
'creator': 'Icona Pop',
'track': 'I Love It (feat. Charli XCX)',
'artist': 'Icona Pop',
@ -540,14 +541,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': '07FYdnEawAQ',
'ext': 'mp4',
'upload_date': '20130703',
'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
'alt_title': 'Tunnel Vision',
'description': 'md5:64249768eec3bc4276236606ea996373',
'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
'duration': 419,
'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
'license': 'Standard YouTube License',
'creator': 'Justin Timberlake',
'track': 'Tunnel Vision',
'artist': 'Justin Timberlake',
@ -566,7 +566,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'SET India',
'uploader_id': 'setindia',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
'license': 'Standard YouTube License',
'age_limit': 18,
}
},
@ -581,7 +580,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
@ -605,7 +603,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '',
'uploader': '8KVIDEO',
'license': 'Standard YouTube License',
'title': 'UHDTV TEST 8K VIDEO.mp4'
},
'params': {
@ -620,13 +617,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'info_dict': {
'id': 'IB3lcPjvWLA',
'ext': 'm4a',
'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
'duration': 244,
'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
'license': 'Standard YouTube License',
},
'params': {
'youtube_include_dash_manifest': True,
@ -640,13 +636,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'nfWlot6h_JM',
'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off',
'alt_title': 'Shake It Off',
'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
'description': 'md5:bec2185232c05479482cb5a9b82719bf',
'duration': 242,
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
'license': 'Standard YouTube License',
'creator': 'Taylor Swift',
},
'params': {
@ -662,10 +656,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'duration': 219,
'upload_date': '20100909',
'uploader': 'TJ Kirk',
'uploader': 'Amazing Atheist',
'uploader_id': 'TheAmazingAtheist',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
'license': 'Standard YouTube License',
'title': 'Burning Everyone\'s Koran',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
}
@ -683,7 +676,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'WitcherGame',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605',
'license': 'Standard YouTube License',
'age_limit': 18,
},
},
@ -692,7 +684,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
'info_dict': {
'id': '6kLq3WMV1nU',
'ext': 'webm',
'ext': 'mp4',
'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
'duration': 246,
@ -700,7 +692,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'LloydVEVO',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
'upload_date': '20110629',
'license': 'Standard YouTube License',
'age_limit': 18,
},
},
@ -718,7 +709,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'creator': 'deadmau5',
'description': 'md5:12c56784b8032162bb936a5f76d55360',
'uploader': 'deadmau5',
'license': 'Standard YouTube License',
'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'Some Chords',
},
@ -736,7 +726,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150827',
'uploader_id': 'olympic',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
'license': 'Standard YouTube License',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympic',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
@ -758,7 +747,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
'uploader': '孫ᄋᄅ',
'license': 'Standard YouTube License',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
},
},
@ -792,7 +780,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'dorappi2000',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
'uploader': 'dorappi2000',
'license': 'Standard YouTube License',
'formats': 'mincount:31',
},
'skip': 'not actual anymore',
@ -808,7 +795,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Airtek',
'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
'license': 'Standard YouTube License',
'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
},
'params': {
@ -881,6 +867,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': {
'skip_download': True,
},
'skip': 'This video is not available.',
},
{
# Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
@ -917,7 +904,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'IronSoulElf',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
'uploader': 'IronSoulElf',
'license': 'Standard YouTube License',
'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
'track': 'Dark Walk - Position Music',
'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
@ -1021,13 +1007,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'iqKdEhx-dD4',
'ext': 'mp4',
'title': 'Isolation - Mind Field (Ep 1)',
'description': 'md5:25b78d2f64ae81719f5c96319889b736',
'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
'duration': 2085,
'upload_date': '20170118',
'uploader': 'Vsauce',
'uploader_id': 'Vsauce',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
'license': 'Standard YouTube License',
'series': 'Mind Field',
'season_number': 1,
'episode_number': 1,
@ -1053,7 +1038,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'New Century Foundation',
'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
'license': 'Standard YouTube License',
},
'params': {
'skip_download': True,
@ -1081,6 +1065,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# DRM protected
'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
'only_matching': True,
},
{
# Video with unsupported adaptive stream type formats
'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
'info_dict': {
'id': 'Z4Vy8R84T1U',
'ext': 'mp4',
'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'duration': 433,
'upload_date': '20130923',
'uploader': 'Amelia Putri Harwita',
'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
'formats': 'maxcount:10',
},
'params': {
'skip_download': True,
'youtube_include_dash_manifest': False,
},
}
]
@ -1197,8 +1201,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
funcname = self._search_regex(
(r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
jscode, 'Initial JS player signature function name', group='sig')
@ -1545,6 +1549,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if dash_mpd and dash_mpd[0] not in dash_mpds:
dash_mpds.append(dash_mpd[0])
def add_dash_mpd_pr(pl_response):
dash_mpd = url_or_none(try_get(
pl_response, lambda x: x['streamingData']['dashManifestUrl'],
compat_str))
if dash_mpd and dash_mpd not in dash_mpds:
dash_mpds.append(dash_mpd)
is_live = None
view_count = None
@ -1602,6 +1613,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if isinstance(pl_response, dict):
player_response = pl_response
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
# We also try looking in get_video_info since it may contain different dashmpd
# URL that points to a DASH manifest with possibly different itag set (some itags
# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
@ -1633,6 +1645,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
pl_response = get_video_info.get('player_response', [None])[0]
if isinstance(pl_response, dict):
player_response = pl_response
add_dash_mpd_pr(player_response)
add_dash_mpd(get_video_info)
if view_count is None:
view_count = extract_view_count(get_video_info)
@ -1818,6 +1831,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
url_data = compat_parse_qs(url_data_str)
if 'itag' not in url_data or 'url' not in url_data:
continue
stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
# Unsupported FORMAT_STREAM_TYPE_OTF
if stream_type == 3:
continue
format_id = url_data['itag'][0]
url = url_data['url'][0]

View File

@ -420,3 +420,14 @@ class EinsUndEinsTVIE(ZattooIE):
'url': 'https://www.1und1.tv/watch/abc/123-abc',
'only_matching': True,
}]
class SaltTVIE(ZattooIE):
_NETRC_MACHINE = 'salttv'
_HOST = 'tv.salt.ch'
_VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
_TESTS = [{
'url': 'https://tv.salt.ch/watch/abc/123-abc',
'only_matching': True,
}]

View File

@ -9,9 +9,6 @@ import re
from .common import AudioConversionError, PostProcessor
from ..compat import (
compat_subprocess_get_DEVNULL,
)
from ..utils import (
encodeArgument,
encodeFilename,
@ -165,27 +162,45 @@ class FFmpegPostProcessor(PostProcessor):
return self._paths[self.probe_basename]
def get_audio_codec(self, path):
if not self.probe_available:
raise PostProcessingError('ffprobe or avprobe not found. Please install one.')
if not self.probe_available and not self.available:
raise PostProcessingError('ffprobe/avprobe and ffmpeg/avconv not found. Please install one.')
try:
cmd = [
encodeFilename(self.probe_executable, True),
encodeArgument('-show_streams'),
encodeFilename(self._ffmpeg_filename_argument(path), True)]
if self.probe_available:
cmd = [
encodeFilename(self.probe_executable, True),
encodeArgument('-show_streams')]
else:
cmd = [
encodeFilename(self.executable, True),
encodeArgument('-i')]
cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True))
if self._downloader.params.get('verbose', False):
self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd)))
handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE)
output = handle.communicate()[0]
if handle.wait() != 0:
self._downloader.to_screen(
'[debug] %s command line: %s' % (self.basename, shell_quote(cmd)))
handle = subprocess.Popen(
cmd, stderr=subprocess.PIPE,
stdout=subprocess.PIPE, stdin=subprocess.PIPE)
stdout_data, stderr_data = handle.communicate()
expected_ret = 0 if self.probe_available else 1
if handle.wait() != expected_ret:
return None
except (IOError, OSError):
return None
audio_codec = None
for line in output.decode('ascii', 'ignore').split('\n'):
if line.startswith('codec_name='):
audio_codec = line.split('=')[1].strip()
elif line.strip() == 'codec_type=audio' and audio_codec is not None:
return audio_codec
output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore')
if self.probe_available:
audio_codec = None
for line in output.split('\n'):
if line.startswith('codec_name='):
audio_codec = line.split('=')[1].strip()
elif line.strip() == 'codec_type=audio' and audio_codec is not None:
return audio_codec
else:
# Stream #FILE_INDEX:STREAM_INDEX[STREAM_ID](LANGUAGE): CODEC_TYPE: CODEC_NAME
mobj = re.search(
r'Stream\s*#\d+:\d+(?:\[0x[0-9a-f]+\])?(?:\([a-z]{3}\))?:\s*Audio:\s*([0-9a-z]+)',
output)
if mobj:
return mobj.group(1)
return None
def run_ffmpeg_multiple_files(self, input_paths, out_path, opts):
@ -202,10 +217,13 @@ class FFmpegPostProcessor(PostProcessor):
encodeArgument('-i'),
encodeFilename(self._ffmpeg_filename_argument(path), True)
])
cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] +
files_cmd +
[encodeArgument(o) for o in opts] +
[encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
cmd = [encodeFilename(self.executable, True), encodeArgument('-y')]
# avconv does not have repeat option
if self.basename == 'ffmpeg':
cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')]
cmd += (files_cmd +
[encodeArgument(o) for o in opts] +
[encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
if self._downloader.params.get('verbose', False):
self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd))
@ -392,6 +410,9 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
# Don't copy the existing subtitles, we may be running the
# postprocessor a second time
'-map', '-0:s',
# Don't copy Apple TV chapters track, bin_data (see #19042, #19024,
# https://trac.ffmpeg.org/ticket/6016)
'-map', '-0:d',
]
if information['ext'] == 'mp4':
opts += ['-c:s', 'mov_text']

View File

@ -184,7 +184,7 @@ DATE_FORMATS_MONTH_FIRST.extend([
])
PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
def preferredencoding():
@ -1141,6 +1141,8 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
_HTTPONLY_PREFIX = '#HttpOnly_'
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
# Store session cookies with `expires` set to 0 instead of an empty
# string
@ -1150,7 +1152,21 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
def load(self, filename=None, ignore_discard=False, ignore_expires=False):
compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires)
"""Load cookies from a file."""
if filename is None:
if self.filename is not None:
filename = self.filename
else:
raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
cf = io.StringIO()
with open(filename) as f:
for line in f:
if line.startswith(self._HTTPONLY_PREFIX):
line = line[len(self._HTTPONLY_PREFIX):]
cf.write(compat_str(line))
cf.seek(0)
self._really_load(cf, filename, ignore_discard, ignore_expires)
# Session cookies are denoted by either `expires` field set to
# an empty string or 0. MozillaCookieJar only recognizes the former
# (see [1]). So we need force the latter to be recognized as session
@ -1868,7 +1884,7 @@ def urljoin(base, path):
path = path.decode('utf-8')
if not isinstance(path, compat_str) or not path:
return None
if re.match(r'^(?:https?:)?//', path):
if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
return path
if isinstance(base, bytes):
base = base.decode('utf-8')

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = '2019.01.10'
__version__ = '2019.03.01'