diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7ee704e48..c2bd5d8ae 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,12 +6,13 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones +- [ ] Checked that provided video/audio/playlist URLs (if any) are alive and playable in a browser ### What is the purpose of your *issue*? - [ ] Bug report (encountered problems with youtube-dl) @@ -35,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.09 +[debug] youtube-dl version 2018.05.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md index 26f61d3b4..8edbd5a0f 100644 --- a/.github/ISSUE_TEMPLATE_tmpl.md +++ b/.github/ISSUE_TEMPLATE_tmpl.md @@ -12,6 +12,7 @@ ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones +- [ ] Checked that provided video/audio/playlist URLs (if any) are alive and playable in a browser ### What is the purpose of your *issue*? - [ ] Bug report (encountered problems with youtube-dl) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 46fa26f02..ba4ca7553 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -9,6 +9,7 @@ ### Before submitting a *pull request* make sure you have: - [ ] At least skimmed through [adding new extractor tutorial](https://github.com/rg3/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/rg3/youtube-dl#youtube-dl-coding-conventions) sections - [ ] [Searched](https://github.com/rg3/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests +- [ ] Checked the code with [flake8](https://pypi.python.org/pypi/flake8) ### In order to be accepted and merged into youtube-dl each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check one of the following options: - [ ] I am the original author of this code and I am willing to release it under [Unlicense](http://unlicense.org/) diff --git a/.gitignore b/.gitignore index a5b585f43..fbf7cecb2 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ cover/ updates_key.pem *.egg-info *.srt +*.ttml *.sbv *.vtt *.flv diff --git a/.travis.yml b/.travis.yml index f41e11137..92f326860 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,16 +7,21 @@ python: - "3.4" - "3.5" - "3.6" + - "pypy" + - "pypy3" sudo: false env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download +matrix: + include: + - env: JYTHON=true; YTDL_TEST_SET=core + - env: JYTHON=true; YTDL_TEST_SET=download + fast_finish: true + allow_failures: + - env: YTDL_TEST_SET=download + - env: JYTHON=true; YTDL_TEST_SET=core + - env: JYTHON=true; YTDL_TEST_SET=download +before_install: + - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi script: ./devscripts/run_tests.sh -notifications: - email: - - filippo.valsorda@gmail.com - - yasoob.khld@gmail.com -# irc: -# channels: -# - "irc.freenode.org#youtube-dl" -# skip_join: true diff --git a/AUTHORS b/AUTHORS index 053159cc3..eaf96d79d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -223,3 +223,19 @@ Jan Kundrát Giuseppe Fabiano Örn Guðjónsson Parmjit Virk +Genki Sky +Ľuboš Katrinec +Corey Nicholson +Ashutosh Chaudhary +John Dong +Tatsuyuki Ishi +Daniel Weber +Kay Bouché +Yang Hongbo +Lei Wang +Petr Novák +Leonardo Taccari +Martin Weinelt +Surya Oktafendri +TingPing +Alexandre Macabies diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d606eab0e..333acee80 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,7 +3,7 @@ $ youtube-dl -v [debug] System config: [] [debug] User config: [] -[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] youtube-dl version 2015.12.06 [debug] Git HEAD: 135392e @@ -34,7 +34,7 @@ For bug reports, this means that your report should contain the *complete* outpu If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). -**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL. +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. ### Are you using the latest version? @@ -70,7 +70,7 @@ It may sound strange, but some bug reports we receive are completely unrelated t # DEVELOPER INSTRUCTIONS -Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution. +Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution. To run youtube-dl as a developer, you don't need to build anything either. Simply execute @@ -82,6 +82,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file python test/test_download.py nosetests +See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. + If you want to create a build of youtube-dl yourself, you'll need * python @@ -118,7 +120,7 @@ After you have ensured this site is distributing its content legally, you can fo class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TEST = { - 'url': 'http://yourextractor.com/watch/42', + 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', @@ -149,10 +151,10 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py diff --git a/ChangeLog b/ChangeLog index b28ade446..916b8edb8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,943 @@ +version 2018.05.01 + +Core +* [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) ++ [extractor/common] Extract interaction statistic ++ [utils] Add merge_dicts ++ [extractor/common] Add _download_json_handle + +Extractors +* [kaltura] Improve iframe embeds detection (#16337) ++ [udemy] Extract outputs renditions (#16289, #16291, #16320, #16321, #16334, + #16335) ++ [zattoo] Add support for zattoo.com and mobiltv.quickline.com (#14668, #14676) +* [yandexmusic] Convert release_year to int +* [udemy] Override _download_webpage_handle instead of _download_webpage +* [xiami] Override _download_webpage_handle instead of _download_webpage +* [yandexmusic] Override _download_webpage_handle instead of _download_webpage +* [youtube] Correctly disable polymer on all requests (#16323, #16326) +* [generic] Prefer enclosures over links in RSS feeds (#16189) ++ [redditr] Add support for old.reddit.com URLs (#16274) +* [nrktv] Update API host (#16324) ++ [imdb] Extract all formats (#16249) ++ [vimeo] Extract JSON-LD (#16295) +* [funk:channel] Improve extraction (#16285) + + +version 2018.04.25 + +Core +* [utils] Fix match_str for boolean meta fields ++ [Makefile] Add support for pandoc 2 and disable smart extension (#16251) +* [YoutubeDL] Fix typo in media extension compatibility checker (#16215) + +Extractors ++ [openload] Recognize IPv6 stream URLs (#16136, #16137, #16205, #16246, + #16250) ++ [twitch] Extract is_live according to status (#16259) +* [pornflip] Relax URL regular expression (#16258) +- [etonline] Remove extractor (#16256) +* [breakcom] Fix extraction (#16254) ++ [youtube] Add ability to authenticate with cookies +* [youtube:feed] Implement lazy playlist extraction (#10184) ++ [svt] Add support for TV channel live streams (#15279, #15809) +* [ccma] Fix video extraction (#15931) +* [rentv] Fix extraction (#15227) ++ [nick] Add support for nickjr.nl (#16230) +* [extremetube] Fix metadata extraction ++ [keezmovies] Add support for generic embeds (#16134, #16154) +* [nexx] Extract new azure URLs (#16223) +* [cbssports] Fix extraction (#16217) +* [kaltura] Improve embeds detection (#16201) +* [instagram:user] Fix extraction (#16119) +* [cbs] Skip DRM asset types (#16104) + + +version 2018.04.16 + +Extractors +* [smotri:broadcast] Fix extraction (#16180) ++ [picarto] Add support for picarto.tv (#6205, #12514, #15276, #15551) +* [vine:user] Fix extraction (#15514, #16190) +* [pornhub] Relax URL regular expression (#16165) +* [cbc:watch] Re-acquire device token when expired (#16160) ++ [fxnetworks] Add support for https theplatform URLs (#16125, #16157) ++ [instagram:user] Add request signing (#16119) ++ [twitch] Add support for mobile URLs (#16146) + + +version 2018.04.09 + +Core +* [YoutubeDL] Do not save/restore console title while simulate (#16103) +* [extractor/common] Relax JSON-LD context check (#16006) + +Extractors ++ [generic] Add support for tube8 embeds ++ [generic] Add support for share-videos.se embeds (#16089, #16115) +* [odnoklassniki] Extend URL regular expression (#16081) +* [steam] Bypass mature content check (#16113) ++ [acast] Extract more metadata +* [acast] Fix extraction (#16118) +* [instagram:user] Fix extraction (#16119) +* [drtuber] Fix title extraction (#16107, #16108) +* [liveleak] Extend URL regular expression (#16117) ++ [openload] Add support for oload.xyz +* [openload] Relax stream URL regular expression +* [openload] Fix extraction (#16099) ++ [svtplay:series] Add support for season URLs ++ [svtplay:series] Add support for series (#11130, #16059) + + +version 2018.04.03 + +Extractors ++ [tvnow] Add support for shows (#15837) +* [dramafever] Fix authentication (#16067) +* [afreecatv] Use partial view only when necessary (#14450) ++ [afreecatv] Add support for authentication (#14450) ++ [nationalgeographic] Add support for new URL schema (#16001, #16054) +* [xvideos] Fix thumbnail extraction (#15978, #15979) +* [medialaan] Fix vod id (#16038) ++ [openload] Add support for oload.site (#16039) +* [naver] Fix extraction (#16029) +* [dramafever] Partially switch to API v5 (#16026) +* [abc:iview] Unescape title and series meta fields (#15994) +* [videa] Extend URL regular expression (#16003) + + +version 2018.03.26.1 + +Core ++ [downloader/external] Add elapsed time to progress hook (#10876) +* [downloader/external,fragment] Fix download finalization when writing file + to stdout (#10809, #10876, #15799) + +Extractors +* [vrv] Fix extraction on python2 (#15928) +* [afreecatv] Update referrer (#15947) ++ [24video] Add support for 24video.sexy (#15973) +* [crackle] Bypass geo restriction +* [crackle] Fix extraction (#15969) ++ [lenta] Add support for lenta.ru (#15953) ++ [instagram:user] Add pagination (#15934) +* [youku] Update ccode (#15939) +* [libsyn] Adapt to new page structure + + +version 2018.03.20 + +Core +* [extractor/common] Improve thumbnail extraction for HTML5 entries +* Generalize XML manifest processing code and improve XSPF parsing ++ [extractor/common] Add _download_xml_handle ++ [extractor/common] Add support for relative URIs in _parse_xspf (#15794) + +Extractors ++ [7plus] Extract series metadata (#15862, #15906) +* [9now] Bypass geo restriction (#15920) +* [cbs] Skip unavailable assets (#13490, #13506, #15776) ++ [canalc2] Add support for HTML5 videos (#15916, #15919) ++ [ceskatelevize] Add support for iframe embeds (#15918) ++ [prosiebensat1] Add support for galileo.tv (#15894) ++ [generic] Add support for xfileshare embeds (#15879) +* [bilibili] Switch to v2 playurl API +* [bilibili] Fix and improve extraction (#15048, #15430, #15622, #15863) +* [heise] Improve extraction (#15496, #15784, #15026) +* [instagram] Fix user videos extraction (#15858) + + +version 2018.03.14 + +Extractors +* [soundcloud] Update client id (#15866) ++ [tennistv] Add support for tennistv.com ++ [line] Add support for tv.line.me (#9427) +* [xnxx] Fix extraction (#15817) +* [njpwworld] Fix authentication (#15815) + + +version 2018.03.10 + +Core +* [downloader/hls] Skip uplynk ad fragments (#15748) + +Extractors +* [pornhub] Don't override session cookies (#15697) ++ [raywenderlich] Add support for videos.raywenderlich.com (#15251) +* [funk] Fix extraction and rework extractors (#15792) +* [nexx] Restore reverse engineered approach ++ [heise] Add support for kaltura embeds (#14961, #15728) ++ [tvnow] Extract series metadata (#15774) +* [ruutu] Continue formats extraction on NOT-USED URLs (#15775) +* [vrtnu] Use redirect URL for building video JSON URL (#15767, #15769) +* [vimeo] Modernize login code and improve error messaging +* [archiveorg] Fix extraction (#15770, #15772) ++ [hidive] Add support for hidive.com (#15494) +* [afreecatv] Detect deleted videos +* [afreecatv] Fix extraction (#15755) +* [vice] Fix extraction and rework extractors (#11101, #13019, #13622, #13778) ++ [vidzi] Add support for vidzi.si (#15751) +* [npo] Fix typo + + +version 2018.03.03 + +Core ++ [utils] Add parse_resolution +Revert respect --prefer-insecure while updating + +Extractors ++ [yapfiles] Add support for yapfiles.ru (#15726, #11085) +* [spankbang] Fix formats extraction (#15727) +* [adn] Fix extraction (#15716) ++ [toggle] Extract DASH and ISM formats (#15721) ++ [nickelodeon] Add support for nickelodeon.com.tr (#15706) +* [npo] Validate and filter format URLs (#15709) + + +version 2018.02.26 + +Extractors +* [udemy] Use custom User-Agent (#15571) + + +version 2018.02.25 + +Core +* [postprocessor/embedthumbnail] Skip embedding when there aren't any + thumbnails (#12573) +* [extractor/common] Improve jwplayer subtitles extraction (#15695) + +Extractors ++ [vidlii] Add support for vidlii.com (#14472, #14512, #14779) ++ [streamango] Capture and output error messages +* [streamango] Fix extraction (#14160, #14256) ++ [telequebec] Add support for emissions (#14649, #14655) ++ [telequebec:live] Add support for live streams (#15688) ++ [mailru:music] Add support for mail.ru/music (#15618) +* [aenetworks] Switch to akamai HLS formats (#15612) +* [ytsearch] Fix flat title extraction (#11260, #15681) + + +version 2018.02.22 + +Core ++ [utils] Fixup some common URL typos in sanitize_url (#15649) +* Respect --prefer-insecure while updating (#15497) + +Extractors +* [vidio] Fix HLS URL extraction (#15675) ++ [nexx] Add support for arc.nexx.cloud URLs +* [nexx] Switch to arc API (#15652) +* [redtube] Fix duration extraction (#15659) ++ [sonyliv] Respect referrer (#15648) ++ [brightcove:new] Use referrer for formats' HTTP headers ++ [cbc] Add support for olympics.cbc.ca (#15535) ++ [fusion] Add support for fusion.tv (#15628) +* [npo] Improve quality metadata extraction +* [npo] Relax URL regular expression (#14987, #14994) ++ [npo] Capture and output error message ++ [pornhub] Add support for channels (#15613) +* [youtube] Handle shared URLs with generic extractor (#14303) + + +version 2018.02.11 + +Core ++ [YoutubeDL] Add support for filesize_approx in format selector (#15550) + +Extractors ++ [francetv] Add support for live streams (#13689) ++ [francetv] Add support for zouzous.fr and ludo.fr (#10454, #13087, #13103, + #15012) +* [francetv] Separate main extractor and rework others to delegate to it +* [francetv] Improve manifest URL signing (#15536) ++ [francetv] Sign m3u8 manifest URLs (#15565) ++ [veoh] Add support for embed URLs (#15561) +* [afreecatv] Fix extraction (#15556) +* [periscope] Use accessVideoPublic endpoint (#15554) +* [discovery] Fix auth request (#15542) ++ [6play] Extract subtitles (#15541) +* [newgrounds] Fix metadata extraction (#15531) ++ [nbc] Add support for stream.nbcolympics.com (#10295) +* [dvtv] Fix live streams extraction (#15442) + + +version 2018.02.08 + +Extractors ++ [myvi] Extend URL regular expression ++ [myvi:embed] Add support for myvi.tv embeds (#15521) ++ [prosiebensat1] Extend URL regular expression (#15520) +* [pokemon] Relax URL regular expression and extend title extraction (#15518) ++ [gameinformer] Use geo verification headers +* [la7] Fix extraction (#15501, #15502) +* [gameinformer] Fix brightcove id extraction (#15416) ++ [afreecatv] Pass referrer to video info request (#15507) ++ [telebruxelles] Add support for live streams +* [telebruxelles] Relax URL regular expression +* [telebruxelles] Fix extraction (#15504) +* [extractor/common] Respect secure schemes in _extract_wowza_formats + + +version 2018.02.04 + +Core +* [downloader/http] Randomize HTTP chunk size ++ [downloader/http] Add ability to pass downloader options via info dict +* [downloader/http] Fix 302 infinite loops by not reusing requests ++ Document http_chunk_size + +Extractors ++ [brightcove] Pass embed page URL as referrer (#15486) ++ [youtube] Enforce using chunked HTTP downloading for DASH formats + + +version 2018.02.03 + +Core ++ Introduce --http-chunk-size for chunk-based HTTP downloading ++ Add support for IronPython +* [downloader/ism] Fix Python 3.2 support + +Extractors +* [redbulltv] Fix extraction (#15481) +* [redtube] Fix metadata extraction (#15472) +* [pladform] Respect platform id and extract HLS formats (#15468) +- [rtlnl] Remove progressive formats (#15459) +* [6play] Do no modify asset URLs with a token (#15248) +* [nationalgeographic] Relax URL regular expression +* [dplay] Relax URL regular expression (#15458) +* [cbsinteractive] Fix data extraction (#15451) ++ [amcnetworks] Add support for sundancetv.com (#9260) + + +version 2018.01.27 + +Core +* [extractor/common] Improve _json_ld for articles +* Switch codebase to use compat_b64decode ++ [compat] Add compat_b64decode + +Extractors ++ [seznamzpravy] Add support for seznam.cz and seznamzpravy.cz (#14102, #14616) +* [dplay] Bypass geo restriction ++ [dplay] Add support for disco-api videos (#15396) +* [youtube] Extract precise error messages (#15284) +* [teachertube] Capture and output error message +* [teachertube] Fix and relax thumbnail extraction (#15403) ++ [prosiebensat1] Add another clip id regular expression (#15378) +* [tbs] Update tokenizer url (#15395) +* [mixcloud] Use compat_b64decode (#15394) +- [thesixtyone] Remove extractor (#15341) + + +version 2018.01.21 + +Core +* [extractor/common] Improve jwplayer DASH formats extraction (#9242, #15187) +* [utils] Improve scientific notation handling in js_to_json (#14789) + +Extractors ++ [southparkdk] Add support for southparkstudios.nu ++ [southpark] Add support for collections (#14803) +* [franceinter] Fix upload date extraction (#14996) ++ [rtvs] Add support for rtvs.sk (#9242, #15187) +* [restudy] Fix extraction and extend URL regular expression (#15347) +* [youtube:live] Improve live detection (#15365) ++ [springboardplatform] Add support for springboardplatform.com +* [prosiebensat1] Add another clip id regular expression (#15290) +- [ringtv] Remove extractor (#15345) + + +version 2018.01.18 + +Extractors +* [soundcloud] Update client id (#15306) +- [kamcord] Remove extractor (#15322) ++ [spiegel] Add support for nexx videos (#15285) +* [twitch] Fix authentication and error capture (#14090, #15264) +* [vk] Detect more errors due to copyright complaints (#15259) + + +version 2018.01.14 + +Extractors +* [youtube] Fix live streams extraction (#15202) +* [wdr] Bypass geo restriction +* [wdr] Rework extractors (#14598) ++ [wdr] Add support for wdrmaus.de/elefantenseite (#14598) ++ [gamestar] Add support for gamepro.de (#3384) +* [viafree] Skip rtmp formats (#15232) ++ [pandoratv] Add support for mobile URLs (#12441) ++ [pandoratv] Add support for new URL format (#15131) ++ [ximalaya] Add support for ximalaya.com (#14687) ++ [digg] Add support for digg.com (#15214) +* [limelight] Tolerate empty pc formats (#15150, #15151, #15207) +* [ndr:embed:base] Make separate formats extraction non fatal (#15203) ++ [weibo] Add extractor (#15079) ++ [ok] Add support for live streams +* [canalplus] Fix extraction (#15072) +* [bilibili] Fix extraction (#15188) + + +version 2018.01.07 + +Core +* [utils] Fix youtube-dl under PyPy3 on Windows +* [YoutubeDL] Output python implementation in debug header + +Extractors ++ [jwplatform] Add support for multiple embeds (#15192) +* [mitele] Fix extraction (#15186) ++ [motherless] Add support for groups (#15124) +* [lynda] Relax URL regular expression (#15185) +* [soundcloud] Fallback to avatar picture for thumbnail (#12878) +* [youku] Fix list extraction (#15135) +* [openload] Fix extraction (#15166) +* [lynda] Skip invalid subtitles (#15159) +* [twitch] Pass video id to url_result when extracting playlist (#15139) +* [rtve.es:alacarta] Fix extraction of some new URLs +* [acast] Fix extraction (#15147) + + +version 2017.12.31 + +Core ++ [extractor/common] Add container meta field for formats extracted + in _parse_mpd_formats (#13616) ++ [downloader/hls] Use HTTP headers for key request +* [common] Use AACL as the default fourcc when AudioTag is 255 +* [extractor/common] Fix extraction of DASH formats with the same + representation id (#15111) + +Extractors ++ [slutload] Add support for mobile URLs (#14806) +* [abc:iview] Bypass geo restriction +* [abc:iview] Fix extraction (#14711, #14782, #14838, #14917, #14963, #14985, + #15035, #15057, #15061, #15071, #15095, #15106) +* [openload] Fix extraction (#15118) +- [sandia] Remove extractor +- [collegerama] Remove extractor ++ [mediasite] Add support for sites based on Mediasite Video Platform (#5428, + #11185, #14343) ++ [ufctv] Add support for ufc.tv (#14520) +* [pluralsight] Fix missing first line of subtitles (#11118) +* [openload] Fallback on f-page extraction (#14665, #14879) +* [vimeo] Improve password protected videos extraction (#15114) +* [aws] Fix canonical/signed headers generation on python 2 (#15102) + + +version 2017.12.28 + +Extractors ++ [internazionale] Add support for internazionale.it (#14973) +* [playtvak] Relax video regular expression and make description optional + (#15037) ++ [filmweb] Add support for filmweb.no (#8773, #10368) ++ [23video] Add support for 23video.com ++ [espn] Add support for fivethirtyeight.com (#6864) ++ [umg:de] Add support for universal-music.de (#11582, #11584) ++ [espn] Add support for espnfc and extract more formats (#8053) +* [youku] Update ccode (#14880) ++ [openload] Add support for oload.stream (#15070) +* [youku] Fix list extraction (#15065) + + +version 2017.12.23 + +Core +* [extractor/common] Move X-Forwarded-For setup code into _request_webpage ++ [YoutubeDL] Add support for playlist_uploader and playlist_uploader_id in + output template (#11427, #15018) ++ [extractor/common] Introduce uploader, uploader_id and uploader_url + meta fields for playlists (#11427, #15018) +* [downloader/fragment] Encode filename of fragment being removed (#15020) ++ [utils] Add another date format pattern (#14999) + +Extractors ++ [kaltura] Add another embed pattern for entry_id ++ [7plus] Add support for 7plus.com.au (#15043) +* [animeondemand] Relax login error regular expression ++ [shahid] Add support for show pages (#7401) ++ [youtube] Extract uploader, uploader_id and uploader_url for playlists + (#11427, #15018) +* [afreecatv] Improve format extraction (#15019) ++ [cspan] Add support for audio only pages and catch page errors (#14995) ++ [mailru] Add support for embed URLs (#14904) +* [crunchyroll] Future-proof XML element checks (#15013) +* [cbslocal] Fix timestamp extraction (#14999, #15000) +* [discoverygo] Correct TTML subtitle extension +* [vk] Make view count optional (#14979) +* [disney] Skip Apple FairPlay formats (#14982) +* [voot] Fix format extraction (#14758) + + +version 2017.12.14 + +Core +* [postprocessor/xattr] Clarify NO_SPACE message (#14970) +* [downloader/http] Return actual download result from real_download (#14971) + +Extractors ++ [itv] Extract more subtitles and duration +* [itv] Improve extraction (#14944) ++ [byutv] Add support for geo restricted videos +* [byutv] Fix extraction (#14966, #14967) ++ [bbccouk] Fix extraction for 320k HLS streams ++ [toutv] Add support for special video URLs (#14179) +* [discovery] Fix free videos extraction (#14157, #14954) +* [tvnow] Fix extraction (#7831) ++ [nickelodeon:br] Add support for nickelodeon brazil websites (#14893) +* [nick] Improve extraction (#14876) +* [tbs] Fix extraction (#13658) + + +version 2017.12.10 + +Core ++ [utils] Add sami mimetype to mimetype2ext + +Extractors +* [culturebox] Improve video id extraction (#14947) +* [twitter] Improve extraction (#14197) ++ [udemy] Extract more HLS formats +* [udemy] Improve course id extraction (#14938) ++ [stretchinternet] Add support for portal.stretchinternet.com (#14576) +* [ellentube] Fix extraction (#14407, #14570) ++ [raiplay:playlist] Add support for playlists (#14563) +* [sonyliv] Bypass geo restriction +* [sonyliv] Extract higher quality formats (#14922) +* [fox] Extract subtitles ++ [fox] Add support for Adobe Pass authentication (#14205, #14489) +- [dailymotion:cloud] Remove extractor (#6794) +* [xhamster] Fix thumbnail extraction (#14780) ++ [xhamster] Add support for mobile URLs (#14780) +* [generic] Don't pass video id as mpd id while extracting DASH (#14902) +* [ard] Skip invalid stream URLs (#14906) +* [porncom] Fix metadata extraction (#14911) +* [pluralsight] Detect agreement request (#14913) +* [toutv] Fix login (#14614) + + +version 2017.12.02 + +Core ++ [downloader/fragment] Commit part file after each fragment ++ [extractor/common] Add durations for DASH fragments with bare SegmentURLs ++ [extractor/common] Add support for DASH manifests with SegmentLists with + bare SegmentURLs (#14844) ++ [utils] Add hvc1 codec code to parse_codecs + +Extractors +* [xhamster] Fix extraction (#14884) +* [youku] Update ccode (#14872) +* [mnet] Fix format extraction (#14883) ++ [xiami] Add Referer header to API request +* [mtv] Correct scc extention in extracted subtitles (#13730) +* [vvvvid] Fix extraction for kenc videos (#13406) ++ [br] Add support for BR Mediathek videos (#14560, #14788) ++ [daisuki] Add support for motto.daisuki.com (#14681) +* [odnoklassniki] Fix API metadata request (#14862) +* [itv] Fix HLS formats extraction ++ [pbs] Add another media id regular expression + + +version 2017.11.26 + +Core +* [extractor/common] Use final URL when dumping request (#14769) + +Extractors +* [fczenit] Fix extraction +- [firstpost] Remove extractor +* [freespeech] Fix extraction +* [nexx] Extract more formats ++ [openload] Add support for openload.link (#14763) +* [empflix] Relax URL regular expression +* [empflix] Fix extractrion +* [tnaflix] Don't modify download URLs (#14811) +- [gamersyde] Remove extractor +* [francetv:generationwhat] Fix extraction ++ [massengeschmacktv] Add support for Massengeschmack TV +* [fox9] Fix extraction +* [faz] Fix extraction and add support for Perform Group embeds (#14714) ++ [performgroup] Add support for performgroup.com ++ [jwplatform] Add support for iframes (#14828) +* [culturebox] Fix extraction (#14827) +* [youku] Fix extraction; update ccode (#14815) +* [livestream] Make SMIL extraction non fatal (#14792) ++ [drtuber] Add support for mobile URLs (#14772) ++ [spankbang] Add support for mobile URLs (#14771) +* [instagram] Fix description, timestamp and counters extraction (#14755) + + +version 2017.11.15 + +Core +* [common] Skip Apple FairPlay m3u8 manifests (#14741) +* [YoutubeDL] Fix playlist range optimization for --playlist-items (#14740) + +Extractors +* [vshare] Capture and output error message +* [vshare] Fix extraction (#14473) +* [crunchyroll] Extract old RTMP formats +* [tva] Fix extraction (#14736) +* [gamespot] Lower preference of HTTP formats (#14652) +* [instagram:user] Fix extraction (#14699) +* [ccma] Fix typo (#14730) +- Remove sensitive data from logging in messages +* [instagram:user] Fix extraction (#14699) ++ [gamespot] Add support for article URLs (#14652) +* [gamespot] Skip Brightcove Once HTTP formats (#14652) +* [cartoonnetwork] Update tokenizer_src (#14666) ++ [wsj] Recognize another URL pattern (#14704) +* [pandatv] Update API URL and sign format URLs (#14693) +* [crunchyroll] Use old login method (#11572) + + +version 2017.11.06 + +Core ++ [extractor/common] Add protocol for f4m formats +* [f4m] Prefer baseURL for relative URLs (#14660) +* [extractor/common] Respect URL query in _extract_wowza_formats (14645) + +Extractors ++ [hotstar:playlist] Add support for playlists (#12465) +* [hotstar] Bypass geo restriction (#14672) +- [22tracks] Remove extractor (#11024, #14628) ++ [skysport] Sdd support ooyala videos protected with embed_token (#14641) +* [gamespot] Extract formats referenced with new data fields (#14652) +* [spankbang] Detect unavailable videos (#14644) + + +version 2017.10.29 + +Core +* [extractor/common] Prefix format id for audio only HLS formats ++ [utils] Add support for zero years and months in parse_duration + +Extractors +* [egghead] Fix extraction (#14388) ++ [fxnetworks] Extract series metadata (#14603) ++ [younow] Add support for younow.com (#9255, #9432, #12436) +* [dctptv] Fix extraction (#14599) +* [youtube] Restrict embed regular expression (#14600) +* [vimeo] Restrict iframe embed regular expression (#14600) +* [soundgasm] Improve extraction (#14588) +- [myvideo] Remove extractor (#8557) ++ [nbc] Add support for classic-tv videos (#14575) ++ [vrtnu] Add support for cookies authentication and simplify (#11873) ++ [canvas] Add support for vrt.be/vrtnu (#11873) +* [twitch:clips] Fix title extraction (#14566) ++ [ndtv] Add support for sub-sites (#14534) +* [dramafever] Fix login error message extraction ++ [nick] Add support for more nickelodeon sites (no, dk, se, ch, fr, es, pt, + ro, hu) (#14553) + + +version 2017.10.20 + +Core +* [downloader/fragment] Report warning instead of error on inconsistent + download state +* [downloader/hls] Fix total fragments count when ad fragments exist + +Extractors +* [parliamentliveuk] Fix extraction (#14524) +* [soundcloud] Update client id (#14546) ++ [servus] Add support for servus.com (#14362) ++ [unity] Add support for unity3d.com (#14528) +* [youtube] Replace youtube redirect URLs in description (#14517) +* [pbs] Restrict direct video URL regular expression (#14519) +* [drtv] Respect preference for direct HTTP formats (#14509) ++ [eporner] Add support for embed URLs (#14507) +* [arte] Capture and output error message +* [niconico] Improve uploader metadata extraction robustness (#14135) + + +version 2017.10.15.1 + +Core +* [downloader/hls] Ignore anvato ad fragments (#14496) +* [downloader/fragment] Output ad fragment count + +Extractors +* [scrippsnetworks:watch] Bypass geo restriction ++ [anvato] Add ability to bypass geo restriction +* [redditr] Fix extraction for URLs with query (#14495) + + +version 2017.10.15 + +Core ++ [common] Add support for jwplayer youtube embeds + +Extractors +* [scrippsnetworks:watch] Fix extraction (#14389) +* [anvato] Process master m3u8 manifests +* [youtube] Fix relative URLs in description +* [spike] Bypass geo restriction ++ [howstuffworks] Add support for more domains +* [infoq] Fix http format downloading ++ [rtlnl] Add support for another type of embeds ++ [onionstudios] Add support for bulbs-video embeds +* [udn] Fix extraction +* [shahid] Fix extraction (#14448) +* [kaltura] Ignore Widevine encrypted video (.wvm) (#14471) +* [vh1] Fix extraction (#9613) + + +version 2017.10.12 + +Core +* [YoutubeDL] Improve _default_format_spec (#14461) + +Extractors +* [steam] Fix extraction (#14067) ++ [funk] Add support for funk.net (#14464) ++ [nexx] Add support for shortcuts and relax domain id extraction ++ [voxmedia] Add support for recode.net (#14173) ++ [once] Add support for vmap URLs ++ [generic] Add support for channel9 embeds (#14469) +* [tva] Fix extraction (#14328) ++ [tubitv] Add support for new URL format (#14460) +- [afreecatv:global] Remove extractor +- [youtube:shared] Removed extractor (#14420) ++ [slideslive] Add support for slideslive.com (#2680) ++ [facebook] Support thumbnails (#14416) +* [vvvvid] Fix episode number extraction (#14456) +* [hrti:playlist] Relax URL regular expression +* [wdr] Relax media link regular expression (#14447) +* [hrti] Relax URL regular expression (#14443) +* [fox] Delegate extraction to uplynk:preplay (#14147) ++ [youtube] Add support for hooktube.com (#14437) + + +version 2017.10.07 + +Core +* [YoutubeDL] Ignore duplicates in --playlist-items +* [YoutubeDL] Fix out of range --playlist-items for iterable playlists and + reduce code duplication (#14425) ++ [utils] Use cache in OnDemandPagedList by default +* [postprocessor/ffmpeg] Convert to opus using libopus (#14381) + +Extractors +* [reddit] Sort formats (#14430) +* [lnkgo] Relax URL regular expression (#14423) +* [pornflip] Extend URL regular expression (#14405, #14406) ++ [xtube] Add support for embed URLs (#14417) ++ [xvideos] Add support for embed URLs and improve extraction (#14409) +* [beeg] Fix extraction (#14403) +* [tvn24] Relax URL regular expression (#14395) +* [nbc] Fix extraction (#13651, #13715, #14137, #14198, #14312, #14314, #14378, + #14392, #14414, #14419, #14431) ++ [ketnet] Add support for videos without direct sources (#14377) +* [canvas] Generalize mediazone.vrt.be extractor and rework canvas and een ++ [afreecatv] Add support for adult videos (#14376) + + +version 2017.10.01 + +Core +* [YoutubeDL] Document youtube_include_dash_manifest + +Extractors ++ [tvp] Add support for new URL schema (#14368) ++ [generic] Add support for single format Video.js embeds (#14371) +* [yahoo] Bypass geo restriction for brightcove (#14210) +* [yahoo] Use extracted brightcove account id (#14210) +* [rtve:alacarta] Fix extraction (#14290) ++ [yahoo] Add support for custom brigthcove embeds (#14210) ++ [generic] Add support for Video.js embeds ++ [gfycat] Add support for /gifs/detail URLs (#14322) +* [generic] Fix infinite recursion for twitter:player URLs (#14339) +* [xhamsterembed] Fix extraction (#14308) + + +version 2017.09.24 + +Core ++ [options] Accept lrc as a subtitle conversion target format (#14292) +* [utils] Fix handling raw TTML subtitles (#14191) + +Extractors +* [24video] Fix timestamp extraction and make non fatal (#14295) ++ [24video] Add support for 24video.adult (#14295) ++ [kakao] Add support for tv.kakao.com (#12298, #14007) ++ [twitter] Add support for URLs without user id (#14270) ++ [americastestkitchen] Add support for americastestkitchen.com (#10764, + #13996) +* [generic] Fix support for multiple HTML5 videos on one page (#14080) +* [mixcloud] Fix extraction (#14088, #14132) ++ [lynda] Add support for educourse.ga (#14286) +* [beeg] Fix extraction (#14275) +* [nbcsports:vplayer] Correct theplatform URL (#13873) +* [twitter] Fix duration extraction (#14141) +* [tvplay] Bypass geo restriction ++ [heise] Add support for YouTube embeds (#14109) ++ [popcorntv] Add support for popcorntv.it (#5914, #14211) +* [viki] Update app data (#14181) +* [morningstar] Relax URL regular expression (#14222) +* [openload] Fix extraction (#14225, #14257) +* [noovo] Fix extraction (#14214) +* [dailymotion:playlist] Relax URL regular expression (#14219) ++ [twitch] Add support for go.twitch.tv URLs (#14215) +* [vgtv] Relax URL regular expression (#14223) + + +version 2017.09.15 + +Core +* [downloader/fragment] Restart inconsistent incomplete fragment downloads + (#13731) +* [YoutubeDL] Download raw subtitles files (#12909, #14191) + +Extractors +* [condenast] Fix extraction (#14196, #14207) ++ [orf] Add support for f4m stories +* [tv4] Relax URL regular expression (#14206) +* [animeondemand] Bypass geo restriction ++ [animeondemand] Add support for flash videos (#9944) + + +version 2017.09.11 + +Extractors +* [rutube:playlist] Fix suitable (#14166) + + +version 2017.09.10 + +Core ++ [utils] Introduce bool_or_none +* [YoutubeDL] Ensure dir existence for each requested format (#14116) + +Extractors +* [fox] Fix extraction (#14147) +* [rutube] Use bool_or_none +* [rutube] Rework and generalize playlist extractors (#13565) ++ [rutube:playlist] Add support for playlists (#13534, #13565) ++ [radiocanada] Add fallback for title extraction (#14145) +* [vk] Use dedicated YouTube embeds extraction routine +* [vice] Use dedicated YouTube embeds extraction routine +* [cracked] Use dedicated YouTube embeds extraction routine +* [chilloutzone] Use dedicated YouTube embeds extraction routine +* [abcnews] Use dedicated YouTube embeds extraction routine +* [youtube] Separate methods for embeds extraction +* [redtube] Fix formats extraction (#14122) +* [arte] Relax unavailability check (#14112) ++ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059) +* [vidme:user] Relax URL regular expression (#14054) +* [bpb] Fix extraction (#14043, #14086) +* [soundcloud] Fix download URL with private tracks (#14093) +* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707) +* [viidea] Capture and output lecture error message (#14099) +* [radiocanada] Skip unsupported platforms (#14100) + + +version 2017.09.02 + +Extractors +* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, + #14077, #14079, #14082, #14083, #14094, #14095, #14096) +* [youtube] Fix upload date extraction (#14065) ++ [charlierose] Add support for episodes (#14062) ++ [bbccouk] Add support for w-prefixed ids (#14056) +* [googledrive] Extend URL regular expression (#9785) ++ [googledrive] Add support for source format (#14046) +* [pornhd] Fix extraction (#14005) + + +version 2017.08.27.1 + +Extractors + +* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037) + + +version 2017.08.27 + +Core ++ [extractor/common] Extract height and format id for HTML5 videos (#14034) +* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023, + #8625, #9483) + * Simplify code and split into separate routines to facilitate maintaining + * Make retry mechanism work on errors during actual download not only + during connection establishment phase + * Retry on ECONNRESET and ETIMEDOUT during reading data from network + * Retry on content too short + * Show error description on retry + +Extractors +* [generic] Lower preference for extraction from LD-JSON +* [rai] Fix audio formats extraction (#14024) +* [youtube] Fix controversy videos extraction (#14027, #14029) +* [mixcloud] Fix extraction (#14015, #14020) + + +version 2017.08.23 + +Core ++ [extractor/common] Introduce _parse_xml +* [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries + non fatal (#13970) +* [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) + +Extractors +* [cbc:watch] Bypass geo restriction (#13993) +* [toutv] Relax DRM check (#13994) ++ [googledrive] Add support for subtitles (#13619, #13638) +* [pornhub] Relax uploader regular expression (#13906, #13975) +* [bandcamp:album] Extract track titles (#13962) ++ [bbccouk] Add support for events URLs (#13893) ++ [liveleak] Support multi-video pages (#6542) ++ [liveleak] Support another liveleak embedding pattern (#13336) +* [cda] Fix extraction (#13935) ++ [laola1tv] Add support for tv.ittf.com (#13965) +* [mixcloud] Fix extraction (#13958, #13974, #13980, #14003) + + +version 2017.08.18 + +Core +* [YoutubeDL] Sanitize byte string format URLs (#13951) ++ [extractor/common] Add support for float durations in _parse_mpd_formats + (#13919) + +Extractors +* [arte] Detect unavailable videos (#13945) +* [generic] Convert redirect URLs to unicode strings (#13951) +* [udemy] Fix paid course detection (#13943) +* [pluralsight] Use RPC API for course extraction (#13937) ++ [clippit] Add support for clippituser.tv ++ [qqmusic] Support new URL schemes (#13805) +* [periscope] Renew HLS extraction (#13917) +* [mixcloud] Extract decrypt key + + +version 2017.08.13 + +Core +* [YoutubeDL] Make sure format id is not empty +* [extractor/common] Make _family_friendly_search optional +* [extractor/common] Respect source's type attribute for HTML5 media (#13892) + +Extractors +* [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902) ++ [fourtube] Add support pornerbros.com (#6022) ++ [fourtube] Add support porntube.com (#7859, #13901) ++ [fourtube] Add support fux.com +* [limelight] Improve embeds detection (#13895) ++ [reddit] Add support for v.redd.it and reddit.com (#13847) +* [aparat] Extract all formats (#13887) +* [mixcloud] Fix play info decryption (#13885) ++ [generic] Add support for vzaar embeds (#13876) + + version 2017.08.09 Core @@ -81,7 +1021,7 @@ Extractors * [youku:show] Fix playlist extraction (#13248) + [dispeak] Recognize sevt subdomain (#13276) * [adn] Improve error reporting (#13663) -* [crunchyroll] Relax series and season regex (#13659) +* [crunchyroll] Relax series and season regular expression (#13659) + [spiegel:article] Add support for nexx iframe embeds (#13029) + [nexx:embed] Add support for iframe embeds * [nexx] Improve JS embed extraction @@ -554,7 +1494,7 @@ version 2017.04.14 Core + [downloader/hls] Add basic support for EXT-X-BYTERANGE tag (#10955) -+ [adobepass] Improve Comcast and Verison login code (#10803) ++ [adobepass] Improve Comcast and Verizon login code (#10803) + [adobepass] Add support for Verizon (#10803) Extractors diff --git a/MANIFEST.in b/MANIFEST.in index 5743f605a..4e43e99f3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,9 @@ include README.md -include test/*.py -include test/*.json +include LICENSE +include AUTHORS +include ChangeLog include youtube-dl.bash-completion include youtube-dl.fish include youtube-dl.1 recursive-include docs Makefile conf.py *.rst +recursive-include test * diff --git a/Makefile b/Makefile index 84ccce2b3..4a62f44bc 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,9 @@ PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi) +# set markdown input format to "markdown-smart" for pandoc version 2 and to "markdown" for pandoc prior to version 2 +MARKDOWN = $(shell if [ `pandoc -v | head -n1 | cut -d" " -f2 | head -c1` = "2" ]; then echo markdown-smart; else echo markdown; fi) + install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) install -m 755 youtube-dl $(DESTDIR)$(BINDIR) @@ -36,8 +39,17 @@ test: ot: offlinetest +# Keep this list in sync with devscripts/run_tests.sh offlinetest: codetest - $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py --exclude test_socks.py + $(PYTHON) -m nose --verbose test \ + --exclude test_age_restriction.py \ + --exclude test_download.py \ + --exclude test_iqiyi_sdk_interpreter.py \ + --exclude test_socks.py \ + --exclude test_subtitles.py \ + --exclude test_write_annotations.py \ + --exclude test_youtube_lists.py \ + --exclude test_youtube_signature.py tar: youtube-dl.tar.gz @@ -46,8 +58,15 @@ tar: youtube-dl.tar.gz pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1 youtube-dl.fish youtube-dl: youtube_dl/*.py youtube_dl/*/*.py - zip --quiet youtube-dl youtube_dl/*.py youtube_dl/*/*.py - zip --quiet --junk-paths youtube-dl youtube_dl/__main__.py + mkdir -p zip + for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \ + mkdir -p zip/$$d ;\ + cp -pPR $$d/*.py zip/$$d/ ;\ + done + touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py + mv zip/youtube_dl/__main__.py zip/ + cd zip ; zip -q ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py + rm -rf zip echo '#!$(PYTHON)' > youtube-dl cat youtube-dl.zip >> youtube-dl rm youtube-dl.zip @@ -66,11 +85,11 @@ supportedsites: $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md README.txt: README.md - pandoc -f markdown -t plain README.md -o README.txt + pandoc -f $(MARKDOWN) -t plain README.md -o README.txt youtube-dl.1: README.md $(PYTHON) devscripts/prepare_manpage.py youtube-dl.1.temp.md - pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 + pandoc -s -f $(MARKDOWN) -t man youtube-dl.1.temp.md -o youtube-dl.1 rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in @@ -94,7 +113,7 @@ _EXTRACTOR_FILES = $(shell find youtube_dl/extractor -iname '*.py' -and -not -in youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) $(PYTHON) devscripts/make_lazy_extractors.py $@ -youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog +youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog AUTHORS @tar -czf youtube-dl.tar.gz --transform "s|^|youtube-dl/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ --exclude '*.kate-swp' \ @@ -103,11 +122,10 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude '*~' \ --exclude '__pycache__' \ --exclude '.git' \ - --exclude 'testdata' \ --exclude 'docs/_build' \ -- \ bin devscripts test youtube_dl docs \ - ChangeLog LICENSE README.md README.txt \ + ChangeLog AUTHORS LICENSE README.md README.txt \ Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion \ - youtube-dl.zsh youtube-dl.fish setup.py \ + youtube-dl.zsh youtube-dl.fish setup.py setup.cfg \ youtube-dl diff --git a/README.md b/README.md index 0067184be..5af0f387b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Build Status](https://travis-ci.org/rg3/youtube-dl.svg?branch=master)](https://travis-ci.org/rg3/youtube-dl) + youtube-dl - download videos from youtube.com or other video platforms - [INSTALLATION](#installation) @@ -25,7 +27,7 @@ If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl -Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`). +Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](https://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`). You can also use pip: @@ -33,7 +35,7 @@ You can also use pip: This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. -OS X users can install youtube-dl with [Homebrew](http://brew.sh/): +OS X users can install youtube-dl with [Homebrew](https://brew.sh/): brew install youtube-dl @@ -44,7 +46,7 @@ Or with [MacPorts](https://www.macports.org/): Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). # DESCRIPTION -**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on Mac OS X. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. +**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on macOS. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. youtube-dl [OPTIONS] URL [URL...] @@ -196,6 +198,11 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo size. By default, the buffer size is automatically resized from an initial value of SIZE. + --http-chunk-size SIZE Size of a chunk for chunk-based HTTP + downloading (e.g. 10485760 or 10M) (default + is disabled). May be useful for bypassing + bandwidth throttling imposed by a webserver + (experimental) --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with @@ -216,7 +223,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo ## Filesystem Options: -a, --batch-file FILE File containing URLs to download ('-' for - stdin) + stdin), one URL per line. Lines starting + with '#', ';' or ']' are considered as + comments and ignored. --id Use only video ID in file name -o, --output TEMPLATE Output filename template, see the "OUTPUT TEMPLATE" for all the info @@ -427,7 +436,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm {}' --convert-subs FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt) + (currently supported: srt|ass|vtt|lrc) # CONFIGURATION @@ -458,7 +467,7 @@ You can also use `--config-location` if you want to use custom configuration fil ### Authentication with `.netrc` file -You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you: +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you: ``` touch $HOME/.netrc chmod a-rwx,u+rw $HOME/.netrc @@ -485,7 +494,7 @@ The `-o` option allows users to indicate a template for the output file names. **tl;dr:** [navigate me to examples](#output-template-examples). -The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: - `id` (string): Video identifier - `title` (string): Video title @@ -509,6 +518,9 @@ The basic usage is not to set any template arguments when downloading a single f - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage - `comment_count` (numeric): Number of comments on the video - `age_limit` (numeric): Age restriction for the video (years) + - `is_live` (boolean): Whether this video is a live stream or a fixed-length video + - `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL + - `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL - `format` (string): A human-readable description of the format - `format_id` (string): Format code specified by `--format` - `format_note` (string): Additional info about the format @@ -534,6 +546,8 @@ The basic usage is not to set any template arguments when downloading a single f - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according to the total length of the playlist - `playlist_id` (string): Playlist identifier - `playlist_title` (string): Playlist title + - `playlist_uploader` (string): Full name of the playlist uploader + - `playlist_uploader_id` (string): Nickname or id of the playlist uploader Available for the video that belongs to some logical chapter or section: @@ -603,7 +617,7 @@ $ youtube-dl -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext) $ youtube-dl -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/ # Download entire series season keeping each series and each season in separate directory under C:/MyVideos -$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" http://videomore.ru/kino_v_detalayah/5_sezon/367617 +$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" https://videomore.ru/kino_v_detalayah/5_sezon/367617 # Stream the video being downloaded to stdout $ youtube-dl -o - BaW_jenozKc @@ -716,17 +730,17 @@ $ youtube-dl --dateafter 20000101 --datebefore 20091231 ### How do I update youtube-dl? -If you've followed [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`). +If you've followed [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`). If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update. -If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. +If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to https://yt-dl.org to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like sudo apt-get remove -y youtube-dl -Afterwards, simply follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html): +Afterwards, simply follow [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html): ``` sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl @@ -766,11 +780,11 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much. youtube-dl works fine on its own on most sites. However, if you want to convert video/audio, you'll need [avconv](https://libav.org/) or [ffmpeg](https://www.ffmpeg.org/). On some sites - most notably YouTube - videos can be retrieved in a higher quality format without sound. youtube-dl will detect whether avconv/ffmpeg is present and automatically pick the best option. -Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](http://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed. +Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](https://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed. ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org/) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](https://www.videolan.org/) or [mplayer](https://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my web browser. @@ -845,10 +859,10 @@ Use the `-o` to specify an [output template](#output-template), for example `-o ### How do I download a video starting with a `-`? -Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the options with `--`: +Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the options with `--`: youtube-dl -- -wNyEUrxzFU - youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU" + youtube-dl "https://www.youtube.com/watch?v=-wNyEUrxzFU" ### How do I pass cookies to youtube-dl? @@ -856,15 +870,15 @@ Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). -Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, Mac OS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. Another use case is working around [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) some websites require you to solve in particular cases in order to get access (e.g. YouTube, CloudFlare). ### How do I stream directly to media player? -You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with: +You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](https://www.videolan.org/) can be achieved with: - youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - + youtube-dl -o - "https://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - ### How do I download only new videos from a playlist? @@ -884,7 +898,7 @@ When youtube-dl detects an HLS video, it can download it either with the built-i When youtube-dl knows that one particular downloader works better for a given website, that downloader will be picked. Otherwise, youtube-dl will pick the best downloader for general compatibility, which at the moment happens to be ffmpeg. This choice may change in future versions of youtube-dl, with improvements of the built-in downloader and/or ffmpeg. -In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](http://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader. +In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](https://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader. If you put either `--hls-prefer-native` or `--hls-prefer-ffmpeg` into your configuration, a different subset of videos will fail to download correctly. Instead, it is much better to [file an issue](https://yt-dl.org/bug) or a pull request which details why the native or the ffmpeg HLS downloader is a better choice for your use case. @@ -910,7 +924,7 @@ Feel free to bump the issue from time to time by writing a small comment ("Issue ### How can I detect whether a given URL is supported by youtube-dl? -For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. +For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from https://example.com/video/1234567 to https://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor. @@ -924,7 +938,7 @@ youtube-dl is an open-source project manned by too few volunteers, so we'd rathe # DEVELOPER INSTRUCTIONS -Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution. +Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution. To run youtube-dl as a developer, you don't need to build anything either. Simply execute @@ -936,6 +950,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file python test/test_download.py nosetests +See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. + If you want to create a build of youtube-dl yourself, you'll need * python @@ -972,7 +988,7 @@ After you have ensured this site is distributing its content legally, you can fo class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TEST = { - 'url': 'http://yourextractor.com/watch/42', + 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', @@ -1003,10 +1019,10 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py @@ -1162,10 +1178,10 @@ import youtube_dl ydl_opts = {} with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L129-L279). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: @@ -1201,19 +1217,19 @@ ydl_opts = { 'progress_hooks': [my_hook], } with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` # BUGS -Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). +Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` $ youtube-dl -v [debug] System config: [] [debug] User config: [] -[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] youtube-dl version 2015.12.06 [debug] Git HEAD: 135392e @@ -1244,7 +1260,7 @@ For bug reports, this means that your report should contain the *complete* outpu If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). -**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL. +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. ### Are you using the latest version? diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 7a219ebe9..72b2ee422 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -14,7 +14,7 @@ import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_testcases +from test.helper import gettestcases from youtube_dl.utils import compat_urllib_parse_urlparse from youtube_dl.utils import compat_urllib_request @@ -24,7 +24,7 @@ if len(sys.argv) > 1: else: METHOD = 'EURISTIC' -for test in get_testcases(): +for test in gettestcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() diff --git a/devscripts/gh-pages/generate-download.py b/devscripts/gh-pages/generate-download.py index fcd7e1dff..a873d32ee 100755 --- a/devscripts/gh-pages/generate-download.py +++ b/devscripts/gh-pages/generate-download.py @@ -1,27 +1,22 @@ #!/usr/bin/env python3 from __future__ import unicode_literals -import hashlib -import urllib.request import json versions_info = json.load(open('update/versions.json')) version = versions_info['latest'] -URL = versions_info['versions'][version]['bin'][0] - -data = urllib.request.urlopen(URL).read() +version_dict = versions_info['versions'][version] # Read template page with open('download.html.in', 'r', encoding='utf-8') as tmplf: template = tmplf.read() -sha256sum = hashlib.sha256(data).hexdigest() template = template.replace('@PROGRAM_VERSION@', version) -template = template.replace('@PROGRAM_URL@', URL) -template = template.replace('@PROGRAM_SHA256SUM@', sha256sum) -template = template.replace('@EXE_URL@', versions_info['versions'][version]['exe'][0]) -template = template.replace('@EXE_SHA256SUM@', versions_info['versions'][version]['exe'][1]) -template = template.replace('@TAR_URL@', versions_info['versions'][version]['tar'][0]) -template = template.replace('@TAR_SHA256SUM@', versions_info['versions'][version]['tar'][1]) +template = template.replace('@PROGRAM_URL@', version_dict['bin'][0]) +template = template.replace('@PROGRAM_SHA256SUM@', version_dict['bin'][1]) +template = template.replace('@EXE_URL@', version_dict['exe'][0]) +template = template.replace('@EXE_SHA256SUM@', version_dict['exe'][1]) +template = template.replace('@TAR_URL@', version_dict['tar'][0]) +template = template.replace('@TAR_SHA256SUM@', version_dict['tar'][1]) with open('download.html', 'w', encoding='utf-8') as dlf: dlf.write(template) diff --git a/devscripts/install_jython.sh b/devscripts/install_jython.sh new file mode 100755 index 000000000..bafca4da4 --- /dev/null +++ b/devscripts/install_jython.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +wget http://central.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar +java -jar jython-installer-2.7.1.jar -s -d "$HOME/jython" +$HOME/jython/bin/jython -m pip install nose diff --git a/devscripts/run_tests.sh b/devscripts/run_tests.sh index 6ba26720d..dd37a80f5 100755 --- a/devscripts/run_tests.sh +++ b/devscripts/run_tests.sh @@ -1,6 +1,7 @@ #!/bin/bash -DOWNLOAD_TESTS="age_restriction|download|subtitles|write_annotations|iqiyi_sdk_interpreter|youtube_lists" +# Keep this list in sync with the `offlinetest` target in Makefile +DOWNLOAD_TESTS="age_restriction|download|iqiyi_sdk_interpreter|socks|subtitles|write_annotations|youtube_lists|youtube_signature" test_set="" multiprocess_args="" diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a3bd07726..c5a48002b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,8 +3,7 @@ - **1up.com** - **20min** - **220.ro** - - **22tracks:genre** - - **22tracks:track** + - **23video** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -12,6 +11,7 @@ - **56.com** - **5min** - **6play** + - **7plus** - **8tracks** - **91porn** - **9c9media** @@ -36,12 +36,13 @@ - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network - **afreecatv**: afreecatv.com - - **afreecatv:global**: afreecatv.com - **AirMozilla** + - **AliExpressLive** - **AlJazeera** - **Allocine** - **AlphaPorno** - **AMCNetworks** + - **AmericasTestKitchen** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimeOnDemand** - **anitube.se** @@ -113,26 +114,28 @@ - **BokeCC** - **BostonGlobe** - **Bpb**: Bundeszentrale für politische Bildung - - **BR**: Bayerischer Rundfunk Mediathek + - **BR**: Bayerischer Rundfunk - **BravoTV** - **Break** - **brightcove:legacy** - **brightcove:new** + - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **BuzzFeed** - **BYUtv** - - **BYUtvEvent** - **Camdemy** - **CamdemyFolder** - **CamWithHer** - **canalc2.tv** - - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv - - **Canvas**: canvas.be and een.be + - **Canalplus**: mycanal.fr and piwiplus.fr + - **Canvas** + - **CanvasEen**: canvas.be and een.be - **CarambaTV** - **CarambaTVPage** - **CartoonNetwork** - **cbc.ca** + - **cbc.ca:olympics** - **cbc.ca:player** - **cbc.ca:watch** - **cbc.ca:watch:video** @@ -156,6 +159,7 @@ - **Cinchcast** - **CJSW** - **cliphunter** + - **Clippit** - **ClipRs** - **Clipsyndicate** - **CloserToTruth** @@ -168,7 +172,6 @@ - **CNN** - **CNNArticle** - **CNNBlogs** - - **CollegeRama** - **ComCarCoff** - **ComedyCentral** - **ComedyCentralFullEpisodes** @@ -187,7 +190,7 @@ - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **CTVNews** - - **culturebox.francetvinfo.fr** + - **Culturebox** - **CultureUnplugged** - **curiositystream** - **curiositystream:collection** @@ -196,9 +199,8 @@ - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** - - **DailymotionCloud** - - **Daisuki** - - **DaisukiPlaylist** + - **DaisukiMotto** + - **DaisukiMottoPlaylist** - **daum.net** - **daum.net:clip** - **daum.net:playlist** @@ -209,6 +211,7 @@ - **defense.gouv.fr** - **democracynow** - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **Digg** - **DigitallySpeaking** - **Digiteka** - **Discovery** @@ -241,8 +244,9 @@ - **eHow** - **Einthusan** - **eitb.tv** - - **EllenTV** - - **EllenTV:clips** + - **EllenTube** + - **EllenTubePlaylist** + - **EllenTubeVideo** - **ElPais**: El País - **Embedly** - **EMPFlix** @@ -253,7 +257,6 @@ - **ESPN** - **ESPNArticle** - **EsriVideo** - - **ETOnline** - **Europa** - **EveryonesMixtape** - **ExpoTV** @@ -265,10 +268,10 @@ - **fc2** - **fc2:embed** - **Fczenit** - - **fernsehkritik.tv** - **filmon** - **filmon:channel** - - **Firstpost** + - **Filmweb** + - **FiveThirtyEight** - **FiveTV** - **Flickr** - **Flipagram** @@ -282,23 +285,27 @@ - **foxnews:article** - **foxnews:insider** - **FoxSports** - - **france2.fr:generation-quoi** + - **france2.fr:generation-what** - **FranceCulture** - **FranceInter** - **FranceTV** - **FranceTVEmbed** - **francetvinfo.fr** + - **FranceTVJeunesse** + - **FranceTVSite** - **Freesound** - **freespeech.org** - **FreshLive** - **Funimation** + - **FunkChannel** + - **FunkMix** - **FunnyOrDie** - **Fusion** + - **Fux** - **FXNetworks** - **GameInformer** - **GameOne** - **gameone:playlist** - - **Gamersyde** - **GameSpot** - **GameStar** - **Gaskrank** @@ -329,6 +336,7 @@ - **HentaiStigma** - **hetklokhuis** - **hgtv.com:show** + - **HiDive** - **HistoricFilms** - **history:topic**: History.com Topic - **hitbox** @@ -337,6 +345,7 @@ - **HornBunny** - **HotNewHipHop** - **HotStar** + - **hotstar:playlist** - **Howcast** - **HowStuffWorks** - **HRTi** @@ -357,10 +366,12 @@ - **InfoQ** - **Instagram** - **instagram:user**: Instagram user profile + - **Internazionale** - **InternetVideoArchive** - **IPrima** - **iqiyi**: 爱奇艺 - **Ir90Tv** + - **ITTF** - **ITV** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations @@ -374,8 +385,8 @@ - **Jove** - **jpopsuki.tv** - **JWPlatform** + - **Kakao** - **Kaltura** - - **Kamcord** - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** @@ -407,6 +418,7 @@ - **Lecture2Go** - **LEGO** - **Lemonde** + - **Lenta** - **LePlaylist** - **LetvCloud**: 乐视云 - **Libsyn** @@ -415,8 +427,10 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineTV** - **LiTV** - **LiveLeak** + - **LiveLeakEmbed** - **livestream** - **livestream:original** - **LnkGo** @@ -429,15 +443,20 @@ - **m6** - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru + - **mailru:music**: Музыка@Mail.Ru + - **mailru:music:search**: Музыка@Mail.Ru - **MakersChannel** - **MakerTV** - **mangomolo:live** - **mangomolo:video** + - **ManyVids** + - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** - **Medialaan** - **Mediaset** + - **Mediasite** - **Medici** - **megaphone.fm**: megaphone.fm embedded players - **Meipai**: 美拍 @@ -467,6 +486,7 @@ - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** + - **MotherlessGroup** - **Motorsport**: motorsport.com - **MovieClips** - **MovieFap** @@ -489,8 +509,8 @@ - **MySpace:album** - **MySpass** - **Myvi** - - **myvideo** (Currently broken) - **MyVidster** + - **MyviEmbed** - **n-tv.de** - **natgeo** - **natgeo:episodeguide** @@ -499,7 +519,8 @@ - **NBA** - **NBC** - **NBCNews** - - **NBCOlympics** + - **nbcolympics** + - **nbcolympics:stream** - **NBCSports** - **NBCSportsVPlayer** - **ndr**: NDR.de - Norddeutscher Rundfunk @@ -532,6 +553,7 @@ - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** + - **nickelodeon:br** - **nickelodeonru** - **nicknight** - **niconico**: ニコニコ動画 @@ -550,8 +572,6 @@ - **nowness** - **nowness:playlist** - **nowness:series** - - **NowTV** (Currently broken) - - **NowTVList** - **nowvideo**: NowVideo - **Noz** - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl @@ -587,6 +607,7 @@ - **Openload** - **OraTV** - **orf:fm4**: radio FM4 + - **orf:fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek @@ -600,11 +621,14 @@ - **pcmag** - **PearVideo** - **People** + - **PerformGroup** - **periscope**: Periscope - **periscope:user**: Periscope user videos - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** + - **Picarto** + - **PicartoVod** - **Piksel** - **Pinkbike** - **Pladform** @@ -620,7 +644,9 @@ - **Pokemon** - **PolskieRadio** - **PolskieRadioCategory** + - **PopcornTV** - **PornCom** + - **PornerBros** - **PornFlip** - **PornHd** - **PornHub**: PornHub and Thumbzilla @@ -629,6 +655,7 @@ - **Pornotube** - **PornoVoisines** - **PornoXO** + - **PornTube** - **PressTV** - **PrimeShareTV** - **PromptFile** @@ -640,6 +667,8 @@ - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 + - **Quickline** + - **QuicklineLive** - **R7** - **R7Article** - **radio.de** @@ -651,9 +680,13 @@ - **Rai** - **RaiPlay** - **RaiPlayLive** + - **RaiPlayPlaylist** + - **RayWenderlich** - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** + - **Reddit** + - **RedditR** - **RedTube** - **RegioTV** - **RENTV** @@ -664,7 +697,6 @@ - **revision** - **revision3:embed** - **RICE** - - **RingTV** - **RMCDecouverte** - **RockstarGames** - **RoosterTeeth** @@ -685,6 +717,7 @@ - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** - **RTVNH** + - **RTVS** - **Rudo** - **RUHD** - **RulePorn** @@ -693,13 +726,13 @@ - **rutube:embed**: Rutube embedded videos - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos + - **rutube:playlist**: Rutube playlists - **RUTV**: RUTV.RU - **Ruutu** - **Ruv** - **safari**: safaribooksonline.com online video - **safari:api** - **safari:course**: safaribooksonline.com online courses - - **Sandia**: Sandia National Laboratories - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -712,8 +745,12 @@ - **SenateISVP** - **SendtoNews** - **ServingSys** + - **Servus** - **Sexu** + - **SeznamZpravy** + - **SeznamZpravyArticle** - **Shahid** + - **ShahidShow** - **Shared**: shared.sx - **ShowRoomLive** - **Sina** @@ -722,6 +759,7 @@ - **skynewsarabia:video** - **SkySports** - **Slideshare** + - **SlidesLive** - **Slutload** - **smotri**: Smotri.com - **smotri:broadcast**: Smotri.com broadcasts @@ -752,7 +790,7 @@ - **Sport5** - **SportBoxEmbed** - **SportDeutschland** - - **Sportschau** + - **SpringboardPlatform** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** @@ -765,9 +803,11 @@ - **streamcloud.eu** - **StreamCZ** - **StreetVoice** + - **StretchInternet** - **SunPorno** - **SVT** - **SVTPlay**: SVT Play and Öppet arkiv + - **SVTSeries** - **SWRMediathek** - **Syfy** - **SztvHu** @@ -776,7 +816,7 @@ - **tagesschau:player** - **Tass** - **TastyTrade** - - **TBS** (Currently broken) + - **TBS** - **TDSLifeway** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos @@ -791,8 +831,11 @@ - **Telegraaf** - **TeleMB** - **TeleQuebec** + - **TeleQuebecEmission** + - **TeleQuebecLive** - **TeleTask** - **Telewebion** + - **TennisTV** - **TF1** - **TFO** - **TheIntercept** @@ -800,7 +843,6 @@ - **ThePlatform** - **ThePlatformFeed** - **TheScene** - - **TheSixtyOne** - **TheStar** - **TheSun** - **TheWeatherChannel** @@ -847,6 +889,9 @@ - **tvland.com** - **TVN24** - **TVNoe** + - **TVNow** + - **TVNowList** + - **TVNowShow** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** @@ -868,8 +913,11 @@ - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 + - **UFCTV** - **UKTVPlay** + - **umg:de**: Universal Music Deutschland - **Unistra** + - **Unity** - **uol.com.br** - **uplynk** - **uplynk:preplay** @@ -897,7 +945,6 @@ - **vice** - **vice:article** - **vice:show** - - **Viceland** - **Vidbit** - **Viddler** - **Videa** @@ -913,6 +960,7 @@ - **VideoPress** - **videoweed**: VideoWeed - **Vidio** + - **VidLii** - **vidme** - **vidme:user** - **vidme:user:likes** @@ -953,10 +1001,12 @@ - **VoiceRepublic** - **Voot** - **VoxMedia** + - **VoxMediaVolume** - **Vporn** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be + - **VrtNU**: VrtNU.be - **vrv** - **vrv:series** - **VShare** @@ -973,10 +1023,14 @@ - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** + - **WDRElefant** + - **WDRPage** - **Webcaster** - **WebcasterFeed** - **WebOfStories** - **WebOfStoriesPlaylist** + - **Weibo** + - **WeiboMobile** - **WeiqiTV**: WQTV - **wholecloud**: WholeCloud - **Wimp** @@ -996,6 +1050,8 @@ - **xiami:artist**: 虾米音乐 - 歌手 - **xiami:collection**: 虾米音乐 - 精选集 - **xiami:song**: 虾米音乐 + - **ximalaya**: 喜马拉雅FM + - **ximalaya:album**: 喜马拉雅FM 专辑 - **XMinus** - **XNXX** - **Xstream** @@ -1009,12 +1065,16 @@ - **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек + - **YapFiles** - **YesJapan** - **yinyuetai:video**: 音悦Tai - **Ynet** - **YouJizz** - **youku**: 优酷 - **youku:show** + - **YouNowChannel** + - **YouNowLive** + - **YouNowMoment** - **YouPorn** - **YourUpload** - **youtube**: YouTube.com @@ -1028,13 +1088,14 @@ - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first - **youtube:search_url**: YouTube.com search URLs - - **youtube:shared** - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **Zapiks** - **Zaq1** + - **Zattoo** + - **ZattooLive** - **ZDF** - **ZDFChannel** - **zingmp3**: mp3.zing.vn diff --git a/setup.cfg b/setup.cfg index 2dc06ffe4..5208f7ae2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,4 +3,4 @@ universal = True [flake8] exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git -ignore = E402,E501,E731 +ignore = E402,E501,E731,E741 diff --git a/setup.py b/setup.py index 67d6633ed..7dbb5805f 100644 --- a/setup.py +++ b/setup.py @@ -109,6 +109,7 @@ setup( author_email='ytdl@yt-dl.org', maintainer='Sergey M.', maintainer_email='dstftw@gmail.com', + license='Unlicense', packages=[ 'youtube_dl', 'youtube_dl.extractor', 'youtube_dl.downloader', diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6f52e11f7..4833396a5 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -10,6 +10,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL, expect_dict, expect_value +from youtube_dl.compat import compat_etree_fromstring from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError @@ -488,6 +489,260 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + def test_parse_mpd_formats(self): + _TEST_CASES = [ + ( + # https://github.com/rg3/youtube-dl/issues/13919 + # Also tests duplicate representation ids, see + # https://github.com/rg3/youtube-dl/issues/15111 + 'float_duration', + 'http://unknown/manifest.mpd', + [{ + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'm4a', + 'format_id': '318597', + 'format_note': 'DASH audio', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'none', + 'tbr': 61.587, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '318597', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.42001f', + 'tbr': 318.597, + 'width': 340, + 'height': 192, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '638590', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.42001f', + 'tbr': 638.59, + 'width': 512, + 'height': 288, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '1022565', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.4d001f', + 'tbr': 1022.565, + 'width': 688, + 'height': 384, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '2046506', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.4d001f', + 'tbr': 2046.506, + 'width': 1024, + 'height': 576, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '3998017', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.640029', + 'tbr': 3998.017, + 'width': 1280, + 'height': 720, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '5997485', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.640032', + 'tbr': 5997.485, + 'width': 1920, + 'height': 1080, + }] + ), ( + # https://github.com/rg3/youtube-dl/pull/14844 + 'urls_only', + 'http://unknown/manifest.mpd', + [{ + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_144p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 200, + 'width': 256, + 'height': 144, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_240p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 400, + 'width': 424, + 'height': 240, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_360p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 800, + 'width': 640, + 'height': 360, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_480p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 1200, + 'width': 856, + 'height': 480, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_576p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 1600, + 'width': 1024, + 'height': 576, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_720p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 2400, + 'width': 1280, + 'height': 720, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': 'h264_aac_1080p_m4s', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'avc3.42c01e', + 'tbr': 4400, + 'width': 1920, + 'height': 1080, + }] + ) + ] + + for mpd_file, mpd_url, expected_formats in _TEST_CASES: + with io.open('./test/testdata/mpd/%s.mpd' % mpd_file, + mode='r', encoding='utf-8') as f: + formats = self.ie._parse_mpd_formats( + compat_etree_fromstring(f.read().encode('utf-8')), + mpd_url=mpd_url) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) + + def test_parse_f4m_formats(self): + _TEST_CASES = [ + ( + # https://github.com/rg3/youtube-dl/issues/14660 + 'custom_base_url', + 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', + [{ + 'manifest_url': 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', + 'ext': 'flv', + 'format_id': '2148', + 'protocol': 'f4m', + 'tbr': 2148, + 'width': 1280, + 'height': 720, + }] + ), + ] + + for f4m_file, f4m_url, expected_formats in _TEST_CASES: + with io.open('./test/testdata/f4m/%s.f4m' % f4m_file, + mode='r', encoding='utf-8') as f: + formats = self.ie._parse_f4m_formats( + compat_etree_fromstring(f.read().encode('utf-8')), + f4m_url, None) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) + + def test_parse_xspf(self): + _TEST_CASES = [ + ( + 'foo_xspf', + 'https://example.org/src/foo_xspf.xspf', + [{ + 'id': 'foo_xspf', + 'title': 'Pandemonium', + 'description': 'Visit http://bigbrother404.bandcamp.com', + 'duration': 202.416, + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/src/cd1/track%201.mp3', + }], + }, { + 'id': 'foo_xspf', + 'title': 'Final Cartridge (Nichico Twelve Remix)', + 'description': 'Visit http://bigbrother404.bandcamp.com', + 'duration': 255.857, + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3', + }], + }, { + 'id': 'foo_xspf', + 'title': 'Rebuilding Nightingale', + 'description': 'Visit http://bigbrother404.bandcamp.com', + 'duration': 287.915, + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/src/track3.mp3', + }, { + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.com/track3.mp3', + }] + }] + ), + ] + + for xspf_file, xspf_url, expected_entries in _TEST_CASES: + with io.open('./test/testdata/xspf/%s.xspf' % xspf_file, + mode='r', encoding='utf-8') as f: + entries = self.ie._parse_xspf( + compat_etree_fromstring(f.read().encode('utf-8')), + xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url) + expect_value(self, entries, expected_entries, None) + for i in range(len(entries)): + expect_dict(self, entries[i], expected_entries[i]) + if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index e70cbcd37..f0f5a8470 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -466,12 +466,18 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL({'simulate': True}) self.assertEqual(ydl._default_format_spec({}), 'bestvideo+bestaudio/best') + ydl = YDL({}) + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') + + ydl = YDL({'simulate': True}) + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'bestvideo+bestaudio/best') + ydl = YDL({'outtmpl': '-'}) - self.assertEqual(ydl._default_format_spec({}), 'best') + self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') ydl = YDL({}) self.assertEqual(ydl._default_format_spec({}, download=False), 'bestvideo+bestaudio/best') - self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best') + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') class TestYoutubeDL(unittest.TestCase): @@ -770,6 +776,12 @@ class TestYoutubeDL(unittest.TestCase): result = get_ids({'playlist_items': '10'}) self.assertEqual(result, []) + result = get_ids({'playlist_items': '3-10'}) + self.assertEqual(result, [3, 4]) + + result = get_ids({'playlist_items': '2-4,3-4,3'}) + self.assertEqual(result, [2, 3, 4]) + def test_urlopen_no_file_protocol(self): # see https://github.com/rg3/youtube-dl/issues/8227 ydl = YDL() diff --git a/test/test_download.py b/test/test_download.py index 209f5f6d6..ebe820dfc 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -92,8 +92,8 @@ class TestDownload(unittest.TestCase): def generator(test_case, tname): def test_template(self): - ie = youtube_dl.extractor.get_info_extractor(test_case['name']) - other_ies = [get_info_extractor(ie_key) for ie_key in test_case.get('add_ie', [])] + ie = youtube_dl.extractor.get_info_extractor(test_case['name'])() + other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])] is_playlist = any(k.startswith('playlist') for k in test_case) test_cases = test_case.get( 'playlist', [] if is_playlist else [test_case]) diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py new file mode 100644 index 000000000..5cf2bf1a5 --- /dev/null +++ b/test/test_downloader_http.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import re +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import try_rm +from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_http_server +from youtube_dl.downloader.http import HttpFD +from youtube_dl.utils import encodeFilename +import ssl +import threading + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def http_server_port(httpd): + if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): + # In Jython SSLSocket is not a subclass of socket.socket + sock = httpd.socket.sock + else: + sock = httpd.socket + return sock.getsockname()[1] + + +TEST_SIZE = 10 * 1024 + + +class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def send_content_range(self, total=None): + range_header = self.headers.get('Range') + start = end = None + if range_header: + mobj = re.search(r'^bytes=(\d+)-(\d+)', range_header) + if mobj: + start = int(mobj.group(1)) + end = int(mobj.group(2)) + valid_range = start is not None and end is not None + if valid_range: + content_range = 'bytes %d-%d' % (start, end) + if total: + content_range += '/%d' % total + self.send_header('Content-Range', content_range) + return (end - start + 1) if valid_range else total + + def serve(self, range=True, content_length=True): + self.send_response(200) + self.send_header('Content-Type', 'video/mp4') + size = TEST_SIZE + if range: + size = self.send_content_range(TEST_SIZE) + if content_length: + self.send_header('Content-Length', size) + self.end_headers() + self.wfile.write(b'#' * size) + + def do_GET(self): + if self.path == '/regular': + self.serve() + elif self.path == '/no-content-length': + self.serve(content_length=False) + elif self.path == '/no-range': + self.serve(range=False) + elif self.path == '/no-range-no-content-length': + self.serve(range=False, content_length=False) + else: + assert False + + +class FakeLogger(object): + def debug(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + pass + + +class TestHttpFD(unittest.TestCase): + def setUp(self): + self.httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + self.port = http_server_port(self.httpd) + self.server_thread = threading.Thread(target=self.httpd.serve_forever) + self.server_thread.daemon = True + self.server_thread.start() + + def download(self, params, ep): + params['logger'] = FakeLogger() + ydl = YoutubeDL(params) + downloader = HttpFD(ydl, params) + filename = 'testfile.mp4' + try_rm(encodeFilename(filename)) + self.assertTrue(downloader.real_download(filename, { + 'url': 'http://127.0.0.1:%d/%s' % (self.port, ep), + })) + self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE) + try_rm(encodeFilename(filename)) + + def download_all(self, params): + for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'): + self.download(params, ep) + + def test_regular(self): + self.download_all({}) + + def test_chunked(self): + self.download_all({ + 'http_chunk_size': 1000, + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_http.py b/test/test_http.py index 7a7a3510f..409fec9c8 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -47,7 +47,7 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): self.end_headers() return - new_url = 'http://localhost:%d/中文.html' % http_server_port(self.server) + new_url = 'http://127.0.0.1:%d/中文.html' % http_server_port(self.server) self.send_response(302) self.send_header(b'Location', new_url.encode('utf-8')) self.end_headers() @@ -74,7 +74,7 @@ class FakeLogger(object): class TestHTTP(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( - ('localhost', 0), HTTPTestRequestHandler) + ('127.0.0.1', 0), HTTPTestRequestHandler) self.port = http_server_port(self.httpd) self.server_thread = threading.Thread(target=self.httpd.serve_forever) self.server_thread.daemon = True @@ -86,15 +86,15 @@ class TestHTTP(unittest.TestCase): return ydl = YoutubeDL({'logger': FakeLogger()}) - r = ydl.extract_info('http://localhost:%d/302' % self.port) - self.assertEqual(r['entries'][0]['url'], 'http://localhost:%d/vid.mp4' % self.port) + r = ydl.extract_info('http://127.0.0.1:%d/302' % self.port) + self.assertEqual(r['entries'][0]['url'], 'http://127.0.0.1:%d/vid.mp4' % self.port) class TestHTTPS(unittest.TestCase): def setUp(self): certfn = os.path.join(TEST_DIR, 'testcert.pem') self.httpd = compat_http_server.HTTPServer( - ('localhost', 0), HTTPTestRequestHandler) + ('127.0.0.1', 0), HTTPTestRequestHandler) self.httpd.socket = ssl.wrap_socket( self.httpd.socket, certfile=certfn, server_side=True) self.port = http_server_port(self.httpd) @@ -107,11 +107,11 @@ class TestHTTPS(unittest.TestCase): ydl = YoutubeDL({'logger': FakeLogger()}) self.assertRaises( Exception, - ydl.extract_info, 'https://localhost:%d/video.html' % self.port) + ydl.extract_info, 'https://127.0.0.1:%d/video.html' % self.port) ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True}) - r = ydl.extract_info('https://localhost:%d/video.html' % self.port) - self.assertEqual(r['entries'][0]['url'], 'https://localhost:%d/vid.mp4' % self.port) + r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) + self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) def _build_proxy_handler(name): @@ -132,23 +132,23 @@ def _build_proxy_handler(name): class TestProxy(unittest.TestCase): def setUp(self): self.proxy = compat_http_server.HTTPServer( - ('localhost', 0), _build_proxy_handler('normal')) + ('127.0.0.1', 0), _build_proxy_handler('normal')) self.port = http_server_port(self.proxy) self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) self.proxy_thread.daemon = True self.proxy_thread.start() self.geo_proxy = compat_http_server.HTTPServer( - ('localhost', 0), _build_proxy_handler('geo')) + ('127.0.0.1', 0), _build_proxy_handler('geo')) self.geo_port = http_server_port(self.geo_proxy) self.geo_proxy_thread = threading.Thread(target=self.geo_proxy.serve_forever) self.geo_proxy_thread.daemon = True self.geo_proxy_thread.start() def test_proxy(self): - geo_proxy = 'localhost:{0}'.format(self.geo_port) + geo_proxy = '127.0.0.1:{0}'.format(self.geo_port) ydl = YoutubeDL({ - 'proxy': 'localhost:{0}'.format(self.port), + 'proxy': '127.0.0.1:{0}'.format(self.port), 'geo_verification_proxy': geo_proxy, }) url = 'http://foo.com/bar' @@ -162,7 +162,7 @@ class TestProxy(unittest.TestCase): def test_proxy_with_idn(self): ydl = YoutubeDL({ - 'proxy': 'localhost:{0}'.format(self.port), + 'proxy': '127.0.0.1:{0}'.format(self.port), }) url = 'http://中文.tw/' response = ydl.urlopen(url).read().decode('utf-8') diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 1b8de822a..7d57a628e 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -232,7 +232,7 @@ class TestNPOSubtitles(BaseTestSubtitles): class TestMTVSubtitles(BaseTestSubtitles): - url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' + url = 'http://www.cc.com/video-clips/p63lk0/adam-devine-s-house-party-chasing-white-swans' IE = ComedyCentralIE def getInfoDict(self): @@ -243,7 +243,7 @@ class TestMTVSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') + self.assertEqual(md5(subtitles['en']), '78206b8d8a0cfa9da64dc026eea48961') class TestNRKSubtitles(BaseTestSubtitles): diff --git a/test/test_utils.py b/test/test_utils.py index 2aab16b97..14503ab53 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -42,6 +42,7 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + merge_dicts, mimetype2ext, month_by_name, multipart_encode, @@ -53,10 +54,12 @@ from youtube_dl.utils import ( parse_filesize, parse_count, parse_iso8601, + parse_resolution, pkcs1pad, read_batch_urls, sanitize_filename, sanitize_path, + sanitize_url, expand_path, prepend_extension, replace_extension, @@ -219,6 +222,12 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') + def test_sanitize_url(self): + self.assertEqual(sanitize_url('//foo.bar'), 'http://foo.bar') + self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') + self.assertEqual(sanitize_url('rmtps://foo.bar'), 'rtmps://foo.bar') + self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar') + def test_expand_path(self): def env(var): return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var) @@ -279,6 +288,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') + self.assertEqual(unescapeHTML('&a"'), '&a"') # HTML5 entities self.assertEqual(unescapeHTML('.''), '.\'') @@ -342,6 +352,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100) self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361) self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) + self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) + self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') @@ -539,6 +551,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('87 Min.'), 5220) self.assertEqual(parse_duration('PT1H0.040S'), 3600.04) self.assertEqual(parse_duration('PT00H03M30SZ'), 210) + self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88) def test_fix_xml_ampersands(self): self.assertEqual( @@ -657,6 +670,17 @@ class TestUtil(unittest.TestCase): self.assertEqual(dict_get(d, ('b', 'c', key, )), None) self.assertEqual(dict_get(d, ('b', 'c', key, ), skip_false_values=False), false_value) + def test_merge_dicts(self): + self.assertEqual(merge_dicts({'a': 1}, {'b': 2}), {'a': 1, 'b': 2}) + self.assertEqual(merge_dicts({'a': 1}, {'a': 2}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {'a': None}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {'a': ''}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {}), {'a': 1}) + self.assertEqual(merge_dicts({'a': None}, {'a': 1}), {'a': 1}) + self.assertEqual(merge_dicts({'a': ''}, {'a': 1}), {'a': ''}) + self.assertEqual(merge_dicts({'a': ''}, {'a': 'abc'}), {'a': 'abc'}) + self.assertEqual(merge_dicts({'a': None}, {'a': ''}, {'a': 'abc'}), {'a': 'abc'}) + def test_encode_compat_str(self): self.assertEqual(encode_compat_str(b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', 'utf-8'), 'тест') self.assertEqual(encode_compat_str('тест', 'utf-8'), 'тест') @@ -811,6 +835,9 @@ class TestUtil(unittest.TestCase): inp = '''{"duration": "00:01:07"}''' self.assertEqual(js_to_json(inp), '''{"duration": "00:01:07"}''') + inp = '''{segments: [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''' + self.assertEqual(js_to_json(inp), '''{"segments": [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''') + def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) @@ -882,6 +909,13 @@ class TestUtil(unittest.TestCase): on = js_to_json('{/*comment\n*/42/*comment\n*/:/*comment\n*/42/*comment\n*/}') self.assertEqual(json.loads(on), {'42': 42}) + on = js_to_json('{42:4.2e1}') + self.assertEqual(json.loads(on), {'42': 42.0}) + + def test_js_to_json_malformed(self): + self.assertEqual(js_to_json('42a1'), '42"a1"') + self.assertEqual(js_to_json('42a-1'), '42"a"-1') + def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) @@ -962,6 +996,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_count('1.1kk '), 1100000) self.assertEqual(parse_count('1.1kk views'), 1100000) + def test_parse_resolution(self): + self.assertEqual(parse_resolution(None), {}) + self.assertEqual(parse_resolution(''), {}) + self.assertEqual(parse_resolution('1920x1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('1920×1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('1920 x 1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('720p'), {'height': 720}) + self.assertEqual(parse_resolution('4k'), {'height': 2160}) + self.assertEqual(parse_resolution('8K'), {'height': 4320}) + def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) self.assertEqual(version_tuple('10.23.344'), (10, 23, 344)) @@ -1040,6 +1084,18 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertFalse(match_str( 'like_count > 100 & dislike_count Ignored, three

- ''' + '''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The following line contains Chinese characters and special symbols @@ -1088,7 +1144,7 @@ Line

The first line

- ''' + '''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The first line @@ -1114,7 +1170,7 @@ The first line

inner
style

-''' +'''.encode('utf-8') srt_data = '''1 00:00:02,080 --> 00:00:05,839 default stylecustom style @@ -1137,6 +1193,26 @@ part 3 ''' self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data) + dfxp_data_non_utf8 = ''' + + +
+

Line 1

+

第二行

+
+ +
'''.encode('utf-16') + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +Line 1 + +2 +00:00:01,000 --> 00:00:02,000 +第二行 + +''' + self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data) + def test_cli_option(self): self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 7a33dbf88..c4f0abbea 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -61,7 +61,7 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() dl.params['extract_flat'] = True ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv') self.assertIsPlaylist(result) for entry in result['entries']: self.assertTrue(entry.get('title')) diff --git a/test/testdata/f4m/custom_base_url.f4m b/test/testdata/f4m/custom_base_url.f4m new file mode 100644 index 000000000..74e1539e8 --- /dev/null +++ b/test/testdata/f4m/custom_base_url.f4m @@ -0,0 +1,10 @@ + + + recorded + http://vod.livestream.com/events/0000000000673980/ + 269.293 + AAAAm2Fic3QAAAAAAAAAAQAAAAPoAAAAAAAEG+0AAAAAAAAAAAAAAAAAAQAAABlhc3J0AAAAAAAAAAABAAAAAQAAAC4BAAAAVmFmcnQAAAAAAAAD6AAAAAAEAAAAAQAAAAAAAAAAAAAXcAAAAC0AAAAAAAQHQAAAE5UAAAAuAAAAAAAEGtUAAAEYAAAAAAAAAAAAAAAAAAAAAAA= + + AgAKb25NZXRhRGF0YQgAAAAIAAhkdXJhdGlvbgBAcNSwIMSbpgAFd2lkdGgAQJQAAAAAAAAABmhlaWdodABAhoAAAAAAAAAJZnJhbWVyYXRlAEA4/7DoLwW3AA12aWRlb2RhdGFyYXRlAECe1DLgjcobAAx2aWRlb2NvZGVjaWQAQBwAAAAAAAAADWF1ZGlvZGF0YXJhdGUAQGSimlvaPKQADGF1ZGlvY29kZWNpZABAJAAAAAAAAAAACQ== + + diff --git a/test/testdata/mpd/float_duration.mpd b/test/testdata/mpd/float_duration.mpd new file mode 100644 index 000000000..8dc1d2d5e --- /dev/null +++ b/test/testdata/mpd/float_duration.mpd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/test/testdata/mpd/urls_only.mpd b/test/testdata/mpd/urls_only.mpd new file mode 100644 index 000000000..2b9d595d3 --- /dev/null +++ b/test/testdata/mpd/urls_only.mpd @@ -0,0 +1,218 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/testdata/xspf/foo_xspf.xspf b/test/testdata/xspf/foo_xspf.xspf new file mode 100644 index 000000000..b7f0086b3 --- /dev/null +++ b/test/testdata/xspf/foo_xspf.xspf @@ -0,0 +1,34 @@ + + + 2018-03-09T18:01:43Z + + + cd1/track%201.mp3 + Pandemonium + Foilverb + Visit http://bigbrother404.bandcamp.com + Pandemonium EP + 1 + 202416 + + + ../%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3 + Final Cartridge (Nichico Twelve Remix) + Visit http://bigbrother404.bandcamp.com + Foilverb + Pandemonium EP + 2 + 255857 + + + track3.mp3 + https://example.com/track3.mp3 + Rebuilding Nightingale + Visit http://bigbrother404.bandcamp.com + Foilverb + Pandemonium EP + 3 + 287915 + + + diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 367ae3533..ad3598805 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -65,6 +65,7 @@ from .utils import ( locked_file, make_HTTPS_handler, MaxDownloadsReached, + orderedSet, PagedList, parse_filesize, PerRequestProxyHandler, @@ -92,6 +93,7 @@ from .utils import ( ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER +from .extractor.openload import PhantomJSwrapper from .downloader import get_suitable_downloader from .downloader.rtmp import rtmpdump_version from .postprocessor import ( @@ -296,13 +298,20 @@ class YoutubeDL(object): the downloader (see youtube_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize, external_downloader_args, hls_use_mpegts. + xattr_set_filesize, external_downloader_args, hls_use_mpegts, + http_chunk_size. The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. postprocessor_args: A list of additional command-line arguments for the postprocessor. + + The following options are used by the Youtube extractor: + youtube_include_dash_manifest: If True (default), DASH manifests and related + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about DASH. """ _NUMERIC_FIELDS = set(( @@ -523,6 +532,8 @@ class YoutubeDL(object): def save_console_title(self): if not self.params.get('consoletitle', False): return + if self.params.get('simulate', False): + return if compat_os_name != 'nt' and 'TERM' in os.environ: # Save the title on stack self._write_string('\033[22;0t', self._screen_file) @@ -530,6 +541,8 @@ class YoutubeDL(object): def restore_console_title(self): if not self.params.get('consoletitle', False): return + if self.params.get('simulate', False): + return if compat_os_name != 'nt' and 'TERM' in os.environ: # Restore the title from stack self._write_string('\033[23;0t', self._screen_file) @@ -901,15 +914,25 @@ class YoutubeDL(object): yield int(item) else: yield int(string_segment) - playlistitems = iter_playlistitems(playlistitems_str) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + if isinstance(ie_entries, list): n_all_entries = len(ie_entries) if playlistitems: - entries = [ - ie_entries[i - 1] for i in playlistitems - if -n_all_entries <= i - 1 < n_all_entries] + entries = make_playlistitems_entries(ie_entries) else: entries = ie_entries[playliststart:playlistend] n_entries = len(entries) @@ -927,20 +950,16 @@ class YoutubeDL(object): entries = ie_entries.getslice( playliststart, playlistend) n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, n_entries)) + report_download(n_entries) else: # iterable if playlistitems: - entry_list = list(ie_entries) - entries = [entry_list[i - 1] for i in playlistitems] + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) else: entries = list(itertools.islice( ie_entries, playliststart, playlistend)) n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, n_entries)) + report_download(n_entries) if self.params.get('playlistreverse', False): entries = entries[::-1] @@ -961,6 +980,8 @@ class YoutubeDL(object): 'playlist': playlist, 'playlist_id': ie_result.get('id'), 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), 'playlist_index': i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], @@ -1016,7 +1037,7 @@ class YoutubeDL(object): '!=': operator.ne, } operator_rex = re.compile(r'''(?x)\s* - (?Pwidth|height|tbr|abr|vbr|asr|filesize|fps) + (?Pwidth|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps) \s*(?P%s)(?P\s*\?)?\s* (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) $ @@ -1065,22 +1086,27 @@ class YoutubeDL(object): return _filter def _default_format_spec(self, info_dict, download=True): - req_format_list = [] - def can_have_partial_formats(): - if self.params.get('simulate', False): - return True - if not download: - return True - if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-': - return False - if info_dict.get('is_live'): - return False + def can_merge(): merger = FFmpegMergerPP(self) return merger.available and merger.can_merge() - if can_have_partial_formats(): - req_format_list.append('bestvideo+bestaudio') - req_format_list.append('best') + + def prefer_best(): + if self.params.get('simulate', False): + return False + if not download: + return False + if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-': + return True + if info_dict.get('is_live'): + return True + if not can_merge(): + return True + return False + + req_format_list = ['bestvideo+bestaudio', 'best'] + if prefer_best(): + req_format_list.reverse() return '/'.join(req_format_list) def build_format_selector(self, format_spec): @@ -1483,12 +1509,14 @@ class YoutubeDL(object): def is_wellformed(f): url = f.get('url') - valid_url = url and isinstance(url, compat_str) - if not valid_url: + if not url: self.report_warning( '"url" field is missing or empty - skipping format, ' 'there is an error in extractor') - return valid_url + return False + if isinstance(url, bytes): + sanitize_string_field(f, 'url') + return True # Filter out malformed formats for better extraction robustness formats = list(filter(is_wellformed, formats)) @@ -1500,7 +1528,7 @@ class YoutubeDL(object): sanitize_string_field(format, 'format_id') sanitize_numeric_fields(format) format['url'] = sanitize_url(format['url']) - if format.get('format_id') is None: + if not format.get('format_id'): format['format_id'] = compat_str(i) else: # Sanitize format_id from characters used in format selector expression @@ -1708,12 +1736,17 @@ class YoutubeDL(object): if filename is None: return - try: - dn = os.path.dirname(sanitize_path(encodeFilename(filename))) - if dn and not os.path.exists(dn): - os.makedirs(dn) - except (OSError, IOError) as err: - self.report_error('unable to create directory ' + error_to_compat_str(err)) + def ensure_dir_exists(path): + try: + dn = os.path.dirname(path) + if dn and not os.path.exists(dn): + os.makedirs(dn) + return True + except (OSError, IOError) as err: + self.report_error('unable to create directory ' + error_to_compat_str(err)) + return False + + if not ensure_dir_exists(sanitize_path(encodeFilename(filename))): return if self.params.get('writedescription', False): @@ -1756,29 +1789,30 @@ class YoutubeDL(object): ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - if sub_info.get('data') is not None: - sub_data = sub_info['data'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: - try: - sub_data = ie._download_webpage( - sub_info['url'], info_dict['id'], note=False) - except ExtractorError as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err.cause))) - continue - try: - sub_filename = subtitles_filename(filename, sub_lang, sub_format) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + self.to_screen('[info] Writing video subtitles to: ' + sub_filename) + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + except (OSError, IOError): + self.report_error('Cannot write subtitles file ' + sub_filename) + return else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - # Use newline='' to prevent conversion of newline characters - # See https://github.com/rg3/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_data) - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return + try: + sub_data = ie._request_webpage( + sub_info['url'], info_dict['id'], note=False).read() + with io.open(encodeFilename(sub_filename), 'wb') as subfile: + subfile.write(sub_data) + except (ExtractorError, IOError, OSError, ValueError) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, error_to_compat_str(err))) + continue if self.params.get('writeinfojson', False): infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) @@ -1819,7 +1853,7 @@ class YoutubeDL(object): def compatible_formats(formats): video, audio = formats # Check extension - video_ext, audio_ext = audio.get('ext'), video.get('ext') + video_ext, audio_ext = video.get('ext'), audio.get('ext') if video_ext and audio_ext: COMPATIBLE_EXTS = ( ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'), @@ -1851,8 +1885,11 @@ class YoutubeDL(object): for f in requested_formats: new_info = dict(info_dict) new_info.update(f) - fname = self.prepare_filename(new_info) - fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) + fname = prepend_extension( + self.prepare_filename(new_info), + 'f%s' % f['format_id'], new_info['ext']) + if not ensure_dir_exists(fname): + return downloaded.append(fname) partial_success = dl(fname, new_info) success = success and partial_success @@ -2201,11 +2238,20 @@ class YoutubeDL(object): sys.exc_clear() except Exception: pass - self._write_string('[debug] Python version %s - %s\n' % ( - platform.python_version(), platform_name())) + + def python_implementation(): + impl_name = platform.python_implementation() + if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): + return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] + return impl_name + + self._write_string('[debug] Python version %s (%s) - %s\n' % ( + platform.python_version(), python_implementation(), + platform_name())) exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() + exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( '%s %s' % (exe, v) for exe, v in sorted(exe_versions.items()) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c4589411e..9bb952457 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -191,6 +191,11 @@ def _real_main(argv=None): if numeric_buffersize is None: parser.error('invalid buffer size specified') opts.buffersize = numeric_buffersize + if opts.http_chunk_size is not None: + numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) + if not numeric_chunksize: + parser.error('invalid http chunk size specified') + opts.http_chunk_size = numeric_chunksize if opts.playliststart <= 0: raise ValueError('Playlist start must be positive') if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: @@ -206,7 +211,7 @@ def _real_main(argv=None): if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: parser.error('invalid video recode format specified') if opts.convertsubtitles is not None: - if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']: parser.error('invalid subtitle format specified') if opts.date is not None: @@ -346,6 +351,7 @@ def _real_main(argv=None): 'keep_fragments': opts.keep_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, + 'http_chunk_size': opts.http_chunk_size, 'continuedl': opts.continue_dl, 'noprogress': opts.noprogress, 'progress_with_newline': opts.progress_with_newline, diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index c5bb3c4ef..461bb6d41 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -1,8 +1,8 @@ from __future__ import unicode_literals -import base64 from math import ceil +from .compat import compat_b64decode from .utils import bytes_to_intlist, intlist_to_bytes BLOCK_SIZE_BYTES = 16 @@ -180,7 +180,7 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) + data = bytes_to_intlist(compat_b64decode(data)) password = bytes_to_intlist(password.encode('utf-8')) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e4e13bcf..4a611f183 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,13 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import binascii import collections +import ctypes import email import getpass import io +import itertools import optparse import os +import platform import re import shlex import shutil @@ -15,7 +19,6 @@ import socket import struct import subprocess import sys -import itertools import xml.etree.ElementTree @@ -2894,19 +2897,72 @@ except TypeError: if isinstance(spec, compat_str): spec = spec.encode('ascii') return struct.unpack(spec, *args) + + class compat_Struct(struct.Struct): + def __init__(self, fmt): + if isinstance(fmt, compat_str): + fmt = fmt.encode('ascii') + super(compat_Struct, self).__init__(fmt) else: compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack + if platform.python_implementation() == 'IronPython' and sys.version_info < (2, 7, 8): + class compat_Struct(struct.Struct): + def unpack(self, string): + if not isinstance(string, buffer): # noqa: F821 + string = buffer(string) # noqa: F821 + return super(compat_Struct, self).unpack(string) + else: + compat_Struct = struct.Struct + + +try: + from future_builtins import zip as compat_zip +except ImportError: # not 2.6+ or is 3.x + try: + from itertools import izip as compat_zip # < 2.5 or 3.x + except ImportError: + compat_zip = zip + + +if sys.version_info < (3, 3): + def compat_b64decode(s, *args, **kwargs): + if isinstance(s, compat_str): + s = s.encode('ascii') + return base64.b64decode(s, *args, **kwargs) +else: + compat_b64decode = base64.b64decode + + +if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): + # PyPy2 prior to version 5.4.0 expects byte strings as Windows function + # names, see the original PyPy issue [1] and the youtube-dl one [2]. + # 1. https://bitbucket.org/pypy/pypy/issues/2360/windows-ctypescdll-typeerror-function-name + # 2. https://github.com/rg3/youtube-dl/pull/4392 + def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + real = ctypes.WINFUNCTYPE(*args, **kwargs) + + def resf(tpl, *args, **kwargs): + funcname, dll = tpl + return real((str(funcname), dll), *args, **kwargs) + + return resf +else: + def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + return ctypes.WINFUNCTYPE(*args, **kwargs) __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', + 'compat_Struct', + 'compat_b64decode', 'compat_basestring', 'compat_chr', 'compat_cookiejar', 'compat_cookies', + 'compat_ctypes_WINFUNCTYPE', 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', @@ -2948,5 +3004,6 @@ __all__ = [ 'compat_urlretrieve', 'compat_xml_parse_error', 'compat_xpath', + 'compat_zip', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 77242dacc..edd125ee2 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -49,6 +49,9 @@ class FileDownloader(object): external_downloader_args: A list of additional command-line arguments for the external downloader. hls_use_mpegts: Use the mpegts container for HLS videos. + http_chunk_size: Size of a chunk for chunk-based HTTP downloading. May be + useful for bypassing bandwidth throttling imposed by + a webserver (experimental) Subclasses of this one must re-define the real_download method. """ @@ -246,12 +249,13 @@ class FileDownloader(object): if self.params.get('noprogress', False): self.to_screen('[download] Download completed') else: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template = '100%%' + if s.get('total_bytes') is not None: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template += ' of %(_total_bytes_str)s' if s.get('elapsed') is not None: s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template = '100%% of %(_total_bytes_str)s in %(_elapsed_str)s' - else: - msg_template = '100%% of %(_total_bytes_str)s' + msg_template += ' in %(_elapsed_str)s' self._report_progress_status( msg_template % s, is_last_line=True) @@ -304,11 +308,11 @@ class FileDownloader(object): """Report attempt to resume at given byte.""" self.to_screen('[download] Resuming download at byte %s' % resume_len) - def report_retry(self, count, retries): + def report_retry(self, err, count, retries): """Report retry in case of HTTP error 5xx""" self.to_screen( - '[download] Got server HTTP error. Retrying (attempt %d of %s)...' - % (count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' + % (error_to_compat_str(err), count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index db018fa89..958d00aac 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals import os.path +import re import subprocess import sys -import re +import time from .common import FileDownloader from ..compat import ( @@ -30,6 +31,7 @@ class ExternalFD(FileDownloader): tmpfilename = self.temp_name(filename) try: + started = time.time() retval = self._call_downloader(tmpfilename, info_dict) except KeyboardInterrupt: if not info_dict.get('is_live'): @@ -41,15 +43,20 @@ class ExternalFD(FileDownloader): self.to_screen('[%s] Interrupted by user' % self.get_basename()) if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, + status = { 'filename': filename, 'status': 'finished', - }) + 'elapsed': time.time() - started, + } + if filename != '-': + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) + self.try_rename(tmpfilename, filename) + status.update({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + }) + self._hook_progress(status) return True else: self.to_stderr('\n') diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index c8fde9a89..15e71be9a 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -1,12 +1,12 @@ from __future__ import division, unicode_literals -import base64 import io import itertools import time from .fragment import FragmentFD from ..compat import ( + compat_b64decode, compat_etree_fromstring, compat_urlparse, compat_urllib_error, @@ -243,8 +243,17 @@ def remove_encrypted_media(media): media)) -def _add_ns(prop): - return '{http://ns.adobe.com/f4m/1.0}%s' % prop +def _add_ns(prop, ver=1): + return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): + base_url = xpath_text( + manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() + return base_url class F4mFD(FragmentFD): @@ -303,7 +312,7 @@ class F4mFD(FragmentFD): boot_info = self._get_bootstrap_from_url(bootstrap_url) else: bootstrap_url = None - bootstrap = base64.b64decode(node.text.encode('ascii')) + bootstrap = compat_b64decode(node.text) boot_info = read_bootstrap_info(bootstrap) return boot_info, bootstrap_url @@ -330,17 +339,17 @@ class F4mFD(FragmentFD): rate, media = list(filter( lambda f: int(f[0]) == requested_bitrate, formats))[0] - base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) + # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. + man_base_url = get_base_url(doc) or man_url + + base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - # From Adobe F4M 3.0 spec: - # The element SHALL be the base URL for all relative - # (HTTP-based) URLs in the manifest. If is not present, said - # URLs should be relative to the location of the containing document. - boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) + boot_info, bootstrap_url = self._parse_bootstrap_node( + bootstrap_node, man_base_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: - metadata = base64.b64decode(metadata_node.text.encode('ascii')) + metadata = compat_b64decode(metadata_node.text) else: metadata = None diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index bccc8ecc1..917f6dc01 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -74,9 +74,14 @@ class FragmentFD(FileDownloader): return not ctx['live'] and not ctx['tmpfilename'] == '-' def _read_ytdl_file(self, ctx): + assert 'ytdl_corrupt' not in ctx stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') - ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] - stream.close() + try: + ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] + except Exception: + ctx['ytdl_corrupt'] = True + finally: + stream.close() def _write_ytdl_file(self, ctx): frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') @@ -107,19 +112,26 @@ class FragmentFD(FileDownloader): def _append_fragment(self, ctx, frag_content): try: ctx['dest_stream'].write(frag_content) + ctx['dest_stream'].flush() finally: if self.__do_ytdl_file(ctx): self._write_ytdl_file(ctx) if not self.params.get('keep_fragments', False): - os.remove(ctx['fragment_filename_sanitized']) + os.remove(encodeFilename(ctx['fragment_filename_sanitized'])) del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): if 'live' not in ctx: ctx['live'] = False + if not ctx['live']: + total_frags_str = '%d' % ctx['total_frags'] + ad_frags = ctx.get('ad_frags', 0) + if ad_frags: + total_frags_str += ' (not including %d ad)' % ad_frags + else: + total_frags_str = 'unknown (live)' self.to_screen( - '[%s] Total fragments: %s' - % (self.FD_NAME, ctx['total_frags'] if not ctx['live'] else 'unknown (live)')) + '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) self.report_destination(ctx['filename']) dl = HttpQuietDownloader( self.ydl, @@ -151,10 +163,21 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) + is_corrupt = ctx.get('ytdl_corrupt') is True + is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 + if is_corrupt or is_inconsistent: + message = ( + '.ytdl file is corrupt' if is_corrupt else + 'Inconsistent state of incomplete fragment download') + self.report_warning( + '%s. Restarting from the beginning...' % message) + ctx['fragment_index'] = resume_len = 0 + if 'ytdl_corrupt' in ctx: + del ctx['ytdl_corrupt'] + self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) - if ctx['fragment_index'] > 0: - assert resume_len > 0 + assert ctx['fragment_index'] == 0 dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) @@ -229,12 +252,16 @@ class FragmentFD(FileDownloader): if os.path.isfile(ytdl_filename): os.remove(ytdl_filename) elapsed = time.time() - ctx['started'] - self.try_rename(ctx['tmpfilename'], ctx['filename']) - fsize = os.path.getsize(encodeFilename(ctx['filename'])) + + if ctx['tmpfilename'] == '-': + downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + else: + self.try_rename(ctx['tmpfilename'], ctx['filename']) + downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, + 'downloaded_bytes': downloaded_bytes, + 'total_bytes': downloaded_bytes, 'filename': ctx['filename'], 'status': 'finished', 'elapsed': elapsed, diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 46308cf07..fd304527e 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -75,15 +75,31 @@ class HlsFD(FragmentFD): fd.add_progress_hook(ph) return fd.real_download(filename, info_dict) - total_frags = 0 + def is_ad_fragment(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s or + s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + + media_frags = 0 + ad_frags = 0 + ad_frag_next = False for line in s.splitlines(): line = line.strip() - if line and not line.startswith('#'): - total_frags += 1 + if not line: + continue + if line.startswith('#'): + if is_ad_fragment(line): + ad_frags += 1 + ad_frag_next = True + continue + if ad_frag_next: + ad_frag_next = False + continue + media_frags += 1 ctx = { 'filename': filename, - 'total_frags': total_frags, + 'total_frags': media_frags, + 'ad_frags': ad_frags, } self._prepare_and_start_frag_download(ctx) @@ -101,10 +117,14 @@ class HlsFD(FragmentFD): decrypt_info = {'METHOD': 'NONE'} byte_range = {} frag_index = 0 + ad_frag_next = False for line in s.splitlines(): line = line.strip() if line: if not line.startswith('#'): + if ad_frag_next: + ad_frag_next = False + continue frag_index += 1 if frag_index <= ctx['fragment_index']: continue @@ -144,7 +164,8 @@ class HlsFD(FragmentFD): return False if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) - decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(decrypt_info['URI']).read() + decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( + self._prepare_url(info_dict, decrypt_info['URI'])).read() frag_content = AES.new( decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) @@ -175,6 +196,8 @@ class HlsFD(FragmentFD): 'start': sub_range_start, 'end': sub_range_start + int(splitted_byte_range[0]), } + elif is_ad_fragment(line): + ad_frag_next = True self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index af405b950..a22875f69 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -4,13 +4,18 @@ import errno import os import socket import time +import random import re from .common import FileDownloader -from ..compat import compat_urllib_error +from ..compat import ( + compat_str, + compat_urllib_error, +) from ..utils import ( ContentTooShortError, encodeFilename, + int_or_none, sanitize_open, sanitized_Request, write_xattr, @@ -22,79 +27,133 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): url = info_dict['url'] - tmpfilename = self.temp_name(filename) - stream = None + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) - basic_request = sanitized_Request(url, None, headers) - request = sanitized_Request(url, None, headers) is_test = self.params.get('test', False) + chunk_size = self._TEST_FILE_SIZE if is_test else ( + info_dict.get('downloader_options', {}).get('http_chunk_size') or + self.params.get('http_chunk_size') or 0) - if is_test: - request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) + ctx.open_mode = 'wb' + ctx.resume_len = 0 + ctx.data_len = None + ctx.block_size = self.params.get('buffersize', 1024) + ctx.start_time = time.time() + ctx.chunk_size = None - # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): - resume_len = os.path.getsize(encodeFilename(tmpfilename)) - else: - resume_len = 0 + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize( + encodeFilename(ctx.tmpfilename)) - open_mode = 'wb' - if resume_len != 0: - if self.params.get('continuedl', True): - self.report_resuming_byte(resume_len) - request.add_header('Range', 'bytes=%d-' % resume_len) - open_mode = 'ab' - else: - resume_len = 0 + ctx.is_resume = ctx.resume_len > 0 count = 0 retries = self.params.get('retries', 0) - while count <= retries: + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + class NextFragment(Exception): + pass + + def set_range(req, start, end): + range_header = 'bytes=%d-' % start + if end: + range_header += compat_str(end) + req.add_header('Range', range_header) + + def establish_connection(): + ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) + if not is_test and chunk_size else chunk_size) + if ctx.resume_len > 0: + range_start = ctx.resume_len + if ctx.is_resume: + self.report_resuming_byte(ctx.resume_len) + ctx.open_mode = 'ab' + elif ctx.chunk_size > 0: + range_start = 0 + else: + range_start = None + ctx.is_resume = False + range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None + if range_end and ctx.data_len is not None and range_end >= ctx.data_len: + range_end = ctx.data_len - 1 + has_range = range_start is not None + ctx.has_range = has_range + request = sanitized_Request(url, None, headers) + if has_range: + set_range(request, range_start, range_end) # Establish connection try: - data = self.ydl.urlopen(request) + ctx.data = self.ydl.urlopen(request) # When trying to resume, Content-Range HTTP header of response has to be checked # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if resume_len > 0: - content_range = data.headers.get('Content-Range') + if has_range: + content_range = ctx.data.headers.get('Content-Range') if content_range: - content_range_m = re.search(r'bytes (\d+)-', content_range) + content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) # Content-Range is present and matches requested Range, resume is possible - if content_range_m and resume_len == int(content_range_m.group(1)): - break + if content_range_m: + if range_start == int(content_range_m.group(1)): + content_range_end = int_or_none(content_range_m.group(2)) + content_len = int_or_none(content_range_m.group(3)) + accept_content_len = ( + # Non-chunked download + not ctx.chunk_size or + # Chunked download and requested piece or + # its part is promised to be served + content_range_end == range_end or + content_len < range_end) + if accept_content_len: + ctx.data_len = content_len + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break + ctx.resume_len = 0 + ctx.open_mode = 'wb' + ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) + return except (compat_urllib_error.HTTPError, ) as err: - if (err.code < 500 or err.code >= 600) and err.code != 416: - # Unexpected HTTP error - raise - elif err.code == 416: + if err.code == 416: # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header - data = self.ydl.urlopen(basic_request) - content_length = data.info()['Content-Length'] + ctx.data = self.ydl.urlopen( + sanitized_Request(url, None, headers)) + content_length = ctx.data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: raise else: # Examine the reported length if (content_length is not None and - (resume_len - 100 < int(content_length) < resume_len + 100)): + (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, @@ -102,152 +161,193 @@ class HttpFD(FileDownloader): # I decided to implement a suggested change and consider the file # completely downloaded if the file size differs less than 100 bytes from # the one in the hard drive. - self.report_file_already_downloaded(filename) - self.try_rename(tmpfilename, filename) + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) self._hook_progress({ - 'filename': filename, + 'filename': ctx.filename, 'status': 'finished', - 'downloaded_bytes': resume_len, - 'total_bytes': resume_len, + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, }) - return True + raise SucceedDownload() else: # The length does not match, we start the download over self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break - except socket.error as e: - if e.errno != errno.ECONNRESET: + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + elif err.code < 500 or err.code >= 600: + # Unexpected HTTP error + raise + raise RetryDownload(err) + except socket.error as err: + if err.errno != errno.ECONNRESET: # Connection reset is no problem, just retry raise + raise RetryDownload(err) - # Retry - count += 1 - if count <= retries: - self.report_retry(count, retries) + def download(): + data_len = ctx.data.info().get('Content-length', None) - if count > retries: - self.report_error('giving up after %s retries' % retries) - return False + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE - data_len = data.info().get('Content-length', None) - - # Range HTTP header may be ignored/unsupported by a webserver - # (e.g. extractor/scivee.py, extractor/bambuser.py). - # However, for a test we still would like to download just a piece of a file. - # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control - # block size when downloading a file. - if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): - data_len = self._TEST_FILE_SIZE - - if data_len is not None: - data_len = int(data_len) + resume_len - min_data_len = self.params.get('min_filesize') - max_data_len = self.params.get('max_filesize') - if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) - return False - if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) - return False - - byte_counter = 0 + resume_len - block_size = self.params.get('buffersize', 1024) - start = time.time() - - # measure time over whole while-loop, so slow_down() and best_block_size() work together properly - now = None # needed for slow_down() in the first loop run - before = start # start measuring - while True: - - # Download and write - data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - byte_counter += len(data_block) - - # exit loop when download is finished - if len(data_block) == 0: - break - - # Open destination file just in time - if stream is None: - try: - (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) - assert stream is not None - filename = self.undo_temp_name(tmpfilename) - self.report_destination(filename) - except (OSError, IOError) as err: - self.report_error('unable to open for writing: %s' % str(err)) + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False - if self.params.get('xattr_set_filesize', False) and data_len is not None: + byte_counter = 0 + ctx.resume_len + block_size = ctx.block_size + start = time.time() + + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring + + def retry(e): + if ctx.tmpfilename != '-': + ctx.stream.close() + ctx.stream = None + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + raise RetryDownload(e) + + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) + # socket.timeout is a subclass of socket.error but may not have + # errno set + except socket.timeout as e: + retry(e) + except socket.error as e: + if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): + raise + retry(e) + + byte_counter += len(data_block) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: try: - write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) - except (XAttrUnavailableError, XAttrMetadataError) as err: - self.report_error('unable to set filesize xattr: %s' % str(err)) + ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except (OSError, IOError) as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False - try: - stream.write(data_block) - except (IOError, OSError) as err: + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) + + try: + ctx.stream.write(data_block) + except (IOError, OSError) as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) + return False + + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if ctx.data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': ctx.data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - ctx.start_time, + }) + + if is_test and byte_counter == data_len: + break + + if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: + ctx.resume_len = byte_counter + # ctx.block_size = block_size + raise NextFragment() + + if ctx.stream is None: self.to_stderr('\n') - self.report_error('unable to write data: %s' % str(err)) + self.report_error('Did not get any data blocks') return False + if ctx.tmpfilename != '-': + ctx.stream.close() - # Apply rate limit - self.slow_down(start, now, byte_counter - resume_len) + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + raise err - # end measuring of one loop run - now = time.time() - after = now + self.try_rename(ctx.tmpfilename, ctx.filename) - # Adjust block size - if not self.params.get('noresizebuffer', False): - block_size = self.best_block_size(after - before, len(data_block)) - - before = after - - # Progress message - speed = self.calc_speed(start, now, byte_counter - resume_len) - if data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) self._hook_progress({ - 'status': 'downloading', 'downloaded_bytes': byte_counter, - 'total_bytes': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'speed': speed, - 'elapsed': now - start, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - ctx.start_time, }) - if is_test and byte_counter == data_len: - break + return True - if stream is None: - self.to_stderr('\n') - self.report_error('Did not get any data blocks') - return False - if tmpfilename != '-': - stream.close() + while count <= retries: + try: + establish_connection() + return download() + except RetryDownload as e: + count += 1 + if count <= retries: + self.report_retry(e.source_error, count, retries) + continue + except NextFragment: + continue + except SucceedDownload: + return True - if data_len is not None and byte_counter != data_len: - raise ContentTooShortError(byte_counter, int(data_len)) - self.try_rename(tmpfilename, filename) - - # Update file modification time - if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) - - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - start, - }) - - return True + self.report_error('giving up after %s retries' % retries) + return False diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 9b001ecff..063fcf444 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -1,25 +1,27 @@ from __future__ import unicode_literals import time -import struct import binascii import io from .fragment import FragmentFD -from ..compat import compat_urllib_error +from ..compat import ( + compat_Struct, + compat_urllib_error, +) -u8 = struct.Struct(b'>B') -u88 = struct.Struct(b'>Bx') -u16 = struct.Struct(b'>H') -u1616 = struct.Struct(b'>Hxx') -u32 = struct.Struct(b'>I') -u64 = struct.Struct(b'>Q') +u8 = compat_Struct('>B') +u88 = compat_Struct('>Bx') +u16 = compat_Struct('>H') +u1616 = compat_Struct('>Hxx') +u32 = compat_Struct('>I') +u64 = compat_Struct('>Q') -s88 = struct.Struct(b'>bx') -s16 = struct.Struct(b'>h') -s1616 = struct.Struct(b'>hxx') -s32 = struct.Struct(b'>i') +s88 = compat_Struct('>bx') +s16 = compat_Struct('>h') +s1616 = compat_Struct('>hxx') +s32 = compat_Struct('>i') unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) @@ -139,7 +141,7 @@ def write_piff_header(stream, params): sample_entry_payload += u16.pack(0x18) # depth sample_entry_payload += s16.pack(-1) # pre defined - codec_private_data = binascii.unhexlify(params['codec_private_data']) + codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8')) if fourcc in ('H264', 'AVC1'): sps, pps = codec_private_data.split(u32.pack(1))[1:] avcc_payload = u8.pack(1) # configuration version diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 60f753b95..512f04684 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals +import hashlib +import hmac import re +import time from .common import InfoExtractor from ..compat import compat_str @@ -10,6 +13,8 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + unescapeHTML, + update_url_query, ) @@ -101,21 +106,25 @@ class ABCIE(InfoExtractor): class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + _GEO_COUNTRIES = ['AU'] # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'http://iview.abc.net.au/programs/diaries-of-a-broken-mind/ZX9735A001S00', + 'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZX9735A001S00', + 'id': 'ZY9247A021S00', 'ext': 'mp4', - 'title': 'Diaries Of A Broken Mind', - 'description': 'md5:7de3903874b7a1be279fe6b68718fc9e', - 'upload_date': '20161010', - 'uploader_id': 'abc2', - 'timestamp': 1476064920, + 'title': "Gaston's Visit", + 'series': "Ben And Holly's Little Kingdom", + 'description': 'md5:18db170ad71cf161e006a4c688e33155', + 'upload_date': '20180318', + 'uploader_id': 'abc4kids', + 'timestamp': 1521400959, + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Video gone', }] def _real_extract(self, url): @@ -126,20 +135,30 @@ class ABCIViewIE(InfoExtractor): title = video_params.get('title') or video_params['seriesTitle'] stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') - format_urls = [ - try_get(stream, lambda x: x['hds-unmetered'], compat_str)] + house_number = video_params.get('episodeHouseNumber') + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( + int(time.time()), house_number) + sig = hmac.new( + 'android.content.res.Resources'.encode('utf-8'), + path.encode('utf-8'), hashlib.sha256).hexdigest() + token = self._download_webpage( + 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) - # May have higher quality video - sd_url = try_get( - stream, lambda x: x['streams']['hds']['sd'], compat_str) - if sd_url: - format_urls.append(sd_url.replace('metered', 'um')) + def tokenize_url(url, token): + return update_url_query(url, { + 'hdnea': token, + }) - formats = [] - for format_url in format_urls: - if format_url: - formats.extend( - self._extract_akamai_formats(format_url, video_id)) + for sd in ('sd', 'sd-low'): + sd_url = try_get( + stream, lambda x: x['streams']['hls'][sd], compat_str) + if not sd_url: + continue + formats = self._extract_m3u8_formats( + tokenize_url(sd_url, token), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + if formats: + break self._sort_formats(formats) subtitles = {} @@ -152,12 +171,12 @@ class ABCIViewIE(InfoExtractor): return { 'id': video_id, - 'title': title, + 'title': unescapeHTML(title), 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), 'duration': int_or_none(video_params.get('eventDuration')), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), - 'series': video_params.get('seriesTitle'), + 'series': unescapeHTML(video_params.get('seriesTitle')), 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), 'episode': self._html_search_meta('episode_title', webpage, default=None), diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 74d54560c..cd29aca77 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -7,6 +7,7 @@ import time from .amp import AMPIE from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_urlparse @@ -65,7 +66,7 @@ class AbcNewsIE(InfoExtractor): _TESTS = [{ 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', 'info_dict': { - 'id': '10498713', + 'id': '10505354', 'ext': 'flv', 'display_id': 'dramatic-video-rare-death-job-america', 'title': 'Occupational Hazards', @@ -78,7 +79,7 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { - 'id': '39125818', + 'id': '38897857', 'ext': 'mp4', 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', @@ -108,9 +109,7 @@ class AbcNewsIE(InfoExtractor): r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') full_video_url = compat_urlparse.urljoin(url, video_url) - youtube_url = self._html_search_regex( - r']+src="(https://www\.youtube\.com/embed/[^"]+)"', - webpage, 'YouTube URL', default=None) + youtube_url = YoutubeIE._extract_url(webpage) timestamp = None date_str = self._html_search_regex( @@ -140,7 +139,7 @@ class AbcNewsIE(InfoExtractor): } if youtube_url: - entries = [entry, self.url_result(youtube_url, 'Youtube')] + entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] return self.playlist_result(entries) return entry diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 6dace3051..6d846ea7a 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -7,8 +7,10 @@ import functools from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + float_or_none, int_or_none, - parse_iso8601, + try_get, + unified_timestamp, OnDemandPagedList, ) @@ -24,39 +26,58 @@ class ACastIE(InfoExtractor): 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', 'ext': 'mp3', 'title': '"Where Are You?": Taipei 101, Taiwan', + 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'timestamp': 1196172000, 'upload_date': '20071127', - 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'duration': 211, + 'creator': 'Concierge', + 'series': 'Condé Nast Traveler Podcast', + 'episode': '"Where Are You?": Taipei 101, Taiwan', } }, { # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '55c0097badd7095f494c99a172f86501', + 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', + 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', 'timestamp': 1477346700, 'upload_date': '20161024', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', - 'duration': 2797, + 'duration': 2766.602563, + 'creator': 'Anton Berg & Martin Johnson', + 'series': 'Spår', + 'episode': '2. Raggarmordet - Röster ur det förflutna', } }] def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() + s = self._download_json( + 'https://play-api.acast.com/stitch/%s/%s' % (channel, display_id), + display_id)['result'] + media_url = s['url'] cast_data = self._download_json( - 'https://embed.acast.com/api/acasts/%s/%s' % (channel, display_id), display_id) + 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), + display_id)['result'] + e = cast_data['episode'] + title = e['name'] return { - 'id': compat_str(cast_data['id']), + 'id': compat_str(e['id']), 'display_id': display_id, - 'url': [b['audio'] for b in cast_data['blings'] if b['type'] == 'BlingAudio'][0], - 'title': cast_data['name'], - 'description': cast_data.get('description'), - 'thumbnail': cast_data.get('image'), - 'timestamp': parse_iso8601(cast_data.get('publishingDate')), - 'duration': int_or_none(cast_data.get('duration')), + 'url': media_url, + 'title': title, + 'description': e.get('description') or e.get('summary'), + 'thumbnail': e.get('image'), + 'timestamp': unified_timestamp(e.get('publishingDate')), + 'duration': float_or_none(s.get('duration') or e.get('duration')), + 'filesize': int_or_none(e.get('contentLength')), + 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), + 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), + 'season_number': int_or_none(e.get('seasonNumber')), + 'episode': title, + 'episode_number': int_or_none(e.get('episodeNumber')), } diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index cffdab6ca..041c61aff 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -1,13 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import json import os from .common import InfoExtractor from ..aes import aes_cbc_decrypt -from ..compat import compat_ord +from ..compat import ( + compat_b64decode, + compat_ord, +) from ..utils import ( bytes_to_intlist, ExtractorError, @@ -48,9 +50,9 @@ class ADNIE(InfoExtractor): # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( - bytes_to_intlist(base64.b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\x1b\xe0\x29\x61\x38\x94\x24\x00\x12\xbd\xc5\x80\xac\xce\xbe\xb0'), - bytes_to_intlist(base64.b64decode(enc_subtitles[:24])) + bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), + bytes_to_intlist(b'\xc8\x6e\x06\xbc\xbe\xc6\x49\xf5\x88\x0d\xc8\x47\xc4\x27\x0c\x60'), + bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(), @@ -105,15 +107,18 @@ class ADNIE(InfoExtractor): options = player_config.get('options') or {} metas = options.get('metas') or {} - title = metas.get('title') or video_info['title'] links = player_config.get('links') or {} + sub_path = player_config.get('subtitles') error = None if not links: - links_url = player_config['linksurl'] + links_url = player_config.get('linksurl') or options['videoUrl'] links_data = self._download_json(urljoin( self._BASE_URL, links_url), video_id) links = links_data.get('links') or {} + metas = metas or links_data.get('meta') or {} + sub_path = sub_path or links_data.get('subtitles') error = links_data.get('error') + title = metas.get('title') or video_info['title'] formats = [] for format_id, qualities in links.items(): @@ -144,7 +149,7 @@ class ADNIE(InfoExtractor): 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), 'thumbnail': video_info.get('image'), 'formats': formats, - 'subtitles': self.extract_subtitles(player_config.get('subtitles'), video_id), + 'subtitles': self.extract_subtitles(sub_path, video_id), 'episode': metas.get('subtitle') or video_info.get('videoTitle'), 'series': video_info.get('playlistTitle'), } diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 2dcdba9d2..398e56ea3 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -122,7 +122,8 @@ class AENetworksIE(AENetworksBaseIE): query = { 'mbr': 'true', - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_ak', + 'switch': 'hls_high_ak', } video_id = self._html_search_meta('aetn:VideoID', webpage) media_url = self._search_regex( @@ -131,7 +132,7 @@ class AENetworksIE(AENetworksBaseIE): r'data-media-url=(["\'])(?P(?:(?!\1).)+?)\1'], webpage, 'video url', group='url') theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) info = self._parse_theplatform_metadata(theplatform_metadata) if theplatform_metadata.get('AETN$isBehindWall'): requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index c8cb91dcb..4b3d97136 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + urlencode_postdata, xpath_text, ) @@ -28,6 +29,7 @@ class AfreecaTVIE(InfoExtractor): ) (?P\d+) ''' + _NETRC_MACHINE = 'afreecatv' _TESTS = [{ 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', @@ -138,6 +140,23 @@ class AfreecaTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # PARTIAL_ADULT + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', + 'info_dict': { + 'id': '20180327_27901457_202289533_1', + 'ext': 'mp4', + 'title': '[생]빨개요♥ (part 1)', + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': '[SA]서아', + 'uploader_id': 'bjdyrksu', + 'upload_date': '20180327', + 'duration': 3601, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['adult content'], }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, @@ -155,17 +174,107 @@ class AfreecaTVIE(InfoExtractor): video_key['part'] = int(m.group('part')) return video_key + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_form = { + 'szWork': 'login', + 'szType': 'json', + 'szUid': username, + 'szPassword': password, + 'isSaveId': 'false', + 'szScriptVar': 'oLoginRet', + 'szAction': '', + } + + response = self._download_json( + 'https://login.afreecatv.com/app/LoginAction.php', None, + 'Logging in', data=urlencode_postdata(login_form)) + + _ERRORS = { + -4: 'Your account has been suspended due to a violation of our terms and policies.', + -5: 'https://member.afreecatv.com/app/user_delete_progress.php', + -6: 'https://login.afreecatv.com/membership/changeMember.php', + -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", + -9: 'https://member.afreecatv.com/app/pop_login_block.php', + -11: 'https://login.afreecatv.com/afreeca/second_login.php', + -12: 'https://member.afreecatv.com/app/user_security.php', + 0: 'The username does not exist or you have entered the wrong password.', + -1: 'The username does not exist or you have entered the wrong password.', + -3: 'You have entered your username/password incorrectly.', + -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', + -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', + -32008: 'You have failed to log in. Please contact our Help Center.', + } + + result = int_or_none(response.get('RESULT')) + if result != 1: + error = _ERRORS.get(result, 'You have failed to log in.') + raise ExtractorError( + 'Unable to login: %s said: %s' % (self.IE_NAME, error), + expected=True) + def _real_extract(self, url): video_id = self._match_id(url) - video_xml = self._download_xml( - 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', - video_id, query={'nTitleNo': video_id}) + webpage = self._download_webpage(url, video_id) - video_element = video_xml.findall(compat_xpath('./track/video'))[1] + if re.search(r'alert\(["\']This video has been deleted', webpage): + raise ExtractorError( + 'Video %s has been deleted' % video_id, expected=True) + + station_id = self._search_regex( + r'nStationNo\s*=\s*(\d+)', webpage, 'station') + bbs_id = self._search_regex( + r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') + video_id = self._search_regex( + r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) + + partial_view = False + for _ in range(2): + query = { + 'nTitleNo': video_id, + 'nStationNo': station_id, + 'nBbsNo': bbs_id, + } + if partial_view: + query['partialView'] = 'SKIP_ADULT' + video_xml = self._download_xml( + 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', + video_id, 'Downloading video info XML%s' + % (' (skipping adult)' if partial_view else ''), + video_id, headers={ + 'Referer': url, + }, query=query) + + flag = xpath_text(video_xml, './track/flag', 'flag', default=None) + if flag and flag == 'SUCCEED': + break + if flag == 'PARTIAL_ADULT': + self._downloader.report_warning( + 'In accordance with local laws and regulations, underage users are restricted from watching adult content. ' + 'Only content suitable for all ages will be downloaded. ' + 'Provide account credentials if you wish to download restricted content.') + partial_view = True + continue + elif flag == 'ADULT': + error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' + else: + error = flag + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + else: + raise ExtractorError('Unable to download video info') + + video_element = video_xml.findall(compat_xpath('./track/video'))[-1] if video_element is None or video_element.text is None: - raise ExtractorError('Specified AfreecaTV video does not exist', - expected=True) + raise ExtractorError( + 'Video %s video does not exist' % video_id, expected=True) video_url = video_element.text.strip() @@ -203,10 +312,19 @@ class AfreecaTVIE(InfoExtractor): r'^(\d{8})_', key, 'upload date', default=None) file_duration = int_or_none(file_element.get('duration')) format_id = key if key else '%s_%s' % (video_id, file_num) - formats = self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', - note='Downloading part %d m3u8 information' % file_num) + if determine_ext(file_url) == 'm3u8': + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', + note='Downloading part %d m3u8 information' % file_num) + else: + formats = [{ + 'url': file_url, + 'format_id': 'http', + }] + if not formats: + continue + self._sort_formats(formats) file_info = common_entry.copy() file_info.update({ 'id': format_id, @@ -246,107 +364,3 @@ class AfreecaTVIE(InfoExtractor): }) return info - - -class AfreecaTVGlobalIE(AfreecaTVIE): - IE_NAME = 'afreecatv:global' - _VALID_URL = r'https?://(?:www\.)?afreeca\.tv/(?P\d+)(?:/v/(?P\d+))?' - _TESTS = [{ - 'url': 'http://afreeca.tv/36853014/v/58301', - 'info_dict': { - 'id': '58301', - 'title': 'tryhard top100', - 'uploader_id': '36853014', - 'uploader': 'makgi Hearthstone Live!', - }, - 'playlist_count': 3, - }] - - def _real_extract(self, url): - channel_id, video_id = re.match(self._VALID_URL, url).groups() - video_type = 'video' if video_id else 'live' - query = { - 'pt': 'view', - 'bid': channel_id, - } - if video_id: - query['vno'] = video_id - video_data = self._download_json( - 'http://api.afreeca.tv/%s/view_%s.php' % (video_type, video_type), - video_id or channel_id, query=query)['channel'] - - if video_data.get('result') != 1: - raise ExtractorError('%s said: %s' % (self.IE_NAME, video_data['remsg'])) - - title = video_data['title'] - - info = { - 'thumbnail': video_data.get('thumb'), - 'view_count': int_or_none(video_data.get('vcnt')), - 'age_limit': int_or_none(video_data.get('grade')), - 'uploader_id': channel_id, - 'uploader': video_data.get('cname'), - } - - if video_id: - entries = [] - for i, f in enumerate(video_data.get('flist', [])): - video_key = self.parse_video_key(f.get('key', '')) - f_url = f.get('file') - if not video_key or not f_url: - continue - entries.append({ - 'id': '%s_%s' % (video_id, video_key.get('part', i + 1)), - 'title': title, - 'upload_date': video_key.get('upload_date'), - 'duration': int_or_none(f.get('length')), - 'url': f_url, - 'protocol': 'm3u8_native', - 'ext': 'mp4', - }) - - info.update({ - 'id': video_id, - 'title': title, - 'duration': int_or_none(video_data.get('length')), - }) - if len(entries) > 1: - info['_type'] = 'multi_video' - info['entries'] = entries - elif len(entries) == 1: - i = entries[0].copy() - i.update(info) - info = i - else: - formats = [] - for s in video_data.get('strm', []): - s_url = s.get('purl') - if not s_url: - continue - stype = s.get('stype') - if stype == 'HLS': - formats.extend(self._extract_m3u8_formats( - s_url, channel_id, 'mp4', m3u8_id=stype, fatal=False)) - elif stype == 'RTMP': - format_id = [stype] - label = s.get('label') - if label: - format_id.append(label) - formats.append({ - 'format_id': '-'.join(format_id), - 'url': s_url, - 'tbr': int_or_none(s.get('bps')), - 'height': int_or_none(s.get('brt')), - 'ext': 'flv', - 'rtmp_live': True, - }) - self._sort_formats(formats) - - info.update({ - 'id': channel_id, - 'title': self._live_title(title), - 'is_live': True, - 'formats': formats, - }) - - return info diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py new file mode 100644 index 000000000..6f241e683 --- /dev/null +++ b/youtube_dl/extractor/aliexpress.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, +) + + +class AliExpressLiveIE(InfoExtractor): + _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P\d+)' + _TEST = { + 'url': 'https://live.aliexpress.com/live/2800002704436634', + 'md5': 'e729e25d47c5e557f2630eaf99b740a5', + 'info_dict': { + 'id': '2800002704436634', + 'ext': 'mp4', + 'title': 'CASIMA7.22', + 'thumbnail': r're:http://.*\.jpg', + 'uploader': 'CASIMA Official Store', + 'timestamp': 1500717600, + 'upload_date': '20170722', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var', + webpage, 'runParams'), + video_id) + + title = data['title'] + + formats = self._extract_m3u8_formats( + data['replyStreamUrl'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': data.get('coverUrl'), + 'uploader': try_get( + data, lambda x: x['followBar']['name'], compat_str), + 'timestamp': float_or_none(data.get('startTimeLong'), scale=1000), + 'formats': formats, + } diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index dd3b18d72..6fb3d6c53 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -11,7 +11,7 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', 'md5': '', @@ -51,6 +51,9 @@ class AMCNetworksIE(ThePlatformIE): }, { 'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3', 'only_matching': True, + }, { + 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py new file mode 100644 index 000000000..01736872d --- /dev/null +++ b/youtube_dl/extractor/americastestkitchen.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + try_get, + unified_strdate, +) + + +class AmericasTestKitchenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'md5': 'b861c3e365ac38ad319cfd509c30577f', + 'info_dict': { + 'id': '1_5g5zua6e', + 'title': 'Summer Dinner Party', + 'ext': 'mp4', + 'description': 'md5:858d986e73a4826979b6a5d9f8f6a1ec', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1497285541, + 'upload_date': '20170612', + 'uploader_id': 'roger.metcalf@americastestkitchen.com', + 'release_date': '20170617', + 'series': "America's Test Kitchen", + 'season_number': 17, + 'episode': 'Summer Dinner Party', + 'episode_number': 24, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + partner_id = self._search_regex( + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') + + video_data = self._parse_json( + self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', + webpage, 'initial context'), + video_id) + + ep_data = try_get( + video_data, + (lambda x: x['episodeDetail']['content']['data'], + lambda x: x['videoDetail']['content']['data']), dict) + ep_meta = ep_data.get('full_video', {}) + external_id = ep_data.get('external_id') or ep_meta['external_id'] + + title = ep_data.get('title') or ep_meta.get('title') + description = clean_html(ep_meta.get('episode_description') or ep_data.get( + 'description') or ep_meta.get('description')) + thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) + release_date = unified_strdate(ep_data.get('aired_at')) + + season_number = int_or_none(ep_meta.get('season_number')) + episode = ep_meta.get('title') + episode_number = int_or_none(ep_meta.get('episode_number')) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, external_id), + 'ie_key': 'Kaltura', + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'release_date': release_date, + 'series': "America's Test Kitchen", + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + } diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9e28f2579..e4fa72f46 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,16 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, ExtractorError, - sanitized_Request, urlencode_postdata, + urljoin, ) @@ -21,6 +18,8 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' + # German-speaking countries of Europe + _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] _TESTS = [{ # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', @@ -46,6 +45,10 @@ class AnimeOnDemandIE(InfoExtractor): # Full length film, non-series, ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/185', 'only_matching': True, + }, { + # Flash videos + 'url': 'https://www.anime-on-demand.de/anime/12', + 'only_matching': True, }] def _login(self): @@ -72,19 +75,18 @@ class AnimeOnDemandIE(InfoExtractor): 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', self._LOGIN_URL) + post_url = urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Referer': self._LOGIN_URL, + }) if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( - r'

(.+?)

', - response, 'error', default=None) + r']+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P.+?)

', + response, 'error', default=None, group='error') if error: raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') @@ -120,10 +122,11 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] for input_ in re.findall( - r']+class=["\'].*?streamstarter_html5[^>]+>', html): + r']+class=["\'].*?streamstarter[^>]+>', html): attributes = extract_attributes(input_) + title = attributes.get('data-dialog-header') playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist'): + for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): playlist_url = attributes.get(playlist_key) if isinstance(playlist_url, compat_str) and re.match( r'/?[\da-zA-Z]+', playlist_url): @@ -147,19 +150,38 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), + item_id_list = [] + if format_id: + item_id_list.append(format_id) + item_id_list.append('videomaterial') + playlist = self._download_json( + urljoin(url, playlist_url), video_id, + 'Downloading %s JSON' % ' '.join(item_id_list), headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-CSRF-Token': csrf_token, 'Referer': url, 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) - playlist = self._download_json( - request, video_id, 'Downloading %s playlist JSON' % format_id, - fatal=False) + }, fatal=False) if not playlist: continue + stream_url = playlist.get('streamurl') + if stream_url: + rtmp = re.search( + r'^(?Prtmpe?://(?P[^/]+)/(?P.+/))(?Pmp[34]:.+)', + stream_url) + if rtmp: + formats.append({ + 'url': rtmp.group('url'), + 'app': rtmp.group('app'), + 'play_path': rtmp.group('playpath'), + 'page_url': url, + 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', + 'rtmp_real_time': True, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + continue start_video = playlist.get('startvideo', 0) playlist = playlist.get('playlist') if not playlist or not isinstance(playlist, list): @@ -222,7 +244,7 @@ class AnimeOnDemandIE(InfoExtractor): f.update({ 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), + 'url': urljoin(url, m.group('href')), }) entries.append(f) diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 8023da702..7a29cd2c6 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -18,6 +18,7 @@ from ..utils import ( int_or_none, strip_jsonp, unescapeHTML, + unsmuggle_url, ) @@ -197,12 +198,16 @@ class AnvatoIE(InfoExtractor): 'tbr': tbr if tbr != 0 else None, } - if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'): - if tbr is not None: - a_format.update({ - 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), - 'ext': 'mp4', - }) + if media_format == 'm3u8' and tbr is not None: + a_format.update({ + 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'ext': 'mp4', + }) + elif media_format == 'm3u8-variant' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue elif ext == 'mp3' or media_format == 'mp3': a_format['vcodec'] = 'none' else: @@ -271,6 +276,9 @@ class AnvatoIE(InfoExtractor): anvplayer_data['accessKey'], anvplayer_data['video']) def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + mobj = re.match(self._VALID_URL, url) access_key, video_id = mobj.group('access_key_or_mcp', 'id') if access_key not in self._ANVACK_TABLE: diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 025e29aa4..e394cb661 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - ExtractorError, - HEADRequest, + int_or_none, + mimetype2ext, ) class AparatIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'http://www.aparat.com/v/wP8On', @@ -29,30 +29,41 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id - webpage = self._download_webpage(embed_url, video_id) - - file_list = self._parse_json(self._search_regex( - r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) - for i, item in enumerate(file_list[0]): - video_url = item['file'] - req = HEADRequest(video_url) - res = self._request_webpage( - req, video_id, note='Testing video URL %d' % i, errnote=False) - if res: - break - else: - raise ExtractorError('No working video URLs found') + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') + + file_list = self._parse_json( + self._search_regex( + r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, + 'file list'), + video_id) + + formats = [] + for item in file_list[0]: + file_url = item.get('file') + if not file_url: + continue + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': label or ext, + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', default=None)), + }) + self._sort_formats(formats) + thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': title, - 'url': video_url, - 'ext': 'mp4', 'thumbnail': thumbnail, 'age_limit': self._family_friendly_search(webpage), + 'formats': formats, } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index b45b431e1..a9ef733e0 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -117,7 +117,7 @@ class AppleTrailersIE(InfoExtractor): continue formats.append({ 'format_id': '%s-%s' % (version, size), - 'url': re.sub(r'_(\d+p.mov)', r'_h\1', src), + 'url': re.sub(r'_(\d+p\.mov)', r'_h\1', src), 'width': int_or_none(size_data.get('width')), 'height': int_or_none(size_data.get('height')), 'language': version[:2], @@ -179,7 +179,7 @@ class AppleTrailersIE(InfoExtractor): formats = [] for format in settings['metadata']['sizes']: # The src is a file pointing to the real video file - format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) + format_url = re.sub(r'_(\d*p\.mov)', r'_h\1', format['src']) formats.append({ 'url': format_url, 'format': format['type'], diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 3c7d7250b..c79c58e82 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -41,7 +41,7 @@ class ArchiveOrgIE(InfoExtractor): webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);", + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", webpage, 'jwplayer playlist'), video_id) info = self._parse_jwplayer_data( {'playlist': jwplayer_playlist}, video_id, base_url=url) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 3f248b147..86951d975 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from .generic import GenericIE +from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, @@ -23,57 +24,30 @@ class ARDMediathekIE(InfoExtractor): _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ - 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', + # available till 26.07.2022 + 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', 'info_dict': { - 'id': '29582122', + 'id': '44726822', 'ext': 'mp4', - 'title': 'Ich liebe das Leben trotzdem', - 'description': 'md5:45e4c225c72b27993314b31a84a5261c', - 'duration': 4557, + 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', + 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', + 'duration': 1740, }, 'params': { # m3u8 download 'skip_download': True, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', - 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', - 'info_dict': { - 'id': '29522730', - 'ext': 'mp4', - 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)', - 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', - 'duration': 5252, - }, - 'skip': 'HTTP Error 404: Not Found', + } }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', - 'md5': '219d94d8980b4f538c7fcb0865eb7f2c', - 'info_dict': { - 'id': '28488308', - 'ext': 'mp3', - 'title': 'Tod eines Fußballers', - 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', - 'duration': 3240, - }, - 'skip': 'HTTP Error 404: Not Found', + 'only_matching': True, }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 'only_matching': True, }, { # audio 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', - 'md5': '4e8f00631aac0395fee17368ac0e9867', - 'info_dict': { - 'id': '30796318', - 'ext': 'mp3', - 'title': 'Vor dem Fest', - 'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb', - 'duration': 3287, - }, - 'skip': 'Video is no longer available', + 'only_matching': True, }] def _extract_media_info(self, media_info_url, webpage, video_id): @@ -126,6 +100,8 @@ class ARDMediathekIE(InfoExtractor): quality = stream.get('_quality') server = stream.get('_server') for stream_url in stream_urls: + if not isinstance(stream_url, compat_str) or '//' not in stream_url: + continue ext = determine_ext(stream_url) if quality != 'auto' and ext in ('f4m', 'm3u8'): continue @@ -146,13 +122,11 @@ class ARDMediathekIE(InfoExtractor): 'play_path': stream_url, 'format_id': 'a%s-rtmp-%s' % (num, quality), } - elif stream_url.startswith('http'): + else: f = { 'url': stream_url, 'format_id': 'a%s-%s-%s' % (num, ext, quality) } - else: - continue m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) if m: f.update({ @@ -195,7 +169,7 @@ class ARDMediathekIE(InfoExtractor): title = self._html_search_regex( [r'(.*?)', - r'', + r'', r'

(.*?)

'], webpage, 'title') description = self._html_search_meta( @@ -251,20 +225,23 @@ class ARDMediathekIE(InfoExtractor): class ARDIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' - _TEST = { - 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', - 'md5': 'd216c3a86493f9322545e045ddc3eb35', + _TESTS = [{ + # available till 14.02.2019 + 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', + 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', 'info_dict': { - 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge', - 'id': '100', + 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', + 'id': '102', 'ext': 'mp4', - 'duration': 2600, - 'title': 'Die Story im Ersten: Mission unter falscher Flagge', - 'upload_date': '20140804', + 'duration': 4435.0, + 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', + 'upload_date': '20180214', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'skip': 'HTTP Error 404: Not Found', - } + }, { + 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 56baef29d..ffc321821 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -6,15 +6,18 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( + ExtractorError, find_xpath_attr, - unified_strdate, get_element_by_attribute, int_or_none, NO_DEFAULT, qualities, + try_get, + unified_strdate, ) # There are different sources of video in arte.tv, the extraction process @@ -79,6 +82,16 @@ class ArteTVBaseIE(InfoExtractor): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] + vsr = try_get(player_info, lambda x: x['VSR'], dict) + if not vsr: + error = None + if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error': + error = try_get( + player_info, lambda x: x['custom_msg']['msg'], compat_str) + if not error: + error = 'Video %s is not available' % player_info.get('VID') or video_id + raise ExtractorError(error, expected=True) + upload_date_str = player_info.get('shootingDate') if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] @@ -107,7 +120,7 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] - for format_id, format_dict in player_info['VSR'].items(): + for format_id, format_dict in vsr.items(): f = dict(format_dict) versionCode = f.get('versionCode') l = re.escape(langcode) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 01fa308ff..1a31ebe08 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -87,7 +87,7 @@ class AtresPlayerIE(InfoExtractor): self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( - request, None, 'Logging in as %s' % username) + request, None, 'Logging in') error = self._html_search_regex( r'(?s)]+class="[^"]*\blist_error\b[^"]*">(.+?)', diff --git a/youtube_dl/extractor/aws.py b/youtube_dl/extractor/aws.py new file mode 100644 index 000000000..dccfeaf73 --- /dev/null +++ b/youtube_dl/extractor/aws.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import hashlib +import hmac + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlencode + + +class AWSIE(InfoExtractor): + _AWS_ALGORITHM = 'AWS4-HMAC-SHA256' + _AWS_REGION = 'us-east-1' + + def _aws_execute_api(self, aws_dict, video_id, query=None): + query = query or {} + amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') + date = amz_date[:8] + headers = { + 'Accept': 'application/json', + 'Host': self._AWS_PROXY_HOST, + 'X-Amz-Date': amz_date, + 'X-Api-Key': self._AWS_API_KEY + } + session_token = aws_dict.get('session_token') + if session_token: + headers['X-Amz-Security-Token'] = session_token + + def aws_hash(s): + return hashlib.sha256(s.encode('utf-8')).hexdigest() + + # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + canonical_querystring = compat_urllib_parse_urlencode(query) + canonical_headers = '' + for header_name, header_value in sorted(headers.items()): + canonical_headers += '%s:%s\n' % (header_name.lower(), header_value) + signed_headers = ';'.join([header.lower() for header in sorted(headers.keys())]) + canonical_request = '\n'.join([ + 'GET', + aws_dict['uri'], + canonical_querystring, + canonical_headers, + signed_headers, + aws_hash('') + ]) + + # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html + credential_scope_list = [date, self._AWS_REGION, 'execute-api', 'aws4_request'] + credential_scope = '/'.join(credential_scope_list) + string_to_sign = '\n'.join([self._AWS_ALGORITHM, amz_date, credential_scope, aws_hash(canonical_request)]) + + # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html + def aws_hmac(key, msg): + return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) + + def aws_hmac_digest(key, msg): + return aws_hmac(key, msg).digest() + + def aws_hmac_hexdigest(key, msg): + return aws_hmac(key, msg).hexdigest() + + k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8') + for value in credential_scope_list: + k_signing = aws_hmac_digest(k_signing, value) + + signature = aws_hmac_hexdigest(k_signing, string_to_sign) + + # Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html + headers['Authorization'] = ', '.join([ + '%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope), + 'SignedHeaders=%s' % signed_headers, + 'Signature=%s' % signature, + ]) + + return self._download_json( + 'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''), + video_id, headers=headers) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index f4e07d901..68f26e2ca 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(AZMedienBaseIE): 'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom', 'info_dict': { 'id': '1_2444peh4', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom', 'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8', 'uploader_id': 'TeleZ?ri', diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 0eb1930c2..633c57553 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -59,7 +59,7 @@ class BambuserIE(InfoExtractor): self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + request, None, 'Logging in') login_error = self._html_search_regex( r'(?s)
(.+?)
', diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 9ddb9af17..be41bd5a2 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -242,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor): raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ - self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) + self.url_result( + compat_urlparse.urljoin(url, t_path), + ie=BandcampIE.ie_key(), + video_title=self._search_regex( + r']+\bitemprop=["\']name["\'][^>]*>([^<]+)', + elem_content, 'track title', fatal=False)) for elem_content, t_path in track_elements if self._html_search_meta('duration', elem_content, default=None)] diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 79ded6ba1..8b20c03d6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -29,7 +29,7 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pb][\da-z]{7}' + _ID_REGEX = r'[pbw][\da-z]{7}' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -37,7 +37,8 @@ class BBCCoUkIE(InfoExtractor): programmes/(?!articles/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| - radio/player/ + radio/player/| + events/[^/]+/play/[^/]+/ ) (?P%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX @@ -232,6 +233,9 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, }] _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index d5c5822f2..bf22a41b7 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -9,6 +9,7 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, + urljoin, ) @@ -36,9 +37,11 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) cpl_url = self._search_regex( - r']+src=(["\'])(?P(?:https?:)?//static\.beeg\.com/cpl/\d+\.js.*?)\1', + r']+src=(["\'])(?P(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1', webpage, 'cpl', default=None, group='url') + cpl_url = urljoin(url, cpl_url) + beeg_version, beeg_salt = [None] * 2 if cpl_url: @@ -54,12 +57,16 @@ class BeegIE(InfoExtractor): r'beeg_salt\s*=\s*(["\'])(?P.+?)\1', cpl, 'beeg salt', default=None, group='beeg_salt') - beeg_version = beeg_version or '2000' + beeg_version = beeg_version or '2185' beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' - video = self._download_json( - 'https://api.beeg.com/api/v6/%s/video/%s' % (beeg_version, video_id), - video_id) + for api_path in ('', 'api.'): + video = self._download_json( + 'https://%sbeeg.com/api/v6/%s/video/%s' + % (api_path, beeg_version, video_id), video_id, + fatal=api_path == 'api.') + if video: + break def split(o, e): def cut(s, x): diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py index b4ce767af..28e3e59f6 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/youtube_dl/extractor/bigflix.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, +) class BigflixIE(InfoExtractor): @@ -39,8 +41,8 @@ class BigflixIE(InfoExtractor): webpage, 'title') def decode_url(quoted_b64_url): - return base64.b64decode(compat_urllib_parse_unquote( - quoted_b64_url).encode('ascii')).decode('utf-8') + return compat_b64decode(compat_urllib_parse_unquote( + quoted_b64_url)).decode('utf-8') formats = [] for height, encoded_url in re.findall( diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 1e57310d6..3e3348ef5 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -27,14 +27,14 @@ class BiliBiliIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', + 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { 'id': '1074402', - 'ext': 'mp4', + 'ext': 'flv', 'title': '【金坷垃】金泡沫', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.315, - 'timestamp': 1398012660, + 'duration': 308.067, + 'timestamp': 1398012678, 'upload_date': '20140420', 'thumbnail': r're:^https?://.+\.jpg', 'uploader': '菊子桑', @@ -59,17 +59,38 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://www.bilibili.com/video/av8903802/', 'info_dict': { 'id': '8903802', - 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382620, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, # Test metadata only }, + 'playlist': [{ + 'info_dict': { + 'id': '8903802_part1', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, + }, { + 'info_dict': { + 'id': '8903802_part2', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, + }] }] _APP_KEY = '84956560bc028eb7' @@ -92,8 +113,12 @@ class BiliBiliIE(InfoExtractor): webpage = self._download_webpage(url, video_id) if 'anime/' not in url: - cid = compat_parse_qs(self._search_regex( + cid = self._search_regex( + r'cid(?:["\']:|=)(\d+)', webpage, 'cid', + default=None + ) or compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], webpage, 'player parameters'))['cid'][0] else: @@ -102,6 +127,7 @@ class BiliBiliIE(InfoExtractor): video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': url } headers.update(self.geo_verification_headers()) @@ -113,48 +139,66 @@ class BiliBiliIE(InfoExtractor): self._report_error(js) cid = js['result']['cid'] - payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - - video_info = self._download_json( - 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=self.geo_verification_headers()) - - if 'durl' not in video_info: - self._report_error(video_info) + headers = { + 'Referer': url + } + headers.update(self.geo_verification_headers()) entries = [] - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): - formats.append({ - 'url': backup_url, - # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url else -3, + RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') + for num, rendition in enumerate(RENDITIONS, start=1): + payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + + video_info = self._download_json( + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + + if not video_info: + continue + + if 'durl' not in video_info: + if num < len(RENDITIONS): + continue + self._report_error(video_info) + + for idx, durl in enumerate(video_info['durl']): + formats = [{ + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), + }] + for backup_url in durl.get('backup_url', []): + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'preference': -2 if 'hd.mp4' in backup_url else -3, + }) + + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, }) + break - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, - }) - - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - }) - - title = self._html_search_regex(']*>([^<]+)', webpage, 'title') + title = self._html_search_regex( + (']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', + '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + group='title') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( - r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None)) + r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', + default=None) or self._html_search_meta( + 'uploadDate', webpage, 'timestamp', default=None)) thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript @@ -168,13 +212,16 @@ class BiliBiliIE(InfoExtractor): } uploader_mobj = re.search( - r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', + r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)', webpage) if uploader_mobj: info.update({ 'uploader': uploader_mobj.group('name'), 'uploader_id': uploader_mobj.group('id'), }) + if not info.get('uploader'): + info['uploader'] = self._html_search_meta( + 'author', webpage, 'uploader', default=None) for entry in entries: entry.update(info) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 9661ade4f..07833532e 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -33,13 +33,18 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( - r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: - video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) - quality = video_info['quality'] - video_url = video_info['src'] + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 2c32b6ae2..9bde7f2d8 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,20 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, parse_duration, + parse_iso8601, xpath_element, xpath_text, ) class BRIE(InfoExtractor): - IE_DESC = 'Bayerischer Rundfunk Mediathek' + IE_DESC = 'Bayerischer Rundfunk' _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' _TESTS = [ @@ -123,10 +126,10 @@ class BRIE(InfoExtractor): for asset in assets.findall('asset'): format_url = xpath_text(asset, ['downloadUrl', 'url']) asset_type = asset.get('type') - if asset_type == 'HDS': + if asset_type.startswith('HDS'): formats.extend(self._extract_f4m_formats( format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False)) - elif asset_type == 'HLS': + elif asset_type.startswith('HLS'): formats.extend(self._extract_m3u8_formats( format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) else: @@ -169,3 +172,140 @@ class BRIE(InfoExtractor): } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails + + +class BRMediathekIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk Mediathek' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' + + _TESTS = [{ + 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', + 'md5': 'fdc3d485835966d1622587d08ba632ec', + 'info_dict': { + 'id': 'av:5a1e6a6e8fce6d001871cc8e', + 'ext': 'mp4', + 'title': 'Die Sendung vom 28.11.2017', + 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', + 'timestamp': 1511942766, + 'upload_date': '20171129', + } + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + + clip = self._download_json( + 'https://proxy-base.master.mango.express/graphql', + clip_id, data=json.dumps({ + "query": """{ + viewer { + clip(id: "%s") { + title + description + duration + createdAt + ageRestriction + videoFiles { + edges { + node { + publicLocation + fileSize + videoProfile { + width + height + bitrate + encoding + } + } + } + } + captionFiles { + edges { + node { + publicLocation + } + } + } + teaserImages { + edges { + node { + imageFiles { + edges { + node { + publicLocation + width + height + } + } + } + } + } + } + } + } +}""" % clip_id}).encode(), headers={ + 'Content-Type': 'application/json', + })['data']['viewer']['clip'] + title = clip['title'] + + formats = [] + for edge in clip.get('videoFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + ext = determine_ext(n_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + n_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + video_profile = node.get('videoProfile', {}) + tbr = int_or_none(video_profile.get('bitrate')) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': n_url, + 'width': int_or_none(video_profile.get('width')), + 'height': int_or_none(video_profile.get('height')), + 'tbr': tbr, + 'filesize': int_or_none(node.get('fileSize')), + }) + self._sort_formats(formats) + + subtitles = {} + for edge in clip.get('captionFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + subtitles.setdefault('de', []).append({ + 'url': n_url, + }) + + thumbnails = [] + for edge in clip.get('teaserImages', {}).get('edges', []): + for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): + node = image_edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + thumbnails.append({ + 'url': n_url, + 'width': int_or_none(node.get('width')), + 'height': int_or_none(node.get('height')), + }) + + return { + 'id': clip_id, + 'title': title, + 'description': clip.get('description'), + 'duration': int_or_none(clip.get('duration')), + 'timestamp': parse_iso8601(clip.get('createdAt')), + 'age_limit': int_or_none(clip.get('ageRestriction')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 5a87c2661..70d16767f 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -3,15 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_age_limit, -) +from ..utils import int_or_none class BreakIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<site>break|screenjunkies)\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { @@ -19,125 +17,73 @@ class BreakIE(InfoExtractor): 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', 'age_limit': 13, + }, + }, { + # youtube embed + 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', + 'info_dict': { + 'id': 'RrrDLdeL2HQ', + 'ext': 'mp4', + 'title': 'Whale Watching Boat Crashing Into San Diego Dock', + 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', + 'upload_date': '20160331', + 'uploader': 'Steve Holden', + 'uploader_id': 'sdholden07', + }, + 'params': { + 'skip_download': True, } - }, { - 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', - 'md5': '5c2b686bec3d43de42bde9ec047536b0', - 'info_dict': { - 'id': '2841915', - 'display_id': 'best-quentin-tarantino-movie', - 'ext': 'mp4', - 'title': 'Best Quentin Tarantino Movie', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3671, - 'age_limit': 13, - 'tags': list, - }, - }, { - 'url': 'http://www.screenjunkies.com/video/honest-trailers-the-dark-knight', - 'info_dict': { - 'id': '2348808', - 'display_id': 'honest-trailers-the-dark-knight', - 'ext': 'mp4', - 'title': 'Honest Trailers - The Dark Knight', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - 'age_limit': 10, - 'tags': list, - }, - }, { - # requires subscription but worked around - 'url': 'http://www.screenjunkies.com/video/knocking-dead-ep-1-the-show-so-far-3003285', - 'info_dict': { - 'id': '3003285', - 'display_id': 'knocking-dead-ep-1-the-show-so-far', - 'ext': 'mp4', - 'title': 'State of The Dead Recap: Knocking Dead Pilot', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3307, - 'age_limit': 13, - 'tags': list, - }, }, { 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', 'only_matching': True, }] - _DEFAULT_BITRATES = (48, 150, 320, 496, 864, 2240, 3264) - def _real_extract(self, url): - site, display_id, video_id = re.match(self._VALID_URL, url).groups() + display_id, video_id = re.match(self._VALID_URL, url).groups() - if not video_id: - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - (r'src=["\']/embed/(\d+)', r'data-video-content-id=["\'](\d+)'), - webpage, 'video id') + webpage = self._download_webpage(url, display_id) - webpage = self._download_webpage( - 'http://www.%s.com/embed/%s' % (site, video_id), - display_id, 'Downloading video embed page') - embed_vars = self._parse_json( + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + content = self._parse_json( self._search_regex( - r'(?s)embedVars\s*=\s*({.+?})\s*</script>', webpage, 'embed vars'), + r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, + 'content'), display_id) - youtube_id = embed_vars.get('youtubeId') - if youtube_id: - return self.url_result(youtube_id, 'Youtube') - - title = embed_vars['contentName'] - formats = [] - bitrates = [] - for f in embed_vars.get('media', []): - if not f.get('uri') or f.get('mediaPurpose') != 'play': + for video in content: + video_url = video.get('url') + if not video_url or not isinstance(video_url, compat_str): continue - bitrate = int_or_none(f.get('bitRate')) - if bitrate: - bitrates.append(bitrate) + bitrate = int_or_none(self._search_regex( + r'(\d+)_kbps', video_url, 'tbr', default=None)) formats.append({ - 'url': f['uri'], + 'url': video_url, 'format_id': 'http-%d' % bitrate if bitrate else 'http', - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), 'tbr': bitrate, - 'format': 'mp4', }) - - if not bitrates: - # When subscriptionLevel > 0, i.e. plus subscription is required - # media list will be empty. However, hds and hls uris are still - # available. We can grab them assuming bitrates to be default. - bitrates = self._DEFAULT_BITRATES - - auth_token = embed_vars.get('AuthToken') - - def construct_manifest_url(base_url, ext): - pieces = [base_url] - pieces.extend([compat_str(b) for b in bitrates]) - pieces.append('_kbps.mp4.%s?%s' % (ext, auth_token)) - return ','.join(pieces) - - if bitrates and auth_token: - hds_url = embed_vars.get('hdsUri') - if hds_url: - formats.extend(self._extract_f4m_formats( - construct_manifest_url(hds_url, 'f4m'), - display_id, f4m_id='hds', fatal=False)) - hls_url = embed_vars.get('hlsUri') - if hls_url: - formats.extend(self._extract_m3u8_formats( - construct_manifest_url(hls_url, 'm3u8'), - display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) + title = self._search_regex( + (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') + + def get(key, name): + return int_or_none(self._search_regex( + r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, + default=None)) + + age_limit = get('ratings', 'age limit') + video_id = video_id or get('pid', 'video id') or display_id + return { 'id': video_id, 'display_id': display_id, 'title': title, - 'thumbnail': embed_vars.get('thumbUri'), - 'duration': int_or_none(embed_vars.get('videoLengthInSeconds')) or None, - 'age_limit': parse_age_limit(embed_vars.get('audienceRating')), - 'tags': embed_vars.get('tags', '').split(','), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': age_limit, 'formats': formats, } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0ed59bcbc..0e4eaef65 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -464,7 +464,7 @@ class BrightcoveNewIE(AdobePassIE): 'timestamp': 1441391203, 'upload_date': '20150904', 'uploader_id': '929656772001', - 'formats': 'mincount:22', + 'formats': 'mincount:20', }, }, { # with rtmp streams @@ -478,7 +478,7 @@ class BrightcoveNewIE(AdobePassIE): 'timestamp': 1433556729, 'upload_date': '20150606', 'uploader_id': '4036320279001', - 'formats': 'mincount:41', + 'formats': 'mincount:39', }, 'params': { # m3u8 download @@ -564,59 +564,7 @@ class BrightcoveNewIE(AdobePassIE): return entries - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass(smuggled_data.get('geo_countries')) - - account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) - - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - - if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', - webpage, 'policy key', group='pk') - - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) - try: - json_data = self._download_json(api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] - message = json_data.get('message') or json_data['error_code'] - if json_data.get('error_subcode') == 'CLIENT_GEO': - self.raise_geo_restricted(msg=message) - raise ExtractorError(message, expected=True) - raise - - errors = json_data.get('errors') - if errors and errors[0].get('error_subcode') == 'TVE_AUTH': - custom_fields = json_data['custom_fields'] - tve_token = self._extract_mvpd_auth( - smuggled_data['source_url'], video_id, - custom_fields['bcadobepassrequestorid'], - custom_fields['bcadobepassresourceid']) - json_data = self._download_json( - api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key - }, query={ - 'tveToken': tve_token, - }) - + def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() formats = [] @@ -682,6 +630,7 @@ class BrightcoveNewIE(AdobePassIE): }) formats.append(f) + errors = json_data.get('errors') if not formats and errors: error = errors[0] raise ExtractorError( @@ -689,6 +638,9 @@ class BrightcoveNewIE(AdobePassIE): self._sort_formats(formats) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + subtitles = {} for text_track in json_data.get('text_tracks', []): if text_track.get('src'): @@ -708,9 +660,72 @@ class BrightcoveNewIE(AdobePassIE): 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), - 'uploader_id': account_id, + 'uploader_id': json_data.get('account_id'), 'formats': formats, 'subtitles': subtitles, 'tags': json_data.get('tags', []), 'is_live': is_live, } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', + webpage, 'policy key', group='pk') + + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) + headers = { + 'Accept': 'application/json;pk=%s' % policy_key, + } + referrer = smuggled_data.get('referrer') + if referrer: + headers.update({ + 'Referer': referrer, + 'Origin': re.search(r'https?://[^/]+', referrer).group(0), + }) + try: + json_data = self._download_json(api_url, video_id, headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message, expected=True) + raise + + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + + return self._parse_brightcove_metadata( + json_data, video_id, headers=headers) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 8ef089653..4bf4efe1f 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -3,20 +3,19 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError class BYUtvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?byutv\.org/watch/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'info_dict': { - 'id': '6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', + 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', 'display_id': 'studio-c-season-5-episode-5', 'ext': 'mp4', 'title': 'Season 5 Episode 5', - 'description': 'md5:e07269172baff037f8e8bf9956bc9747', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', + 'thumbnail': r're:^https?://.*', 'duration': 1486.486, }, 'params': { @@ -26,6 +25,9 @@ class BYUtvIE(InfoExtractor): }, { 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', 'only_matching': True, + }, { + 'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', + 'only_matching': True, }] def _real_extract(self, url): @@ -33,16 +35,16 @@ class BYUtvIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) - episode_code = self._search_regex( - r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information') - - ep = self._parse_json( - episode_code, display_id, transform_source=lambda s: - re.sub(r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', s)) - - if ep['providerType'] != 'Ooyala': - raise ExtractorError('Unsupported provider %s' % ep['provider']) + ep = self._download_json( + 'https://api.byutv.org/api3/catalog/getvideosforcontent', video_id, + query={ + 'contentid': video_id, + 'channel': 'byutv', + 'x-byutv-context': 'web$US', + }, headers={ + 'x-byutv-context': 'web$US', + 'x-byutv-platformkey': 'xsaaw9c7y5', + })['ooyalaVOD'] return { '_type': 'url_transparent', @@ -50,44 +52,7 @@ class BYUtvIE(InfoExtractor): 'url': 'ooyala:%s' % ep['providerId'], 'id': video_id, 'display_id': display_id, - 'title': ep['title'], + 'title': ep.get('title'), 'description': ep.get('description'), 'thumbnail': ep.get('imageThumbnail'), } - - -class BYUtvEventIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?byutv\.org/watch/event/(?P<id>[0-9a-f-]+)' - _TEST = { - 'url': 'http://www.byutv.org/watch/event/29941b9b-8bf6-48d2-aebf-7a87add9e34b', - 'info_dict': { - 'id': '29941b9b-8bf6-48d2-aebf-7a87add9e34b', - 'ext': 'mp4', - 'title': 'Toledo vs. BYU (9/30/16)', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - ooyala_id = self._search_regex( - r'providerId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'ooyala id', group='id') - - title = self._search_regex( - r'class=["\']description["\'][^>]*>\s*<h1>([^<]+)</h1>', webpage, - 'title').strip() - - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % ooyala_id, - 'id': video_id, - 'title': title, - } diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index acd87e371..407cc8084 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -31,6 +31,10 @@ class Canalc2IE(InfoExtractor): webpage = self._download_webpage( 'http://www.canalc2.tv/video/%s' % video_id, video_id) + title = self._html_search_regex( + r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>', + webpage, 'title') + formats = [] for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage): if video_url.startswith('rtmp://'): @@ -49,17 +53,21 @@ class Canalc2IE(InfoExtractor): 'url': video_url, 'format_id': 'http', }) - self._sort_formats(formats) - title = self._html_search_regex( - r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.*?)</h3>', webpage, 'title') - duration = parse_duration(self._search_regex( - r'id=["\']video_duree["\'][^>]*>([^<]+)', - webpage, 'duration', fatal=False)) + if formats: + info = { + 'formats': formats, + } + else: + info = self._parse_html5_media_entries(url, webpage, url)[0] - return { + self._sort_formats(info['formats']) + + info.update({ 'id': video_id, 'title': title, - 'duration': duration, - 'formats': formats, - } + 'duration': parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)), + }) + return info diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index d8bf073f4..51c11cb7e 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -4,59 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( - dict_get, # ExtractorError, # HEADRequest, int_or_none, qualities, - remove_end, unified_strdate, ) class CanalplusIE(InfoExtractor): - IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - (?:(?:www|m)\.)?canalplus\.fr| - (?:www\.)?piwiplus\.fr| - (?:www\.)?d8\.tv| - (?:www\.)?c8\.fr| - (?:www\.)?d17\.tv| - (?:(?:football|www)\.)?cstar\.fr| - (?:www\.)?itele\.fr - )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?| - player\.canalplus\.fr/#/(?P<id>\d+) - ) - - ''' + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { - 'canalplus': 'cplus', + 'mycanal': 'cplus', 'piwiplus': 'teletoon', - 'd8': 'd8', - 'c8': 'd8', - 'd17': 'd17', - 'cstar': 'd17', - 'itele': 'itele', } # Only works for direct mp4 URLs _GEO_COUNTRIES = ['FR'] _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', 'info_dict': { - 'id': '1405510', - 'display_id': 'pid1830-c-zapping', + 'id': '1397061', + 'display_id': 'lolywood', 'ext': 'mp4', - 'title': 'Zapping - 02/07/2016', - 'description': 'Le meilleur de toutes les chaînes, tous les jours', - 'upload_date': '20160702', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', }, }, { # geo restricted, bypassed @@ -70,64 +47,12 @@ class CanalplusIE(InfoExtractor): 'upload_date': '20140724', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - # geo restricted, bypassed - 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html?vid=1443684', - 'md5': 'bb6f9f343296ab7ebd88c97b660ecf8d', - 'info_dict': { - 'id': '1443684', - 'display_id': 'pid6318-videos-integrales', - 'ext': 'mp4', - 'title': 'Guess my iep ! - TPMP - 07/04/2017', - 'description': 'md5:6f005933f6e06760a9236d9b3b5f17fa', - 'upload_date': '20170407', - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - 'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'info_dict': { - 'id': '1420176', - 'display_id': 'rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'ext': 'mp4', - 'title': 'L\'invité de Michaël Darmon du 14/10/2016 - ', - 'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.', - 'upload_date': '20161014', - }, - }, { - 'url': 'http://football.cstar.fr/cstar-minisite-foot/pid7566-feminines-videos.html?vid=1416769', - 'info_dict': { - 'id': '1416769', - 'display_id': 'pid7566-feminines-videos', - 'ext': 'mp4', - 'title': 'France - Albanie : les temps forts de la soirée - 20/09/2016', - 'description': 'md5:c3f30f2aaac294c1c969b3294de6904e', - 'upload_date': '20160921', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://m.canalplus.fr/?vid=1398231', - 'only_matching': True, - }, { - 'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061', - 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + site, display_id, video_id = re.match(self._VALID_URL, url).groups() - site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]] - - # Beware, some subclasses do not define an id group - display_id = remove_end(dict_get(mobj.groupdict(), ('display_id', 'id', 'vid')), '.html') - - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', - r'id=["\']canal_video_player(?P<id>\d+)', - r'data-video=["\'](?P<id>\d+)'], - webpage, 'video id', default=mobj.group('vid'), group='id') + site_id = self._SITE_ID_MAP[site] info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') @@ -161,7 +86,7 @@ class CanalplusIE(InfoExtractor): format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ - # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index aada02917..8ac62c1a6 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -1,26 +1,112 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor -from ..utils import float_or_none +from .gigya import GigyaBaseIE +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + strip_or_none, + float_or_none, + int_or_none, + parse_iso8601, +) class CanvasIE(InfoExtractor): + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrtvideo)/assets/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'md5': '90139b746a0a9bd7bb631283f6e2a64e', + 'info_dict': { + 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'ext': 'flv', + 'title': 'Nachtwacht: De Greystook', + 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1468.03, + }, + 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], + }, { + 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, video_id = mobj.group('site_id'), mobj.group('id') + + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id) + + title = data['title'] + description = data.get('description') + + formats = [] + for target in data['targetUrls']: + format_url, format_type = target.get('url'), target.get('type') + if not format_url or not format_type: + continue + if format_type == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type, fatal=False)) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id=format_type, fatal=False)) + elif format_type == 'HSS': + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + self._sort_formats(formats) + + subtitles = {} + subtitle_urls = data.get('subtitleUrls') + if isinstance(subtitle_urls, list): + for subtitle in subtitle_urls: + subtitle_url = subtitle.get('url') + if subtitle_url and subtitle.get('type') == 'CLOSED': + subtitles.setdefault('nl', []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': float_or_none(data.get('duration'), 1000), + 'thumbnail': data.get('posterImageUrl'), + 'subtitles': subtitles, + } + + +class CanvasEenIE(InfoExtractor): IE_DESC = 'canvas.be and een.be' _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', - 'md5': 'ea838375a547ac787d4064d8c7860a6c', + 'md5': 'ed66976748d12350b118455979cca293', 'info_dict': { 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'De afspraak veilt voor de Warmste Week', 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 49.02, - } + }, + 'expected_warnings': ['is not a supported codec'], }, { # with subtitles 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', @@ -40,7 +126,8 @@ class CanvasIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Pagina niet gevonden', }, { 'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles', 'info_dict': { @@ -54,7 +141,8 @@ class CanvasIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Episode no longer available', }, { 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', 'only_matching': True, @@ -66,55 +154,157 @@ class CanvasIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - title = (self._search_regex( + title = strip_or_none(self._search_regex( r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', webpage, 'title', default=None) or self._og_search_title( - webpage)).strip() + webpage, default=None)) video_id = self._html_search_regex( - r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id') - - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), display_id) - - formats = [] - for target in data['targetUrls']: - format_url, format_type = target.get('url'), target.get('type') - if not format_url or not format_type: - continue - if format_type == 'HLS': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, entry_protocol='m3u8_native', - ext='mp4', preference=0, fatal=False, m3u8_id=format_type)) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, display_id, f4m_id=format_type, fatal=False)) - elif format_type == 'MPEG_DASH': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id=format_type, fatal=False)) - else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - self._sort_formats(formats) - - subtitles = {} - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) + r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), + 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, 'title': title, 'description': self._og_search_description(webpage), - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, + } + + +class VrtNUIE(GigyaBaseIE): + IE_DESC = 'VrtNU.be' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'info_dict': { + 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'ext': 'flv', + 'title': 'De zwarte weduwe', + 'description': 'md5:d90c21dced7db869a85db89a623998d4', + 'duration': 1457.04, + 'thumbnail': r're:^https?://.*\.jpg$', + 'season': '1', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'This video is only available for registered users' + }] + _NETRC_MACHINE = 'vrtnu' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' + _CONTEXT_ID = 'R3595707040' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + auth_data = { + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + } + + auth_info = self._gigya_login(auth_data) + + # Sometimes authentication fails for no good reason, retry + login_attempt = 1 + while login_attempt <= 3: + try: + # When requesting a token, no actual token is returned, but the + # necessary cookies are set. + self._request_webpage( + 'https://token.vrt.be', + None, note='Requesting a token', errnote='Could not get a token', + headers={ + 'Content-Type': 'application/json', + 'Referer': 'https://www.vrt.be/vrtnu/', + }, + data=json.dumps({ + 'uid': auth_info['UID'], + 'uidsig': auth_info['UIDSignature'], + 'ts': auth_info['signatureTimestamp'], + 'email': auth_info['profile']['email'], + }).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + login_attempt += 1 + self.report_warning('Authentication failed') + self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') + else: + raise e + else: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, display_id) + + title = self._html_search_regex( + r'(?ms)<h1 class="content__heading">(.+?)</h1>', + webpage, 'title').strip() + + description = self._html_search_regex( + r'(?ms)<div class="content__description">(.+?)</div>', + webpage, 'description', default=None) + + season = self._html_search_regex( + [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s* + <span>seizoen\ (.+?)</span>\s* + </div>''', + r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'], + webpage, 'season', default=None) + + season_number = int_or_none(season) + + episode_number = int_or_none(self._html_search_regex( + r'''(?xms)<div\ class="content__episode">\s* + <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span> + </div>''', + webpage, 'episode_number', default=None)) + + release_date = parse_iso8601(self._html_search_regex( + r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"', + webpage, 'release_date', default=None)) + + # If there's a ? or a # in the URL, remove them and everything after + clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/') + securevideo_url = clean_url + '.mssecurevideo.json' + + try: + video = self._download_json(securevideo_url, display_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_login_required() + raise + + # We are dealing with a '../<show>.relevant' URL + redirect_url = video.get('url') + if redirect_url: + return self.url_result(self._proto_relative_url(redirect_url, 'https:')) + + # There is only one entry, but with an unknown key, so just get + # the first one + video_id = list(video.values())[0].get('videoid') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season': season, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_date': release_date, } diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py index 086ec90c9..6aeebd7b3 100644 --- a/youtube_dl/extractor/cartoonnetwork.py +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -31,7 +31,7 @@ class CartoonNetworkIE(TurnerBaseIE): 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { 'secure': { 'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big', - 'tokenizer_src': 'http://www.cartoonnetwork.com/cntv/mvpd/processors/services/token_ipadAdobe.do', + 'tokenizer_src': 'https://token.vgtf.net/token/token_mobile', }, }, { 'url': url, diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 87ad14e91..54b4b9be9 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -1,10 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( js_to_json, smuggle_url, @@ -13,6 +17,7 @@ from ..utils import ( xpath_element, xpath_with_ns, find_xpath_attr, + parse_duration, parse_iso8601, parse_age_limit, int_or_none, @@ -200,33 +205,52 @@ class CBCWatchBaseIE(InfoExtractor): 'media': 'http://search.yahoo.com/mrss/', 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } + _GEO_COUNTRIES = ['CA'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path - result = self._download_xml(url, video_id, headers={ - 'X-Clearleap-DeviceId': self._device_id, - 'X-Clearleap-DeviceToken': self._device_token, - }) + for _ in range(2): + try: + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + # Device token has expired, re-acquiring device token + self._register_device() + continue + raise error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') if error_message: raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) return result def _real_initialize(self): - if not self._device_id or not self._device_token: - device = self._downloader.cache.load('cbcwatch', 'device') or {} - self._device_id, self._device_token = device.get('id'), device.get('token') - if not self._device_id or not self._device_token: - result = self._download_xml( - self._API_BASE_URL + 'device/register', - None, data=b'<device><type>web</type></device>') - self._device_id = xpath_text(result, 'deviceId', fatal=True) - self._device_token = xpath_text(result, 'deviceToken', fatal=True) - self._downloader.cache.store( - 'cbcwatch', 'device', { - 'id': self._device_id, - 'token': self._device_token, - }) + if self._valid_device_token(): + return + device = self._downloader.cache.load('cbcwatch', 'device') or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if self._valid_device_token(): + return + self._register_device() + + def _valid_device_token(self): + return self._device_id and self._device_token + + def _register_device(self): + self._device_id = self._device_token = None + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, 'Acquiring device token', + data=b'<device><type>web</type></device>') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', 'device', { + 'id': self._device_id, + 'token': self._device_token, + }) def _parse_rss_feed(self, rss): channel = xpath_element(rss, 'channel', fatal=True) @@ -287,6 +311,11 @@ class CBCWatchBaseIE(InfoExtractor): class CBCWatchVideoIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch:video' _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TEST = { + # geo-restricted to Canada, bypassable + 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', + 'only_matching': True, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -323,9 +352,10 @@ class CBCWatchIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch' _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)' _TESTS = [{ + # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', 'info_dict': { - 'id': '38e815a-009e3ab12e4', + 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', 'ext': 'mp4', 'title': 'Customer (Dis)Service', 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', @@ -337,8 +367,8 @@ class CBCWatchIE(CBCWatchBaseIE): 'skip_download': True, 'format': 'bestvideo', }, - 'skip': 'Geo-restricted to Canada', }, { + # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', 'info_dict': { 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', @@ -346,10 +376,69 @@ class CBCWatchIE(CBCWatchBaseIE): 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', }, 'playlist_mincount': 30, - 'skip': 'Geo-restricted to Canada', }] def _real_extract(self, url): video_id = self._match_id(url) rss = self._call_api('web/browse/' + video_id, video_id) return self._parse_rss_feed(rss) + + +class CBCOlympicsIE(InfoExtractor): + IE_NAME = 'cbc.ca:olympics' + _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._hidden_inputs(webpage)['videoId'] + video_doc = self._download_xml( + 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) + title = xpath_text(video_doc, 'title', fatal=True) + is_live = xpath_text(video_doc, 'kind') == 'Live' + if is_live: + title = self._live_title(title) + + formats = [] + for video_source in video_doc.findall('videoSources/videoSource'): + uri = xpath_text(video_source, 'uri') + if not uri: + continue + tokenize = self._download_json( + 'https://olympics.cbc.ca/api/api-akamai/tokenize', + video_id, data=json.dumps({ + 'VideoSource': uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': url, + # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js + 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie + }, fatal=False) + if not tokenize: + continue + content_url = tokenize['ContentUrl'] + video_source_format = video_source.get('format') + if video_source_format == 'IIS': + formats.extend(self._extract_ism_formats( + content_url, video_id, ism_id=video_source_format, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id=video_source_format, fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': xpath_text(video_doc, 'description'), + 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), + 'duration': parse_duration(xpath_text(video_doc, 'duration')), + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1268e38ef..1799d63ea 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .theplatform import ThePlatformFeedIE from ..utils import ( + ExtractorError, int_or_none, find_xpath_attr, xpath_element, @@ -61,9 +62,10 @@ class CBSIE(CBSBaseIE): asset_types = [] subtitles = {} formats = [] + last_e = None for item in items_data.findall('.//item'): asset_type = xpath_text(item, 'assetType') - if not asset_type or asset_type in asset_types: + if not asset_type or asset_type in asset_types or asset_type in ('HLS_FPS', 'DASH_CENC'): continue asset_types.append(asset_type) query = { @@ -74,11 +76,17 @@ class CBSIE(CBSBaseIE): query['formats'] = 'MPEG4,M3U' elif asset_type in ('RTMP', 'WIFI', '3G'): query['formats'] = 'MPEG4,FLV' - tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query(tp_release_url, query), content_id, - 'Downloading %s SMIL data' % asset_type) + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(tp_release_url, query), content_id, + 'Downloading %s SMIL data' % asset_type) + except ExtractorError as e: + last_e = e + continue formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if last_e and not formats: + raise last_e self._sort_formats(formats) info = self._extract_theplatform_metadata(tp_path, content_id) diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 681d63e29..6596e98a6 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -75,10 +75,10 @@ class CBSInteractiveIE(CBSIE): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"data-(?:cnet|zdnet)-video(?:-uvp(?:js)?)?-options='([^']+)'", + r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) - vdata = data.get('video') or data['videos'][0] + vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] video_id = vdata['mpxRefId'] diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 7d78e3aae..90852a9ef 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -91,12 +91,10 @@ class CBSLocalIE(AnvatoIE): info_dict = self._extract_anvato_videos(webpage, display_id) - time_str = self._html_search_regex( - r'class="entry-date">([^<]+)<', webpage, 'released date', default=None) - if time_str: - timestamp = unified_timestamp(time_str) - else: - timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage)) + timestamp = unified_timestamp(self._html_search_regex( + r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, + 'released date', default=None)) or parse_iso8601( + self._html_search_meta('uploadDate', webpage)) info_dict.update({ 'display_id': display_id, diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 3a62c840b..83b764762 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -4,28 +4,35 @@ from .cbs import CBSBaseIE class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/video/player/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast', + 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', 'info_dict': { - 'id': '708337219968', + 'id': '1214315075735', 'ext': 'mp4', - 'title': 'Ben Simmons the next LeBron? Not so fast', - 'description': 'md5:854294f627921baba1f4b9a990d87197', - 'timestamp': 1466293740, - 'upload_date': '20160618', + 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', + 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', + 'timestamp': 1524111457, + 'upload_date': '20180419', 'uploader': 'CBSI-NEW', }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'only_matching': True, }] def _extract_video_info(self, filter_query, video_id): return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], + webpage, 'video id') return self._extract_video_info('byId=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 39938c9ac..07f5206c1 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -4,11 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + clean_html, int_or_none, parse_duration, parse_iso8601, - clean_html, + parse_resolution, ) @@ -40,34 +42,42 @@ class CCMAIE(InfoExtractor): def _real_extract(self, url): media_type, media_id = re.match(self._VALID_URL, url).groups() - media_data = {} - formats = [] - profiles = ['pc'] if media_type == 'audio' else ['mobil', 'pc'] - for i, profile in enumerate(profiles): - md = self._download_json('http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + + media = self._download_json( + 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ 'media': media_type, 'idint': media_id, - 'profile': profile, - }, fatal=False) - if md: - media_data = md - media_url = media_data.get('media', {}).get('url') - if media_url: - formats.append({ - 'format_id': profile, - 'url': media_url, - 'quality': i, - }) + }) + + formats = [] + media_url = media['media']['url'] + if isinstance(media_url, list): + for format_ in media_url: + format_url = format_.get('file') + if not format_url or not isinstance(format_url, compat_str): + continue + label = format_.get('label') + f = parse_resolution(label) + f.update({ + 'url': format_url, + 'format_id': label, + }) + formats.append(f) + else: + formats.append({ + 'url': media_url, + 'vcodec': 'none' if media_type == 'audio' else None, + }) self._sort_formats(formats) - informacio = media_data['informacio'] + informacio = media['informacio'] title = informacio['titol'] durada = informacio.get('durada', {}) duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) subtitles = {} - subtitols = media_data.get('subtitols', {}) + subtitols = media.get('subtitols', {}) if subtitols: sub_url = subtitols.get('url') if sub_url: @@ -77,7 +87,7 @@ class CCMAIE(InfoExtractor): }) thumbnails = [] - imatges = media_data.get('imatges', {}) + imatges = media.get('imatges', {}) if imatges: thumbnail_url = imatges.get('url') if thumbnail_url: @@ -93,7 +103,7 @@ class CCMAIE(InfoExtractor): 'description': clean_html(informacio.get('descripcio')), 'duration': duration, 'timestamp': timestamp, - 'thumnails': thumbnails, + 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py old mode 100755 new mode 100644 index 78b7a923c..0c3af23d5 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -124,7 +124,7 @@ class CDAIE(InfoExtractor): } def extract_format(page, version): - json_str = self._search_regex( + json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, '%s player_json' % version, fatal=False, group='player_data') if not json_str: diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index e250de18c..6bad90859 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -13,6 +13,7 @@ from ..utils import ( float_or_none, sanitized_Request, unescapeHTML, + update_url_query, urlencode_postdata, USER_AGENTS, ) @@ -265,6 +266,10 @@ class CeskaTelevizePoradyIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] def _real_extract(self, url): @@ -272,8 +277,11 @@ class CeskaTelevizePoradyIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - data_url = unescapeHTML(self._search_regex( - r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'iframe player url', group='url')) + data_url = update_url_query(unescapeHTML(self._search_regex( + (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), + webpage, 'iframe player url', group='url')), query={ + 'autoStart': 'true', + }) return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index e92894246..81108e704 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -81,6 +81,12 @@ class Channel9IE(InfoExtractor): _RSS_URL = 'http://channel9.msdn.com/%s/RSS' + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', + webpage) + def _extract_list(self, video_id, rss_url=None): if not rss_url: rss_url = self._RSS_URL % video_id diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py index 2d517f231..42c9af263 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/youtube_dl/extractor/charlierose.py @@ -5,7 +5,7 @@ from ..utils import remove_end class CharlieRoseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://charlierose.com/videos/27996', 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', @@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor): }, { 'url': 'https://charlierose.com/videos/27996', 'only_matching': True, + }, { + 'url': 'https://charlierose.com/episodes/30887?autoplay=true', + 'only_matching': True, }] _PLAYER_BASE = 'https://charlierose.com/video/player/%s' diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 0206d96db..5aac21299 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -1,10 +1,11 @@ from __future__ import unicode_literals import re -import base64 import json from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import compat_b64decode from ..utils import ( clean_html, ExtractorError @@ -57,7 +58,7 @@ class ChilloutzoneIE(InfoExtractor): base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') + decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict @@ -70,11 +71,9 @@ class ChilloutzoneIE(InfoExtractor): # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) if native_platform is None: - youtube_url = self._html_search_regex( - r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', - webpage, 'fallback video URL', default=None) - if youtube_url is not None: - return self.url_result(youtube_url, ie='Youtube') + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or # the own CDN diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 4815b34be..8d75cdf19 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,10 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor +from ..compat import compat_b64decode from ..utils import parse_duration @@ -44,8 +44,7 @@ class ChirbitIE(InfoExtractor): # Reverse engineered from https://chirb.it/js/chirbit.player.js (look # for soundURL) - audio_url = base64.b64decode( - data_fd[::-1].encode('ascii')).decode('utf-8') + audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8') title = self._search_regex( r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py new file mode 100644 index 000000000..a1a7a774c --- /dev/null +++ b/youtube_dl/extractor/clippit.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + qualities, +) + +import re + + +class ClippitIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)' + _TEST = { + 'url': 'https://www.clippituser.tv/c/evmgm', + 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09', + 'info_dict': { + 'id': 'evmgm', + 'ext': 'mp4', + 'title': 'Bye bye Brutus. #BattleBots - Clippit', + 'uploader': 'lizllove', + 'uploader_url': 'https://www.clippituser.tv/p/lizllove', + 'timestamp': 1472183818, + 'upload_date': '20160826', + 'description': 'BattleBots | ABC', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title.*>(.+?)', webpage, 'title') + + FORMATS = ('sd', 'hd') + quality = qualities(FORMATS) + formats = [] + for format_id in FORMATS: + url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, + webpage, 'url', fatal=False) + if not url: + continue + match = re.search(r'/(?P\d+)\.mp4', url) + formats.append({ + 'url': url, + 'format_id': format_id, + 'quality': quality(format_id), + 'height': int(match.group('height')) if match else None, + }) + + uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n', + webpage, 'uploader', fatal=False) + uploader_url = ('https://www.clippituser.tv/p/' + uploader + if uploader else None) + + timestamp = self._html_search_regex(r'datetime="(.+?)"', + webpage, 'date', fatal=False) + thumbnail = self._html_search_regex(r'data-image="(.+?)"', + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'timestamp': parse_iso8601(timestamp), + 'description': self._og_search_description(webpage), + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py deleted file mode 100644 index 6a41db87c..000000000 --- a/youtube_dl/extractor/collegerama.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - sanitized_Request, -) - - -class CollegeRamaIE(InfoExtractor): - _VALID_URL = r'https?://collegerama\.tudelft\.nl/Mediasite/Play/(?P[\da-f]+)' - _TESTS = [ - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', - 'md5': '481fda1c11f67588c0d9d8fbdced4e39', - 'info_dict': { - 'id': '585a43626e544bdd97aeb71a0ec907a01d', - 'ext': 'mp4', - 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', - 'duration': 7713.088, - 'timestamp': 1413309600, - 'upload_date': '20141014', - }, - }, - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', - 'md5': 'ef1fdded95bdf19b12c5999949419c92', - 'info_dict': { - 'id': '86a9ea9f53e149079fbdb4202b521ed21d', - 'ext': 'wmv', - 'title': '64ste Vakantiecursus: Afvalwater', - 'description': 'md5:7fd774865cc69d972f542b157c328305', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', - 'duration': 10853, - 'timestamp': 1326446400, - 'upload_date': '20120113', - }, - }, - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - player_options_request = { - 'getPlayerOptionsRequest': { - 'ResourceId': video_id, - 'QueryString': '', - } - } - - request = sanitized_Request( - 'http://collegerama.tudelft.nl/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', - json.dumps(player_options_request)) - request.add_header('Content-Type', 'application/json') - - player_options = self._download_json(request, video_id) - - presentation = player_options['d']['Presentation'] - title = presentation['Title'] - description = presentation.get('Description') - thumbnail = None - duration = float_or_none(presentation.get('Duration'), 1000) - timestamp = int_or_none(presentation.get('UnixTime'), 1000) - - formats = [] - for stream in presentation['Streams']: - for video in stream['VideoUrls']: - thumbnail_url = stream.get('ThumbnailUrl') - if thumbnail_url: - thumbnail = 'http://collegerama.tudelft.nl' + thumbnail_url - format_id = video['MediaType'] - if format_id == 'SS': - continue - formats.append({ - 'url': video['Location'], - 'format_id': format_id, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 4cac29415..d08b909a6 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -120,13 +120,16 @@ class ComedyCentralTVIE(MTVServicesInfoExtractor): class ComedyCentralShortnameIE(InfoExtractor): - _VALID_URL = r'^:(?Ptds|thedailyshow)$' + _VALID_URL = r'^:(?Ptds|thedailyshow|theopposition)$' _TESTS = [{ 'url': ':tds', 'only_matching': True, }, { 'url': ':thedailyshow', 'only_matching': True, + }, { + 'url': ':theopposition', + 'only_matching': True, }] def _real_extract(self, url): @@ -134,5 +137,6 @@ class ComedyCentralShortnameIE(InfoExtractor): shortcut_map = { 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', } return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 459e7ffd6..a9939b0fd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,8 +27,12 @@ from ..compat import ( compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, +) +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, ) -from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, @@ -170,6 +174,8 @@ class InfoExtractor(object): width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. + * downloader_options A dictionary of downloader options as + described in FileDownloader url: Final video URL. ext: Video filename extension. @@ -297,8 +303,9 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title", "description" and "id" attributes - with the same semantics as videos (see above). + Additionally, playlists can have "id", "title", "description", "uploader", + "uploader_id", "uploader_url" attributes with the same semantics as videos + (see above). _type "multi_video" indicates that there are multiple videos that @@ -490,6 +497,16 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) @@ -519,15 +536,6 @@ class InfoExtractor(object): if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - # Some sites check X-Forwarded-For HTTP header in order to figure out - # the origin of the client behind proxy. This allows bypassing geo - # restriction by faking this header's value to IP that belongs to some - # geo unrestricted country. We will do so once we encounter any - # geo restriction error. - if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal @@ -588,19 +596,11 @@ class InfoExtractor(object): if not encoding: encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self._downloader.params.get('dump_intermediate_pages', False): - try: - url = url_or_request.get_full_url() - except AttributeError: - url = url_or_request - self.to_screen('Dumping request to ' + url) + self.to_screen('Dumping request to ' + urlh.geturl()) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self._downloader.params.get('write_pages', False): - try: - url = url_or_request.get_full_url() - except AttributeError: - url = url_or_request - basen = '%s_%s' % (video_id, url) + basen = '%s_%s' % (video_id, urlh.geturl()) if len(basen) > 240: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() basen = basen[:240 - len(h)] + h @@ -644,30 +644,68 @@ class InfoExtractor(object): content, _ = res return content - def _download_xml(self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): - """Return the xml as an xml.etree.ElementTree.Element""" - xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) - if xml_string is False: - return xml_string - if transform_source: - xml_string = transform_source(xml_string) - return compat_etree_fromstring(xml_string.encode('utf-8')) - - def _download_json(self, url_or_request, video_id, - note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', - transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - json_string = self._download_webpage( + def _download_xml_handle( + self, url_or_request, video_id, note='Downloading XML', + errnote='Unable to download XML', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" + res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) - if (not fatal) and json_string is False: - return None + if res is False: + return res + xml_string, urlh = res + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + + def _download_xml(self, url_or_request, video_id, + note='Downloading XML', errnote='Unable to download XML', + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}): + """Return the xml as an xml.etree.ElementTree.Element""" + res = self._download_xml_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query) + return res if res is False else res[0] + + def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): + if transform_source: + xml_string = transform_source(xml_string) + try: + return compat_etree_fromstring(xml_string.encode('utf-8')) + except compat_xml_parse_error as ve: + errmsg = '%s: Failed to parse XML ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) + + def _download_json_handle( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + """Return a tuple (JSON object, URL handle)""" + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query) + if res is False: + return res + json_string, urlh = res return self._parse_json( - json_string, video_id, transform_source=transform_source, fatal=fatal) + json_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + + def _download_json( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + res = self._download_json_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query) + return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): if transform_source: @@ -940,7 +978,8 @@ class InfoExtractor(object): def _family_friendly_search(self, html): # See http://schema.org/VideoObject - family_friendly = self._html_search_meta('isFamilyFriendly', html) + family_friendly = self._html_search_meta( + 'isFamilyFriendly', html, default=None) if not family_friendly: return None @@ -981,6 +1020,40 @@ class InfoExtractor(object): if isinstance(json_ld, dict): json_ld = [json_ld] + INTERACTION_TYPE_MAP = { + 'CommentAction': 'comment', + 'AgreeAction': 'like', + 'DisagreeAction': 'dislike', + 'LikeAction': 'like', + 'DislikeAction': 'dislike', + 'ListenAction': 'view', + 'WatchAction': 'view', + 'ViewAction': 'view', + } + + def extract_interaction_statistic(e): + interaction_statistic = e.get('interactionStatistic') + if not isinstance(interaction_statistic, list): + return + for is_e in interaction_statistic: + if not isinstance(is_e, dict): + continue + if is_e.get('@type') != 'InteractionCounter': + continue + interaction_type = is_e.get('interactionType') + if not isinstance(interaction_type, compat_str): + continue + interaction_count = int_or_none(is_e.get('userInteractionCount')) + if interaction_count is None: + continue + count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) + if not count_kind: + continue + count_key = '%s_count' % count_kind + if info.get(count_key) is not None: + continue + info[count_key] = interaction_count + def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ @@ -996,9 +1069,10 @@ class InfoExtractor(object): 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), }) + extract_interaction_statistic(e) for e in json_ld: - if e.get('@context') == 'http://schema.org': + if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: return info @@ -1014,7 +1088,7 @@ class InfoExtractor(object): part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Article': + elif item_type in ('Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), @@ -1223,11 +1297,8 @@ class InfoExtractor(object): media_nodes = remove_encrypted_media(media_nodes) if not media_nodes: return formats - base_url = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], - 'base URL', default=None) - if base_url: - base_url = base_url.strip() + + manifest_base_url = get_base_url(manifest) bootstrap_info = xpath_element( manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], @@ -1259,7 +1330,7 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) + else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested @@ -1294,6 +1365,7 @@ class InfoExtractor(object): 'url': manifest_url, 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, + 'protocol': 'f4m', 'tbr': tbr, 'width': width, 'height': height, @@ -1339,6 +1411,9 @@ class InfoExtractor(object): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] + if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay + return [] + formats = [] format_url = lambda u: ( @@ -1385,7 +1460,7 @@ class InfoExtractor(object): media_url = media.get('URI') if media_url: format_id = [] - for v in (group_id, name): + for v in (m3u8_id, group_id, name): if v: format_id.append(v) f = { @@ -1678,22 +1753,24 @@ class InfoExtractor(object): }) return subtitles - def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): xspf = self._download_xml( - playlist_url, playlist_id, 'Downloading xpsf playlist', + xspf_url, playlist_id, 'Downloading xpsf playlist', 'Unable to download xspf manifest', fatal=fatal) if xspf is False: return [] - return self._parse_xspf(xspf, playlist_id) + return self._parse_xspf( + xspf, playlist_id, xspf_url=xspf_url, + xspf_base_url=base_url(xspf_url)) - def _parse_xspf(self, playlist, playlist_id): + def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): NS_MAP = { 'xspf': 'http://xspf.org/ns/0/', 's1': 'http://static.streamone.nl/player/ns/0', } entries = [] - for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): title = xpath_text( track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) description = xpath_text( @@ -1703,12 +1780,18 @@ class InfoExtractor(object): duration = float_or_none( xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) - formats = [{ - 'url': location.text, - 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), - 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), - 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), - } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + formats = [] + for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): + format_url = urljoin(xspf_base_url, location.text) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'manifest_url': xspf_url, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + }) self._sort_formats(formats) entries.append({ @@ -1722,18 +1805,18 @@ class InfoExtractor(object): return entries def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): - res = self._download_webpage_handle( + res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', fatal=fatal) if res is False: return [] - mpd, urlh = res + mpd_doc, urlh = res mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, + mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, formats_dict=formats_dict, mpd_url=mpd_url) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): @@ -1785,7 +1868,7 @@ class InfoExtractor(object): ms_info['timescale'] = int(timescale) segment_duration = source.get('duration') if segment_duration: - ms_info['segment_duration'] = int(segment_duration) + ms_info['segment_duration'] = float(segment_duration) def extract_Initialization(source): initialization = source.find(_add_ns('Initialization')) @@ -1866,6 +1949,7 @@ class InfoExtractor(object): 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', } f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) @@ -1904,7 +1988,7 @@ class InfoExtractor(object): # can't be used at the same time if '%(Number' in media_template and 's' not in representation_ms_info: segment_duration = None - if 'total_number' not in representation_ms_info and 'segment_duration': + if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) representation_ms_info['fragments'] = [{ @@ -1963,6 +2047,22 @@ class InfoExtractor(object): }) segment_index += 1 representation_ms_info['fragments'] = fragments + elif 'segment_urls' in representation_ms_info: + # Segment URLs with no SegmentTimeline + # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # https://github.com/rg3/youtube-dl/pull/14844 + fragments = [] + segment_duration = float_or_none( + representation_ms_info['segment_duration'], + representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None + for segment_url in representation_ms_info['segment_urls']: + fragment = { + location_key(segment_url): segment_url, + } + if segment_duration: + fragment['duration'] = segment_duration + fragments.append(fragment) + representation_ms_info['fragments'] = fragments # NB: MPD manifest may contain direct URLs to unfragmented media. # No fragments key is present in this case. if 'fragments' in representation_ms_info: @@ -1977,32 +2077,29 @@ class InfoExtractor(object): f['url'] = initialization_url f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == representation_id) - except StopIteration: - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - existing_format.update(f) + # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation + # is not necessarily unique within a Period thus formats with + # the same `format_id` are quite possible. There are numerous examples + # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, + # https://github.com/rg3/youtube-dl/issues/13919) + full_info = formats_dict.get(representation_id, {}).copy() + full_info.update(f) + formats.append(full_info) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): - res = self._download_webpage_handle( + res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', fatal=fatal) if res is False: return [] - ism, urlh = res + ism_doc, urlh = res - return self._parse_ism_formats( - compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) + return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): """ @@ -2026,7 +2123,7 @@ class InfoExtractor(object): stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') + fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) # TODO: add support for WVC1 and WMAP if fourcc not in ('H264', 'AVC1', 'AACL'): self.report_warning('%s is not a supported codec' % fourcc) @@ -2100,8 +2197,8 @@ class InfoExtractor(object): return formats def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): - def absolute_url(video_url): - return compat_urlparse.urljoin(base_url, video_url) + def absolute_url(item_url): + return urljoin(base_url, item_url) def parse_content_type(content_type): if not content_type: @@ -2114,19 +2211,19 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type): + def _media_formats(src, cur_media_type, type_info={}): full_url = absolute_url(src) - ext = determine_ext(full_url) + ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference) + preference=preference, fatal=False) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id) + full_url, video_id, mpd_id=mpd_id, fatal=False) else: is_plain_url = True formats = [{ @@ -2158,16 +2255,22 @@ class InfoExtractor(object): if src: _, formats = _media_formats(src, media_type) media_info['formats'].extend(formats) - media_info['thumbnail'] = media_attributes.get('poster') + media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: for source_tag in re.findall(r']+>', media_content): source_attributes = extract_attributes(source_tag) src = source_attributes.get('src') if not src: continue - is_plain_url, formats = _media_formats(src, media_type) + f = parse_content_type(source_attributes.get('type')) + is_plain_url, formats = _media_formats(src, media_type, f) if is_plain_url: - f = parse_content_type(source_attributes.get('type')) + # res attribute is not standard but seen several times + # in the wild + f.update({ + 'height': int_or_none(source_attributes.get('res')), + 'format_id': source_attributes.get('label'), + }) f.update(formats[0]) media_info['formats'].append(f) else: @@ -2211,27 +2314,36 @@ class InfoExtractor(object): return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): + query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex( - r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') - http_base_url = '%s:%s' % ('http', url_base) + mobj = re.search( + r'(?:(?:http|rtmp|rtsp)(?Ps)?:)?(?P//[^?]+)', url) + url_base = mobj.group('url') + http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) formats = [] + + def manifest_url(manifest): + m_url = '%s/%s' % (http_base_url, manifest) + if query: + m_url += '?%s' % query + return m_url + if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( - http_base_url + '/playlist.m3u8', video_id, 'mp4', + manifest_url('playlist.m3u8'), video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) if 'f4m' not in skip_protocols: formats.extend(self._extract_f4m_formats( - http_base_url + '/manifest.f4m', + manifest_url('manifest.f4m'), video_id, f4m_id='hds', fatal=False)) if 'dash' not in skip_protocols: formats.extend(self._extract_mpd_formats( - http_base_url + '/manifest.mpd', + manifest_url('manifest.mpd'), video_id, mpd_id='dash', fatal=False)) if re.search(r'(?:/smil:|\.smil)', url_base): if 'smil' not in skip_protocols: rtmp_formats = self._extract_smil_formats( - http_base_url + '/jwplayer.smil', + manifest_url('jwplayer.smil'), video_id, fatal=False) for rtmp_format in rtmp_formats: rtsp_format = rtmp_format.copy() @@ -2300,7 +2412,6 @@ class InfoExtractor(object): formats = self._parse_jwplayer_formats( video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) - self._sort_formats(formats) subtitles = {} tracks = video_data.get('tracks') @@ -2308,7 +2419,10 @@ class InfoExtractor(object): for track in tracks: if not isinstance(track, dict): continue - if track.get('kind') != 'captions': + track_kind = track.get('kind') + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): continue track_url = urljoin(base_url, track.get('file')) if not track_url: @@ -2317,16 +2431,25 @@ class InfoExtractor(object): 'url': self._proto_relative_url(track_url) }) - entries.append({ + entry = { 'id': this_video_id, - 'title': video_data['title'] if require_title else video_data.get('title'), + 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, - 'formats': formats, - }) + } + # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 + if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): + entry.update({ + '_type': 'url_transparent', + 'url': formats[0]['url'], + }) + else: + self._sort_formats(formats) + entry['formats'] = formats + entries.append(entry) if len(entries) == 1: return entries[0] else: @@ -2353,7 +2476,7 @@ class InfoExtractor(object): formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -2427,10 +2550,12 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res - def _set_cookie(self, domain, name, value, expire_time=None): + def _set_cookie(self, domain, name, value, expire_time=None, port=None, + path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, None, None, domain, None, - None, '/', True, False, expire_time, '', None, None, None) + 0, name, value, port, port is not None, domain, True, + domain.startswith('.'), path, True, secure, expire_time, + discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 0c3f0c0e4..ed278fefc 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -116,16 +116,16 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video_params(self, webpage): - query = {} - params = self._search_regex( - r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) - if params: - query.update({ - 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), - 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), - 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), - }) + def _extract_video_params(self, webpage, display_id): + query = self._parse_json( + self._search_regex( + r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', + default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if query: + query['videoId'] = self._search_regex( + r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', + webpage, 'video id', default=None) else: params = extract_attributes(self._search_regex( r'(<[^>]+data-js="video-player"[^>]+>)', @@ -141,17 +141,27 @@ class CondeNastIE(InfoExtractor): video_id = params['videoId'] video_info = None - if params.get('playerId'): - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=params) - if info_page: - video_info = info_page.get('video') - if not video_info: - info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=params) - else: + + # New API path + query = params.copy() + query['embedType'] = 'inline' + info_page = self._download_json( + 'http://player.cnevids.com/embed-api.json', video_id, + 'Downloading embed info', fatal=False, query=query) + + # Old fallbacks + if not info_page: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', video_id, + 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + if not video_info: info_page = self._download_webpage( 'https://player.cnevids.com/inline/video/%s.js' % video_id, video_id, 'Downloading inline info', query={ @@ -215,7 +225,7 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage) + params = self._extract_video_params(webpage, display_id) info = self._search_json_ld( webpage, display_id, fatal=False) info.update(self._extract_video(params)) diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index 94d03ce2a..f77a68ece 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( parse_iso8601, str_to_int, @@ -41,11 +42,9 @@ class CrackedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - youtube_url = self._search_regex( - r']+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', - webpage, 'youtube url', default=None) + youtube_url = YoutubeIE._extract_url(webpage) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) video_url = self._html_search_regex( [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'\d+)' _TEST = { - 'url': 'http://www.crackle.com/comedians-in-cars-getting-coffee/2498934', + # geo restricted to CA + 'url': 'https://www.crackle.com/andromeda/2502343', 'info_dict': { - 'id': '2498934', + 'id': '2502343', 'ext': 'mp4', - 'title': 'Everybody Respects A Bloody Nose', - 'description': 'Jerry is kaffeeklatsching in L.A. with funnyman J.B. Smoove (Saturday Night Live, Real Husbands of Hollywood). They’re headed for brew at 10 Speed Coffee in a 1964 Studebaker Avanti.', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 906, - 'series': 'Comedians In Cars Getting Coffee', - 'season_number': 8, - 'episode_number': 4, - 'subtitles': { - 'en-US': [ - {'ext': 'vtt'}, - {'ext': 'tt'}, - ] - }, + 'title': 'Under The Night', + 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', + 'duration': 2583, + 'view_count': int, + 'average_rating': 0, + 'age_limit': 14, + 'genre': 'Action, Sci-Fi', + 'creator': 'Allan Kroeker', + 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', + 'release_year': 2000, + 'series': 'Andromeda', + 'episode': 'Under The Night', + 'season_number': 1, + 'episode_number': 1, }, 'params': { # m3u8 download @@ -33,109 +47,118 @@ class CrackleIE(InfoExtractor): } } - _THUMBNAIL_RES = [ - (120, 90), - (208, 156), - (220, 124), - (220, 220), - (240, 180), - (250, 141), - (315, 236), - (320, 180), - (360, 203), - (400, 300), - (421, 316), - (460, 330), - (460, 460), - (462, 260), - (480, 270), - (587, 330), - (640, 480), - (700, 330), - (700, 394), - (854, 480), - (1024, 1024), - (1920, 1080), - ] - - # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx - _MEDIA_FILE_SLOTS = { - 'c544.flv': { - 'width': 544, - 'height': 306, - }, - '360p.mp4': { - 'width': 640, - 'height': 360, - }, - '480p.mp4': { - 'width': 852, - 'height': 478, - }, - '480p_1mbps.mp4': { - 'width': 852, - 'height': 478, - }, - } - def _real_extract(self, url): video_id = self._match_id(url) - config_doc = self._download_xml( - 'http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx?site=16', - video_id, 'Downloading config') + country_code = self._downloader.params.get('geo_bypass_country', None) + countries = [country_code] if country_code else ( + 'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI') - item = self._download_xml( - 'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, - video_id, headers=self.geo_verification_headers()).find('i') - title = item.attrib['t'] + last_e = None - subtitles = {} - formats = self._extract_m3u8_formats( - 'http://content.uplynk.com/ext/%s/%s.m3u8' % (config_doc.attrib['strUplynkOwnerId'], video_id), - video_id, 'mp4', m3u8_id='hls', fatal=None) - thumbnails = [] - path = item.attrib.get('p') - if path: - for width, height in self._THUMBNAIL_RES: - res = '%dx%d' % (width, height) - thumbnails.append({ - 'id': res, - 'url': 'http://images-us-am.crackle.com/%stnl_%s.jpg' % (path, res), - 'width': width, - 'height': height, - 'resolution': res, - }) - http_base_url = 'http://ahttp.crackle.com/' + path - for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items(): - formats.append({ - 'url': http_base_url + mfs_path, - 'format_id': 'http-' + mfs_path.split('.')[0], - 'width': mfs_info['width'], - 'height': mfs_info['height'], - }) - for cc in item.findall('cc'): - locale = cc.attrib.get('l') - v = cc.attrib.get('v') - if locale and v: - if locale not in subtitles: - subtitles[locale] = [] - for url_ext, ext in (('vtt', 'vtt'), ('xml', 'tt')): - subtitles.setdefault(locale, []).append({ - 'url': '%s/%s%s_%s.%s' % (config_doc.attrib['strSubtitleServer'], path, locale, v, url_ext), - 'ext': ext, - }) - self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + for country in countries: + try: + media = self._download_json( + 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s' + % (video_id, country), video_id, + 'Downloading media JSON as %s' % country, + 'Unable to download media JSON', query={ + 'disableProtocols': 'true', + 'format': 'json' + }) + except ExtractorError as e: + # 401 means geo restriction, trying next country + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + last_e = e + continue + raise - return { - 'id': video_id, - 'title': title, - 'description': item.attrib.get('d'), - 'duration': int(item.attrib.get('r'), 16) / 1000 if item.attrib.get('r') else None, - 'series': item.attrib.get('sn'), - 'season_number': int_or_none(item.attrib.get('se')), - 'episode_number': int_or_none(item.attrib.get('ep')), - 'thumbnails': thumbnails, - 'subtitles': subtitles, - 'formats': formats, - } + media_urls = media.get('MediaURLs') + if not media_urls or not isinstance(media_urls, list): + continue + + title = media['Title'] + + formats = [] + for e in media['MediaURLs']: + if e.get('UseDRM') is True: + continue + format_url = e.get('Path') + if not format_url or not isinstance(format_url, compat_str): + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + description = media.get('Description') + duration = int_or_none(media.get( + 'DurationInSeconds')) or parse_duration(media.get('Duration')) + view_count = int_or_none(media.get('CountViews')) + average_rating = float_or_none(media.get('UserRating')) + age_limit = parse_age_limit(media.get('Rating')) + genre = media.get('Genre') + release_year = int_or_none(media.get('ReleaseYear')) + creator = media.get('Directors') + artist = media.get('Cast') + + if media.get('MediaTypeDisplayValue') == 'Full Episode': + series = media.get('ShowName') + episode = title + season_number = int_or_none(media.get('Season')) + episode_number = int_or_none(media.get('Episode')) + else: + series = episode = season_number = episode_number = None + + subtitles = {} + cc_files = media.get('ClosedCaptionFiles') + if isinstance(cc_files, list): + for cc_file in cc_files: + if not isinstance(cc_file, dict): + continue + cc_url = cc_file.get('Path') + if not cc_url or not isinstance(cc_url, compat_str): + continue + lang = cc_file.get('Locale') or 'en' + subtitles.setdefault(lang, []).append({'url': cc_url}) + + thumbnails = [] + images = media.get('Images') + if isinstance(images, list): + for image_key, image_url in images.items(): + mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key) + if not mobj: + continue + thumbnails.append({ + 'url': image_url, + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, + 'genre': genre, + 'creator': creator, + 'artist': artist, + 'release_year': release_year, + 'series': series, + 'episode': episode, + 'season_number': season_number, + 'episode_number': episode_number, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } + + raise last_e diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 8bdaf0c2c..3efdc8c21 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals import re import json -import base64 import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( + compat_b64decode, compat_etree_fromstring, compat_urllib_parse_urlencode, compat_urllib_request, @@ -38,11 +38,32 @@ class CrunchyrollBaseIE(InfoExtractor): _LOGIN_FORM = 'login_form' _NETRC_MACHINE = 'crunchyroll' + def _call_rpc_api(self, method, video_id, note=None, data=None): + data = data or {} + data['req'] = 'RpcApi' + method + data = compat_urllib_parse_urlencode(data).encode('utf-8') + return self._download_xml( + 'http://www.crunchyroll.com/xml/', + video_id, note, fatal=False, data=data, headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + def _login(self): (username, password) = self._get_login_info() if username is None: return + self._download_webpage( + 'https://www.crunchyroll.com/?a=formhandler', + None, 'Logging in', 'Wrong login info', + data=urlencode_postdata({ + 'formname': 'RpcApiUser_Login', + 'next_url': 'https://www.crunchyroll.com/acct/membership', + 'name': username, + 'password': password, + })) + + ''' login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -86,6 +107,7 @@ class CrunchyrollBaseIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') + ''' def _real_initialize(self): self._login() @@ -250,8 +272,8 @@ class CrunchyrollIE(CrunchyrollBaseIE): } def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) - iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) + data = bytes_to_intlist(compat_b64decode(data)) + iv = bytes_to_intlist(compat_b64decode(iv)) id = int(id) def obfuscate_key_aux(count, modulo, start): @@ -365,15 +387,19 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text def _get_subtitles(self, video_id, webpage): subtitles = {} for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): - sub_page = self._download_webpage( - 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, - video_id, note='Downloading subtitles for ' + sub_name) - id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) - iv = self._search_regex(r'([^<]+)', sub_page, 'subtitle_iv', fatal=False) - data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) - if not id or not iv or not data: + sub_doc = self._call_rpc_api( + 'Subtitle_GetXml', video_id, + 'Downloading subtitles for ' + sub_name, data={ + 'subtitle_script_id': sub_id, + }) + if sub_doc is None: continue - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') + sid = sub_doc.get('id') + iv = xpath_text(sub_doc, 'iv', 'subtitle iv') + data = xpath_text(sub_doc, 'data', 'subtitle data') + if not sid or not iv or not data: + continue + subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8') lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue @@ -444,65 +470,79 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text for fmt in available_fmts: stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' - streamdata_req = sanitized_Request( - 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' - % (video_id, stream_format, stream_quality), - compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8')) - streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - streamdata = self._download_xml( - streamdata_req, video_id, - note='Downloading media info for %s' % video_format) - stream_info = streamdata.find('./{default}preload/stream_info') - video_encode_id = xpath_text(stream_info, './video_encode_id') - if video_encode_id in video_encode_ids: - continue - video_encode_ids.append(video_encode_id) + stream_infos = [] + streamdata = self._call_rpc_api( + 'VideoPlayer_GetStandardConfig', video_id, + 'Downloading media info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_quality': stream_quality, + 'current_page': url, + }) + if streamdata is not None: + stream_info = streamdata.find('./{default}preload/stream_info') + if stream_info is not None: + stream_infos.append(stream_info) + stream_info = self._call_rpc_api( + 'VideoEncode_GetStreamInfo', video_id, + 'Downloading stream info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_encode_quality': stream_quality, + }) + if stream_info is not None: + stream_infos.append(stream_info) + for stream_info in stream_infos: + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) - video_file = xpath_text(stream_info, './file') - if not video_file: - continue - if video_file.startswith('http'): - formats.extend(self._extract_m3u8_formats( - video_file, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - - video_url = xpath_text(stream_info, './host') - if not video_url: - continue - metadata = stream_info.find('./metadata') - format_info = { - 'format': video_format, - 'format_id': video_format, - 'height': int_or_none(xpath_text(metadata, './height')), - 'width': int_or_none(xpath_text(metadata, './width')), - } - - if '.fplive.net/' in video_url: - video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) - parsed_video_url = compat_urlparse.urlparse(video_url) - direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( - netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) - if self._is_valid_url(direct_video_url, video_id, video_format): - format_info.update({ - 'url': direct_video_url, - }) - formats.append(format_info) + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue - format_info.update({ - 'url': video_url, - 'play_path': video_file, - 'ext': 'flv', - }) - formats.append(format_info) - self._sort_formats(formats) + video_url = xpath_text(stream_info, './host') + if not video_url: + continue + metadata = stream_info.find('./metadata') + format_info = { + 'format': video_format, + 'height': int_or_none(xpath_text(metadata, './height')), + 'width': int_or_none(xpath_text(metadata, './width')), + } - metadata = self._download_xml( - 'http://www.crunchyroll.com/xml', video_id, - note='Downloading media info', query={ - 'req': 'RpcApiVideoPlayer_GetMediaMetadata', + if '.fplive.net/' in video_url: + video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) + parsed_video_url = compat_urlparse.urlparse(video_url) + direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( + netloc='v.lvlt.crcdn.net', + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) + if self._is_valid_url(direct_video_url, video_id, video_format): + format_info.update({ + 'format_id': 'http-' + video_format, + 'url': direct_video_url, + }) + formats.append(format_info) + continue + + format_info.update({ + 'format_id': 'rtmp-' + video_format, + 'url': video_url, + 'play_path': video_file, + 'ext': 'flv', + }) + formats.append(format_info) + self._sort_formats(formats, ('height', 'width', 'tbr', 'fps')) + + metadata = self._call_rpc_api( + 'VideoPlayer_GetMediaMetadata', video_id, + note='Downloading media info', data={ 'media_id': video_id, }) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 171820e27..67d6df4b0 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -4,13 +4,14 @@ import re from .common import InfoExtractor from ..utils import ( - int_or_none, - unescapeHTML, - find_xpath_attr, - smuggle_url, determine_ext, ExtractorError, extract_attributes, + find_xpath_attr, + get_element_by_class, + int_or_none, + smuggle_url, + unescapeHTML, ) from .senateisvp import SenateISVPIE from .ustream import UstreamIE @@ -68,6 +69,10 @@ class CSpanIE(InfoExtractor): 'uploader': 'HouseCommittee', 'uploader_id': '12987475', }, + }, { + # Audio Only + 'url': 'https://www.c-span.org/video/?437336-1/judiciary-antitrust-competition-policy-consumer-rights', + 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' @@ -111,7 +116,15 @@ class CSpanIE(InfoExtractor): title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) return self.url_result(surl, 'SenateISVP', video_id, title) + video_id = self._search_regex( + r'jwsetup\.clipprog\s*=\s*(\d+);', + webpage, 'jwsetup program id', default=None) + if video_id: + video_type = 'program' if video_type is None or video_id is None: + error_message = get_element_by_class('VLplayer-error-message', webpage) + if error_message: + raise ExtractorError(error_message) raise ExtractorError('unable to find video id and type') def get_text_attr(d, attr): @@ -138,7 +151,7 @@ class CSpanIE(InfoExtractor): entries = [] for partnum, f in enumerate(files): formats = [] - for quality in f['qualities']: + for quality in f.get('qualities', []): formats.append({ 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), 'url': unescapeHTML(get_text_attr(quality, 'file')), diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 74e991331..0e7d587dd 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -235,7 +235,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): # vevo embed vevo_id = self._search_regex( - r'[\w]*)', + r'[\w]*)', webpage, 'vevo embed', default=None) if vevo_id: return self.url_result('vevo:%s' % vevo_id, 'Vevo') @@ -325,7 +325,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P[^/?#&]+)' _MORE_PAGES_INDICATOR = r'(?s)
.*?[^/?]+)' % _VALID_URL_PREFIX - _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX - - _TESTS = [{ - # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html - # Tested at FranceTvInfo_2 - 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', - 'only_matching': True, - }, { - # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html - 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1', - 'only_matching': True, - }] - - @classmethod - def _extract_dmcloud_url(cls, webpage): - mobj = re.search(r']+src=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, webpage) - if mobj: - return mobj.group(1) - - mobj = re.search( - r']+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, - webpage) - if mobj: - return mobj.group(1) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage_no_ff(url, video_id) - - title = self._html_search_regex(r'([^>]+)', webpage, 'title') - - video_info = self._parse_json(self._search_regex( - r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) - - # TODO: parse ios_url, which is in fact a manifest - video_url = video_info['mp4_url'] - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': video_info.get('thumbnail_url'), - } diff --git a/youtube_dl/extractor/daisuki.py b/youtube_dl/extractor/daisuki.py index 58cc98666..dbc1aa5d4 100644 --- a/youtube_dl/extractor/daisuki.py +++ b/youtube_dl/extractor/daisuki.py @@ -10,36 +10,34 @@ from ..aes import ( aes_cbc_decrypt, aes_cbc_encrypt, ) +from ..compat import compat_b64decode from ..utils import ( bytes_to_intlist, bytes_to_long, - clean_html, + extract_attributes, ExtractorError, intlist_to_bytes, - get_element_by_id, js_to_json, int_or_none, long_to_bytes, pkcs1pad, - remove_end, ) -class DaisukiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?daisuki\.net/[^/]+/[^/]+/[^/]+/watch\.[^.]+\.(?P\d+)\.html' +class DaisukiMottoIE(InfoExtractor): + _VALID_URL = r'https?://motto\.daisuki\.net/framewatch/embed/[^/]+/(?P[0-9a-zA-Z]{3})' _TEST = { - 'url': 'http://www.daisuki.net/tw/en/anime/watch.TheIdolMasterCG.11213.html', + 'url': 'http://motto.daisuki.net/framewatch/embed/embedDRAGONBALLSUPERUniverseSurvivalsaga/V2e/760/428', 'info_dict': { - 'id': '11213', + 'id': 'V2e', 'ext': 'mp4', - 'title': '#01 Who is in the pumpkin carriage? - THE IDOLM@STER CINDERELLA GIRLS', + 'title': '#117 SHOWDOWN OF LOVE! ANDROIDS VS UNIVERSE 2!!', 'subtitles': { 'mul': [{ 'ext': 'ttml', }], }, - 'creator': 'BANDAI NAMCO Entertainment', }, 'params': { 'skip_download': True, # AES-encrypted HLS stream @@ -73,15 +71,17 @@ class DaisukiIE(InfoExtractor): n, e = self._RSA_KEY encrypted_aeskey = long_to_bytes(pow(bytes_to_long(padded_aeskey), e, n)) - init_data = self._download_json('http://www.daisuki.net/bin/bgn/init', video_id, query={ - 's': flashvars.get('s', ''), - 'c': flashvars.get('ss3_prm', ''), - 'e': url, - 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt( - bytes_to_intlist(json.dumps(data)), - aes_key, iv))).decode('ascii'), - 'a': base64.b64encode(encrypted_aeskey).decode('ascii'), - }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else '')) + init_data = self._download_json( + 'http://motto.daisuki.net/fastAPI/bgn/init/', + video_id, query={ + 's': flashvars.get('s', ''), + 'c': flashvars.get('ss3_prm', ''), + 'e': url, + 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt( + bytes_to_intlist(json.dumps(data)), + aes_key, iv))).decode('ascii'), + 'a': base64.b64encode(encrypted_aeskey).decode('ascii'), + }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else '')) if 'rtn' in init_data: encrypted_rtn = init_data['rtn'] @@ -94,18 +94,15 @@ class DaisukiIE(InfoExtractor): rtn = self._parse_json( intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist( - base64.b64decode(encrypted_rtn)), + compat_b64decode(encrypted_rtn)), aes_key, iv)).decode('utf-8').rstrip('\0'), video_id) + title = rtn['title_str'] + formats = self._extract_m3u8_formats( rtn['play_url'], video_id, ext='mp4', entry_protocol='m3u8_native') - title = remove_end(self._og_search_title(webpage), ' - DAISUKI') - - creator = self._html_search_regex( - r'Creator\s*:\s*([^<]+)', webpage, 'creator', fatal=False) - subtitles = {} caption_url = rtn.get('caption_url') if caption_url: @@ -120,21 +117,18 @@ class DaisukiIE(InfoExtractor): 'title': title, 'formats': formats, 'subtitles': subtitles, - 'creator': creator, } -class DaisukiPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)daisuki\.net/[^/]+/[^/]+/[^/]+/detail\.(?P[a-zA-Z0-9]+)\.html' +class DaisukiMottoPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://motto\.daisuki\.net/(?Pinformation)/' _TEST = { - 'url': 'http://www.daisuki.net/tw/en/anime/detail.TheIdolMasterCG.html', + 'url': 'http://motto.daisuki.net/information/', 'info_dict': { - 'id': 'TheIdolMasterCG', - 'title': 'THE IDOLM@STER CINDERELLA GIRLS', - 'description': 'md5:0f2c028a9339f7a2c7fbf839edc5c5d8', + 'title': 'DRAGON BALL SUPER', }, - 'playlist_count': 26, + 'playlist_mincount': 117, } def _real_extract(self, url): @@ -142,18 +136,19 @@ class DaisukiPlaylistIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - episode_pattern = r'''(?sx) - ]+delay="[^"]+/(\d+)/movie\.jpg".+? - ]+class=".*?\bepisodeNumber\b.*?">(?:]+>)?([^<]+)''' - entries = [{ - '_type': 'url_transparent', - 'url': url.replace('detail', 'watch').replace('.html', '.' + movie_id + '.html'), - 'episode_id': episode_id, - 'episode_number': int_or_none(episode_id), - } for movie_id, episode_id in re.findall(episode_pattern, webpage)] + entries = [] + for li in re.findall(r'(]+?data-product_id="[a-zA-Z0-9]{3}"[^>]+>)', webpage): + attr = extract_attributes(li) + ad_id = attr.get('data-ad_id') + product_id = attr.get('data-product_id') + if ad_id and product_id: + episode_id = attr.get('data-chapter') + entries.append({ + '_type': 'url_transparent', + 'url': 'http://motto.daisuki.net/framewatch/embed/%s/%s/760/428' % (ad_id, product_id), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode_id), + 'ie_key': 'DaisukiMotto', + }) - playlist_title = remove_end( - self._og_search_title(webpage, fatal=False), ' - Anime - DAISUKI') - playlist_description = clean_html(get_element_by_id('synopsisTxt', webpage)) - - return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + return self.playlist_result(entries, playlist_title='DRAGON BALL SUPER') diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 00fbbff2f..3a6d0560e 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -2,53 +2,85 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..compat import compat_str +from ..utils import ( + float_or_none, + unified_strdate, +) class DctpTvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(#/)?filme/(?P.+?)/$' + _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', - 'md5': '174dd4a8a6225cf5655952f969cfbe24', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', 'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 71.24, + }, + 'params': { + # rtmp download + 'skip_download': True, }, } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) - object_id = self._html_search_meta('DC.identifier', webpage) + webpage = self._download_webpage(url, display_id) - servers_json = self._download_json( - 'http://www.dctp.tv/elastic_streaming_client/get_streaming_server/', - video_id, note='Downloading server list') - server = servers_json[0]['server'] - m3u8_path = self._search_regex( - r'\'([^\'"]+/playlist\.m3u8)"', webpage, 'm3u8 path') - formats = self._extract_m3u8_formats( - 'http://%s%s' % (server, m3u8_path), video_id, ext='mp4', - entry_protocol='m3u8_native') + video_id = self._html_search_meta( + 'DC.identifier', webpage, 'video id', + default=None) or self._search_regex( + r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') title = self._og_search_title(webpage) + + servers = self._download_json( + 'http://www.dctp.tv/streaming_servers/', display_id, + note='Downloading server list', fatal=False) + + if servers: + endpoint = next( + server['endpoint'] + for server in servers + if isinstance(server.get('endpoint'), compat_str) and + 'cloudfront' in server['endpoint']) + else: + endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' + + app = self._search_regex( + r'^rtmpe?://[^/]+/(?P.*)$', endpoint, 'app') + + formats = [{ + 'url': endpoint, + 'app': app, + 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'page_url': url, + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'ext': 'flv', + }] + description = self._html_search_meta('DC.description', webpage) upload_date = unified_strdate( self._html_search_meta('DC.date.created', webpage)) thumbnail = self._og_search_thumbnail(webpage) + duration = float_or_none(self._search_regex( + r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', + default=None), scale=1000) return { - 'id': object_id, + 'id': video_id, 'title': title, 'formats': formats, - 'display_id': video_id, + 'display_id': display_id, 'description': description, 'upload_date': upload_date, 'thumbnail': thumbnail, + 'duration': duration, } diff --git a/youtube_dl/extractor/deezer.py b/youtube_dl/extractor/deezer.py index ec87b94db..a38b2683d 100644 --- a/youtube_dl/extractor/deezer.py +++ b/youtube_dl/extractor/deezer.py @@ -19,7 +19,7 @@ class DeezerPlaylistIE(InfoExtractor): 'id': '176747451', 'title': 'Best!', 'uploader': 'Anonymous', - 'thumbnail': r're:^https?://cdn-images.deezer.com/images/cover/.*\.jpg$', + 'thumbnail': r're:^https?://cdn-images\.deezer\.com/images/cover/.*\.jpg$', }, 'playlist_count': 30, 'skip': 'Only available in .de', diff --git a/youtube_dl/extractor/digg.py b/youtube_dl/extractor/digg.py new file mode 100644 index 000000000..913c1750f --- /dev/null +++ b/youtube_dl/extractor/digg.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class DiggIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?digg\.com/video/(?P[^/?#&]+)' + _TESTS = [{ + # JWPlatform via provider + 'url': 'http://digg.com/video/sci-fi-short-jonah-daniel-kaluuya-get-out', + 'info_dict': { + 'id': 'LcqvmS0b', + 'ext': 'mp4', + 'title': "'Get Out' Star Daniel Kaluuya Goes On 'Moby Dick'-Like Journey In Sci-Fi Short 'Jonah'", + 'description': 'md5:541bb847648b6ee3d6514bc84b82efda', + 'upload_date': '20180109', + 'timestamp': 1515530551, + }, + 'params': { + 'skip_download': True, + }, + }, { + # Youtube via provider + 'url': 'http://digg.com/video/dog-boat-seal-play', + 'only_matching': True, + }, { + # vimeo as regular embed + 'url': 'http://digg.com/video/dream-girl-short-film', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + info = self._parse_json( + self._search_regex( + r'(?s)video_info\s*=\s*({.+?});\n', webpage, 'video info', + default='{}'), display_id, transform_source=js_to_json, + fatal=False) + + video_id = info.get('video_id') + + if video_id: + provider = info.get('provider_name') + if provider == 'youtube': + return self.url_result( + video_id, ie='Youtube', video_id=video_id) + elif provider == 'jwplayer': + return self.url_result( + 'jwplatform:%s' % video_id, ie='JWPlatform', + video_id=video_id) + + return self.url_result(url, 'Generic') diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 55853f76f..91449dcd8 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -1,15 +1,20 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, -) +import random +import re +import string + +from .discoverygo import DiscoveryGoBaseIE from ..compat import compat_str +from ..utils import ( + ExtractorError, + try_get, +) +from ..compat import compat_HTTPError -class DiscoveryIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?: +class DiscoveryIE(DiscoveryGoBaseIE): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?P discovery| investigationdiscovery| discoverylife| @@ -19,79 +24,64 @@ class DiscoveryIE(InfoExtractor): sciencechannel| tlc| velocity - )\.com/(?:[^/]+/)*(?P[^./?#]+)''' + )\.com(?P/tv-shows/[^/]+/(?:video|full-episode)s/(?P[^./?#]+))''' _TESTS = [{ - 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', + 'url': 'https://www.discovery.com/tv-shows/cash-cab/videos/dave-foley', 'info_dict': { - 'id': '20769', + 'id': '5a2d9b4d6b66d17a5026e1fd', 'ext': 'mp4', - 'title': 'Mission Impossible Outtakes', - 'description': ('Watch Jamie Hyneman and Adam Savage practice being' - ' each other -- to the point of confusing Jamie\'s dog -- and ' - 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s' - ' back.'), - 'duration': 156, - 'timestamp': 1302032462, - 'upload_date': '20110405', - 'uploader_id': '103207', + 'title': 'Dave Foley', + 'description': 'md5:4b39bcafccf9167ca42810eb5f28b01f', + 'duration': 608, }, 'params': { 'skip_download': True, # requires ffmpeg } }, { - 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons', - 'info_dict': { - 'id': 'mythbusters-the-simpsons', - 'title': 'MythBusters: The Simpsons', - }, - 'playlist_mincount': 10, - }, { - 'url': 'http://www.animalplanet.com/longfin-eels-maneaters/', - 'info_dict': { - 'id': '78326', - 'ext': 'mp4', - 'title': 'Longfin Eels: Maneaters?', - 'description': 'Jeremy Wade tests whether or not New Zealand\'s longfin eels are man-eaters by covering himself in fish guts and getting in the water with them.', - 'upload_date': '20140725', - 'timestamp': 1406246400, - 'duration': 116, - 'uploader_id': '103207', - }, - 'params': { - 'skip_download': True, # requires ffmpeg - } + 'url': 'https://www.investigationdiscovery.com/tv-shows/final-vision/full-episodes/final-vision', + 'only_matching': True, }] + _GEO_COUNTRIES = ['US'] + _GEO_BYPASS = False def _real_extract(self, url): - display_id = self._match_id(url) - info = self._download_json(url + '?flat=1', display_id) + site, path, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) - video_title = info.get('playlist_title') or info.get('video_title') + react_data = self._parse_json(self._search_regex( + r'window\.__reactTransmitPacket\s*=\s*({.+?});', + webpage, 'react data'), display_id) + content_blocks = react_data['layout'][path]['contentBlocks'] + video = next(cb for cb in content_blocks if cb.get('type') == 'video')['content']['items'][0] + video_id = video['id'] - entries = [] + access_token = self._download_json( + 'https://www.%s.com/anonymous' % site, display_id, query={ + 'authRel': 'authorization', + 'client_id': try_get( + react_data, lambda x: x['application']['apiClientId'], + compat_str) or '3020a40c2356a645b4b4', + 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, + })['access_token'] - for idx, video_info in enumerate(info['playlist']): - subtitles = {} - caption_url = video_info.get('captionsUrl') - if caption_url: - subtitles = { - 'en': [{ - 'url': caption_url, - }] - } + try: + stream = self._download_json( + 'https://api.discovery.com/v1/streaming/video/' + video_id, + display_id, headers={ + 'Authorization': 'Bearer ' + access_token, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + e_description = self._parse_json( + e.cause.read().decode(), display_id)['description'] + if 'resource not available for country' in e_description: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + if 'Authorized Networks' in e_description: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + raise ExtractorError(e_description) + raise - entries.append({ - '_type': 'url_transparent', - 'url': 'http://players.brightcove.net/103207/default_default/index.html?videoId=ref:%s' % video_info['referenceId'], - 'id': compat_str(video_info['id']), - 'title': video_info['title'], - 'description': video_info.get('description'), - 'duration': parse_duration(video_info.get('video_length')), - 'webpage_url': video_info.get('href') or video_info.get('url'), - 'thumbnail': video_info.get('thumbnailURL'), - 'alt_title': video_info.get('secondary_title'), - 'timestamp': parse_iso8601(video_info.get('publishedDate')), - 'subtitles': subtitles, - }) - - return self.playlist_result(entries, display_id, video_title) + return self._extract_video_info(video, stream, display_id) diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 7cd5d4291..3368c4c07 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + determine_ext, extract_attributes, ExtractorError, int_or_none, @@ -27,42 +28,9 @@ class DiscoveryGoBaseIE(InfoExtractor): velocitychannel )go\.com/%s(?P[^/?#&]+)''' - -class DiscoveryGoIE(DiscoveryGoBaseIE): - _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' - _GEO_COUNTRIES = ['US'] - _TEST = { - 'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/', - 'info_dict': { - 'id': '58c167d86b66d12f2addeb01', - 'ext': 'mp4', - 'title': 'Reaper Madness', - 'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', - 'duration': 2519, - 'series': 'Bering Sea Gold', - 'season_number': 8, - 'episode_number': 6, - 'age_limit': 14, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - container = extract_attributes( - self._search_regex( - r'(]+class=["\']video-player-container[^>]+>)', - webpage, 'video container')) - - video = self._parse_json( - container.get('data-video') or container.get('data-json'), - display_id) - + def _extract_video_info(self, video, stream, display_id): title = video['name'] - stream = video.get('stream') if not stream: if video.get('authenticated') is True: raise ExtractorError( @@ -106,7 +74,11 @@ class DiscoveryGoIE(DiscoveryGoBaseIE): not subtitle_url.startswith('http')): continue lang = caption.get('fileLang', 'en') - subtitles.setdefault(lang, []).append({'url': subtitle_url}) + ext = determine_ext(subtitle_url) + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + 'ext': 'ttml' if ext == 'xml' else ext, + }) return { 'id': video_id, @@ -124,6 +96,43 @@ class DiscoveryGoIE(DiscoveryGoBaseIE): } +class DiscoveryGoIE(DiscoveryGoBaseIE): + _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' + _GEO_COUNTRIES = ['US'] + _TEST = { + 'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/', + 'info_dict': { + 'id': '58c167d86b66d12f2addeb01', + 'ext': 'mp4', + 'title': 'Reaper Madness', + 'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', + 'duration': 2519, + 'series': 'Bering Sea Gold', + 'season_number': 8, + 'episode_number': 6, + 'age_limit': 14, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + container = extract_attributes( + self._search_regex( + r'(]+class=["\']video-player-container[^>]+>)', + webpage, 'video container')) + + video = self._parse_json( + container.get('data-video') or container.get('data-json'), + display_id) + + stream = video.get('stream') + + return self._extract_video_info(video, stream, display_id) + + class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE): _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % '' _TEST = { diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py index 968c4c7fd..0eee82fd6 100644 --- a/youtube_dl/extractor/disney.py +++ b/youtube_dl/extractor/disney.py @@ -10,6 +10,7 @@ from ..utils import ( compat_str, determine_ext, ExtractorError, + update_url_query, ) @@ -108,9 +109,16 @@ class DisneyIE(InfoExtractor): continue tbr = int_or_none(flavor.get('bitrate')) if tbr == 99999: - formats.extend(self._extract_m3u8_formats( + # wrong ks(Kaltura Signature) causes 404 Error + flavor_url = update_url_query(flavor_url, {'ks': ''}) + m3u8_formats = self._extract_m3u8_formats( flavor_url, video_id, 'mp4', - m3u8_id=flavor_format, fatal=False)) + m3u8_id=flavor_format, fatal=False) + for f in m3u8_formats: + # Apple FairPlay + if '/fpshls/' in f['url']: + continue + formats.append(f) continue format_id = [] if flavor_format: diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 76e784105..b73446773 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -12,25 +12,28 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + determine_ext, ExtractorError, + float_or_none, int_or_none, remove_end, try_get, unified_strdate, + unified_timestamp, update_url_query, USER_AGENTS, ) class DPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?Pwww\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?Pwww\.(?Pdplay\.(?Pdk|se|no)))/(?:video(?:er|s)/)?(?P[^/]+/[^/?#]+)' _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', 'info_dict': { 'id': '3172', - 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', + 'display_id': 'nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet', 'ext': 'mp4', 'title': 'Svensken lär sig njuta av livet', 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', @@ -48,7 +51,7 @@ class DPlayIE(InfoExtractor): 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', 'info_dict': { 'id': '70816', - 'display_id': 'season-6-episode-12', + 'display_id': 'mig-og-min-mor/season-6-episode-12', 'ext': 'mp4', 'title': 'Episode 12', 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90', @@ -65,6 +68,33 @@ class DPlayIE(InfoExtractor): # geo restricted, via direct unsigned hls URL 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/', 'only_matching': True, + }, { + # disco-api + 'url': 'https://www.dplay.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'info_dict': { + 'id': '40206', + 'display_id': 'i-kongens-klr/sesong-1-episode-7', + 'ext': 'mp4', + 'title': 'Episode 7', + 'description': 'md5:e3e1411b2b9aebeea36a6ec5d50c60cf', + 'duration': 2611.16, + 'timestamp': 1516726800, + 'upload_date': '20180123', + 'series': 'I kongens klær', + 'season_number': 1, + 'episode_number': 7, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + + 'url': 'https://www.dplay.dk/videoer/singleliv/season-5-episode-3', + 'only_matching': True, + }, { + 'url': 'https://www.dplay.se/videos/sofias-anglar/sofias-anglar-1001', + 'only_matching': True, }] def _real_extract(self, url): @@ -72,10 +102,81 @@ class DPlayIE(InfoExtractor): display_id = mobj.group('id') domain = mobj.group('domain') + self._initialize_geo_bypass([mobj.group('country').upper()]) + webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - r'data-video-id=["\'](\d+)', webpage, 'video id') + r'data-video-id=["\'](\d+)', webpage, 'video id', default=None) + + if not video_id: + host = mobj.group('host') + disco_base = 'https://disco-api.%s' % host + self._download_json( + '%s/token' % disco_base, display_id, 'Downloading token', + query={ + 'realm': host.replace('.', ''), + }) + video = self._download_json( + '%s/content/videos/%s' % (disco_base, display_id), display_id, + headers={ + 'Referer': url, + 'x-disco-client': 'WEB:UNKNOWN:dplay-client:0.0.1', + }, query={ + 'include': 'show' + }) + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'] + formats = [] + for format_id, format_dict in self._download_json( + '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), + display_id)['data']['attributes']['streaming'].items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id='dash', fatal=False)) + elif format_id == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + series = None + try: + included = video.get('included') + if isinstance(included, list): + show = next(e for e in included if e.get('type') == 'show') + series = try_get( + show, lambda x: x['attributes']['name'], compat_str) + except StopIteration: + pass + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description'), + 'duration': float_or_none( + info.get('videoDuration'), scale=1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, + } info = self._download_json( 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 9a498d72a..ffbd2623d 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -2,26 +2,26 @@ from __future__ import unicode_literals import itertools +import json -from .amp import AMPIE +from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( - ExtractorError, clean_html, + ExtractorError, int_or_none, - remove_end, - sanitized_Request, - urlencode_postdata + parse_age_limit, + parse_duration, + unified_timestamp, ) -class DramaFeverBaseIE(AMPIE): - _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' +class DramaFeverBaseIE(InfoExtractor): _NETRC_MACHINE = 'dramafever' - _GEO_COUNTRIES = ['US', 'CA'] _CONSUMER_SECRET = 'DA59dtVXYLxajktV' @@ -38,8 +38,8 @@ class DramaFeverBaseIE(AMPIE): 'consumer secret', default=self._CONSUMER_SECRET) def _real_initialize(self): - self._login() self._consumer_secret = self._get_consumer_secret() + self._login() def _login(self): (username, password) = self._get_login_info() @@ -51,37 +51,49 @@ class DramaFeverBaseIE(AMPIE): 'password': password, } - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + try: + response = self._download_json( + 'https://www.dramafever.com/api/users/login', None, 'Logging in', + data=json.dumps(login_form).encode('utf-8'), headers={ + 'x-consumer-key': self._consumer_secret, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (403, 404): + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + else: + raise - if all(logout_pattern not in response - for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): - error = self._html_search_regex( - r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', - response, 'error message', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') + # Successful login + if response.get('result') or response.get('guid') or response.get('user_guid'): + return + + errors = response.get('errors') + if errors and isinstance(errors, list): + error = errors[0] + message = error.get('message') or error['reason'] + raise ExtractorError('Unable to login: %s' % message, expected=True) + raise ExtractorError('Unable to log in') class DramaFeverIE(DramaFeverBaseIE): IE_NAME = 'dramafever' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/(?:[^/]+/)?drama/(?P[0-9]+/[0-9]+)(?:/|$)' _TESTS = [{ - 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', + 'url': 'https://www.dramafever.com/drama/4274/1/Heirs/', 'info_dict': { - 'id': '4512.1', - 'ext': 'flv', - 'title': 'Cooking with Shin', - 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', + 'id': '4274.1', + 'ext': 'wvm', + 'title': 'Heirs - Episode 1', + 'description': 'md5:362a24ba18209f6276e032a651c50bc2', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 3783, + 'timestamp': 1381354993, + 'upload_date': '20131009', + 'series': 'Heirs', + 'season_number': 1, 'episode': 'Episode 1', 'episode_number': 1, - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1404336058, - 'upload_date': '20140702', - 'duration': 344, }, 'params': { # m3u8 download @@ -110,50 +122,95 @@ class DramaFeverIE(DramaFeverBaseIE): 'only_matching': True, }] + def _call_api(self, path, video_id, note, fatal=False): + return self._download_json( + 'https://www.dramafever.com/api/5/' + path, + video_id, note=note, headers={ + 'x-consumer-key': self._consumer_secret, + }, fatal=fatal) + + def _get_subtitles(self, video_id): + subtitles = {} + subs = self._call_api( + 'video/%s/subtitles/webvtt/' % video_id, video_id, + 'Downloading subtitles JSON', fatal=False) + if not subs or not isinstance(subs, list): + return subtitles + for sub in subs: + if not isinstance(sub, dict): + continue + sub_url = sub.get('url') + if not sub_url or not isinstance(sub_url, compat_str): + continue + subtitles.setdefault( + sub.get('code') or sub.get('language') or 'en', []).append({ + 'url': sub_url + }) + return subtitles + def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') - try: - info = self._extract_feed_info( - 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - self.raise_geo_restricted( - msg='Currently unavailable in your country', - countries=self._GEO_COUNTRIES) - raise - - # title is postfixed with video id for some reason, removing - if info.get('title'): - info['title'] = remove_end(info['title'], video_id).strip() - series_id, episode_number = video_id.split('.') - episode_info = self._download_json( - # We only need a single episode info, so restricting page size to one episode - # and dealing with page number as with episode number - r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1' - % (self._consumer_secret, series_id, episode_number), - video_id, 'Downloading episode info JSON', fatal=False) - if episode_info: - value = episode_info.get('value') - if isinstance(value, list): - for v in value: - if v.get('type') == 'Episode': - subfile = v.get('subfile') or v.get('new_subfile') - if subfile and subfile != 'http://www.dramafever.com/st/': - info.setdefault('subtitles', {}).setdefault('English', []).append({ - 'ext': 'srt', - 'url': subfile, - }) - episode_number = int_or_none(v.get('number')) - episode_fallback = 'Episode' - if episode_number: - episode_fallback += ' %d' % episode_number - info['episode'] = v.get('title') or episode_fallback - info['episode_number'] = episode_number - break - return info + video = self._call_api( + 'series/%s/episodes/%s/' % (series_id, episode_number), video_id, + 'Downloading video JSON') + + formats = [] + download_assets = video.get('download_assets') + if download_assets and isinstance(download_assets, dict): + for format_id, format_dict in download_assets.items(): + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url or not isinstance(format_url, compat_str): + continue + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'filesize': int_or_none(video.get('filesize')), + }) + + stream = self._call_api( + 'video/%s/stream/' % video_id, video_id, 'Downloading stream JSON', + fatal=False) + if stream: + stream_url = stream.get('stream_url') + if stream_url: + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + title = video.get('title') or 'Episode %s' % episode_number + description = video.get('description') + thumbnail = video.get('thumbnail') + timestamp = unified_timestamp(video.get('release_date')) + duration = parse_duration(video.get('duration')) + age_limit = parse_age_limit(video.get('tv_rating')) + series = video.get('series_title') + season_number = int_or_none(video.get('season')) + + if series: + title = '%s - %s' % (series, title) + + subtitles = self.extract_subtitles(video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'series': series, + 'season_number': season_number, + 'episode_number': int_or_none(episode_number), + 'formats': formats, + 'subtitles': subtitles, + } class DramaFeverSeriesIE(DramaFeverBaseIE): diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index c5d56a9ad..5c41c8022 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -10,7 +10,7 @@ from ..utils import ( class DrTuberIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?drtuber\.com/(?:video|embed)/(?P\d+)(?:/(?P[\w-]+))?' + _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P\d+)(?:/(?P[\w-]+))?' _TESTS = [{ 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 'md5': '93e680cf2536ad0dfb7e74d94a89facd', @@ -28,6 +28,9 @@ class DrTuberIE(InfoExtractor): }, { 'url': 'http://www.drtuber.com/embed/489939', 'only_matching': True, + }, { + 'url': 'http://m.drtuber.com/video/3893529/lingerie-blowjob-from-beautiful-teen', + 'only_matching': True, }] @staticmethod @@ -63,7 +66,9 @@ class DrTuberIE(InfoExtractor): self._sort_formats(formats) title = self._html_search_regex( - (r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', + (r']+class=["\']title[^>]+>([^<]+)', + r'([^<]+)\s*@\s+DrTuber', + r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'), webpage, 'title') diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 69effba58..f757745ba 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -138,6 +138,7 @@ class DRTVIE(InfoExtractor): 'tbr': int_or_none(bitrate), 'ext': link.get('FileFormat'), 'vcodec': 'none' if kind == 'AudioResource' else None, + 'preference': preference, }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index c9fc9b5a9..be2e3d378 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -1,10 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor +from ..compat import compat_b64decode from ..utils import ( qualities, sanitized_Request, @@ -42,7 +42,7 @@ class DumpertIE(InfoExtractor): r'data-files="([^"]+)"', webpage, 'data files') files = self._parse_json( - base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), + compat_b64decode(files_base64).decode('utf-8'), video_id) quality = qualities(['flv', 'mobile', 'tablet', '720p']) diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index e85c58bd5..3f760888e 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -32,7 +32,7 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/', 'info_dict': { - 'title': 'DVTV 16. 12. 2014: útok Talibanu, boj o kliniku, uprchlíci', + 'title': r're:^DVTV 16\. 12\. 2014: útok Talibanu, boj o kliniku, uprchlíci', 'id': '973eb3bc854e11e498be002590604f2e', }, 'playlist': [{ @@ -91,10 +91,24 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, + }, { + 'url': 'https://video.aktualne.cz/dvtv/babis-a-zeman-nesou-vinu-za-to-ze-nemame-jasno-v-tom-kdo-bud/r~026afb54fad711e79704ac1f6b220ee8/', + 'md5': '87defe16681b1429c91f7a74809823c6', + 'info_dict': { + 'id': 'f5ae72f6fad611e794dbac1f6b220ee8', + 'ext': 'mp4', + 'title': 'Babiš a Zeman nesou vinu za to, že nemáme jasno v tom, kdo bude vládnout, říká Pekarová Adamová', + }, + 'params': { + 'skip_download': True, + }, }] - def _parse_video_metadata(self, js, video_id): + def _parse_video_metadata(self, js, video_id, live_js=None): data = self._parse_json(js, video_id, transform_source=js_to_json) + if live_js: + data.update(self._parse_json( + live_js, video_id, transform_source=js_to_json)) title = unescapeHTML(data['title']) @@ -142,13 +156,18 @@ class DVTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) + # live content + live_item = self._search_regex( + r'(?s)embedData[0-9a-f]{32}\.asset\.liveStarter\s*=\s*(\{.+?\});', + webpage, 'video', default=None) + # single video item = self._search_regex( r'(?s)embedData[0-9a-f]{32}\[["\']asset["\']\]\s*=\s*(\{.+?\});', - webpage, 'video', default=None, fatal=False) + webpage, 'video', default=None) if item: - return self._parse_video_metadata(item, video_id) + return self._parse_video_metadata(item, video_id, live_item) # playlist items = re.findall( diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index e4a3046af..edabaafe6 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, try_get, unified_timestamp, @@ -17,7 +19,7 @@ class EggheadCourseIE(InfoExtractor): 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, 'info_dict': { - 'id': 'professor-frisby-introduces-composable-functional-javascript', + 'id': '72', 'title': 'Professor Frisby Introduces Composable Functional JavaScript', 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', }, @@ -26,14 +28,28 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id) + lessons = self._download_json( + 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, + playlist_id, 'Downloading course lessons JSON') - entries = [ - self.url_result( - 'wistia:%s' % lesson['wistia_id'], ie='Wistia', - video_id=lesson['wistia_id'], video_title=lesson.get('title')) - for lesson in course['lessons'] if lesson.get('wistia_id')] + entries = [] + for lesson in lessons: + lesson_url = lesson.get('http_url') + if not lesson_url or not isinstance(lesson_url, compat_str): + continue + lesson_id = lesson.get('id') + if lesson_id: + lesson_id = compat_str(lesson_id) + entries.append(self.url_result( + lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) + + course = self._download_json( + 'https://egghead.io/api/v1/series/%s' % playlist_id, + playlist_id, 'Downloading course JSON', fatal=False) or {} + + playlist_id = course.get('id') + if playlist_id: + playlist_id = compat_str(playlist_id) return self.playlist_result( entries, playlist_id, course.get('title'), @@ -43,11 +59,12 @@ class EggheadCourseIE(InfoExtractor): class EggheadLessonIE(InfoExtractor): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'info_dict': { - 'id': 'fv5yotjxcg', + 'id': '1196', + 'display_id': 'javascript-linear-data-flow-with-container-style-types-box', 'ext': 'mp4', 'title': 'Create linear data flow with container style types (Box)', 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', @@ -60,25 +77,51 @@ class EggheadLessonIE(InfoExtractor): }, 'params': { 'skip_download': True, + 'format': 'bestvideo', }, - } + }, { + 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', + 'only_matching': True, + }] def _real_extract(self, url): - lesson_id = self._match_id(url) + display_id = self._match_id(url) lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id) + 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + + lesson_id = compat_str(lesson['id']) + title = lesson['title'] + + formats = [] + for _, format_url in lesson['media_urls'].items(): + if not format_url or not isinstance(format_url, compat_str): + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, lesson_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, lesson_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + }) + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'ie_key': 'Wistia', - 'url': 'wistia:%s' % lesson['wistia_id'], - 'id': lesson['wistia_id'], - 'title': lesson.get('title'), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, 'description': lesson.get('summary'), 'thumbnail': lesson.get('thumb_nail'), 'timestamp': unified_timestamp(lesson.get('published_at')), 'duration': int_or_none(lesson.get('duration')), 'view_count': int_or_none(lesson.get('plays_count')), 'tags': try_get(lesson, lambda x: x['tag_list'], list), + 'series': try_get( + lesson, lambda x: x['series']['title'], compat_str), + 'formats': formats, } diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py index 3f6268637..4485bf8c1 100644 --- a/youtube_dl/extractor/einthusan.py +++ b/youtube_dl/extractor/einthusan.py @@ -1,13 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import json from .common import InfoExtractor from ..compat import ( - compat_urlparse, + compat_b64decode, compat_str, + compat_urlparse, ) from ..utils import ( extract_attributes, @@ -36,9 +36,9 @@ class EinthusanIE(InfoExtractor): # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js def _decrypt(self, encrypted_data, video_id): - return self._parse_json(base64.b64decode(( + return self._parse_json(compat_b64decode(( encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1] - ).encode('ascii')).decode('utf-8'), video_id) + )).decode('utf-8'), video_id) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/ellentube.py b/youtube_dl/extractor/ellentube.py new file mode 100644 index 000000000..544473274 --- /dev/null +++ b/youtube_dl/extractor/ellentube.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + float_or_none, + int_or_none, + try_get, +) + + +class EllenTubeBaseIE(InfoExtractor): + def _extract_data_config(self, webpage, video_id): + details = self._search_regex( + r'(<[^>]+\bdata-component=(["\'])[Dd]etails.+?></div>)', webpage, + 'details') + return self._parse_json( + extract_attributes(details)['data-config'], video_id) + + def _extract_video(self, data, video_id): + title = data['title'] + + formats = [] + duration = None + for entry in data.get('media'): + if entry.get('id') == 'm3u8': + formats = self._extract_m3u8_formats( + entry['url'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + duration = int_or_none(entry.get('duration')) + break + self._sort_formats(formats) + + def get_insight(kind): + return int_or_none(try_get( + data, lambda x: x['insight']['%ss' % kind])) + + return { + 'extractor_key': EllenTubeIE.ie_key(), + 'id': video_id, + 'title': title, + 'description': data.get('description'), + 'duration': duration, + 'thumbnail': data.get('thumbnail'), + 'timestamp': float_or_none(data.get('publishTime'), scale=1000), + 'view_count': get_insight('view'), + 'like_count': get_insight('like'), + 'formats': formats, + } + + +class EllenTubeIE(EllenTubeBaseIE): + _VALID_URL = r'''(?x) + (?: + ellentube:| + https://api-prod\.ellentube\.com/ellenapi/api/item/ + ) + (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + ''' + _TESTS = [{ + 'url': 'https://api-prod.ellentube.com/ellenapi/api/item/0822171c-3829-43bf-b99f-d77358ae75e3', + 'md5': '2fabc277131bddafdd120e0fc0f974c9', + 'info_dict': { + 'id': '0822171c-3829-43bf-b99f-d77358ae75e3', + 'ext': 'mp4', + 'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck', + 'description': 'md5:76e3355e2242a78ad9e3858e5616923f', + 'thumbnail': r're:^https?://.+?', + 'duration': 514, + 'timestamp': 1508505120, + 'upload_date': '20171020', + 'view_count': int, + 'like_count': int, + } + }, { + 'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + 'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, + video_id) + return self._extract_video(data, video_id) + + +class EllenTubeVideoIE(EllenTubeBaseIE): + _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P<id>.+?)\.html' + _TEST = { + 'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', + 'only_matching': True, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._extract_data_config(webpage, display_id)['id'] + return self.url_result( + 'ellentube:%s' % video_id, ie=EllenTubeIE.ie_key(), + video_id=video_id) + + +class EllenTubePlaylistIE(EllenTubeBaseIE): + _VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P<id>.+?)\.html' + _TESTS = [{ + 'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html', + 'info_dict': { + 'id': 'dax-shepard-jordan-fisher-haim', + 'title': "Dax Shepard, 'DWTS' Team Jordan Fisher & Lindsay Arnold, HAIM", + 'description': 'md5:bfc982194dabb3f4e325e43aa6b2e21c', + }, + 'playlist_count': 6, + }, { + 'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data = self._extract_data_config(webpage, display_id)['data'] + feed = self._download_json( + 'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' + % data['filter'], display_id) + entries = [ + self._extract_video(elem, elem['id']) + for elem in feed if elem.get('type') == 'VIDEO' and elem.get('id')] + return self.playlist_result( + entries, display_id, data.get('title'), + clean_html(data.get('description'))) diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py deleted file mode 100644 index e0a13dd76..000000000 --- a/youtube_dl/extractor/ellentv.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from .kaltura import KalturaIE -from ..utils import NO_DEFAULT - - -class EllenTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' - _TESTS = [{ - 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', - 'md5': '4294cf98bc165f218aaa0b89e0fd8042', - 'info_dict': { - 'id': '0_ipq1gsai', - 'ext': 'mov', - 'title': 'Fast Fingers of Fate', - 'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a', - 'timestamp': 1428035648, - 'upload_date': '20150403', - 'uploader_id': 'batchUser', - }, - }, { - # not available via http://widgets.ellentube.com/ - 'url': 'http://www.ellentv.com/videos/1-szkgu2m2/', - 'info_dict': { - 'id': '1_szkgu2m2', - 'ext': 'flv', - 'title': "Ellen's Amazingly Talented Audience", - 'description': 'md5:86ff1e376ff0d717d7171590e273f0a5', - 'timestamp': 1255140900, - 'upload_date': '20091010', - 'uploader_id': 'ellenkaltura@gmail.com', - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - URLS = ('http://widgets.ellentube.com/videos/%s' % video_id, url) - - for num, url_ in enumerate(URLS, 1): - webpage = self._download_webpage( - url_, video_id, fatal=num == len(URLS)) - - default = NO_DEFAULT if num == len(URLS) else None - - partner_id = self._search_regex( - r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id', - default=default) - - kaltura_id = self._search_regex( - [r'id="kaltura_player_([^"]+)"', - r"_wb_entry_id\s*:\s*'([^']+)", - r'data-kaltura-entry-id="([^"]+)'], - webpage, 'kaltura id', default=default) - - if partner_id and kaltura_id: - break - - return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key()) - - -class EllenTVClipsIE(InfoExtractor): - IE_NAME = 'EllenTV:clips' - _VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)' - _TEST = { - 'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/', - 'info_dict': { - 'id': 'meryl-streep-vanessa-hudgens', - 'title': 'Meryl Streep, Vanessa Hudgens', - }, - 'playlist_mincount': 5, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - playlist = self._extract_playlist(webpage, playlist_id) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._og_search_title(webpage), - 'entries': self._extract_entries(playlist) - } - - def _extract_playlist(self, webpage, playlist_id): - json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json') - return self._parse_json('[{' + json_string + '}]', playlist_id) - - def _extract_entries(self, playlist): - return [ - self.url_result( - 'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']), - KalturaIE.ie_key(), video_id=item['kaltura_entry_id']) - for item in playlist] diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index f3734e9f8..81f2e2ee1 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -15,7 +15,7 @@ from ..utils import ( class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:hd-porn|embed)/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', @@ -35,6 +35,9 @@ class EpornerIE(InfoExtractor): }, { 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 7a7436068..127c69b2e 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .once import OnceIE from ..compat import compat_str from ..utils import ( determine_ext, @@ -9,22 +12,27 @@ from ..utils import ( ) -class ESPNIE(InfoExtractor): +class ESPNIE(OnceIE): _VALID_URL = r'''(?x) https?:// - (?: - (?:(?:\w+\.)+)?espn\.go| - (?:www\.)?espn - )\.com/ (?: (?: - video/clip| - watch/player - ) - (?: - \?.*?\bid=| - /_/id/ - ) + (?: + (?:(?:\w+\.)+)?espn\.go| + (?:www\.)?espn + )\.com/ + (?: + (?: + video/(?:clip|iframe/twitter)| + watch/player + ) + (?: + .*?\?.*?\bid=| + /_/id/ + ) + ) + )| + (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/ ) (?P<id>\d+) ''' @@ -77,6 +85,15 @@ class ESPNIE(InfoExtractor): }, { 'url': 'http://www.espn.com/video/clip/_/id/17989860', 'only_matching': True, + }, { + 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', + 'only_matching': True, + }, { + 'url': 'http://www.espnfc.us/video/espn-fc-tv/86/video/3319154/nashville-unveiled-as-the-newest-club-in-mls', + 'only_matching': True, + }, { + 'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets', + 'only_matching': True, }] def _real_extract(self, url): @@ -93,7 +110,9 @@ class ESPNIE(InfoExtractor): def traverse_source(source, base_source_id=None): for source_id, source in source.items(): - if isinstance(source, compat_str): + if source_id == 'alert': + continue + elif isinstance(source, compat_str): extract_source(source, base_source_id) elif isinstance(source, dict): traverse_source( @@ -106,7 +125,9 @@ class ESPNIE(InfoExtractor): return format_urls.add(source_url) ext = determine_ext(source_url) - if ext == 'smil': + if OnceIE.suitable(source_url): + formats.extend(self._extract_once_formats(source_url)) + elif ext == 'smil': formats.extend(self._extract_smil_formats( source_url, video_id, fatal=False)) elif ext == 'f4m': @@ -117,12 +138,24 @@ class ESPNIE(InfoExtractor): source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=source_id, fatal=False)) else: - formats.append({ + f = { 'url': source_url, 'format_id': source_id, - }) + } + mobj = re.search(r'(\d+)p(\d+)_(\d+)k\.', source_url) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'fps': int(mobj.group(2)), + 'tbr': int(mobj.group(3)), + }) + if source_id == 'mezzanine': + f['preference'] = 1 + formats.append(f) - traverse_source(clip['links']['source']) + links = clip.get('links', {}) + traverse_source(links.get('source', {})) + traverse_source(links.get('mobile', {})) self._sort_formats(formats) description = clip.get('caption') or clip.get('description') @@ -144,9 +177,6 @@ class ESPNIE(InfoExtractor): class ESPNArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P<id>[^/]+)' _TESTS = [{ - 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', - 'only_matching': True, - }, { 'url': 'http://espn.go.com/nba/recap?gameId=400793786', 'only_matching': True, }, { @@ -175,3 +205,34 @@ class ESPNArticleIE(InfoExtractor): return self.url_result( 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) + + +class FiveThirtyEightIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fivethirtyeight\.com/features/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://fivethirtyeight.com/features/how-the-6-8-raiders-can-still-make-the-playoffs/', + 'info_dict': { + 'id': '21846851', + 'ext': 'mp4', + 'title': 'FiveThirtyEight: The Raiders can still make the playoffs', + 'description': 'Neil Paine breaks down the simplest scenario that will put the Raiders into the playoffs at 8-8.', + 'timestamp': 1513960621, + 'upload_date': '20171222', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-video-id=["\'](?P<id>\d+)', + webpage, 'video id', group='id') + + return self.url_result( + 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) diff --git a/youtube_dl/extractor/etonline.py b/youtube_dl/extractor/etonline.py deleted file mode 100644 index 17d7cfec6..000000000 --- a/youtube_dl/extractor/etonline.py +++ /dev/null @@ -1,39 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class ETOnlineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?etonline\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.etonline.com/tv/211130_dove_cameron_liv_and_maddie_emotional_episode_series_finale/', - 'info_dict': { - 'id': '211130_dove_cameron_liv_and_maddie_emotional_episode_series_finale', - 'title': 'md5:a21ec7d3872ed98335cbd2a046f34ee6', - 'description': 'md5:8b94484063f463cca709617c79618ccd', - }, - 'playlist_count': 2, - }, { - 'url': 'http://www.etonline.com/media/video/here_are_the_stars_who_love_bringing_their_moms_as_dates_to_the_oscars-211359/', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911076001/default_default/index.html?videoId=ref:%s' - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % video_id, 'BrightcoveNew', video_id) - for video_id in re.findall( - r'site\.brightcove\s*\([^,]+,\s*["\'](title_\d+)', webpage)] - - return self.playlist_result( - entries, playlist_id, - self._og_search_title(webpage, fatal=False), - self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7ca2cfd19..f90a549b3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -31,20 +31,19 @@ from .aenetworks import ( AENetworksIE, HistoryTopicIE, ) -from .afreecatv import ( - AfreecaTVIE, - AfreecaTVGlobalIE, -) +from .afreecatv import AfreecaTVIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE +from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anvato import AnvatoIE from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( @@ -128,7 +127,10 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bostonglobe import BostonGlobeIE from .bpb import BpbIE -from .br import BRIE +from .br import ( + BRIE, + BRMediathekIE, +) from .bravotv import BravoTVIE from .breakcom import BreakIE from .brightcove import ( @@ -136,10 +138,7 @@ from .brightcove import ( BrightcoveNewIE, ) from .buzzfeed import BuzzFeedIE -from .byutv import ( - BYUtvIE, - BYUtvEventIE, -) +from .byutv import BYUtvIE from .c56 import C56IE from .camdemy import ( CamdemyIE, @@ -148,7 +147,11 @@ from .camdemy import ( from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE -from .canvas import CanvasIE +from .canvas import ( + CanvasIE, + CanvasEenIE, + VrtNUIE, +) from .carambatv import ( CarambaTVIE, CarambaTVPageIE, @@ -159,6 +162,7 @@ from .cbc import ( CBCPlayerIE, CBCWatchVideoIE, CBCWatchIE, + CBCOlympicsIE, ) from .cbs import CBSIE from .cbslocal import CBSLocalIE @@ -187,6 +191,7 @@ from .chirbit import ( from .cinchcast import CinchcastIE from .cjsw import CJSWIE from .cliphunter import CliphunterIE +from .clippit import ClippitIE from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE @@ -201,7 +206,6 @@ from .cnn import ( CNNArticleIE, ) from .coub import CoubIE -from .collegerama import CollegeRamaIE from .comedycentral import ( ComedyCentralFullEpisodesIE, ComedyCentralIE, @@ -239,11 +243,10 @@ from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, DailymotionUserIE, - DailymotionCloudIE, ) from .daisuki import ( - DaisukiIE, - DaisukiPlaylistIE, + DaisukiMottoIE, + DaisukiMottoPlaylistIE, ) from .daum import ( DaumIE, @@ -257,6 +260,7 @@ from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE +from .digg import DiggIE from .dotsub import DotsubIE from .douyutv import ( DouyuShowIE, @@ -305,9 +309,10 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE from .eitb import EitbIE -from .ellentv import ( - EllenTVIE, - EllenTVClipsIE, +from .ellentube import ( + EllenTubeIE, + EllenTubeVideoIE, + EllenTubePlaylistIE, ) from .elpais import ElPaisIE from .embedly import EmbedlyIE @@ -318,9 +323,9 @@ from .escapist import EscapistIE from .espn import ( ESPNIE, ESPNArticleIE, + FiveThirtyEightIE, ) from .esri import EsriVideoIE -from .etonline import ETOnlineIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE @@ -340,17 +345,21 @@ from .filmon import ( FilmOnIE, FilmOnChannelIE, ) -from .firstpost import FirstpostIE +from .filmweb import FilmwebIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE from .fivetv import FiveTVIE -from .fktv import FKTVIE from .flickr import FlickrIE from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE -from .fourtube import FourTubeIE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) from .fox import FOXIE from .fox9 import FOX9IE from .foxgay import FoxgayIE @@ -364,15 +373,21 @@ from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( FranceTVIE, + FranceTVSiteIE, FranceTVEmbedIE, FranceTVInfoIE, - GenerationQuoiIE, + FranceTVJeunesseIE, + GenerationWhatIE, CultureboxIE, ) from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE from .funimation import FunimationIE +from .funk import ( + FunkMixIE, + FunkChannelIE, +) from .funnyordie import FunnyOrDieIE from .fusion import FusionIE from .fxnetworks import FXNetworksIE @@ -381,7 +396,6 @@ from .gameone import ( GameOneIE, GameOnePlaylistIE, ) -from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE @@ -417,12 +431,16 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE +from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE -from .hotstar import HotStarIE +from .hotstar import ( + HotStarIE, + HotStarPlaylistIE, +) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE from .hrti import ( @@ -454,6 +472,7 @@ from .indavideo import ( ) from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE +from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE @@ -475,8 +494,8 @@ from .jove import JoveIE from .joj import JojIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE +from .kakao import KakaoIE from .kaltura import KalturaIE -from .kamcord import KamcordIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE @@ -503,6 +522,7 @@ from .la7 import LA7IE from .laola1tv import ( Laola1TvEmbedIE, Laola1TvIE, + ITTFIE, ) from .lci import LCIIE from .lcp import ( @@ -511,13 +531,14 @@ from .lcp import ( ) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE -from .lego import LEGOIE -from .lemonde import LemondeIE from .leeco import ( LeIE, LePlaylistIE, LetvCloudIE, ) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE from .libraryofcongress import LibraryOfCongressIE from .libsyn import LibsynIE from .lifenews import ( @@ -529,8 +550,12 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) +from .line import LineTVIE from .litv import LiTVIE -from .liveleak import LiveLeakIE +from .liveleak import ( + LiveLeakIE, + LiveLeakEmbedIE, +) from .livestream import ( LivestreamIE, LivestreamOriginalIE, @@ -546,16 +571,23 @@ from .lynda import ( ) from .m6 import M6IE from .macgamestore import MacGameStoreIE -from .mailru import MailRuIE +from .mailru import ( + MailRuIE, + MailRuMusicIE, + MailRuMusicSearchIE, +) from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, ) +from .manyvids import ManyVidsIE +from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE from .mediaset import MediasetIE +from .mediasite import MediasiteIE from .medici import MediciIE from .megaphone import MegaphoneIE from .meipai import MeipaiIE @@ -589,7 +621,10 @@ from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE from .morningstar import MorningstarIE -from .motherless import MotherlessIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE @@ -607,8 +642,10 @@ from .musicplayon import MusicPlayOnIE from .mwave import MwaveIE, MwaveMeetGreetIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE -from .myvi import MyviIE -from .myvideo import MyVideoIE +from .myvi import ( + MyviIE, + MyviEmbedIE, +) from .myvidster import MyVidsterIE from .nationalgeographic import ( NationalGeographicVideoIE, @@ -622,6 +659,7 @@ from .nbc import ( NBCIE, NBCNewsIE, NBCOlympicsIE, + NBCOlympicsStreamIE, NBCSportsIE, NBCSportsVPlayerIE, ) @@ -670,6 +708,7 @@ from .nhl import ( ) from .nick import ( NickIE, + NickBrIE, NickDeIE, NickNightIE, NickRuIE, @@ -702,10 +741,6 @@ from .nowness import ( NownessPlaylistIE, NownessSeriesIE, ) -from .nowtv import ( - NowTVIE, - NowTVListIE, -) from .noz import NozIE from .npo import ( AndereTijdenIE, @@ -756,6 +791,7 @@ from .ora import OraTVIE from .orf import ( ORFTVthekIE, ORFFM4IE, + ORFFM4StoryIE, ORFOE1IE, ORFIPTVIE, ) @@ -770,6 +806,7 @@ from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE from .people import PeopleIE +from .performgroup import PerformGroupIE from .periscope import ( PeriscopeIE, PeriscopeUserIE, @@ -777,6 +814,10 @@ from .periscope import ( from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .picarto import ( + PicartoIE, + PicartoVodIE, +) from .piksel import PikselIE from .pinkbike import PinkbikeIE from .pladform import PladformIE @@ -795,6 +836,7 @@ from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, ) +from .popcorntv import PopcornTVIE from .porn91 import Porn91IE from .porncom import PornComIE from .pornflip import PornFlipIE @@ -835,11 +877,17 @@ from .radiofrance import RadioFranceIE from .rai import ( RaiPlayIE, RaiPlayLiveIE, + RaiPlayPlaylistIE, RaiIE, ) +from .raywenderlich import RayWenderlichIE from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import RedBullTVIE +from .reddit import ( + RedditIE, + RedditRIE, +) from .redtube import RedTubeIE from .regiotv import RegioTVIE from .rentv import ( @@ -854,7 +902,6 @@ from .revision3 import ( Revision3IE, ) from .rice import RICEIE -from .ringtv import RingTVIE from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE @@ -874,6 +921,7 @@ from .rtp import RTPIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE +from .rtvs import RTVSIE from .rudo import RudoIE from .ruhd import RUHDIE from .ruleporn import RulePornIE @@ -883,11 +931,11 @@ from .rutube import ( RutubeEmbedIE, RutubeMovieIE, RutubePersonIE, + RutubePlaylistIE, ) from .rutv import RUTVIE from .ruutu import RuutuIE from .ruv import RuvIE -from .sandia import SandiaIE from .safari import ( SafariIE, SafariApiIE, @@ -903,8 +951,17 @@ from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE +from .servus import ServusIE +from .sevenplus import SevenPlusIE from .sexu import SexuIE -from .shahid import ShahidIE +from .seznamzpravy import ( + SeznamZpravyIE, + SeznamZpravyArticleIE, +) +from .shahid import ( + ShahidIE, + ShahidShowIE, +) from .shared import ( SharedIE, VivoIE, @@ -919,6 +976,7 @@ from .skynewsarabia import ( ) from .skysports import SkySportsIE from .slideshare import SlideshareIE +from .slideslive import SlidesLiveIE from .slutload import SlutloadIE from .smotri import ( SmotriIE, @@ -963,6 +1021,7 @@ from .spreaker import ( SpreakerAPIEpisodeIE, SpreakerPlaylistIE ) +from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE from .srgssr import ( SRGSSRIE, @@ -976,10 +1035,12 @@ from .streamango import StreamangoIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE from .sunporno import SunPornoIE from .svt import ( SVTIE, SVTPlayIE, + SVTSeriesIE, ) from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE @@ -1005,9 +1066,14 @@ from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telemb import TeleMBIE -from .telequebec import TeleQuebecIE +from .telequebec import ( + TeleQuebecIE, + TeleQuebecEmissionIE, + TeleQuebecLiveIE, +) from .teletask import TeleTaskIE from .telewebion import TelewebionIE +from .tennistv import TennisTVIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE @@ -1017,7 +1083,6 @@ from .theplatform import ( ThePlatformFeedIE, ) from .thescene import TheSceneIE -from .thesixtyone import TheSixtyOneIE from .thestar import TheStarIE from .thesun import TheSunIE from .theweatherchannel import TheWeatherChannelIE @@ -1078,6 +1143,11 @@ from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE from .tvnoe import TVNoeIE +from .tvnow import ( + TVNowIE, + TVNowListIE, + TVNowShowIE, +) from .tvp import ( TVPEmbedIE, TVPIE, @@ -1091,10 +1161,7 @@ from .tvplayer import TVPlayerIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE -from .twentytwotracks import ( - TwentyTwoTracksIE, - TwentyTwoTracksGenreIE -) +from .twentythreevideo import TwentyThreeVideoIE from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -1117,9 +1184,12 @@ from .udemy import ( UdemyCourseIE ) from .udn import UDNEmbedIE +from .ufctv import UFCTVIE from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE +from .umg import UMGDeIE from .unistra import UnistraIE +from .unity import UnityIE from .uol import UOLIE from .uplynk import ( UplynkIE, @@ -1159,7 +1229,6 @@ from .vice import ( ViceArticleIE, ViceShowIE, ) -from .viceland import VicelandIE from .vidbit import VidbitIE from .viddler import ViddlerIE from .videa import VideaIE @@ -1174,6 +1243,7 @@ from .videomore import ( from .videopremium import VideoPremiumIE from .videopress import VideoPressIE from .vidio import VidioIE +from .vidlii import VidLiiIE from .vidme import ( VidmeIE, VidmeUserIE, @@ -1227,7 +1297,10 @@ from .vodpl import VODPlIE from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE from .voot import VootIE -from .voxmedia import VoxMediaIE +from .voxmedia import ( + VoxMediaVolumeIE, + VoxMediaIE, +) from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE @@ -1252,6 +1325,8 @@ from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, + WDRPageIE, + WDRElefantIE, WDRMobileIE, ) from .webcaster import ( @@ -1262,6 +1337,10 @@ from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, ) +from .weibo import ( + WeiboIE, + WeiboMobileIE +) from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE @@ -1287,6 +1366,10 @@ from .xiami import ( XiamiArtistIE, XiamiCollectionIE ) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE @@ -1304,6 +1387,7 @@ from .yandexmusic import ( YandexMusicPlaylistIE, ) from .yandexdisk import YandexDiskIE +from .yapfiles import YapFilesIE from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .ynet import YnetIE @@ -1312,6 +1396,11 @@ from .youku import ( YoukuIE, YoukuShowIE, ) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) from .youporn import YouPornIE from .yourupload import YourUploadIE from .youtube import ( @@ -1326,7 +1415,6 @@ from .youtube import ( YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE, - YoutubeSharedVideoIE, YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, @@ -1336,5 +1424,11 @@ from .youtube import ( ) from .zapiks import ZapiksIE from .zaq1 import Zaq1IE +from .zattoo import ( + QuicklineIE, + QuicklineLiveIE, + ZattooIE, + ZattooLiveIE, +) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 445f9438d..acd4090fa 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -8,12 +8,12 @@ class ExtremeTubeIE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', + 'md5': '92feaafa4b58e82f261e5419f39c60cb', 'info_dict': { 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', - 'uploader': 'unknown', + 'uploader': 'anonim', 'view_count': int, 'age_limit': 18, } @@ -36,10 +36,10 @@ class ExtremeTubeIE(KeezMoviesIE): r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( - r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>', + r'Uploaded by:\s*</[^>]+>\s*<a[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) view_count = str_to_int(self._search_regex( - r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>', + r'Views:\s*</[^>]+>\s*<[^>]+>([\d,\.]+)</', webpage, 'view count', fatal=False)) info.update({ diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 4b3f6cc86..220ada3a6 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -67,9 +67,9 @@ class FacebookIE(InfoExtractor): 'uploader': 'Tennis on Facebook', 'upload_date': '20140908', 'timestamp': 1410199200, - } + }, + 'skip': 'Requires logging in', }, { - 'note': 'Video without discernible title', 'url': 'https://www.facebook.com/video.php?v=274175099429670', 'info_dict': { 'id': '274175099429670', @@ -78,6 +78,7 @@ class FacebookIE(InfoExtractor): 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, + 'thumbnail': r're:^https?://.*', }, 'expected_warnings': [ 'title' @@ -94,6 +95,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20160110', 'timestamp': 1452431627, }, + 'skip': 'Requires logging in', }, { 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', @@ -121,7 +123,11 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '10153664894881749', 'ext': 'mp4', - 'title': 'Facebook video #10153664894881749', + 'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1456259628, + 'upload_date': '20160223', + 'uploader': 'Barack Obama', }, }, { # have 1080P, but only up to 720p in swf params @@ -130,10 +136,11 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', - 'title': 'Holocaust survivor becomes US citizen', + 'title': 'She survived the holocaust — and years later, she’s getting her citizenship s...', 'timestamp': 1477818095, 'upload_date': '20161030', 'uploader': 'CNN', + 'thumbnail': r're:^https?://.*', }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -158,6 +165,7 @@ class FacebookIE(InfoExtractor): 'timestamp': 1477305000, 'upload_date': '20161024', 'uploader': 'La Guía Del Varón', + 'thumbnail': r're:^https?://.*', }, 'params': { 'skip_download': True, @@ -376,6 +384,7 @@ class FacebookIE(InfoExtractor): timestamp = int_or_none(self._search_regex( r'<abbr[^>]+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) + thumbnail = self._og_search_thumbnail(webpage) info_dict = { 'id': video_id, @@ -383,6 +392,7 @@ class FacebookIE(InfoExtractor): 'formats': formats, 'uploader': uploader, 'timestamp': timestamp, + 'thumbnail': thumbnail, } return webpage, info_dict diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 4bc8fc512..312ee2aee 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_etree_fromstring from ..utils import ( xpath_element, xpath_text, @@ -43,10 +46,15 @@ class FazIE(InfoExtractor): webpage = self._download_webpage(url, video_id) description = self._og_search_description(webpage) - config_xml_url = self._search_regex( - r'videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url') - config = self._download_xml( - config_xml_url, video_id, 'Downloading config xml') + media = self._html_search_regex( + r"data-videojs-media='([^']+)", + webpage, 'media') + if media == 'extern': + perform_url = self._search_regex( + r"<iframe[^>]+?src='((?:http:)?//player\.performgroup\.com/eplayer/eplayer\.html#/?[0-9a-f]{26}\.[0-9a-z]{26})", + webpage, 'perform url') + return self.url_result(perform_url) + config = compat_etree_fromstring(media) encodings = xpath_element(config, 'ENCODINGS', 'encodings', True) formats = [] @@ -55,12 +63,24 @@ class FazIE(InfoExtractor): if encoding is not None: encoding_url = xpath_text(encoding, 'FILENAME') if encoding_url: - formats.append({ + tbr = xpath_text(encoding, 'AVERAGEBITRATE', 1000) + if tbr: + tbr = int_or_none(tbr.replace(',', '.')) + f = { 'url': encoding_url, 'format_id': code.lower(), 'quality': pref, - 'tbr': int_or_none(xpath_text(encoding, 'AVERAGEBITRATE')), - }) + 'tbr': tbr, + 'vcodec': xpath_text(encoding, 'CODEC'), + } + mobj = re.search(r'(\d+)x(\d+)_(\d+)\.mp4', encoding_url) + if mobj: + f.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + 'tbr': tbr or int(mobj.group(3)), + }) + formats.append(f) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py index 8d1010b88..8db7c5963 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/youtube_dl/extractor/fczenit.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + float_or_none, +) class FczenitIE(InfoExtractor): @@ -14,6 +17,8 @@ class FczenitIE(InfoExtractor): 'id': '41044', 'ext': 'mp4', 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»', + 'timestamp': 1462283735, + 'upload_date': '20160503', }, } @@ -21,28 +26,31 @@ class FczenitIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex( - r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title') + msi_id = self._search_regex( + r"(?s)config\s*=\s*{.+?video_id\s*:\s*'([^']+)'", webpage, 'msi id') - video_items = self._parse_json(self._search_regex( - r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'), - video_id) - - def merge_dicts(*dicts): - ret = {} - for a_dict in dicts: - ret.update(a_dict) - return ret + msi_data = self._download_json( + 'http://player.fc-zenit.ru/msi/video', msi_id, query={ + 'video': msi_id, + })['data'] + title = msi_data['name'] formats = [{ - 'url': compat_urlparse.urljoin(url, video_url), - 'tbr': int(tbr), - } for tbr, video_url in merge_dicts(*video_items).items()] + 'format_id': q.get('label'), + 'url': q['url'], + 'height': int_or_none(q.get('label')), + } for q in msi_data['qualities'] if q.get('url')] self._sort_formats(formats) + tags = [tag['label'] for tag in msi_data.get('tags', []) if tag.get('label')] + return { 'id': video_id, - 'title': video_title, + 'title': title, + 'thumbnail': msi_data.get('preview'), 'formats': formats, + 'duration': float_or_none(msi_data.get('duration')), + 'timestamp': int_or_none(msi_data.get('date')), + 'tags': tags, } diff --git a/youtube_dl/extractor/filmweb.py b/youtube_dl/extractor/filmweb.py new file mode 100644 index 000000000..56000bc5b --- /dev/null +++ b/youtube_dl/extractor/filmweb.py @@ -0,0 +1,42 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class FilmwebIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filmweb\.no/(?P<type>trailere|filmnytt)/article(?P<id>\d+)\.ece' + _TEST = { + 'url': 'http://www.filmweb.no/trailere/article1264921.ece', + 'md5': 'e353f47df98e557d67edaceda9dece89', + 'info_dict': { + 'id': '13033574', + 'ext': 'mp4', + 'title': 'Det som en gang var', + 'upload_date': '20160316', + 'timestamp': 1458140101, + 'uploader_id': '12639966', + 'uploader': 'Live Roaldset', + } + } + + def _real_extract(self, url): + article_type, article_id = re.match(self._VALID_URL, url).groups() + if article_type == 'filmnytt': + webpage = self._download_webpage(url, article_id) + article_id = self._search_regex(r'data-videoid="(\d+)"', webpage, 'article id') + embed_code = self._download_json( + 'https://www.filmweb.no/template_v2/ajax/json_trailerEmbed.jsp', + article_id, query={ + 'articleId': article_id, + })['embedCode'] + iframe_url = self._proto_relative_url(self._search_regex( + r'<iframe[^>]+src="([^"]+)', embed_code, 'iframe url')) + + return { + '_type': 'url_transparent', + 'id': article_id, + 'url': iframe_url, + 'ie_key': 'TwentyThreeVideo', + } diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py deleted file mode 100644 index e8936cb24..000000000 --- a/youtube_dl/extractor/firstpost.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class FirstpostIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html' - - _TEST = { - 'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html', - 'md5': 'ee9114957692f01fb1263ed87039112a', - 'info_dict': { - 'id': '1025403', - 'ext': 'mp4', - 'title': 'India to launch indigenous aircraft carrier INS Vikrant today', - 'description': 'md5:feef3041cb09724e0bdc02843348f5f4', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - page = self._download_webpage(url, video_id) - - title = self._html_search_meta('twitter:title', page, 'title', fatal=True) - description = self._html_search_meta('twitter:description', page, 'title') - - data = self._download_xml( - 'http://www.firstpost.com/getvideoxml-%s.xml' % video_id, video_id, - 'Downloading video XML') - - item = data.find('./playlist/item') - thumbnail = item.find('./image').text - - formats = [ - { - 'url': details.find('./file').text, - 'format_id': details.find('./label').text.strip(), - 'width': int(details.find('./width').text.strip()), - 'height': int(details.find('./height').text.strip()), - } for details in item.findall('./source/file_details') if details.find('./file').text - ] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py deleted file mode 100644 index 2958452f4..000000000 --- a/youtube_dl/extractor/fktv.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - clean_html, - determine_ext, - js_to_json, -) - - -class FKTVIE(InfoExtractor): - IE_NAME = 'fernsehkritik.tv' - _VALID_URL = r'https?://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?' - - _TEST = { - 'url': 'http://fernsehkritik.tv/folge-1', - 'md5': '21f0b0c99bce7d5b524eb1b17b1c6d79', - 'info_dict': { - 'id': '1', - 'ext': 'mp4', - 'title': 'Folge 1 vom 10. April 2007', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - } - - def _real_extract(self, url): - episode = self._match_id(url) - - webpage = self._download_webpage( - 'http://fernsehkritik.tv/folge-%s/play' % episode, episode) - title = clean_html(self._html_search_regex( - '<h3>([^<]+)</h3>', webpage, 'title')) - thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) - sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) - - formats = [] - for source in sources: - furl = source.get('src') - if furl: - formats.append({ - 'url': furl, - 'format_id': determine_ext(furl), - }) - self._sort_formats(formats) - - return { - 'id': episode, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index e3fd08bcf..ad273a0e7 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -3,39 +3,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( parse_duration, parse_iso8601, - sanitized_Request, str_to_int, ) -class FourTubeIE(InfoExtractor): - IE_NAME = '4tube' - _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)' - - _TEST = { - 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', - 'md5': '6516c8ac63b03de06bc8eac14362db4f', - 'info_dict': { - 'id': '209733', - 'ext': 'mp4', - 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', - 'uploader': 'WCP Club', - 'uploader_id': 'wcp-club', - 'upload_date': '20131031', - 'timestamp': 1383263892, - 'duration': 583, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 18, - } - } - +class FourTubeBaseIE(InfoExtractor): def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') + + if kind == 'm' or not display_id: + url = self._URL_TEMPLATE % video_id + webpage = self._download_webpage(url, video_id) title = self._html_search_meta('name', webpage) @@ -43,10 +26,10 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', + r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">', webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', + r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">', webpage, 'uploader', fatal=False) categories_html = self._search_regex( @@ -60,10 +43,10 @@ class FourTubeIE(InfoExtractor): view_count = str_to_int(self._search_regex( r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', - webpage, 'view count', fatal=False)) + webpage, 'view count', default=None)) like_count = str_to_int(self._search_regex( r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', - webpage, 'like count', fatal=False)) + webpage, 'like count', default=None)) duration = parse_duration(self._html_search_meta('duration', webpage)) media_id = self._search_regex( @@ -87,12 +70,12 @@ class FourTubeIE(InfoExtractor): token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format( media_id, '+'.join(sources)) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - b'Origin': b'https://www.4tube.com', - } - token_req = sanitized_Request(token_url, b'{}', headers) - tokens = self._download_json(token_req, video_id) + + parsed_url = compat_urlparse.urlparse(url) + tokens = self._download_json(token_url, video_id, data=b'', headers={ + 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), + 'Referer': url, + }) formats = [{ 'url': tokens[format]['token'], 'format_id': format + 'p', @@ -115,3 +98,126 @@ class FourTubeIE(InfoExtractor): 'duration': duration, 'age_limit': 18, } + + +class FourTubeIE(FourTubeBaseIE): + IE_NAME = '4tube' + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video' + _TESTS = [{ + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', + 'uploader': 'WCP Club', + 'uploader_id': 'wcp-club', + 'upload_date': '20131031', + 'timestamp': 1383263892, + 'duration': 583, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + }, { + 'url': 'http://www.4tube.com/embed/209733', + 'only_matching': True, + }, { + 'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'only_matching': True, + }] + + +class FuxIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.fux.com/video/%s/video' + _TESTS = [{ + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'info_dict': { + 'id': '195359', + 'ext': 'mp4', + 'title': 'Awesome fucking in the kitchen ends with cum swallow', + 'uploader': 'alenci2342', + 'uploader_id': 'alenci2342', + 'upload_date': '20131230', + 'timestamp': 1388361660, + 'duration': 289, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.fux.com/embed/195359', + 'only_matching': True, + }, { + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'only_matching': True, + }] + + +class PornTubeIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' + _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' + _TESTS = [{ + 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', + 'info_dict': { + 'id': '7089759', + 'ext': 'mp4', + 'title': 'Teen couple doing anal', + 'uploader': 'Alexy', + 'uploader_id': 'Alexy', + 'upload_date': '20150606', + 'timestamp': 1433595647, + 'duration': 5052, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.porntube.com/embed/7089759', + 'only_matching': True, + }, { + 'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759', + 'only_matching': True, + }] + + +class PornerBrosIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' + _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s' + _TESTS = [{ + 'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '181369', + 'ext': 'mp4', + 'title': 'Skinny brunette takes big cock down her anal hole', + 'uploader': 'PornerBros HD', + 'uploader_id': 'pornerbros-hd', + 'upload_date': '20130130', + 'timestamp': 1359527401, + 'duration': 1224, + 'view_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.pornerbros.com/embed/181369', + 'only_matching': True, + }, { + 'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 159fdf9c4..11d6c9c32 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -2,57 +2,132 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE +from .uplynk import UplynkPreplayIE +from ..compat import compat_str from ..utils import ( - smuggle_url, + HEADRequest, + int_or_none, + parse_age_limit, + parse_duration, + try_get, + unified_timestamp, update_url_query, ) class FOXIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.fox.com/watch/255180355939/7684182528', + _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' + _TESTS = [{ + # clip + 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', 'md5': 'ebd296fcc41dd4b19f8115d8461a3165', 'info_dict': { - 'id': '255180355939', + 'id': '4b765a60490325103ea69888fb2bd4e8', 'ext': 'mp4', - 'title': 'Official Trailer: Gotham', - 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', - 'duration': 129, - 'timestamp': 1400020798, - 'upload_date': '20140513', - 'uploader': 'NEWA-FNG-FOXCOM', + 'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'description': 'md5:549cd9c70d413adb32ce2a779b53b486', + 'duration': 102, + 'timestamp': 1504291893, + 'upload_date': '20170901', + 'creator': 'FOX', + 'series': 'Gotham', }, - 'add_ie': ['ThePlatform'], - } + 'params': { + 'skip_download': True, + }, + }, { + # episode, geo-restricted + 'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/', + 'only_matching': True, + }, { + # episode, geo-restricted, tv provided required + 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), video_id) - fox_pdk_player = settings['fox_pdk_player'] - release_url = fox_pdk_player['release_url'] - query = { - 'mbr': 'true', - 'switch': 'http' - } - if fox_pdk_player.get('access') == 'locked': - ap_p = settings['foxAdobePassProvider'] - rating = ap_p.get('videoRating') - if rating == 'n/a': - rating = None - resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) + video = self._download_json( + 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id, + video_id, headers={ + 'apikey': 'abdcbed02c124d393b39e818a4312055', + 'Content-Type': 'application/json', + 'Referer': url, + }) - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + title = video['name'] + release_url = video['videoRelease']['url'] + + description = video.get('description') + duration = int_or_none(video.get('durationInSeconds')) or int_or_none( + video.get('duration')) or parse_duration(video.get('duration')) + timestamp = unified_timestamp(video.get('datePublished')) + rating = video.get('contentRating') + age_limit = parse_age_limit(rating) + + data = try_get( + video, lambda x: x['trackingData']['properties'], dict) or {} + + creator = data.get('brand') or data.get('network') or video.get('network') + + series = video.get('seriesName') or data.get( + 'seriesName') or data.get('show') + season_number = int_or_none(video.get('seasonNumber')) + episode = video.get('name') + episode_number = int_or_none(video.get('episodeNumber')) + release_year = int_or_none(video.get('releaseYear')) + + if data.get('authRequired'): + resource = self._get_mvpd_resource( + 'fbc-fox', title, video.get('guid'), rating) + release_url = update_url_query( + release_url, { + 'auth': self._extract_mvpd_auth( + url, video_id, 'fbc-fox', resource) + }) + + subtitles = {} + for doc_rel in video.get('documentReleases', []): + rel_url = doc_rel.get('url') + if not url or doc_rel.get('format') != 'SCC': + continue + subtitles['en'] = [{ + 'url': rel_url, + 'ext': 'scc', + }] + break + + info = { 'id': video_id, - }) + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'creator': creator, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'release_year': release_year, + 'subtitles': subtitles, + } + urlh = self._request_webpage(HEADRequest(release_url), video_id) + video_url = compat_str(urlh.geturl()) + + if UplynkPreplayIE.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'url': video_url, + 'ie_key': UplynkPreplayIE.ie_key(), + }) + else: + m3u8_url = self._download_json(release_url, video_id)['playURL'] + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + info['formats'] = formats return info diff --git a/youtube_dl/extractor/fox9.py b/youtube_dl/extractor/fox9.py index 56d9975d0..17dfffa7b 100644 --- a/youtube_dl/extractor/fox9.py +++ b/youtube_dl/extractor/fox9.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .anvato import AnvatoIE -from ..utils import js_to_json class FOX9IE(AnvatoIE): @@ -34,9 +33,9 @@ class FOX9IE(AnvatoIE): video_id = self._parse_json( self._search_regex( - r'AnvatoPlaylist\s*\(\s*(\[.+?\])\s*\)\s*;', + r"this\.videosJson\s*=\s*'(\[.+?\])';", webpage, 'anvato playlist'), - video_id, transform_source=js_to_json)[0]['video'] + video_id)[0]['video'] return self._get_anvato_videos( 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b', diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 707b9e00d..05806895c 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -33,7 +33,7 @@ class FranceInterIE(InfoExtractor): description = self._og_search_description(webpage) upload_date_str = self._search_regex( - r'class=["\']cover-emission-period["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', + r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', webpage, 'upload date', fatal=False) if upload_date_str: upload_date_list = upload_date_str.split() diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 2bcbb3e39..c02cd03de 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -3,25 +3,91 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( clean_html, + determine_ext, ExtractorError, int_or_none, parse_duration, - determine_ext, -) -from .dailymotion import ( - DailymotionIE, - DailymotionCloudIE, + try_get, ) +from .dailymotion import DailymotionIE class FranceTVBaseInfoExtractor(InfoExtractor): + def _make_url_result(self, video_or_full_id, catalog=None): + full_id = 'francetv:%s' % video_or_full_id + if '@' not in video_or_full_id and catalog: + full_id += '@%s' % catalog + return self.url_result( + full_id, ie=FranceTVIE.ie_key(), + video_id=video_or_full_id.split('@')[0]) + + +class FranceTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + https?:// + sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\? + .*?\bidDiffusion=[^&]+| + (?: + https?://videos\.francetv\.fr/video/| + francetv: + ) + (?P<id>[^@]+)(?:@(?P<catalog>.+))? + ) + ''' + + _TESTS = [{ + # without catalog + 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0', + 'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f', + 'info_dict': { + 'id': '162311093', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', + 'timestamp': 1502623500, + 'upload_date': '20170813', + }, + }, { + # with catalog + 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4', + 'only_matching': True, + }, { + 'url': 'http://videos.francetv.fr/video/NI_657393@Regions', + 'only_matching': True, + }, { + 'url': 'francetv:162311093', + 'only_matching': True, + }, { + 'url': 'francetv:NI_1004933@Zouzous', + 'only_matching': True, + }, { + 'url': 'francetv:NI_983319@Info-web', + 'only_matching': True, + }, { + 'url': 'francetv:NI_983319', + 'only_matching': True, + }, { + 'url': 'francetv:NI_657393@Regions', + 'only_matching': True, + }, { + # france-3 live + 'url': 'francetv:SIM_France3', + 'only_matching': True, + }] + def _extract_video(self, video_id, catalogue=None): + # Videos are identified by idDiffusion so catalogue part is optional. + # However when provided, some extra formats may be returned so we pass + # it if available. info = self._download_json( 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', video_id, 'Downloading video JSON', query={ @@ -31,7 +97,8 @@ class FranceTVBaseInfoExtractor(InfoExtractor): if info.get('status') == 'NOK': raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, info['message']), expected=True) + '%s returned error: %s' % (self.IE_NAME, info['message']), + expected=True) allowed_countries = info['videos'][0].get('geoblocage') if allowed_countries: georestricted = True @@ -46,6 +113,21 @@ class FranceTVBaseInfoExtractor(InfoExtractor): else: georestricted = False + def sign(manifest_url, manifest_id): + for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): + signed_url = self._download_webpage( + 'https://%s/esi/TA' % host, video_id, + 'Downloading signed %s manifest URL' % manifest_id, + fatal=False, query={ + 'url': manifest_url, + }) + if (signed_url and isinstance(signed_url, compat_str) and + re.search(r'^(?:https?:)?//', signed_url)): + return signed_url + return manifest_url + + is_live = None + formats = [] for video in info['videos']: if video['statut'] != 'ONLINE': @@ -53,6 +135,10 @@ class FranceTVBaseInfoExtractor(InfoExtractor): video_url = video['url'] if not video_url: continue + if is_live is None: + is_live = (try_get( + video, lambda x: x['plages_ouverture'][0]['direct'], + bool) is True) or '/live.francetv.fr/' in video_url format_id = video['format'] ext = determine_ext(video_url) if ext == 'f4m': @@ -60,17 +146,14 @@ class FranceTVBaseInfoExtractor(InfoExtractor): # See https://github.com/rg3/youtube-dl/issues/3963 # m3u8 urls work fine continue - f4m_url = self._download_webpage( - 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url, - video_id, 'Downloading f4m manifest token', fatal=False) - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', - video_id, f4m_id=format_id, fatal=False)) + formats.extend(self._extract_f4m_formats( + sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + sign(video_url, format_id), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -101,33 +184,48 @@ class FranceTVBaseInfoExtractor(InfoExtractor): return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': clean_html(info['synopsis']), 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'timestamp': int_or_none(info['diffusion']['timestamp']), + 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, } + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + catalog = mobj.group('catalog') -class FranceTVIE(FranceTVBaseInfoExtractor): + if not video_id: + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('idDiffusion', [None])[0] + catalog = qs.get('catalogue', [None])[0] + if not video_id: + raise ExtractorError('Invalid URL', expected=True) + + return self._extract_video(video_id, catalog) + + +class FranceTVSiteIE(FranceTVBaseInfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html' _TESTS = [{ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'info_dict': { - 'id': '157550144', + 'id': '162311093', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', - 'timestamp': 1494156300, - 'upload_date': '20170507', + 'timestamp': 1502623500, + 'upload_date': '20170813', }, 'params': { - # m3u8 downloads 'skip_download': True, }, + 'add_ie': [FranceTVIE.ie_key()], }, { # france3 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', @@ -160,6 +258,10 @@ class FranceTVIE(FranceTVBaseInfoExtractor): }, { 'url': 'https://www.france.tv/142749-rouge-sang.html', 'only_matching': True, + }, { + # france-3 live + 'url': 'https://www.france.tv/france-3/direct.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -176,13 +278,14 @@ class FranceTVIE(FranceTVBaseInfoExtractor): video_id, catalogue = self._html_search_regex( r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video ID').split('@') - return self._extract_video(video_id, catalogue) + + return self._make_url_result(video_id, catalogue) class FranceTVEmbedIE(FranceTVBaseInfoExtractor): _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', 'info_dict': { 'id': 'NI_983319', @@ -192,7 +295,11 @@ class FranceTVEmbedIE(FranceTVBaseInfoExtractor): 'timestamp': 1493981780, 'duration': 16, }, - } + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -201,12 +308,12 @@ class FranceTVEmbedIE(FranceTVBaseInfoExtractor): 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, video_id) - return self._extract_video(video['video_id'], video.get('catalog')) + return self._make_url_result(video['video_id'], video.get('catalog')) class FranceTVInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -221,51 +328,18 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, }, 'params': { - # m3u8 downloads 'skip_download': True, }, + 'add_ie': [FranceTVIE.ie_key()], }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', - 'info_dict': { - 'id': 'EV_20019', - 'ext': 'mp4', - 'title': 'Débat des candidats à la Commission européenne', - 'description': 'Débat des candidats à la Commission européenne', - }, - 'params': { - 'skip_download': 'HLS (reqires ffmpeg)' - }, - 'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', + 'only_matching': True, }, { 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', - 'md5': 'f485bda6e185e7d15dbc69b72bae993e', - 'info_dict': { - 'id': 'NI_173343', - 'ext': 'mp4', - 'title': 'Les entreprises familiales : le secret de la réussite', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'timestamp': 1433273139, - 'upload_date': '20150602', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, + 'only_matching': True, }, { 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', - 'md5': 'f485bda6e185e7d15dbc69b72bae993e', - 'info_dict': { - 'id': 'NI_657393', - 'ext': 'mp4', - 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', - 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'timestamp': 1458300695, - 'upload_date': '20160318', - }, - 'params': { - 'skip_download': True, - }, + 'only_matching': True, }, { # Dailymotion embed 'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html', @@ -287,13 +361,9 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_title = mobj.group('title') - webpage = self._download_webpage(url, page_title) + display_id = self._match_id(url) - dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) - if dmcloud_url: - return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key()) + webpage = self._download_webpage(url, display_id) dailymotion_urls = DailymotionIE._extract_urls(webpage) if dailymotion_urls: @@ -305,64 +375,116 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): (r'id-video=([^@]+@[^"]+)', r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), webpage, 'video id').split('@') - return self._extract_video(video_id, catalogue) + + return self._make_url_result(video_id, catalogue) -class GenerationQuoiIE(InfoExtractor): - IE_NAME = 'france2.fr:generation-quoi' - _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)' +class GenerationWhatIE(InfoExtractor): + IE_NAME = 'france2.fr:generation-what' + _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://generation-quoi.france2.fr/portrait/garde-a-vous', + _TESTS = [{ + 'url': 'http://generation-what.francetv.fr/portrait/video/present-arms', 'info_dict': { - 'id': 'k7FJX8VBcvvLmX4wA5Q', + 'id': 'wtvKYUG45iw', 'ext': 'mp4', - 'title': 'Génération Quoi - Garde à Vous', - 'uploader': 'Génération Quoi', + 'title': 'Generation What - Garde à vous - FRA', + 'uploader': 'Generation What', + 'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w', + 'upload_date': '20160411', }, 'params': { - # It uses Dailymotion 'skip_download': True, }, - } + 'add_ie': ['Youtube'], + }, { + 'url': 'http://generation-what.francetv.fr/europe/video/present-arms', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) - info_url = compat_urlparse.urljoin(url, '/medias/video/%s.json' % display_id) - info_json = self._download_webpage(info_url, display_id) - info = json.loads(info_json) - return self.url_result('http://www.dailymotion.com/video/%s' % info['id'], - ie='Dailymotion') + + webpage = self._download_webpage(url, display_id) + + youtube_id = self._search_regex( + r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';", + webpage, 'youtube id') + + return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id) class CultureboxIE(FranceTVBaseInfoExtractor): - IE_NAME = 'culturebox.francetvinfo.fr' - _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)' + _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://culturebox.francetvinfo.fr/live/musique/musique-classique/le-livre-vermeil-de-montserrat-a-la-cathedrale-delne-214511', - 'md5': '9b88dc156781c4dbebd4c3e066e0b1d6', + _TESTS = [{ + 'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689', 'info_dict': { - 'id': 'EV_50111', - 'ext': 'flv', - 'title': "Le Livre Vermeil de Montserrat à la Cathédrale d'Elne", - 'description': 'md5:f8a4ad202e8fe533e2c493cc12e739d9', - 'upload_date': '20150320', - 'timestamp': 1426892400, - 'duration': 2760.9, + 'id': 'EV_134885', + 'ext': 'mp4', + 'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7', + 'description': 'md5:19c44af004b88219f4daa50fa9a351d4', + 'upload_date': '20180206', + 'timestamp': 1517945220, + 'duration': 5981, }, - } + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + if ">Ce live n'est plus disponible en replay<" in webpage: + raise ExtractorError( + 'Video %s is not available' % display_id, expected=True) + + video_id, catalogue = self._search_regex( + r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]', + webpage, 'video id').split('@') + + return self._make_url_result(video_id, catalogue) + + +class FranceTVJeunesseIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P<id>[^/?#&]+))' + + _TESTS = [{ + 'url': 'https://www.zouzous.fr/heros/simon', + 'info_dict': { + 'id': 'simon', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.ludo.fr/heros/ninjago', + 'info_dict': { + 'id': 'ninjago', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.zouzous.fr/heros/simon?abc', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') + playlist_id = mobj.group('id') - webpage = self._download_webpage(url, name) + playlist = self._download_json( + '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id) - if ">Ce live n'est plus disponible en replay<" in webpage: - raise ExtractorError('Video %s is not available' % name, expected=True) + if not playlist.get('count'): + raise ExtractorError( + '%s is not available' % playlist_id, expected=True) - video_id, catalogue = self._search_regex( - r'"http://videos\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video id').split('@') + entries = [] + for item in playlist['items']: + identity = item.get('identity') + if identity and isinstance(identity, compat_str): + entries.append(self._make_url_result(identity)) - return self._extract_video(video_id, catalogue) + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index 0a70ca763..486a49c05 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -1,37 +1,34 @@ from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor class FreespeechIE(InfoExtractor): IE_NAME = 'freespeech.org' - _VALID_URL = r'https?://(?:www\.)?freespeech\.org/video/(?P<title>.+)' + _VALID_URL = r'https?://(?:www\.)?freespeech\.org/stories/(?P<id>.+)' _TEST = { 'add_ie': ['Youtube'], - 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', + 'url': 'http://www.freespeech.org/stories/fcc-announces-net-neutrality-rollback-whats-stake/', 'info_dict': { - 'id': 'poKsVCZ64uU', - 'ext': 'webm', - 'title': 'Obama, Romney Campaign in Colorado Ahead of Debate', - 'description': 'Obama, Romney Campaign in Colorado Ahead of Debate', - 'uploader': 'freespeechtv', + 'id': 'waRk6IPqyWM', + 'ext': 'mp4', + 'title': 'What\'s At Stake - Net Neutrality Special', + 'description': 'Presented by MNN and FSTV', + 'upload_date': '20170728', 'uploader_id': 'freespeechtv', - 'upload_date': '20121002', + 'uploader': 'freespeechtv', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - info_json = self._search_regex(r'jQuery.extend\(Drupal.settings, ({.*?})\);', webpage, 'info') - info = json.loads(info_json) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + youtube_url = self._search_regex( + r'data-video-url="([^"]+)"', + webpage, 'youtube url') return { '_type': 'url', - 'url': info['jw_player']['basic_video_node_player']['file'], + 'url': youtube_url, 'ie_key': 'Youtube', } diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 8c37509ec..107f658ba 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -57,7 +57,7 @@ class FunimationIE(InfoExtractor): try: data = self._download_json( 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', - None, 'Logging in as %s' % username, data=urlencode_postdata({ + None, 'Logging in', data=urlencode_postdata({ 'username': username, 'password': password, })) diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py new file mode 100644 index 000000000..0ff058619 --- /dev/null +++ b/youtube_dl/extractor/funk.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .nexx import NexxIE +from ..utils import ( + int_or_none, + try_get, +) + + +class FunkBaseIE(InfoExtractor): + def _make_url_result(self, video): + return { + '_type': 'url_transparent', + 'url': 'nexx:741:%s' % video['sourceId'], + 'ie_key': NexxIE.ie_key(), + 'id': video['sourceId'], + 'title': video.get('title'), + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'season_number': int_or_none(video.get('seasonNr')), + 'episode_number': int_or_none(video.get('episodeNr')), + } + + +class FunkMixIE(FunkBaseIE): + _VALID_URL = r'https?://(?:www\.)?funk\.net/mix/(?P<id>[^/]+)/(?P<alias>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.funk.net/mix/59d65d935f8b160001828b5b/die-realste-kifferdoku-aller-zeiten', + 'md5': '8edf617c2f2b7c9847dfda313f199009', + 'info_dict': { + 'id': '123748', + 'ext': 'mp4', + 'title': '"Die realste Kifferdoku aller Zeiten"', + 'description': 'md5:c97160f5bafa8d47ec8e2e461012aa9d', + 'timestamp': 1490274721, + 'upload_date': '20170323', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + mix_id = mobj.group('id') + alias = mobj.group('alias') + + lists = self._download_json( + 'https://www.funk.net/api/v3.1/curation/curatedLists/', + mix_id, headers={ + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI', + 'Referer': url, + }, query={ + 'size': 100, + })['result']['lists'] + + metas = next( + l for l in lists + if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas'] + video = next( + meta['videoDataDelegate'] + for meta in metas if meta.get('alias') == alias) + + return self._make_url_result(video) + + +class FunkChannelIE(FunkBaseIE): + _VALID_URL = r'https?://(?:www\.)?funk\.net/channel/(?P<id>[^/]+)/(?P<alias>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.funk.net/channel/ba/die-lustigsten-instrumente-aus-dem-internet-teil-2', + 'info_dict': { + 'id': '1155821', + 'ext': 'mp4', + 'title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet - Teil 2', + 'description': 'md5:a691d0413ef4835588c5b03ded670c1f', + 'timestamp': 1514507395, + 'upload_date': '20171229', + }, + 'params': { + 'skip_download': True, + }, + }, { + # only available via byIdList API + 'url': 'https://www.funk.net/channel/informr/martin-sonneborn-erklaert-die-eu', + 'info_dict': { + 'id': '205067', + 'ext': 'mp4', + 'title': 'Martin Sonneborn erklärt die EU', + 'description': 'md5:050f74626e4ed87edf4626d2024210c0', + 'timestamp': 1494424042, + 'upload_date': '20170510', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.funk.net/channel/59d5149841dca100012511e3/mein-erster-job-lovemilla-folge-1/lovemilla/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + alias = mobj.group('alias') + + headers = { + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', + 'Referer': url, + } + + video = None + + by_id_list = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, + headers=headers, query={ + 'ids': alias, + }, fatal=False) + if by_id_list: + video = try_get(by_id_list, lambda x: x['result'][0], dict) + + if not video: + results = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, + headers=headers, query={ + 'channelId': channel_id, + 'size': 100, + })['result'] + video = next(r for r in results if r.get('alias') == alias) + + return self._make_url_result(video) diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py index ede729b52..25e284d46 100644 --- a/youtube_dl/extractor/fusion.py +++ b/youtube_dl/extractor/fusion.py @@ -5,9 +5,9 @@ from .ooyala import OoyalaIE class FusionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?fusion\.(?:net|tv)/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', + 'url': 'http://fusion.tv/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', 'info_dict': { 'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P', 'ext': 'mp4', @@ -20,7 +20,7 @@ class FusionIE(InfoExtractor): }, 'add_ie': ['Ooyala'], }, { - 'url': 'http://fusion.net/video/201781', + 'url': 'http://fusion.tv/video/201781', 'only_matching': True, }] diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 629897317..00e67426b 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -3,27 +3,31 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - update_url_query, extract_attributes, + int_or_none, parse_age_limit, smuggle_url, + update_url_query, ) class FXNetworksIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/719841347694', - 'md5': '1447d4722e42ebca19e5232ab93abb22', + 'url': 'http://www.fxnetworks.com/video/1032565827847', + 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', 'info_dict': { - 'id': '719841347694', + 'id': 'dRzwHC_MMqIv', 'ext': 'mp4', - 'title': 'Vanpage', - 'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.', + 'title': 'First Look: Better Things - Season 2', + 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', 'age_limit': 14, 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20160706', - 'timestamp': 1467844741, + 'upload_date': '20170825', + 'timestamp': 1503686274, + 'episode_number': 0, + 'season_number': 2, + 'series': 'Better Things', }, 'add_ie': ['ThePlatform'], }, { @@ -37,7 +41,7 @@ class FXNetworksIE(AdobePassIE): if 'The content you are trying to access is not available in your region.' in webpage: self.raise_geo_restricted() video_data = extract_attributes(self._search_regex( - r'(<a.+?rel="http://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) + r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) release_url = video_data['rel'] title = video_data['data-title'] @@ -64,6 +68,9 @@ class FXNetworksIE(AdobePassIE): 'id': video_id, 'title': title, 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'series': video_data.get('data-show-title'), + 'episode_number': int_or_none(video_data.get('data-episode')), + 'season_number': int_or_none(video_data.get('data-season')), 'thumbnail': video_data.get('data-large-thumb'), 'age_limit': parse_age_limit(rating), 'ie_key': 'ThePlatform', diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py index a66e309de..a2920a793 100644 --- a/youtube_dl/extractor/gameinformer.py +++ b/youtube_dl/extractor/gameinformer.py @@ -23,6 +23,11 @@ class GameInformerIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - brightcove_id = self._search_regex(r"getVideo\('[^']+video_id=(\d+)", webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + webpage = self._download_webpage( + url, display_id, headers=self.geo_verification_headers()) + brightcove_id = self._search_regex( + [r'<[^>]+\bid=["\']bc_(\d+)', r"getVideo\('[^']+video_id=(\d+)"], + webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', + brightcove_id) diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py deleted file mode 100644 index a218a6944..000000000 --- a/youtube_dl/extractor/gamersyde.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - js_to_json, - parse_duration, - remove_start, -) - - -class GamersydeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_(?P<display_id>[\da-z_]+)-(?P<id>\d+)_[a-z]{2}\.html' - _TEST = { - 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', - 'md5': 'f38d400d32f19724570040d5ce3a505f', - 'info_dict': { - 'id': '34371', - 'ext': 'mp4', - 'duration': 372, - 'title': 'Bloodborne - Birth of a hero', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - playlist = self._parse_json( - self._search_regex( - r'(?s)playlist: \[({.+?})\]\s*}\);', webpage, 'files'), - display_id, transform_source=js_to_json) - - formats = [] - for source in playlist['sources']: - video_url = source.get('file') - if not video_url: - continue - format_id = source.get('label') - f = { - 'url': video_url, - 'format_id': format_id, - } - m = re.search(r'^(?P<height>\d+)[pP](?P<fps>\d+)fps', format_id) - if m: - f.update({ - 'height': int(m.group('height')), - 'fps': int(m.group('fps')), - }) - formats.append(f) - self._sort_formats(formats) - - title = remove_start(playlist['title'], '%s - ' % video_id) - thumbnail = playlist.get('image') - duration = parse_duration(self._search_regex( - r'Length:</label>([^<]+)<', webpage, 'duration', fatal=False)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 00d311158..ab647dd41 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(OnceIE): - _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -35,6 +35,12 @@ class GameSpotIE(OnceIE): 'params': { 'skip_download': True, # m3u8 downloads }, + }, { + 'url': 'https://www.gamespot.com/videos/embed/6439218/', + 'only_matching': True, + }, { + 'url': 'https://www.gamespot.com/articles/the-last-of-us-2-receives-new-ps4-trailer/1100-6454469/', + 'only_matching': True, }] def _real_extract(self, url): @@ -52,7 +58,7 @@ class GameSpotIE(OnceIE): manifest_url = f4m_url formats.extend(self._extract_f4m_formats( f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = streams.get('m3u8_stream') + m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) if m3u8_url: manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( @@ -60,7 +66,7 @@ class GameSpotIE(OnceIE): m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low')) + streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) if progressive_url and manifest_url: qualities_basename = self._search_regex( r'/([^/]+)\.csmil/', @@ -105,7 +111,8 @@ class GameSpotIE(OnceIE): onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') if onceux_url: formats.extend(self._extract_once_formats(re.sub( - r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', ''))) + r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url), + http_formats_preference=-1)) if not formats: for quality in ['sd', 'hd']: diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index e607d6ab8..f00dab2f3 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -9,27 +11,34 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?game(?P<site>pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' + _TESTS = [{ 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', - 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', + 'md5': 'ee782f1f8050448c95c5cacd63bc851c', 'info_dict': { 'id': '76110', 'ext': 'mp4', 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1406542020, + 'timestamp': 1406542380, 'upload_date': '20140728', - 'duration': 17 + 'duration': 17, } - } + }, { + 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }, { + 'url': 'http://www.gamestar.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') + video_id = mobj.group('id') - url = 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id + webpage = self._download_webpage(url, video_id) # TODO: there are multiple ld+json objects in the webpage, # while _search_json_ld finds only the first one @@ -37,16 +46,17 @@ class GameStarIE(InfoExtractor): r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>[^<]+VideoObject[^<]+)</script>', webpage, 'JSON-LD', group='json_ld'), video_id) info_dict = self._json_ld(json_ld, video_id) - info_dict['title'] = remove_end(info_dict['title'], ' - GameStar') + info_dict['title'] = remove_end( + info_dict['title'], ' - Game%s' % site.title()) - view_count = json_ld.get('interactionCount') + view_count = int_or_none(json_ld.get('interactionCount')) comment_count = int_or_none(self._html_search_regex( - r'([0-9]+) Kommentare</span>', webpage, 'comment_count', - fatal=False)) + r'<span>Kommentare</span>\s*<span[^>]+class=["\']count[^>]+>\s*\(\s*([0-9]+)', + webpage, 'comment count', fatal=False)) info_dict.update({ 'id': video_id, - 'url': url, + 'url': 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id, 'ext': 'mp4', 'view_count': view_count, 'comment_count': comment_count diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 34e814988..73980601c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -22,6 +22,9 @@ from ..utils import ( HEADRequest, is_html, js_to_json, + KNOWN_EXTENSIONS, + merge_dicts, + mimetype2ext, orderedSet, sanitized_Request, smuggle_url, @@ -56,11 +59,9 @@ from .xhamster import XHamsterEmbedIE from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE +from .tube8 import Tube8IE from .vimeo import VimeoIE -from .dailymotion import ( - DailymotionIE, - DailymotionCloudIE, -) +from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE @@ -98,6 +99,14 @@ from .wistia import WistiaIE from .mediaset import MediasetIE from .joj import JojIE from .megaphone import MegaphoneIE +from .vzaar import VzaarIE +from .channel9 import Channel9IE +from .vshare import VShareIE +from .mediasite import MediasiteIE +from .springboardplatform import SpringboardPlatformIE +from .yapfiles import YapFilesIE +from .vice import ViceIE +from .xfileshare import XFileShareIE class GenericIE(InfoExtractor): @@ -182,6 +191,16 @@ class GenericIE(InfoExtractor): 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } }, + # RSS feed with enclosures and unsupported link URLs + { + 'url': 'http://www.hellointernet.fm/podcast?format=rss', + 'info_dict': { + 'id': 'http://www.hellointernet.fm/podcast?format=rss', + 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', + 'title': 'Hello Internet', + }, + 'playlist_mincount': 100, + }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', @@ -1087,23 +1106,24 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20150212', 'uploader': 'The National Archives UK', - 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6', + 'description': 'md5:8078af856dca76edc42910b61273dbbf', 'uploader_id': 'NationalArchives08', 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', }, }, # jwplayer rtmp { - 'url': 'http://www.suffolk.edu/sjc/', + 'url': 'http://www.suffolk.edu/sjc/live.php', 'info_dict': { - 'id': 'sjclive', + 'id': 'live', 'ext': 'flv', 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', 'uploader': 'www.suffolk.edu', }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', }, # Complex jwplayer { @@ -1112,6 +1132,7 @@ class GenericIE(InfoExtractor): 'id': 'videos', 'ext': 'mp4', 'title': 'king machine trailer 1', + 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', 'thumbnail': r're:^https?://.*\.jpg$', }, }, @@ -1129,13 +1150,55 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + { + # JWPlatform iframe + 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/', + 'md5': 'ca00a040364b5b439230e7ebfd02c4e9', + 'info_dict': { + 'id': 'O0c5JcKT', + 'ext': 'mp4', + 'upload_date': '20171122', + 'timestamp': 1511366290, + 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, + { + # Video.js embed, multiple formats + 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', + 'info_dict': { + 'id': 'yygqldloqIk', + 'ext': 'mp4', + 'title': 'SolidWorks. Урок 6 Настройка чертежа', + 'description': 'md5:baf95267792646afdbf030e4d06b2ab3', + 'upload_date': '20130314', + 'uploader': 'PROстое3D', + 'uploader_id': 'PROstoe3D', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Video.js embed, single format + 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=', + 'info_dict': { + 'id': 'watch', + 'ext': 'mp4', + 'title': 'Step 1 - Good Foundation', + 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4', + }, + 'params': { + 'skip_download': True, + }, + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', 'playlist_mincount': 5, 'info_dict': { 'id': 'aanslagen-kopenhagen', - 'title': 'Aanslagen Kopenhagen | RTL Nieuws', + 'title': 'Aanslagen Kopenhagen', } }, # Zapiks embed @@ -1168,7 +1231,7 @@ class GenericIE(InfoExtractor): 'title': '35871', 'timestamp': 1355743100, 'upload_date': '20121217', - 'uploader_id': 'batchUser', + 'uploader_id': 'cplapp@learn360.com', }, 'add_ie': ['Kaltura'], }, @@ -1219,23 +1282,38 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, - # EaglePlatform embed (generic URL) { - 'url': 'http://lenta.ru/news/2015/03/06/navalny/', - # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used + # Kaltura iframe embed, more sophisticated + 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html', 'info_dict': { - 'id': '227304', + 'id': '1_9gzouybz', 'ext': 'mp4', - 'title': 'Навальный вышел на свободу', - 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 87, - 'view_count': int, - 'age_limit': 0, + 'title': 'lecture-05sep2017', + 'description': 'md5:40f347d91fd4ba047e511c5321064b49', + 'upload_date': '20170913', + 'uploader_id': 'eps2', + 'timestamp': 1505340777, }, 'params': { 'skip_download': True, }, + 'add_ie': ['Kaltura'], + }, + { + # meta twitter:player + 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', + 'info_dict': { + 'id': '0_01b42zps', + 'ext': 'mp4', + 'title': 'Main Twerk (Video)', + 'upload_date': '20171208', + 'uploader_id': 'sebastian.salinas@thechive.com', + 'timestamp': 1512713057, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], }, # referrer protected EaglePlatform embed { @@ -1267,6 +1345,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is unavailable.', }, # Pladform embed { @@ -1280,6 +1359,7 @@ class GenericIE(InfoExtractor): 'duration': 694, 'age_limit': 0, }, + 'skip': 'HTTP Error 404: Not Found', }, # Playwire embed { @@ -1300,6 +1380,14 @@ class GenericIE(InfoExtractor): 'id': '518726732', 'ext': 'mp4', 'title': 'Facebook Creates "On This Day" | Crunch Report', + 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', + 'timestamp': 1427237531, + 'uploader': 'Crunch Report', + 'upload_date': '20150324', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, # SVT embed @@ -1351,16 +1439,20 @@ class GenericIE(InfoExtractor): 'upload_date': '20140107', 'timestamp': 1389118457, }, + 'skip': 'Invalid Page URL', }, # NBC News embed { 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html', 'md5': '1aa589c675898ae6d37a17913cf68d66', 'info_dict': { - 'id': '701714499682', + 'id': 'x_dtl_oa_LettermanliftPR_160608', 'ext': 'mp4', - 'title': 'PREVIEW: On Assignment: David Letterman', + 'title': 'David Letterman: A Preview', 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.', + 'upload_date': '20160609', + 'timestamp': 1465431544, + 'uploader': 'NBCU-NEWS', }, }, # UDN embed @@ -1377,6 +1469,7 @@ class GenericIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Failed to parse JSON Expecting value'], }, # Ooyala embed { @@ -1384,7 +1477,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', - 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', + 'description': 'Index/Match versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', 'duration': 191.933, }, @@ -1408,22 +1501,6 @@ class GenericIE(InfoExtractor): 'timestamp': 1432570283, }, }, - # Dailymotion Cloud video - { - 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', - 'info_dict': { - 'id': 'x2uy8t3', - 'ext': 'mp4', - 'title': 'Sauvons les abeilles ! - Le débat', - 'description': 'md5:d9082128b1c5277987825d684939ca26', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'timestamp': 1434970506, - 'upload_date': '20150622', - 'uploader': 'Public Sénat', - 'uploader_id': 'xa9gza', - } - }, # OnionStudios embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', @@ -1518,14 +1595,27 @@ class GenericIE(InfoExtractor): # LiveLeak embed { 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'md5': '7619da8c820e835bef21a1efa2a0fc71', 'info_dict': { 'id': '874_1459135191', 'ext': 'mp4', 'title': 'Man shows poor quality of new apartment building', 'description': 'The wall is like a sand pile.', 'uploader': 'Lake8737', - } + }, + 'add_ie': [LiveLeakIE.ie_key()], + }, + # Another LiveLeak embed pattern (#13336) + { + 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', + 'info_dict': { + 'id': '2eb_1496309988', + 'ext': 'mp4', + 'title': 'Thief robs place where everyone was armed', + 'description': 'md5:694d73ee79e535953cf2488562288eee', + 'uploader': 'brazilwtf', + }, + 'add_ie': [LiveLeakIE.ie_key()], }, # Duplicated embedded video URLs { @@ -1567,22 +1657,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['BrightcoveLegacy'], }, - # Nexx embed - { - 'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503', - 'info_dict': { - 'id': '247746', - 'ext': 'mp4', - 'title': "Yesterday's Jam (OV)", - 'description': 'md5:09bc0984723fed34e2581624a84e05f0', - 'timestamp': 1492594816, - 'upload_date': '20170419', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, # Facebook <iframe> embed { 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', @@ -1784,6 +1858,21 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 5, }, + { + # Limelight embed (LimelightPlayerUtil.embed) + 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', + 'info_dict': { + 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', + 'ext': 'mp4', + 'title': '07448641', + 'timestamp': 1499890639, + 'upload_date': '20170712', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['LimelightMedia'], + }, { 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', 'info_dict': { @@ -1840,6 +1929,100 @@ class GenericIE(InfoExtractor): 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', }, }, + { + # vzaar embed + 'url': 'http://help.vzaar.com/article/165-embedding-video', + 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', + 'info_dict': { + 'id': '8707641', + 'ext': 'mp4', + 'title': 'Building A Business Online: Principal Chairs Q & A', + }, + }, + { + # multiple HTML5 videos on one page + 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', + 'info_dict': { + 'id': 'keyscenarios', + 'title': 'Rescue Kit 14 Free Edition - Getting started', + }, + 'playlist_count': 4, + }, + { + # vshare embed + 'url': 'https://youtube-dl-demo.neocities.org/vshare.html', + 'md5': '17b39f55b5497ae8b59f5fbce8e35886', + 'info_dict': { + 'id': '0f64ce6', + 'title': 'vl14062007715967', + 'ext': 'mp4', + } + }, + { + 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', + 'md5': 'aecd089f55b1cb5a59032cb049d3a356', + 'info_dict': { + 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', + 'ext': 'mp4', + 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', + 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', + 'timestamp': 1474354800, + 'upload_date': '20160920', + } + }, + { + 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton', + 'info_dict': { + 'id': '1731611', + 'ext': 'mp4', + 'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!', + 'description': 'md5:eb5f23826a027ba95277d105f248b825', + 'timestamp': 1516100691, + 'upload_date': '20180116', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [SpringboardPlatformIE.ie_key()], + }, + { + 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', + 'info_dict': { + 'id': 'uPDB5I9wfp8', + 'ext': 'webm', + 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', + 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', + 'upload_date': '20160219', + 'uploader': 'Pocoyo - Português (BR)', + 'uploader_id': 'PocoyoBrazil', + }, + 'add_ie': [YoutubeIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', + 'info_dict': { + 'id': 'vMDE4NzI1Mjgt690b', + 'ext': 'mp4', + 'title': 'Котята', + }, + 'add_ie': [YapFilesIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://share-videos.se/auto/video/83645793?uid=13', + 'md5': 'b68d276de422ab07ee1d49388103f457', + 'info_dict': { + 'id': '83645793', + 'title': 'Lock up and get excited', + 'ext': 'mp4' + }, + 'skip': 'TODO: fix nested playlists processing in tests', + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1870,13 +2053,15 @@ class GenericIE(InfoExtractor): entries = [] for it in doc.findall('./channel/item'): - next_url = xpath_text(it, 'link', fatal=False) + next_url = None + enclosure_nodes = it.findall('./enclosure') + for e in enclosure_nodes: + next_url = e.attrib.get('url') + if next_url: + break + if not next_url: - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break + next_url = xpath_text(it, 'link', fatal=False) if not next_url: continue @@ -1989,7 +2174,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = head_response.geturl() + new_url = compat_str(head_response.geturl()) if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -2086,11 +2271,15 @@ class GenericIE(InfoExtractor): self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': - return self.playlist_result(self._parse_xspf(doc, video_id), video_id) + return self.playlist_result( + self._parse_xspf( + doc, video_id, xspf_url=url, + xspf_base_url=compat_str(full_response.geturl())), + video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( - doc, video_id, - mpd_base_url=full_response.geturl().rpartition('/')[0], + doc, + mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict @@ -2127,7 +2316,7 @@ class GenericIE(InfoExtractor): # And then there are the jokers who advertise that they use RTA, # but actually don't. AGE_LIMIT_MARKERS = [ - r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>', + r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', ] if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): age_limit = 18 @@ -2165,7 +2354,10 @@ class GenericIE(InfoExtractor): # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(self, webpage) if bc_urls: - return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') + return self.playlist_from_matches( + bc_urls, video_id, video_title, + getter=lambda x: smuggle_url(x, {'referrer': url}), + ie='BrightcoveNew') # Look for Nexx embeds nexx_urls = NexxIE._extract_urls(webpage) @@ -2189,7 +2381,7 @@ class GenericIE(InfoExtractor): # Look for embedded rtl.nl player matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', + r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', webpage) if matches: return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') @@ -2204,36 +2396,11 @@ class GenericIE(InfoExtractor): if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') - # Look for embedded YouTube player - matches = re.findall(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/.+?) - \1''', webpage) - if matches: + # Look for YouTube embeds + youtube_urls = YoutubeIE._extract_urls(webpage) + if youtube_urls: return self.playlist_from_matches( - matches, video_id, video_title, lambda m: unescapeHTML(m[1])) - - # Look for lazyYT YouTube embed - matches = re.findall( - r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) - - # Look for Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) + youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) matches = DailymotionIE._extract_urls(webpage) if matches: @@ -2436,6 +2603,11 @@ class GenericIE(InfoExtractor): if redtube_urls: return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) + # Look for embedded Tube8 player + tube8_urls = Tube8IE._extract_urls(webpage) + if tube8_urls: + return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -2613,7 +2785,7 @@ class GenericIE(InfoExtractor): # Look for UDN embeds mobj = re.search( - r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) + r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) if mobj is not None: return self.url_result( compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') @@ -2623,11 +2795,6 @@ class GenericIE(InfoExtractor): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') - # Look for Dailymotion Cloud videos - dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) - if dmcloud_url: - return self.url_result(dmcloud_url, 'DailymotionCloud') - # Look for OnionStudios embeds onionstudios_url = OnionStudiosIE._extract_url(webpage) if onionstudios_url: @@ -2639,9 +2806,9 @@ class GenericIE(InfoExtractor): return self.url_result(viewlift_url) # Look for JWPlatform embeds - jwplatform_url = JWPlatformIE._extract_url(webpage) - if jwplatform_url: - return self.url_result(jwplatform_url, 'JWPlatform') + jwplatform_urls = JWPlatformIE._extract_urls(webpage) + if jwplatform_urls: + return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) # Look for Digiteka embeds digiteka_url = DigitekaIE._extract_url(webpage) @@ -2731,9 +2898,9 @@ class GenericIE(InfoExtractor): self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) # Look for LiveLeak embeds - liveleak_url = LiveLeakIE._extract_url(webpage) - if liveleak_url: - return self.url_result(liveleak_url, 'LiveLeak') + liveleak_urls = LiveLeakIE._extract_urls(webpage) + if liveleak_urls: + return self.playlist_from_matches(liveleak_urls, video_id, video_title) # Look for 3Q SDN embeds threeqsdn_url = ThreeQSDNIE._extract_url(webpage) @@ -2811,37 +2978,77 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - def merge_dicts(dict1, dict2): - merged = {} - for k, v in dict1.items(): - if v is not None: - merged[k] = v - for k, v in dict2.items(): - if v is None: - continue - if (k not in merged or - (isinstance(v, compat_str) and v and - isinstance(merged[k], compat_str) and - not merged[k])): - merged[k] = v - return merged + # Look for vzaar embeds + vzaar_urls = VzaarIE._extract_urls(webpage) + if vzaar_urls: + return self.playlist_from_matches( + vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - return merge_dicts(json_ld, info_dict) + channel9_urls = Channel9IE._extract_urls(webpage) + if channel9_urls: + return self.playlist_from_matches( + channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) + + vshare_urls = VShareIE._extract_urls(webpage) + if vshare_urls: + return self.playlist_from_matches( + vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) + + # Look for Mediasite embeds + mediasite_urls = MediasiteIE._extract_urls(webpage) + if mediasite_urls: + entries = [ + self.url_result(smuggle_url( + compat_urlparse.urljoin(url, mediasite_url), + {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) + for mediasite_url in mediasite_urls] + return self.playlist_result(entries, video_id, video_title) + + springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) + if springboardplatform_urls: + return self.playlist_from_matches( + springboardplatform_urls, video_id, video_title, + ie=SpringboardPlatformIE.ie_key()) + + yapfiles_urls = YapFilesIE._extract_urls(webpage) + if yapfiles_urls: + return self.playlist_from_matches( + yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) + + vice_urls = ViceIE._extract_urls(webpage) + if vice_urls: + return self.playlist_from_matches( + vice_urls, video_id, video_title, ie=ViceIE.ie_key()) + + xfileshare_urls = XFileShareIE._extract_urls(webpage) + if xfileshare_urls: + return self.playlist_from_matches( + xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', + webpage)] + if sharevideos_urls: + return self.playlist_from_matches( + sharevideos_urls, video_id, video_title) # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: - for entry in entries: - entry.update({ + if len(entries) == 1: + entries[0].update({ 'id': video_id, 'title': video_title, }) + else: + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': '%s-%s' % (video_id, num), + 'title': '%s (%d)' % (video_title, num), + }) + for entry in entries: self._sort_formats(entry['formats']) - return self.playlist_result(entries) + return self.playlist_result(entries, video_id, video_title) jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) @@ -2850,6 +3057,52 @@ class GenericIE(InfoExtractor): jwplayer_data, video_id, require_title=False, base_url=url) return merge_dicts(info, info_dict) + # Video.js embed + mobj = re.search( + r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', + webpage) + if mobj is not None: + sources = self._parse_json( + mobj.group(1), video_id, transform_source=js_to_json, + fatal=False) or [] + if not isinstance(sources, list): + sources = [sources] + formats = [] + for source in sources: + src = source.get('src') + if not src or not isinstance(src, compat_str): + continue + src = compat_urlparse.urljoin(url, src) + src_type = source.get('type') + if isinstance(src_type, compat_str): + src_type = src_type.lower() + ext = determine_ext(src).lower() + if src_type == 'video/youtube': + return self.url_result(src, YoutubeIE.ie_key()) + if src_type == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + elif src_type == 'application/x-mpegurl' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + 'ext': (mimetype2ext(src_type) or + ext if ext in KNOWN_EXTENSIONS else 'mp4'), + }) + if formats: + self._sort_formats(formats) + info_dict['formats'] = formats + return info_dict + + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + return merge_dicts(json_ld, info_dict) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -2937,7 +3190,7 @@ class GenericIE(InfoExtractor): # be supported by youtube-dl thus this is checked the very last (see # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) embed_url = self._html_search_meta('twitter:player', webpage, default=None) - if embed_url: + if embed_url and embed_url != url: return self.url_result(embed_url) if not found: diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index 45ccc11c1..a0670b645 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -11,7 +11,7 @@ from ..utils import ( class GfycatIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/)?(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/|gifs/detail/)?(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { @@ -44,6 +44,9 @@ class GfycatIE(InfoExtractor): 'categories': list, 'age_limit': 0, } + }, { + 'url': 'https://gfycat.com/gifs/detail/UnconsciousLankyIvorygull', + 'only_matching': True }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/gigya.py b/youtube_dl/extractor/gigya.py new file mode 100644 index 000000000..412178492 --- /dev/null +++ b/youtube_dl/extractor/gigya.py @@ -0,0 +1,22 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + urlencode_postdata, +) + + +class GigyaBaseIE(InfoExtractor): + def _gigya_login(self, auth_data): + auth_info = self._download_json( + 'https://accounts.eu1.gigya.com/accounts.login', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(auth_data)) + + error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') + if error_message: + raise ExtractorError( + 'Unable to login: %s' % error_message, expected=True) + return auth_info diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index c40da85c5..3bf462d63 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,26 +4,61 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, lowercase_escape, + update_url_query, ) class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:docs|drive)\.google\.com/ + (?: + (?:uc|open)\?.*?id=| + file/d/ + )| + video\.google\.com/get_player\?.*?docid= + ) + (?P<id>[a-zA-Z0-9_-]{28,}) + ''' _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'md5': 'd109872761f7e7ecf353fa108c0dbe1e', + 'md5': '5c602afbbf2c1db91831f5d82f678554', 'info_dict': { 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', 'duration': 45, } + }, { + # video can't be watched anonymously due to view count limit reached, + # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046) + 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', + 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', + 'info_dict': { + 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', + 'ext': 'mp4', + 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', + } }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', + 'info_dict': { + 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', + 'ext': 'mp4', + 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', + 'duration': 189, + }, + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', 'only_matching': True, }] _FORMATS_EXT = { @@ -44,6 +79,13 @@ class GoogleDriveIE(InfoExtractor): '46': 'webm', '59': 'mp4', } + _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' + _CAPTIONS_ENTRY_TAG = { + 'subtitles': 'track', + 'automatic_captions': 'target', + } + _caption_formats_ext = [] + _captions_xml = None @staticmethod def _extract_url(webpage): @@ -53,54 +95,183 @@ class GoogleDriveIE(InfoExtractor): if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') + def _download_subtitles_xml(self, video_id, subtitles_id, hl): + if self._captions_xml: + return + self._captions_xml = self._download_xml( + self._BASE_URL_CAPTIONS, video_id, query={ + 'id': video_id, + 'vid': subtitles_id, + 'hl': hl, + 'v': video_id, + 'type': 'list', + 'tlangs': '1', + 'fmts': '1', + 'vssids': '1', + }, note='Downloading subtitles XML', + errnote='Unable to download subtitles XML', fatal=False) + if self._captions_xml: + for f in self._captions_xml.findall('format'): + if f.attrib.get('fmt_code') and not f.attrib.get('default'): + self._caption_formats_ext.append(f.attrib['fmt_code']) + + def _get_captions_by_type(self, video_id, subtitles_id, caption_type, + origin_lang_code=None): + if not subtitles_id or not caption_type: + return + captions = {} + for caption_entry in self._captions_xml.findall( + self._CAPTIONS_ENTRY_TAG[caption_type]): + caption_lang_code = caption_entry.attrib.get('lang_code') + if not caption_lang_code: + continue + caption_format_data = [] + for caption_format in self._caption_formats_ext: + query = { + 'vid': subtitles_id, + 'v': video_id, + 'fmt': caption_format, + 'lang': (caption_lang_code if origin_lang_code is None + else origin_lang_code), + 'type': 'track', + 'name': '', + 'kind': '', + } + if origin_lang_code is not None: + query.update({'tlang': caption_lang_code}) + caption_format_data.append({ + 'url': update_url_query(self._BASE_URL_CAPTIONS, query), + 'ext': caption_format, + }) + captions[caption_lang_code] = caption_format_data + return captions + + def _get_subtitles(self, video_id, subtitles_id, hl): + if not subtitles_id or not hl: + return + self._download_subtitles_xml(video_id, subtitles_id, hl) + if not self._captions_xml: + return + return self._get_captions_by_type(video_id, subtitles_id, 'subtitles') + + def _get_automatic_captions(self, video_id, subtitles_id, hl): + if not subtitles_id or not hl: + return + self._download_subtitles_xml(video_id, subtitles_id, hl) + if not self._captions_xml: + return + track = self._captions_xml.find('track') + if track is None: + return + origin_lang_code = track.attrib.get('lang_code') + if not origin_lang_code: + return + return self._get_captions_by_type( + video_id, subtitles_id, 'automatic_captions', origin_lang_code) + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://docs.google.com/file/d/%s' % video_id, video_id) - reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason) - - title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + title = self._search_regex( + r'"title"\s*,\s*"([^"]+)', webpage, 'title', + default=None) or self._og_search_title(webpage) duration = int_or_none(self._search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') - fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') - - resolutions = {} - for fmt in fmt_list: - mobj = re.search( - r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt) - if mobj: - resolutions[mobj.group('format_id')] = ( - int(mobj.group('width')), int(mobj.group('height'))) + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', + default=None)) formats = [] - for fmt_stream in fmt_stream_map: - fmt_stream_split = fmt_stream.split('|') - if len(fmt_stream_split) < 2: - continue - format_id, format_url = fmt_stream_split[:2] - f = { - 'url': lowercase_escape(format_url), - 'format_id': format_id, - 'ext': self._FORMATS_EXT[format_id], - } - resolution = resolutions.get(format_id) - if resolution: - f.update({ - 'width': resolution[0], - 'height': resolution[1], + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, + 'fmt stream map', default='').split(',') + fmt_list = self._search_regex( + r'"fmt_list"\s*,\s*"([^"]+)', webpage, + 'fmt_list', default='').split(',') + if fmt_stream_map and fmt_list: + resolutions = {} + for fmt in fmt_list: + mobj = re.search( + r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt) + if mobj: + resolutions[mobj.group('format_id')] = ( + int(mobj.group('width')), int(mobj.group('height'))) + + for fmt_stream in fmt_stream_map: + fmt_stream_split = fmt_stream.split('|') + if len(fmt_stream_split) < 2: + continue + format_id, format_url = fmt_stream_split[:2] + f = { + 'url': lowercase_escape(format_url), + 'format_id': format_id, + 'ext': self._FORMATS_EXT[format_id], + } + resolution = resolutions.get(format_id) + if resolution: + f.update({ + 'width': resolution[0], + 'height': resolution[1], + }) + formats.append(f) + + source_url = update_url_query( + 'https://drive.google.com/uc', { + 'id': video_id, + 'export': 'download', + }) + urlh = self._request_webpage( + source_url, video_id, note='Requesting source file', + errnote='Unable to request source file', fatal=False) + if urlh: + def add_source_format(src_url): + formats.append({ + 'url': src_url, + 'ext': determine_ext(title, 'mp4').lower(), + 'format_id': 'source', + 'quality': 1, }) - formats.append(f) + if urlh.headers.get('Content-Disposition'): + add_source_format(source_url) + else: + confirmation_webpage = self._webpage_read_content( + urlh, url, video_id, note='Downloading confirmation page', + errnote='Unable to confirm download', fatal=False) + if confirmation_webpage: + confirm = self._search_regex( + r'confirm=([^&"\']+)', confirmation_webpage, + 'confirmation code', fatal=False) + if confirm: + add_source_format(update_url_query(source_url, { + 'confirm': confirm, + })) + + if not formats: + reason = self._search_regex( + r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason, expected=True) + self._sort_formats(formats) + hl = self._search_regex( + r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) + subtitles_id = None + ttsurl = self._search_regex( + r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) + if ttsurl: + # the video Id for subtitles will be the last value in the ttsurl + # query string + subtitles_id = ttsurl.encode('utf-8').decode( + 'unicode_escape').split('=')[-1] + return { 'id': video_id, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, 'formats': formats, + 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), + 'automatic_captions': self.extract_automatic_captions( + video_id, subtitles_id, hl), } diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 427499b11..6b927bb44 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -61,7 +61,7 @@ class GooglePlusIE(InfoExtractor): 'width': int(width), 'height': int(height), } for width, height, video_url in re.findall( - r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent.com.*?)"', webpage)] + r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent\.com.*?)"', webpage)] self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 382f32771..5c03780a3 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -2,10 +2,14 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .kaltura import KalturaIE +from .youtube import YoutubeIE from ..utils import ( determine_ext, int_or_none, + NO_DEFAULT, parse_iso8601, + smuggle_url, xpath_text, ) @@ -13,18 +17,61 @@ from ..utils import ( class HeiseIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P<id>[0-9]+)\.html' _TESTS = [{ + # kaltura embed 'url': 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html', - 'md5': 'ffed432483e922e88545ad9f2f15d30e', 'info_dict': { - 'id': '2404147', + 'id': '1_kkrq94sm', 'ext': 'mp4', 'title': "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone", - 'format_id': 'mp4_720p', - 'timestamp': 1411812600, - 'upload_date': '20140927', + 'timestamp': 1512734959, + 'upload_date': '20171208', 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', - 'thumbnail': r're:^https?://.*/gallery/$', - } + }, + 'params': { + 'skip_download': True, + }, + }, { + # YouTube embed + 'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html', + 'md5': 'e403d2b43fea8e405e88e3f8623909f1', + 'info_dict': { + 'id': '6kmWbXleKW4', + 'ext': 'mp4', + 'title': 'NEU IM SEPTEMBER | Netflix', + 'description': 'md5:2131f3c7525e540d5fd841de938bd452', + 'upload_date': '20170830', + 'uploader': 'Netflix Deutschland, Österreich und Schweiz', + 'uploader_id': 'netflixdach', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.heise.de/video/artikel/nachgehakt-Wie-sichert-das-c-t-Tool-Restric-tor-Windows-10-ab-3700244.html', + 'info_dict': { + 'id': '1_ntrmio2s', + 'ext': 'mp4', + 'title': "nachgehakt: Wie sichert das c't-Tool Restric'tor Windows 10 ab?", + 'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', + 'timestamp': 1512470717, + 'upload_date': '20171205', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', + 'info_dict': { + 'id': '1_59mk80sf', + 'ext': 'mp4', + 'title': "c't uplink 20.8: Staubsaugerroboter Xiaomi Vacuum 2, AR-Brille Meta 2 und Android rooten", + 'description': 'md5:f50fe044d3371ec73a8f79fcebd74afc', + 'timestamp': 1517567237, + 'upload_date': '20180202', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -40,19 +87,49 @@ class HeiseIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + def extract_title(default=NO_DEFAULT): + title = self._html_search_meta( + ('fulltitle', 'title'), webpage, default=None) + if not title or title == "c't": + title = self._search_regex( + r'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title', default=None) + if not title: + title = self._html_search_regex( + r'<h1[^>]+\bclass=["\']article_page_title[^>]+>(.+?)<', + webpage, 'title', default=default) + return title + + title = extract_title(default=None) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage) + + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return { + '_type': 'url_transparent', + 'url': smuggle_url(kaltura_url, {'source_url': url}), + 'ie_key': KalturaIE.ie_key(), + 'title': title, + 'description': description, + } + + yt_urls = YoutubeIE._extract_urls(webpage) + if yt_urls: + return self.playlist_from_matches( + yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + + title = extract_title() + container_id = self._search_regex( r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', webpage, 'container ID') + sequenz_id = self._search_regex( r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', webpage, 'sequenz ID') - title = self._html_search_meta('fulltitle', webpage, default=None) - if not title or title == "c't": - title = self._search_regex( - r'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') - doc = self._download_xml( 'http://www.heise.de/videout/feed', video_id, query={ 'container': container_id, @@ -74,10 +151,6 @@ class HeiseIE(InfoExtractor): }) self._sort_formats(formats) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage) - return { 'id': video_id, 'title': title, diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py new file mode 100644 index 000000000..eee517071 --- /dev/null +++ b/youtube_dl/extractor/hidive.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata, +) + + +class HiDiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P<title>[^/]+)/(?P<key>[^/?#&]+)' + # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, + # so disabling geo bypass completely + _GEO_BYPASS = False + + _TESTS = [{ + 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', + 'info_dict': { + 'id': 'the-comic-artist-and-his-assistants/s01e001', + 'ext': 'mp4', + 'title': 'the-comic-artist-and-his-assistants/s01e001', + 'series': 'the-comic-artist-and-his-assistants', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title, key = mobj.group('title', 'key') + video_id = '%s/%s' % (title, key) + + settings = self._download_json( + 'https://www.hidive.com/play/settings', video_id, + data=urlencode_postdata({ + 'Title': title, + 'Key': key, + })) + + restriction = settings.get('restrictionReason') + if restriction == 'RegionRestricted': + self.raise_geo_restricted() + + if restriction and restriction != 'None': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, restriction), expected=True) + + formats = [] + subtitles = {} + for rendition_id, rendition in settings['renditions'].items(): + bitrates = rendition.get('bitrates') + if not isinstance(bitrates, dict): + continue + m3u8_url = bitrates.get('hls') + if not isinstance(m3u8_url, compat_str): + continue + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='%s-hls' % rendition_id, fatal=False)) + cc_files = rendition.get('ccFiles') + if not isinstance(cc_files, list): + continue + for cc_file in cc_files: + if not isinstance(cc_file, list) or len(cc_file) < 3: + continue + cc_lang = cc_file[0] + cc_url = cc_file[2] + if not isinstance(cc_lang, compat_str) or not isinstance( + cc_url, compat_str): + continue + subtitles.setdefault(cc_lang, []).append({ + 'url': cc_url, + }) + + season_number = int_or_none(self._search_regex( + r's(\d+)', key, 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'e(\d+)', key, 'episode number', default=None)) + + return { + 'id': video_id, + 'title': video_id, + 'subtitles': subtitles, + 'formats': formats, + 'series': title, + 'season_number': season_number, + 'episode_number': episode_number, + } diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 34163725f..4703e1894 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -1,8 +1,7 @@ from __future__ import unicode_literals -import base64 - from .common import InfoExtractor +from ..compat import compat_b64decode from ..utils import ( ExtractorError, HEADRequest, @@ -48,7 +47,7 @@ class HotNewHipHopIE(InfoExtractor): if 'mediaKey' not in mkd: raise ExtractorError('Did not get a media key') - redirect_url = base64.b64decode(video_url_base64).decode('utf-8') + redirect_url = compat_b64decode(video_url_base64).decode('utf-8') redirect_req = HEADRequest(redirect_url) req = self._request_webpage( redirect_req, video_id, diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 3a7a66a34..d28af36ec 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -1,22 +1,47 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, int_or_none, ) -class HotStarIE(InfoExtractor): +class HotStarBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['IN'] + + def _download_json(self, *args, **kwargs): + response = super(HotStarBaseIE, self)._download_json(*args, **kwargs) + if response['resultCode'] != 'OK': + if kwargs.get('fatal'): + raise ExtractorError( + response['errorDescription'], expected=True) + return None + return response['resultObj'] + + def _download_content_info(self, content_id): + return self._download_json( + 'https://account.hotstar.com/AVS/besc', content_id, query={ + 'action': 'GetAggregatedContentDetails', + 'appVersion': '5.0.40', + 'channel': 'PCTV', + 'contentId': content_id, + })['contentInfo'][0] + + +class HotStarIE(HotStarBaseIE): _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' _TESTS = [{ 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', 'info_dict': { 'id': '1000076273', 'ext': 'mp4', - 'title': 'On Air With AIB - English', + 'title': 'On Air With AIB', 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', 'timestamp': 1447227000, 'upload_date': '20151111', @@ -34,23 +59,11 @@ class HotStarIE(InfoExtractor): 'only_matching': True, }] - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True, query=None): - json_data = super(HotStarIE, self)._download_json( - url_or_request, video_id, note, fatal=fatal, query=query) - if json_data['resultCode'] != 'OK': - if fatal: - raise ExtractorError(json_data['errorDescription']) - return None - return json_data['resultObj'] - def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://account.hotstar.com/AVS/besc', video_id, query={ - 'action': 'GetAggregatedContentDetails', - 'channel': 'PCTV', - 'contentId': video_id, - })['contentInfo'][0] + + video_data = self._download_content_info(video_id) + title = video_data['episodeTitle'] if video_data.get('encrypted') == 'Y': @@ -99,3 +112,51 @@ class HotStarIE(InfoExtractor): 'episode_number': int_or_none(video_data.get('episodeNumber')), 'series': video_data.get('contentTitle'), } + + +class HotStarPlaylistIE(HotStarBaseIE): + IE_NAME = 'hotstar:playlist' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com/tv/[^/]+/(?P<content_id>\d+))/(?P<type>[^/]+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.hotstar.com/tv/pratidaan/14982/episodes/14812/9993', + 'info_dict': { + 'id': '14812', + }, + 'playlist_mincount': 75, + }, { + 'url': 'http://www.hotstar.com/tv/pratidaan/14982/popular-clips/9998/9998', + 'only_matching': True, + }] + _ITEM_TYPES = { + 'episodes': 'EPISODE', + 'popular-clips': 'CLIPS', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + base_url = mobj.group('url') + content_id = mobj.group('content_id') + playlist_type = mobj.group('type') + + content_info = self._download_content_info(content_id) + playlist_id = compat_str(content_info['categoryId']) + + collection = self._download_json( + 'https://search.hotstar.com/AVS/besc', playlist_id, query={ + 'action': 'SearchContents', + 'appVersion': '5.0.40', + 'channel': 'PCTV', + 'moreFilters': 'series:%s;' % playlist_id, + 'query': '*', + 'searchOrder': 'last_broadcast_date desc,year desc,title asc', + 'type': self._ITEM_TYPES.get(playlist_type, 'EPISODE'), + }) + + entries = [ + self.url_result( + '%s/_/%s' % (base_url, video['contentId']), + ie=HotStarIE.ie_key(), video_id=video['contentId']) + for video in collection['response']['docs'] + if video.get('contentId')] + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index 2be68abad..cf90ab3c9 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -11,45 +11,20 @@ from ..utils import ( class HowStuffWorksIE(InfoExtractor): - _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm' + _VALID_URL = r'https?://[\da-z-]+\.(?:howstuffworks|stuff(?:(?:youshould|theydontwantyouto)know|toblowyourmind|momnevertoldyou)|(?:brain|car)stuffshow|fwthinking|geniusstuff)\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm' _TESTS = [ { - 'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm', + 'url': 'http://www.stufftoblowyourmind.com/videos/optical-illusions-video.htm', + 'md5': '76646a5acc0c92bf7cd66751ca5db94d', 'info_dict': { - 'id': '450221', - 'ext': 'flv', - 'title': 'Cool Jobs - Iditarod Musher', - 'description': 'Cold sleds, freezing temps and warm dog breath... an Iditarod musher\'s dream. Kasey-Dee Gardner jumps on a sled to find out what the big deal is.', - 'display_id': 'cool-jobs-iditarod-musher', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 161, - }, - 'skip': 'Video broken', - }, - { - 'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm', - 'info_dict': { - 'id': '453464', + 'id': '855410', 'ext': 'mp4', - 'title': 'Survival Zone: Food and Water In the Savanna', - 'description': 'Learn how to find both food and water while trekking in the African savannah. In this video from the Discovery Channel.', - 'display_id': 'survival-zone-food-and-water-in-the-savanna', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Your Trickster Brain: Optical Illusions -- Science on the Web', + 'description': 'md5:e374ff9561f6833ad076a8cc0a5ab2fb', }, }, { - 'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm', - 'info_dict': { - 'id': '440011', - 'ext': 'mp4', - 'title': 'Sword Swallowing #1 by Dan Meyer', - 'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International <www.swordswallow.org>', - 'display_id': 'sword-swallowing-1-by-dan-meyer', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - { - 'url': 'http://shows.howstuffworks.com/stuff-to-blow-your-mind/optical-illusions-video.htm', + 'url': 'http://shows.howstuffworks.com/more-shows/why-does-balloon-stick-to-hair-video.htm', 'only_matching': True, } ] diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 656ce6d05..6424d34ac 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -104,7 +104,7 @@ class HRTiIE(HRTiBaseIE): (?: hrti:(?P<short_id>[0-9]+)| https?:// - hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)? + hrti\.hrt\.hr/(?:\#/)?video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)? ) ''' _TESTS = [{ @@ -129,6 +129,9 @@ class HRTiIE(HRTiBaseIE): }, { 'url': 'hrti:2181385', 'only_matching': True, + }, { + 'url': 'https://hrti.hrt.hr/video/show/3873068/cuvar-dvorca-dramska-serija-14', + 'only_matching': True, }] def _real_extract(self, url): @@ -170,7 +173,7 @@ class HRTiIE(HRTiBaseIE): class HRTiPlaylistIE(HRTiBaseIE): - _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?' + _VALID_URL = r'https?://hrti\.hrt\.hr/(?:#/)?video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?' _TESTS = [{ 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena', 'info_dict': { @@ -182,6 +185,9 @@ class HRTiPlaylistIE(HRTiBaseIE): }, { 'url': 'https://hrti.hrt.hr/#/video/list/category/212/', 'only_matching': True, + }, { + 'url': 'https://hrti.hrt.hr/video/list/category/212/ekumena', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index c1367cf51..a96ea8010 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -203,7 +203,7 @@ class PCMagIE(IGNIE): _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)' IE_NAME = 'pcmag' - _EMBED_RE = r'iframe.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content.html?[^"]*url=([^"]+)["&]' + _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]' _TESTS = [{ 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 3ff672a89..425421968 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, mimetype2ext, qualities, remove_end, @@ -73,19 +75,25 @@ class ImdbIE(InfoExtractor): video_info_list = format_info.get('videoInfoList') if not video_info_list or not isinstance(video_info_list, list): continue - video_info = video_info_list[0] - if not video_info or not isinstance(video_info, dict): - continue - video_url = video_info.get('videoUrl') - if not video_url: - continue - format_id = format_info.get('ffname') - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': mimetype2ext(video_info.get('videoMimeType')), - 'quality': quality(format_id), - }) + for video_info in video_info_list: + if not video_info or not isinstance(video_info, dict): + continue + video_url = video_info.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if (video_info.get('videoMimeType') == 'application/x-mpegURL' or + determine_ext(video_url) == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + format_id = format_info.get('ffname') + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': mimetype2ext(video_info.get('videoMimeType')), + 'quality': quality(format_id), + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index fe425e786..391c2f5d0 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -2,13 +2,15 @@ from __future__ import unicode_literals -import base64 - from ..compat import ( + compat_b64decode, compat_urllib_parse_unquote, compat_urlparse, ) -from ..utils import determine_ext +from ..utils import ( + determine_ext, + update_url_query, +) from .bokecc import BokeCCBaseIE @@ -58,7 +60,7 @@ class InfoQIE(BokeCCBaseIE): encoded_id = self._search_regex( r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None) - real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) + real_id = compat_urllib_parse_unquote(compat_b64decode(encoded_id).decode('utf-8')) playpath = 'mp4:' + real_id return [{ @@ -68,21 +70,22 @@ class InfoQIE(BokeCCBaseIE): 'play_path': playpath, }] - def _extract_cookies(self, webpage): - policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') - signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') - key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') - return 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( - policy, signature, key_pair_id) + def _extract_cf_auth(self, webpage): + policy = self._search_regex(r'InfoQConstants\.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') + signature = self._search_regex(r'InfoQConstants\.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') + key_pair_id = self._search_regex(r'InfoQConstants\.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') + return { + 'Policy': policy, + 'Signature': signature, + 'Key-Pair-Id': key_pair_id, + } def _extract_http_video(self, webpage): http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') + http_video_url = update_url_query(http_video_url, self._extract_cf_auth(webpage)) return [{ 'format_id': 'http_video', 'url': http_video_url, - 'http_headers': { - 'Cookie': self._extract_cookies(webpage) - }, }] def _extract_http_audio(self, webpage, video_id): @@ -91,22 +94,20 @@ class InfoQIE(BokeCCBaseIE): if not http_audio_url: return [] - cookies_header = {'Cookie': self._extract_cookies(webpage)} - # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link # so probe URL to make sure - if not self._is_valid_url(http_audio_url, video_id, headers=cookies_header): + if not self._is_valid_url(http_audio_url, video_id): return [] return [{ 'format_id': 'http_audio', 'url': http_audio_url, 'vcodec': 'none', - 'http_headers': cookies_header, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 4667335e0..0c13f54ee 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,14 +1,21 @@ from __future__ import unicode_literals +import itertools +import hashlib +import json import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( + ExtractorError, get_element_by_attribute, int_or_none, - limit_length, lowercase_escape, + std_headers, try_get, ) @@ -130,13 +137,21 @@ class InstagramIE(InfoExtractor): video_url = media.get('video_url') height = int_or_none(media.get('dimensions', {}).get('height')) width = int_or_none(media.get('dimensions', {}).get('width')) - description = media.get('caption') + description = try_get( + media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) or media.get('caption') thumbnail = media.get('display_src') - timestamp = int_or_none(media.get('date')) + timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') - like_count = int_or_none(media.get('likes', {}).get('count')) - comment_count = int_or_none(media.get('comments', {}).get('count')) + + def get_count(key, kind): + return int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + like_count = get_count('preview_like', 'like') + comment_count = get_count('to_comment', 'comment') + comments = [{ 'author': comment.get('user', {}).get('username'), 'author_id': comment.get('user', {}).get('id'), @@ -212,7 +227,7 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { @@ -221,82 +236,125 @@ class InstagramUserIE(InfoExtractor): 'id': 'porsche', 'title': 'porsche', }, - 'playlist_mincount': 2, - 'playlist': [{ - 'info_dict': { - 'id': '614605558512799803_462752227', - 'ext': 'mp4', - 'title': '#Porsche Intelligent Performance.', - 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'Porsche', - 'uploader_id': 'porsche', - 'timestamp': 1387486713, - 'upload_date': '20131219', - }, - }], + 'playlist_count': 5, 'params': { 'extract_flat': True, 'skip_download': True, + 'playlistend': 5, } } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('username') + _gis_tmpl = None - entries = [] - page_count = 0 - media_url = 'http://instagram.com/%s/media' % uploader_id - while True: - page = self._download_json( - media_url, uploader_id, - note='Downloading page %d ' % (page_count + 1), - ) - page_count += 1 + def _entries(self, data): + def get_count(suffix): + return int_or_none(try_get( + node, lambda x: x['edge_media_' + suffix]['count'])) - for it in page['items']: - if it.get('type') != 'video': + uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + csrf_token = data['config']['csrf_token'] + rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' + + self._set_cookie('instagram.com', 'ig_pr', '1') + + cursor = '' + for page_num in itertools.count(1): + variables = json.dumps({ + 'id': uploader_id, + 'first': 12, + 'after': cursor, + }) + + if self._gis_tmpl: + gis_tmpls = [self._gis_tmpl] + else: + gis_tmpls = [ + '%s' % rhx_gis, + '', + '%s:%s' % (rhx_gis, csrf_token), + '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), + ] + + for gis_tmpl in gis_tmpls: + try: + media = self._download_json( + 'https://www.instagram.com/graphql/query/', uploader_id, + 'Downloading JSON page %d' % page_num, headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-Instagram-GIS': hashlib.md5( + ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), + }, query={ + 'query_hash': '42323d64886122307be10013ad2dcc44', + 'variables': variables, + })['data']['user']['edge_owner_to_timeline_media'] + self._gis_tmpl = gis_tmpl + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if gis_tmpl != gis_tmpls[-1]: + continue + raise + + edges = media.get('edges') + if not edges or not isinstance(edges, list): + break + + for edge in edges: + node = edge.get('node') + if not node or not isinstance(node, dict): + continue + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + video_id = node.get('shortcode') + if not video_id: continue - like_count = int_or_none(it.get('likes', {}).get('count')) - user = it.get('user', {}) - formats = [{ - 'format_id': k, - 'height': v.get('height'), - 'width': v.get('width'), - 'url': v['url'], - } for k, v in it['videos'].items()] - self._sort_formats(formats) + info = self.url_result( + 'https://instagram.com/p/%s/' % video_id, + ie=InstagramIE.ie_key(), video_id=video_id) - thumbnails_el = it.get('images', {}) - thumbnail = thumbnails_el.get('thumbnail', {}).get('url') + description = try_get( + node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) + thumbnail = node.get('thumbnail_src') or node.get('display_src') + timestamp = int_or_none(node.get('taken_at_timestamp')) - # In some cases caption is null, which corresponds to None - # in python. As a result, it.get('caption', {}) gives None - title = (it.get('caption') or {}).get('text', it['id']) + comment_count = get_count('to_comment') + like_count = get_count('preview_like') + view_count = int_or_none(node.get('video_view_count')) - entries.append({ - 'id': it['id'], - 'title': limit_length(title, 80), - 'formats': formats, + info.update({ + 'description': description, 'thumbnail': thumbnail, - 'webpage_url': it.get('link'), - 'uploader': user.get('full_name'), - 'uploader_id': user.get('username'), + 'timestamp': timestamp, + 'comment_count': comment_count, 'like_count': like_count, - 'timestamp': int_or_none(it.get('created_time')), + 'view_count': view_count, }) - if not page['items']: - break - max_id = page['items'][-1]['id'].split('_')[0] - media_url = ( - 'http://instagram.com/%s/media?max_id=%s' % ( - uploader_id, max_id)) + yield info - return { - '_type': 'playlist', - 'entries': entries, - 'id': uploader_id, - 'title': uploader_id, - } + page_info = media.get('page_info') + if not page_info or not isinstance(page_info, dict): + break + + has_next_page = page_info.get('has_next_page') + if not has_next_page: + break + + cursor = page_info.get('end_cursor') + if not cursor or not isinstance(cursor, compat_str): + break + + def _real_extract(self, url): + username = self._match_id(url) + + webpage = self._download_webpage(url, username) + + data = self._parse_json( + self._search_regex( + r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), + username) + + return self.playlist_result( + self._entries(data), username, username) diff --git a/youtube_dl/extractor/internazionale.py b/youtube_dl/extractor/internazionale.py new file mode 100644 index 000000000..10ba1f6cf --- /dev/null +++ b/youtube_dl/extractor/internazionale.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_timestamp + + +class InternazionaleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.internazionale.it/video/2015/02/19/richard-linklater-racconta-una-scena-di-boyhood', + 'md5': '3e39d32b66882c1218e305acbf8348ca', + 'info_dict': { + 'id': '265968', + 'display_id': 'richard-linklater-racconta-una-scena-di-boyhood', + 'ext': 'mp4', + 'title': 'Richard Linklater racconta una scena di Boyhood', + 'description': 'md5:efb7e5bbfb1a54ae2ed5a4a015f0e665', + 'timestamp': 1424354635, + 'upload_date': '20150219', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + DATA_RE = r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' + + title = self._search_regex( + DATA_RE % 'video-title', webpage, 'title', default=None, + group='value') or self._og_search_title(webpage) + + video_id = self._search_regex( + DATA_RE % 'job-id', webpage, 'video id', group='value') + video_path = self._search_regex( + DATA_RE % 'video-path', webpage, 'video path', group='value') + + video_base = 'https://video.internazionale.it/%s/%s.' % (video_path, video_id) + + formats = self._extract_m3u8_formats( + video_base + 'm3u8', display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + video_base + 'mpd', display_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage, 'timestamp')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 26c48e4b8..18a7d7f8c 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import uuid import xml.etree.ElementTree as etree import json +import re from .common import InfoExtractor from ..compat import ( @@ -25,7 +26,7 @@ from ..utils import ( class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' _GEO_COUNTRIES = ['GB'] - _TEST = { + _TESTS = [{ 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', 'info_dict': { 'id': '2a2936a0053', @@ -36,7 +37,11 @@ class ITVIE(InfoExtractor): # rtmp download 'skip_download': True, }, - } + }, { + # unavailable via data-playlist-url + 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -100,6 +105,18 @@ class ITVIE(InfoExtractor): 'Content-Type': 'text/xml; charset=utf-8', 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', }) + + info = self._search_json_ld(webpage, video_id, default={}) + formats = [] + subtitles = {} + + def extract_subtitle(sub_url): + ext = determine_ext(sub_url, 'ttml') + subtitles.setdefault('en', []).append({ + 'url': sub_url, + 'ext': 'ttml' if ext == 'xml' else ext, + }) + resp_env = self._download_xml( params['data-playlist-url'], video_id, headers=headers, data=etree.tostring(req_env)) @@ -110,41 +127,59 @@ class ITVIE(InfoExtractor): if fault_code == 'InvalidGeoRegion': self.raise_geo_restricted( msg=fault_string, countries=self._GEO_COUNTRIES) - raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) - title = xpath_text(playlist, 'EpisodeTitle', fatal=True) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] + elif fault_code != 'InvalidEntity': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, fault_string), expected=True) + info.update({ + 'title': self._og_search_title(webpage), + 'episode_title': params.get('data-video-episode'), + 'series': params.get('data-video-title'), + }) + else: + title = xpath_text(playlist, 'EpisodeTitle', default=None) + info.update({ + 'title': title, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duration': parse_duration(xpath_text(playlist, 'Duration')), + }) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] - formats = [] - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + f = { + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, + 'tbr': tbr, + 'ext': 'flv', + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) - ios_playlist_url = params.get('data-video-playlist') + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if caption_url.text: + extract_subtitle(caption_url.text) + + ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') hmac = params.get('data-video-hmac') - if ios_playlist_url and hmac: + if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url): headers = self.geo_verification_headers() headers.update({ 'Accept': 'application/vnd.itv.vod.playlist.v2+json', @@ -159,12 +194,12 @@ class ITVIE(InfoExtractor): 'token': '' }, 'device': { - 'manufacturer': 'Apple', - 'model': 'iPad', + 'manufacturer': 'Safari', + 'model': '5', 'os': { - 'name': 'iPhone OS', - 'version': '9.3', - 'type': 'ios' + 'name': 'Windows NT', + 'version': '6.1', + 'type': 'desktop' } }, 'client': { @@ -173,10 +208,10 @@ class ITVIE(InfoExtractor): }, 'variantAvailability': { 'featureset': { - 'min': ['hls', 'aes'], - 'max': ['hls', 'aes'] + 'min': ['hls', 'aes', 'outband-webvtt'], + 'max': ['hls', 'aes', 'outband-webvtt'] }, - 'platformTag': 'mobile' + 'platformTag': 'dotcom' } }).encode(), headers=headers, fatal=False) if ios_playlist: @@ -197,27 +232,22 @@ class ITVIE(InfoExtractor): formats.append({ 'url': href, }) + subs = video_data.get('Subtitles') + if isinstance(subs, list): + for sub in subs: + if not isinstance(sub, dict): + continue + href = sub.get('Href') + if isinstance(href, compat_str): + extract_subtitle(href) + if not info.get('duration'): + info['duration'] = parse_duration(video_data.get('Duration')) + self._sort_formats(formats) - subtitles = {} - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if not caption_url.text: - continue - ext = determine_ext(caption_url.text, 'ttml') - subtitles.setdefault('en', []).append({ - 'url': caption_url.text, - 'ext': 'ttml' if ext == 'xml' else ext, - }) - - info = self._search_json_ld(webpage, video_id, default={}) info.update({ 'id': video_id, - 'title': title, 'formats': formats, 'subtitles': subtitles, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duartion': parse_duration(xpath_text(playlist, 'Duration')), }) return info diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 1a4227f6b..e9f4ed738 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor): webpage = self._download_webpage(url, title) title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) config_url = self._html_search_regex( - r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"', + r'data-src(?:set-video)?="(/contenu/medias/video\.php.*?)"', webpage, 'config URL') config_url = 'http://www.jeuxvideo.com' + config_url diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py old mode 100755 new mode 100644 diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 33d55f770..63d0dc998 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -23,11 +23,14 @@ class JWPlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): - mobj = re.search( - r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + urls = JWPlatformIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//content\.jwplatform\.com/players/[a-zA-Z0-9]{8})', webpage) - if mobj: - return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py new file mode 100644 index 000000000..7fa140b0c --- /dev/null +++ b/youtube_dl/extractor/kakao.py @@ -0,0 +1,149 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + unified_timestamp, + update_url_query, +) + + +class KakaoIE(InfoExtractor): + _VALID_URL = r'https?://tv\.kakao\.com/channel/(?P<channel>\d+)/cliplink/(?P<id>\d+)' + _API_BASE = 'http://tv.kakao.com/api/v1/ft/cliplinks' + + _TESTS = [{ + 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', + 'md5': '702b2fbdeb51ad82f5c904e8c0766340', + 'info_dict': { + 'id': '301965083', + 'ext': 'mp4', + 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', + 'uploader_id': 2671005, + 'uploader': '그랑그랑이', + 'timestamp': 1488160199, + 'upload_date': '20170227', + } + }, { + 'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': '300103180', + 'ext': 'mp4', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', + 'uploader_id': 2653210, + 'uploader': '쇼 음악중심', + 'timestamp': 1485684628, + 'upload_date': '20170129', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_header = { + 'Referer': update_url_query( + 'http://tv.kakao.com/embed/player/cliplink/%s' % video_id, { + 'service': 'kakao_tv', + 'autoplay': '1', + 'profile': 'HIGH', + 'wmode': 'transparent', + }) + } + + QUERY_COMMON = { + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'dteType': 'PC', + } + + query = QUERY_COMMON.copy() + query['fields'] = 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' + impress = self._download_json( + '%s/%s/impress' % (self._API_BASE, video_id), + video_id, 'Downloading video info', + query=query, headers=player_header) + + clip_link = impress['clipLink'] + clip = clip_link['clip'] + + title = clip.get('title') or clip_link.get('displayTitle') + + tid = impress.get('tid', '') + + query = QUERY_COMMON.copy() + query.update({ + 'tid': tid, + 'profile': 'HIGH', + }) + raw = self._download_json( + '%s/%s/raw' % (self._API_BASE, video_id), + video_id, 'Downloading video formats info', + query=query, headers=player_header) + + formats = [] + for fmt in raw.get('outputList', []): + try: + profile_name = fmt['profile'] + fmt_url_json = self._download_json( + '%s/%s/raw/videolocation' % (self._API_BASE, video_id), + video_id, + 'Downloading video URL for profile %s' % profile_name, + query={ + 'service': 'kakao_tv', + 'section': '', + 'tid': tid, + 'profile': profile_name + }, headers=player_header, fatal=False) + + if fmt_url_json is None: + continue + + fmt_url = fmt_url_json['url'] + formats.append({ + 'url': fmt_url, + 'format_id': profile_name, + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + 'format_note': fmt.get('label'), + 'filesize': int_or_none(fmt.get('filesize')) + }) + except KeyError: + pass + self._sort_formats(formats) + + thumbs = [] + for thumb in clip.get('clipChapterThumbnailList', []): + thumbs.append({ + 'url': thumb.get('thumbnailUrl'), + 'id': compat_str(thumb.get('timeInSec')), + 'preference': -1 if thumb.get('isDefault') else 0 + }) + top_thumbnail = clip.get('thumbnailUrl') + if top_thumbnail: + thumbs.append({ + 'url': top_thumbnail, + 'preference': 10, + }) + + return { + 'id': video_id, + 'title': title, + 'description': clip.get('description'), + 'uploader': clip_link.get('channel', {}).get('name'), + 'uploader_id': clip_link.get('channelId'), + 'thumbnails': thumbs, + 'timestamp': unified_timestamp(clip_link.get('createTime')), + 'duration': int_or_none(clip.get('duration')), + 'view_count': int_or_none(clip.get('playCount')), + 'like_count': int_or_none(clip.get('likeCount')), + 'comment_count': int_or_none(clip.get('commentCount')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 138d4844d..04f68fce4 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -125,17 +125,21 @@ class KalturaIE(InfoExtractor): (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)* (?P=q1).*? (?: - entry_?[Ii]d| - (?P<q2>["'])entry_?[Ii]d(?P=q2) - )\s*:\s* + (?: + entry_?[Ii]d| + (?P<q2>["'])entry_?[Ii]d(?P=q2) + )\s*:\s*| + \[\s*(?P<q2_1>["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* + ) (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3) ''', webpage) or re.search( r'''(?xs) - <iframe[^>]+src=(?P<q1>["']) - (?:https?:)?//(?:www\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["']) + (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) + (?:(?!(?P=q1)).)* + [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) (?:(?!(?P=q1)).)* - [?&]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) (?P=q1) ''', webpage) ) @@ -287,6 +291,9 @@ class KalturaIE(InfoExtractor): # skip for now. if f.get('fileExt') == 'chun': continue + # DRM-protected video, cannot be decrypted + if f.get('fileExt') == 'wvm': + continue if not f.get('fileExt'): # QT indicates QuickTime; some videos have broken fileExt if f.get('containerFormat') == 'qt': diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py deleted file mode 100644 index b50120d98..000000000 --- a/youtube_dl/extractor/kamcord.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - qualities, -) - - -class KamcordIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://www.kamcord.com/v/hNYRduDgWb4', - 'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c', - 'info_dict': { - 'id': 'hNYRduDgWb4', - 'ext': 'mp4', - 'title': 'Drinking Madness', - 'uploader': 'jacksfilms', - 'uploader_id': '3044562', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video = self._parse_json( - self._search_regex( - r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)', - webpage, 'video'), - video_id)['video'] - - title = video['title'] - - formats = self._extract_m3u8_formats( - video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native') - self._sort_formats(formats) - - uploader = video.get('user', {}).get('username') - uploader_id = video.get('user', {}).get('id') - - view_count = int_or_none(video.get('viewCount')) - like_count = int_or_none(video.get('heartCount')) - comment_count = int_or_none(video.get('messageCount')) - - preference_key = qualities(('small', 'medium', 'large')) - - thumbnails = [{ - 'url': thumbnail_url, - 'id': thumbnail_id, - 'preference': preference_key(thumbnail_id), - } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items() - if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)] - - return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index e83115e2a..d4e6f7ac1 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -20,23 +20,23 @@ from ..utils import ( class KeezMoviesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', - 'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0', + 'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681', + 'md5': '2ac69cdb882055f71d82db4311732a1a', 'info_dict': { - 'id': '1214711', - 'display_id': 'petite-asian-lady-mai-playing-in-bathtub', + 'id': '18070681', + 'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money', 'ext': 'mp4', - 'title': 'Petite Asian Lady Mai Playing In Bathtub', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Arab wife want it so bad I see she thirsty and has tiny money.', + 'thumbnail': None, 'view_count': int, 'age_limit': 18, } }, { - 'url': 'http://www.keezmovies.com/video/1214711', + 'url': 'http://www.keezmovies.com/video/18070681', 'only_matching': True, }] - def _extract_info(self, url): + def _extract_info(self, url, fatal=True): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = (mobj.group('display_id') @@ -55,7 +55,7 @@ class KeezMoviesIE(InfoExtractor): encrypted = False def extract_format(format_url, height=None): - if not isinstance(format_url, compat_str) or not format_url.startswith('http'): + if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): return if format_url in format_urls: return @@ -105,7 +105,11 @@ class KeezMoviesIE(InfoExtractor): raise ExtractorError( 'Video %s is no longer available' % video_id, expected=True) - self._sort_formats(formats) + try: + self._sort_formats(formats) + except ExtractorError: + if fatal: + raise if not title: title = self._html_search_regex( @@ -122,7 +126,9 @@ class KeezMoviesIE(InfoExtractor): } def _real_extract(self, url): - webpage, info = self._extract_info(url) + webpage, info = self._extract_info(url, fatal=False) + if not info['formats']: + return self.url_result(url, 'Generic') info['view_count'] = str_to_int(self._search_regex( r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) return info diff --git a/youtube_dl/extractor/ketnet.py b/youtube_dl/extractor/ketnet.py index fb9c2dbd4..93a98e1e0 100644 --- a/youtube_dl/extractor/ketnet.py +++ b/youtube_dl/extractor/ketnet.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +from .canvas import CanvasIE from .common import InfoExtractor @@ -7,7 +8,7 @@ class KetnetIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', - 'md5': 'd907f7b1814ef0fa285c0475d9994ed7', + 'md5': '6bdeb65998930251bbd1c510750edba9', 'info_dict': { 'id': 'zomerse-filmpjes', 'ext': 'mp4', @@ -15,6 +16,20 @@ class KetnetIE(InfoExtractor): 'description': 'Gluur mee met Ghost Rockers op de filmset', 'thumbnail': r're:^https?://.*\.jpg$', } + }, { + # mzid in playerConfig instead of sources + 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook', + 'md5': '90139b746a0a9bd7bb631283f6e2a64e', + 'info_dict': { + 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'ext': 'flv', + 'title': 'Nachtwacht: De Greystook', + 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1468.03, + }, + 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', 'only_matching': True, @@ -38,6 +53,12 @@ class KetnetIE(InfoExtractor): 'player config'), video_id) + mzid = config.get('mzid') + if mzid: + return self.url_result( + 'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid, + CanvasIE.ie_key(), video_id=mzid) + title = config['title'] formats = [] diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index da5a5de4a..6373268c4 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -49,7 +49,9 @@ class LA7IE(InfoExtractor): webpage = self._download_webpage(url, video_id) player_data = self._parse_json( - self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'), + self._search_regex( + [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'], + webpage, 'player data'), video_id, transform_source=js_to_json) return { diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 1f91ba017..c7f813370 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -215,3 +215,21 @@ class Laola1TvIE(Laola1TvEmbedIE): 'formats': formats, 'is_live': is_live, } + + +class ITTFIE(InfoExtractor): + _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802', + 'only_matching': True, + } + + def _real_extract(self, url): + return self.url_result( + update_url_query('https://www.laola1.tv/titanplayer.php', { + 'videoid': self._match_id(url), + 'type': 'V', + 'lang': 'en', + 'portal': 'int', + 'customer': 1024, + }), Laola1TvEmbedIE.ie_key()) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 0a07c1320..ffe10154b 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import datetime import hashlib import re @@ -9,6 +8,7 @@ import time from .common import InfoExtractor from ..compat import ( + compat_b64decode, compat_ord, compat_str, compat_urllib_parse_urlencode, @@ -329,7 +329,7 @@ class LetvCloudIE(InfoExtractor): raise ExtractorError('Letv cloud returned an unknwon error') def b64decode(s): - return base64.b64decode(s.encode('utf-8')).decode('utf-8') + return compat_b64decode(s).decode('utf-8') formats = [] for media in play_json['data']['video_info']['media'].values(): diff --git a/youtube_dl/extractor/lenta.py b/youtube_dl/extractor/lenta.py new file mode 100644 index 000000000..2ebd4e577 --- /dev/null +++ b/youtube_dl/extractor/lenta.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LentaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/', + 'info_dict': { + 'id': '964400', + 'ext': 'mp4', + 'title': 'Надежду Савченко задержали', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 61, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + # EaglePlatform iframe embed + 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + 'info_dict': { + 'id': '227304', + 'ext': 'mp4', + 'title': 'Навальный вышел на свободу', + 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 87, + 'view_count': int, + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id', + default=None) + if video_id: + return self.url_result( + 'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id, + ie='EaglePlatform', video_id=video_id) + + return self.url_result(url, ie='Generic') diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index 4750b03a3..f7311f483 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -1,24 +1,28 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + parse_duration, + unified_strdate, +) class LibsynIE(InfoExtractor): _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' _TESTS = [{ - 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', - 'md5': '443360ee1b58007bc3dcf09b41d093bb', + 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', + 'md5': '2a55e75496c790cdeb058e7e6c087746', 'info_dict': { - 'id': '3377616', + 'id': '6385796', 'ext': 'mp3', - 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", - 'description': 'md5:601cb790edd05908957dae8aaa866465', - 'upload_date': '20150220', + 'title': "Champion Minded - Developing a Growth Mindset", + 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', + 'upload_date': '20180320', 'thumbnail': 're:^https?://.*', }, }, { @@ -39,31 +43,45 @@ class LibsynIE(InfoExtractor): url = m.group('mainurl') webpage = self._download_webpage(url, video_id) - formats = [{ - 'url': media_url, - } for media_url in set(re.findall(r'var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] - podcast_title = self._search_regex( - r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None) + r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None) + if podcast_title: + podcast_title = podcast_title.strip() episode_title = self._search_regex( - r'(?:<div class="episode-title">|<h3>)([^<]+)</', webpage, 'episode title') + r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title') + if episode_title: + episode_title = episode_title.strip() title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title description = self._html_search_regex( - r'<div id="info_text_body">(.+?)</div>', webpage, + r'<p\s+id="info_text_body">(.+?)</p>', webpage, 'description', default=None) - thumbnail = self._search_regex( - r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + if description: + # Strip non-breaking and normal spaces + description = description.replace('\u00A0', ' ').strip() release_date = unified_strdate(self._search_regex( r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False)) + data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block') + data = json.loads(data_json) + + formats = [{ + 'url': data['media_url'], + 'format_id': 'main', + }, { + 'url': data['media_url_libsyn'], + 'format_id': 'libsyn', + }] + thumbnail = data.get('thumbnail_url') + duration = parse_duration(data.get('duration')) + return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': release_date, + 'duration': duration, 'formats': formats, } diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 0a5a3956c..2803d7e8d 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -10,6 +10,7 @@ from ..utils import ( float_or_none, int_or_none, smuggle_url, + try_get, unsmuggle_url, ExtractorError, ) @@ -26,14 +27,16 @@ class LimelightBaseIE(InfoExtractor): 'Channel': 'channel', 'ChannelList': 'channel_list', } + + def smuggle(url): + return smuggle_url(url, {'source_url': source_url}) + entries = [] for kind, video_id in re.findall( r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage): entries.append(cls.url_result( - smuggle_url( - 'limelight:%s:%s' % (lm[kind], video_id), - {'source_url': source_url}), + smuggle('limelight:%s:%s' % (lm[kind], video_id)), 'Limelight%s' % kind, video_id)) for mobj in re.finditer( # As per [1] class attribute should be exactly equal to @@ -49,10 +52,15 @@ class LimelightBaseIE(InfoExtractor): ''', webpage): kind, video_id = mobj.group('kind'), mobj.group('id') entries.append(cls.url_result( - smuggle_url( - 'limelight:%s:%s' % (kind, video_id), - {'source_url': source_url}), + smuggle('limelight:%s:%s' % (kind, video_id)), 'Limelight%s' % kind.capitalize(), video_id)) + # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page) + for video_id in re.findall( + r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})', + webpage): + entries.append(cls.url_result( + smuggle('limelight:media:%s' % video_id), + LimelightMediaIE.ie_key(), video_id)) return entries def _call_playlist_service(self, item_id, method, fatal=True, referer=None): @@ -213,6 +221,12 @@ class LimelightBaseIE(InfoExtractor): 'subtitles': subtitles, } + def _extract_info_helper(self, pc, mobile, i, metadata): + return self._extract_info( + try_get(pc, lambda x: x['playlistItems'][i]['streams'], list) or [], + try_get(mobile, lambda x: x['mediaList'][i]['mobileUrls'], list) or [], + metadata) + class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' @@ -275,10 +289,7 @@ class LimelightMediaIE(LimelightBaseIE): 'getMobilePlaylistByMediaId', 'properties', smuggled_data.get('source_url')) - return self._extract_info( - pc['playlistItems'][0].get('streams', []), - mobile['mediaList'][0].get('mobileUrls', []) if mobile else [], - metadata) + return self._extract_info_helper(pc, mobile, 0, metadata) class LimelightChannelIE(LimelightBaseIE): @@ -319,10 +330,7 @@ class LimelightChannelIE(LimelightBaseIE): 'media', smuggled_data.get('source_url')) entries = [ - self._extract_info( - pc['playlistItems'][i].get('streams', []), - mobile['mediaList'][i].get('mobileUrls', []) if mobile else [], - medias['media_list'][i]) + self._extract_info_helper(pc, mobile, i, medias['media_list'][i]) for i in range(len(medias['media_list']))] return self.playlist_result(entries, channel_id, pc['title']) diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py new file mode 100644 index 000000000..7f5fa446e --- /dev/null +++ b/youtube_dl/extractor/line.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class LineTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.line\.me/v/(?P<id>\d+)_[^/]+-(?P<segment>ep\d+-\d+)' + + _TESTS = [{ + 'url': 'https://tv.line.me/v/793123_goodbye-mrblack-ep1-1/list/69246', + 'info_dict': { + 'id': '793123_ep1-1', + 'ext': 'mp4', + 'title': 'Goodbye Mr.Black | EP.1-1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 998.509, + 'view_count': int, + }, + }, { + 'url': 'https://tv.line.me/v/2587507_%E6%B4%BE%E9%81%A3%E5%A5%B3%E9%86%ABx-ep1-02/list/185245', + 'only_matching': True, + }] + + def _real_extract(self, url): + series_id, segment = re.match(self._VALID_URL, url).groups() + video_id = '%s_%s' % (series_id, segment) + + webpage = self._download_webpage(url, video_id) + + player_params = self._parse_json(self._search_regex( + r'naver\.WebPlayer\(({[^}]+})\)', webpage, 'player parameters'), + video_id, transform_source=js_to_json) + + video_info = self._download_json( + 'https://global-nvapis.line.me/linetv/rmcnmv/vod_play_videoInfo.json', + video_id, query={ + 'videoId': player_params['videoId'], + 'key': player_params['key'], + }) + + stream = video_info['streams'][0] + extra_query = '?__gda__=' + stream['key']['value'] + formats = self._extract_m3u8_formats( + stream['source'] + extra_query, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + for a_format in formats: + a_format['url'] += extra_query + + duration = None + for video in video_info.get('videos', {}).get('list', []): + encoding_option = video.get('encodingOption', {}) + abr = video['bitrate']['audio'] + vbr = video['bitrate']['video'] + tbr = abr + vbr + formats.append({ + 'url': video['source'], + 'format_id': 'http-%d' % int(tbr), + 'height': encoding_option.get('height'), + 'width': encoding_option.get('width'), + 'abr': abr, + 'vbr': vbr, + 'filesize': video.get('size'), + }) + if video.get('duration') and duration is None: + duration = video['duration'] + + self._sort_formats(formats) + + if not formats[0].get('width'): + formats[0]['vcodec'] = 'none' + + title = self._og_search_title(webpage) + + # like_count requires an additional API request https://tv.line.me/api/likeit/getCount + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'extra_param_to_segment_url': extra_query[1:], + 'duration': duration, + 'thumbnails': [{'url': thumbnail['source']} + for thumbnail in video_info.get('thumbnails', {}).get('list', [])], + 'view_count': video_info.get('meta', {}).get('count'), + } diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index b2247a84d..26671753c 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -7,7 +7,7 @@ from ..utils import int_or_none class LiveLeakIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)' + _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P<id>[\w_]+)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', 'md5': '0813c2430bea7a46bf13acf3406992f4', @@ -72,15 +72,23 @@ class LiveLeakIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.liveleak.com/view?i=677_1439397581', + 'info_dict': { + 'id': '677_1439397581', + 'title': 'Fuel Depot in China Explosion caught on video', + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227', + 'only_matching': True, }] @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)', + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"', webpage) - if mobj: - return 'http://www.liveleak.com/view?i=%s' % mobj.group('id') def _real_extract(self, url): video_id = self._match_id(url) @@ -111,23 +119,54 @@ class LiveLeakIE(InfoExtractor): 'age_limit': age_limit, } - info_dict = entries[0] + for idx, info_dict in enumerate(entries): + for a_format in info_dict['formats']: + if not a_format.get('height'): + a_format['height'] = int_or_none(self._search_regex( + r'([0-9]+)p\.mp4', a_format['url'], 'height label', + default=None)) - for a_format in info_dict['formats']: - if not a_format.get('height'): - a_format['height'] = int_or_none(self._search_regex( - r'([0-9]+)p\.mp4', a_format['url'], 'height label', - default=None)) + self._sort_formats(info_dict['formats']) - self._sort_formats(info_dict['formats']) + # Don't append entry ID for one-video pages to keep backward compatibility + if len(entries) > 1: + info_dict['id'] = '%s_%s' % (video_id, idx + 1) + else: + info_dict['id'] = video_id - info_dict.update({ - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - 'thumbnail': video_thumbnail, - }) + info_dict.update({ + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + 'thumbnail': video_thumbnail, + }) - return info_dict + return self.playlist_result(entries, video_id, video_title) + + +class LiveLeakEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[if])=(?P<id>[\w_]+)' + + # See generic.py for actual test cases + _TESTS = [{ + 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191', + 'only_matching': True, + }, { + 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + kind, video_id = mobj.group('kind', 'id') + + if kind == 'f': + webpage = self._download_webpage(url, video_id) + liveleak_url = self._search_regex( + r'logourl\s*:\s*(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL, + webpage, 'LiveLeak URL', group='url') + elif kind == 'i': + liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id + + return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 7f946c6ed..c4776bbf3 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -114,7 +114,7 @@ class LivestreamIE(InfoExtractor): smil_url = video_data.get('smil_url') if smil_url: - formats.extend(self._extract_smil_formats(smil_url, video_id)) + formats.extend(self._extract_smil_formats(smil_url, video_id, fatal=False)) m3u8_url = video_data.get('m3u8_url') if m3u8_url: @@ -338,7 +338,7 @@ class LivestreamOriginalIE(InfoExtractor): info = { 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), - 'thumbnail': self._search_regex(r'channelLogo.src\s*=\s*"([^"]+)"', webpage, 'thumbnail', None), + 'thumbnail': self._search_regex(r'channelLogo\.src\s*=\s*"([^"]+)"', webpage, 'thumbnail', None), } video_data = self._download_json(stream_url, content_id) is_live = video_data.get('isLive') diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py index 068378c9c..cfec0d3d0 100644 --- a/youtube_dl/extractor/lnkgo.py +++ b/youtube_dl/extractor/lnkgo.py @@ -11,7 +11,7 @@ from ..utils import ( class LnkGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lnkgo\.alfa\.lt/visi-video/(?P<show>[^/]+)/ziurek-(?P<id>[A-Za-z0-9-]+)' + _VALID_URL = r'https?://(?:www\.)?lnkgo\.(?:alfa\.)?lt/visi-video/(?P<show>[^/]+)/ziurek-(?P<id>[A-Za-z0-9-]+)' _TESTS = [{ 'url': 'http://lnkgo.alfa.lt/visi-video/yra-kaip-yra/ziurek-yra-kaip-yra-162', 'info_dict': { @@ -42,6 +42,9 @@ class LnkGoIE(InfoExtractor): 'params': { 'skip_download': True, # HLS download }, + }, { + 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', + 'only_matching': True, }] _AGE_LIMITS = { 'N-7': 7, diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index d2f75296a..f5c7abc13 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,15 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://(?:www\.)?lynda\.com/(?:[^/]+/[^/]+/(?P<course_id>\d+)|player/embed)/(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:lynda\.com|educourse\.ga)/ + (?: + (?:[^/]+/){2,3}(?P<course_id>\d+)| + player/embed + )/ + (?P<id>\d+) + ''' _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]' @@ -110,6 +118,12 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', 'only_matching': True, + }, { + 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -241,8 +255,9 @@ class LyndaIE(LyndaBaseIE): def _get_subtitles(self, video_id): url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id subs = self._download_json(url, None, False) - if subs: - return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} + fixed_subs = self._fix_subtitles(subs) + if fixed_subs: + return {'en': [{'ext': 'srt', 'data': fixed_subs}]} else: return {} @@ -253,7 +268,15 @@ class LyndaCourseIE(LyndaBaseIE): # Course link equals to welcome/introduction video link of same course # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html' + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P<coursepath>(?:[^/]+/){2,3}(?P<courseid>\d+))-2\.html' + + _TESTS = [{ + 'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index f7cc3c832..6b0e64b7f 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -1,20 +1,32 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools +import json import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, + parse_duration, remove_end, + try_get, ) class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' - + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?my\.mail\.ru/ + (?: + video/.*\#video=/?(?P<idv1>(?:[^/]+/){3}\d+)| + (?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html| + (?:video/embed|\+/video/meta)/(?P<metaid>\d+) + ) + ''' _TESTS = [ { 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', @@ -23,7 +35,7 @@ class MailRuIE(InfoExtractor): 'id': '46301138_76', 'ext': 'mp4', 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', - 'timestamp': 1393232740, + 'timestamp': 1393235077, 'upload_date': '20140224', 'uploader': 'sonypicturesrus', 'uploader_id': 'sonypicturesrus@mail.ru', @@ -40,7 +52,7 @@ class MailRuIE(InfoExtractor): 'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion', 'timestamp': 1397039888, 'upload_date': '20140409', - 'uploader': 'hitech@corp.mail.ru', + 'uploader': 'hitech', 'uploader_id': 'hitech@corp.mail.ru', 'duration': 245, }, @@ -65,28 +77,42 @@ class MailRuIE(InfoExtractor): { 'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html', 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru/video/embed/7949340477499637815', + 'only_matching': True, + }, + { + 'url': 'http://my.mail.ru/+/video/meta/7949340477499637815', + 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('idv1') + meta_id = mobj.group('metaid') - if not video_id: - video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') - - webpage = self._download_webpage(url, video_id) + video_id = None + if meta_id: + meta_url = 'https://my.mail.ru/+/video/meta/%s' % meta_id + else: + video_id = mobj.group('idv1') + if not video_id: + video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') + webpage = self._download_webpage(url, video_id) + page_config = self._parse_json(self._search_regex( + r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', + webpage, 'page config', default='{}'), video_id, fatal=False) + if page_config: + meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') + else: + meta_url = None video_data = None - - page_config = self._parse_json(self._search_regex( - r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', - webpage, 'page config', default='{}'), video_id, fatal=False) - if page_config: - meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') - if meta_url: - video_data = self._download_json( - meta_url, video_id, 'Downloading video meta JSON', fatal=False) + if meta_url: + video_data = self._download_json( + meta_url, video_id or meta_id, 'Downloading video meta JSON', + fatal=not video_id) # Fallback old approach if not video_data: @@ -136,3 +162,153 @@ class MailRuIE(InfoExtractor): 'view_count': view_count, 'formats': formats, } + + +class MailRuMusicSearchBaseIE(InfoExtractor): + def _search(self, query, url, audio_id, limit=100, offset=0): + search = self._download_json( + 'https://my.mail.ru/cgi-bin/my/ajax', audio_id, + 'Downloading songs JSON page %d' % (offset // limit + 1), + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }, query={ + 'xemail': '', + 'ajax_call': '1', + 'func_name': 'music.search', + 'mna': '', + 'mnb': '', + 'arg_query': query, + 'arg_extended': '1', + 'arg_search_params': json.dumps({ + 'music': { + 'limit': limit, + 'offset': offset, + }, + }), + 'arg_limit': limit, + 'arg_offset': offset, + }) + return next(e for e in search if isinstance(e, dict)) + + @staticmethod + def _extract_track(t, fatal=True): + audio_url = t['URL'] if fatal else t.get('URL') + if not audio_url: + return + + audio_id = t['File'] if fatal else t.get('File') + if not audio_id: + return + + thumbnail = t.get('AlbumCoverURL') or t.get('FiledAlbumCover') + uploader = t.get('OwnerName') or t.get('OwnerName_Text_HTML') + uploader_id = t.get('UploaderID') + duration = int_or_none(t.get('DurationInSeconds')) or parse_duration( + t.get('Duration') or t.get('DurationStr')) + view_count = int_or_none(t.get('PlayCount') or t.get('PlayCount_hr')) + + track = t.get('Name') or t.get('Name_Text_HTML') + artist = t.get('Author') or t.get('Author_Text_HTML') + + if track: + title = '%s - %s' % (artist, track) if artist else track + else: + title = audio_id + + return { + 'extractor_key': MailRuMusicIE.ie_key(), + 'id': audio_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, + 'vcodec': 'none', + 'abr': int_or_none(t.get('BitRate')), + 'track': track, + 'artist': artist, + 'album': t.get('Album'), + 'url': audio_url, + } + + +class MailRuMusicIE(MailRuMusicSearchBaseIE): + IE_NAME = 'mailru:music' + IE_DESC = 'Музыка@Mail.Ru' + _VALID_URL = r'https?://my\.mail\.ru/music/songs/[^/?#&]+-(?P<id>[\da-f]+)' + _TESTS = [{ + 'url': 'https://my.mail.ru/music/songs/%D0%BC8%D0%BB8%D1%82%D1%85-l-a-h-luciferian-aesthetics-of-herrschaft-single-2017-4e31f7125d0dfaef505d947642366893', + 'md5': '0f8c22ef8c5d665b13ac709e63025610', + 'info_dict': { + 'id': '4e31f7125d0dfaef505d947642366893', + 'ext': 'mp3', + 'title': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017 - М8Л8ТХ', + 'uploader': 'Игорь Мудрый', + 'uploader_id': '1459196328', + 'duration': 280, + 'view_count': int, + 'vcodec': 'none', + 'abr': 320, + 'track': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017', + 'artist': 'М8Л8ТХ', + }, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage(url, audio_id) + + title = self._og_search_title(webpage) + music_data = self._search(title, url, audio_id)['MusicData'] + t = next(t for t in music_data if t.get('File') == audio_id) + + info = self._extract_track(t) + info['title'] = title + return info + + +class MailRuMusicSearchIE(MailRuMusicSearchBaseIE): + IE_NAME = 'mailru:music:search' + IE_DESC = 'Музыка@Mail.Ru' + _VALID_URL = r'https?://my\.mail\.ru/music/search/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://my.mail.ru/music/search/black%20shadow', + 'info_dict': { + 'id': 'black shadow', + }, + 'playlist_mincount': 532, + }] + + def _real_extract(self, url): + query = compat_urllib_parse_unquote(self._match_id(url)) + + entries = [] + + LIMIT = 100 + offset = 0 + + for _ in itertools.count(1): + search = self._search(query, url, query, LIMIT, offset) + + music_data = search.get('MusicData') + if not music_data or not isinstance(music_data, list): + break + + for t in music_data: + track = self._extract_track(t, fatal=False) + if track: + entries.append(track) + + total = try_get( + search, lambda x: x['Results']['music']['Total'], int) + + if total is not None: + if offset > total: + break + + offset += LIMIT + + return self.playlist_result(entries, query) diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py index 3c34d4604..8eda69cfc 100644 --- a/youtube_dl/extractor/makertv.py +++ b/youtube_dl/extractor/makertv.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class MakerTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' + _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer\.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' _TEST = { 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', diff --git a/youtube_dl/extractor/mangomolo.py b/youtube_dl/extractor/mangomolo.py index 1885ac7df..482175a34 100644 --- a/youtube_dl/extractor/mangomolo.py +++ b/youtube_dl/extractor/mangomolo.py @@ -1,13 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 - from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - int_or_none, +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, ) +from ..utils import int_or_none class MangomoloBaseIE(InfoExtractor): @@ -22,7 +21,7 @@ class MangomoloBaseIE(InfoExtractor): format_url = self._html_search_regex( [ - r'file\s*:\s*"(https?://[^"]+?/playlist.m3u8)', + r'file\s*:\s*"(https?://[^"]+?/playlist\.m3u8)', r'<a[^>]+href="(rtsp://[^"]+)"' ], webpage, 'format url') formats = self._extract_wowza_formats( @@ -51,4 +50,4 @@ class MangomoloLiveIE(MangomoloBaseIE): _IS_LIVE = True def _get_real_id(self, page_id): - return base64.b64decode(compat_urllib_parse_unquote(page_id).encode()).decode() + return compat_b64decode(compat_urllib_parse_unquote(page_id)).decode() diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py new file mode 100644 index 000000000..b94b3c2ab --- /dev/null +++ b/youtube_dl/extractor/manyvids.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class ManyVidsIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', + 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'info_dict': { + 'id': '133957', + 'ext': 'mp4', + 'title': 'everthing about me (Preview)', + 'view_count': int, + 'like_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video URL', group='url') + + title = '%s (Preview)' % self._html_search_regex( + r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + + like_count = int_or_none(self._search_regex( + r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) + view_count = int_or_none(self._html_search_regex( + r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage, + 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'view_count': view_count, + 'like_count': like_count, + 'formats': [{ + 'url': video_url, + }], + } diff --git a/youtube_dl/extractor/massengeschmacktv.py b/youtube_dl/extractor/massengeschmacktv.py new file mode 100644 index 000000000..cfcc6b224 --- /dev/null +++ b/youtube_dl/extractor/massengeschmacktv.py @@ -0,0 +1,77 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + js_to_json, + mimetype2ext, + parse_filesize, +) + + +class MassengeschmackTVIE(InfoExtractor): + IE_NAME = 'massengeschmack.tv' + _VALID_URL = r'https?://(?:www\.)?massengeschmack\.tv/play/(?P<id>[^?&#]+)' + + _TEST = { + 'url': 'https://massengeschmack.tv/play/fktv202', + 'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3', + 'info_dict': { + 'id': 'fktv202', + 'ext': 'mp4', + 'title': 'Fernsehkritik-TV - Folge 202', + }, + } + + def _real_extract(self, url): + episode = self._match_id(url) + + webpage = self._download_webpage(url, episode) + title = clean_html(self._html_search_regex( + '<h3>([^<]+)</h3>', webpage, 'title')) + thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) + sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) + + formats = [] + for source in sources: + furl = source.get('src') + if not furl: + continue + furl = self._proto_relative_url(furl) + ext = determine_ext(furl) or mimetype2ext(source.get('type')) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + furl, episode, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': furl, + 'format_id': determine_ext(furl), + }) + + for (durl, format_id, width, height, filesize) in re.findall(r'''(?x) + <a[^>]+?href="(?P<url>(?:https:)?//[^"]+)".*? + <strong>(?P<format_id>.+?)</strong>.*? + <small>(?:(?P<width>\d+)x(?P<height>\d+))?\s+?\((?P<filesize>[\d,]+\s*[GM]iB)\)</small> + ''', webpage): + formats.append({ + 'url': durl, + 'format_id': format_id, + 'width': int_or_none(width), + 'height': int_or_none(height), + 'filesize': parse_filesize(filesize), + 'vcodec': 'none' if format_id.startswith('Audio') else None, + }) + + self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr')) + + return { + 'id': episode, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index 4c32fbc2c..50d5db802 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -2,19 +2,18 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .gigya import GigyaBaseIE + from ..compat import compat_str from ..utils import ( - ExtractorError, int_or_none, parse_duration, try_get, unified_timestamp, - urlencode_postdata, ) -class MedialaanIE(InfoExtractor): +class MedialaanIE(GigyaBaseIE): _VALID_URL = r'''(?x) https?:// (?:www\.|nieuws\.)? @@ -119,15 +118,7 @@ class MedialaanIE(InfoExtractor): 'password': password, } - auth_info = self._download_json( - 'https://accounts.eu1.gigya.com/accounts.login', None, - note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(auth_data)) - - error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') - if error_message: - raise ExtractorError( - 'Unable to login: %s' % error_message, expected=True) + auth_info = self._gigya_login(auth_data) self._uid = auth_info['UID'] self._uid_signature = auth_info['UIDSignature'] @@ -150,6 +141,7 @@ class MedialaanIE(InfoExtractor): vod_id = config.get('vodId') or self._search_regex( (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', + r'"vodId"\s*:\s*"(.+?)"', r'<[^>]+id=["\']vod-(\d+)'), webpage, 'video_id', default=None) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py new file mode 100644 index 000000000..0e2645c55 --- /dev/null +++ b/youtube_dl/extractor/mediasite.py @@ -0,0 +1,214 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + mimetype2ext, + unescapeHTML, + unsmuggle_url, + urljoin, +) + + +class MediasiteIE(InfoExtractor): + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P<id>[0-9a-f]{32,34})(?P<query>\?[^#]+|)' + _TESTS = [ + { + 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', + 'info_dict': { + 'id': '2db6c271681e4f199af3c60d1f82869b1d', + 'ext': 'mp4', + 'title': 'Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles', + 'description': 'Sir Andrew Wiles: “Equations in arithmetic”\\n\\nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers\\u0027.', + 'timestamp': 1474268400.0, + 'upload_date': '20160919', + }, + }, + { + 'url': 'http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb', + 'info_dict': { + 'id': '90bb363295d945d6b548c867d01181361d', + 'ext': 'mp4', + 'upload_date': '20150429', + 'title': '5) IT-forum 2015-Dag 1 - Dungbeetle - How and why Rain created a tiny bug tracker for Unity', + 'timestamp': 1430311380.0, + }, + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', + 'md5': '481fda1c11f67588c0d9d8fbdced4e39', + 'info_dict': { + 'id': '585a43626e544bdd97aeb71a0ec907a01d', + 'ext': 'mp4', + 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$', + 'duration': 7713.088, + 'timestamp': 1413309600, + 'upload_date': '20141014', + }, + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', + 'md5': 'ef1fdded95bdf19b12c5999949419c92', + 'info_dict': { + 'id': '86a9ea9f53e149079fbdb4202b521ed21d', + 'ext': 'wmv', + 'title': '64ste Vakantiecursus: Afvalwater', + 'description': 'md5:7fd774865cc69d972f542b157c328305', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', + 'duration': 10853, + 'timestamp': 1326446400, + 'upload_date': '20120113', + }, + }, + { + 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', + 'md5': '9422edc9b9a60151727e4b6d8bef393d', + 'info_dict': { + 'id': '24aace4429fc450fb5b38cdbf424a66e1d', + 'ext': 'mp4', + 'title': 'Xyce Software Training - Section 1', + 'description': r're:(?s)SAND Number: SAND 2013-7800.{200,}', + 'upload_date': '20120409', + 'timestamp': 1333983600, + 'duration': 7794, + } + } + ] + + # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) + _STREAM_TYPES = { + 0: 'video1', # the main video + 2: 'slide', + 3: 'presentation', + 4: 'video2', # screencast? + 5: 'video3', + } + + @staticmethod + def _extract_urls(webpage): + return [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer( + r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)\1', + webpage)] + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + mobj = re.match(self._VALID_URL, url) + resource_id = mobj.group('id') + query = mobj.group('query') + + webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? + redirect_url = compat_str(urlh.geturl()) + + # XXX: might have also extracted UrlReferrer and QueryString from the html + service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( + r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id, + default='/Mediasite/PlayerService/PlayerService.svc/json')) + + player_options = self._download_json( + '%s/GetPlayerOptions' % service_path, resource_id, + headers={ + 'Content-type': 'application/json; charset=utf-8', + 'X-Requested-With': 'XMLHttpRequest', + }, + data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': resource_id, + 'QueryString': query, + 'UrlReferrer': data.get('UrlReferrer', ''), + 'UseScreenReader': False, + } + }).encode('utf-8'))['d'] + + presentation = player_options['Presentation'] + title = presentation['Title'] + + if presentation is None: + raise ExtractorError( + 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], + expected=True) + + thumbnails = [] + formats = [] + for snum, Stream in enumerate(presentation['Streams']): + stream_type = Stream.get('StreamType') + if stream_type is None: + continue + + video_urls = Stream.get('VideoUrls') + if not isinstance(video_urls, list): + video_urls = [] + + stream_id = self._STREAM_TYPES.get( + stream_type, 'type%u' % stream_type) + + stream_formats = [] + for unum, VideoUrl in enumerate(video_urls): + video_url = VideoUrl.get('Location') + if not video_url or not isinstance(video_url, compat_str): + continue + # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS + + media_type = VideoUrl.get('MediaType') + if media_type == 'SS': + stream_formats.extend(self._extract_ism_formats( + video_url, resource_id, + ism_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + elif media_type == 'Dash': + stream_formats.extend(self._extract_mpd_formats( + video_url, resource_id, + mpd_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + else: + stream_formats.append({ + 'format_id': '%s-%u.%u' % (stream_id, snum, unum), + 'url': video_url, + 'ext': mimetype2ext(VideoUrl.get('MimeType')), + }) + + # TODO: if Stream['HasSlideContent']: + # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) + # from Stream['Slides'] + # this will require writing a custom downloader... + + # disprefer 'secondary' streams + if stream_type != 0: + for fmt in stream_formats: + fmt['preference'] = -1 + + thumbnail_url = Stream.get('ThumbnailUrl') + if thumbnail_url: + thumbnails.append({ + 'id': '%s-%u' % (stream_id, snum), + 'url': urljoin(redirect_url, thumbnail_url), + 'preference': -1 if stream_type != 0 else 0, + }) + formats.extend(stream_formats) + + self._sort_formats(formats) + + # XXX: Presentation['Presenters'] + # XXX: Presentation['Transcript'] + + return { + 'id': resource_id, + 'title': title, + 'description': presentation.get('Description'), + 'duration': float_or_none(presentation.get('Duration'), 1000), + 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), + 'formats': formats, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py index 60e3caf0d..5bafa6cf4 100644 --- a/youtube_dl/extractor/megaphone.py +++ b/youtube_dl/extractor/megaphone.py @@ -18,7 +18,7 @@ class MegaphoneIE(InfoExtractor): 'id': 'GLT9749789991', 'ext': 'mp3', 'title': '#97 What Kind Of Idiot Gets Phished?', - 'thumbnail': 're:^https://.*\.png.*$', + 'thumbnail': r're:^https://.*\.png.*$', 'duration': 1776.26375, 'author': 'Reply All', }, diff --git a/youtube_dl/extractor/meipai.py b/youtube_dl/extractor/meipai.py index c8eacb4f4..2445b8b39 100644 --- a/youtube_dl/extractor/meipai.py +++ b/youtube_dl/extractor/meipai.py @@ -11,7 +11,7 @@ from ..utils import ( class MeipaiIE(InfoExtractor): IE_DESC = '美拍' - _VALID_URL = r'https?://(?:www\.)?meipai.com/media/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?meipai\.com/media/(?P<id>[0-9]+)' _TESTS = [{ # regular uploaded video 'url': 'http://www.meipai.com/media/531697625', diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 964dc542c..42759eae8 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,13 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import json import uuid from .common import InfoExtractor from .ooyala import OoyalaIE from ..compat import ( compat_str, - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -42,31 +42,33 @@ class MiTeleBaseIE(InfoExtractor): duration = int_or_none(mmc.get('duration')) for location in mmc['locations']: gat = self._proto_relative_url(location.get('gat'), 'http:') - bas = location.get('bas') - loc = location.get('loc') + gcp = location.get('gcp') ogn = location.get('ogn') - if None in (gat, bas, loc, ogn): + if None in (gat, gcp, ogn): continue token_data = { - 'bas': bas, - 'icd': loc, + 'gcp': gcp, 'ogn': ogn, - 'sta': '0', + 'sta': 0, } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), - video_id, 'Downloading %s JSON' % location['loc']) - file_ = media.get('file') - if not file_: + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + stream = media.get('stream') or media.get('file') + if not stream: continue - ext = determine_ext(file_) + ext = determine_ext(stream) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 40cd2e389..a56b7690f 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,22 +1,27 @@ from __future__ import unicode_literals -import base64 import functools import itertools import re from .common import InfoExtractor from ..compat import ( + compat_b64decode, compat_chr, compat_ord, + compat_str, compat_urllib_parse_unquote, compat_urlparse, + compat_zip ) from ..utils import ( clean_html, ExtractorError, + int_or_none, OnDemandPagedList, str_to_int, + try_get, + urljoin, ) @@ -53,23 +58,12 @@ class MixcloudIE(InfoExtractor): 'only_matching': True, }] - # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js - def _decrypt_play_info(self, play_info, video_id): - KEYS = ( - 'pleasedontdownloadourmusictheartistswontgetpaid', - '(function() { return new Date().toLocaleDateString(); })()' - ) - play_info = base64.b64decode(play_info.encode('ascii')) - for num, key in enumerate(KEYS, start=1): - try: - return self._parse_json( - ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)])) - for idx, ch in enumerate(play_info)]), - video_id) - except ExtractorError: - if num == len(KEYS): - raise + @staticmethod + def _decrypt_xor_cipher(key, ciphertext): + """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" + return ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(k)) + for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -79,38 +73,119 @@ class MixcloudIE(InfoExtractor): webpage = self._download_webpage(url, track_id) + # Legacy path + encrypted_play_info = self._search_regex( + r'm-play-info="([^"]+)"', webpage, 'play info', default=None) + + if encrypted_play_info is not None: + # Decode + encrypted_play_info = compat_b64decode(encrypted_play_info) + else: + # New path + full_info_json = self._parse_json(self._html_search_regex( + r'<script id="relay-data" type="text/x-mixcloud">([^<]+)</script>', + webpage, 'play info'), 'play info') + for item in full_info_json: + item_data = try_get( + item, lambda x: x['cloudcast']['data']['cloudcastLookup'], + dict) + if try_get(item_data, lambda x: x['streamInfo']['url']): + info_json = item_data + break + else: + raise ExtractorError('Failed to extract matching stream info') + message = self._html_search_regex( r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) - encrypted_play_info = self._search_regex( - r'm-play-info="([^"]+)"', webpage, 'play info') + js_url = self._search_regex( + r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/(?:js2/www_js_4|js/www)\.[^>]+\.js)', + webpage, 'js url') + js = self._download_webpage(js_url, track_id, 'Downloading JS') + # Known plaintext attack + if encrypted_play_info: + kps = ['{"stream_url":'] + kpa_target = encrypted_play_info + else: + kps = ['https://', 'http://'] + kpa_target = compat_b64decode(info_json['streamInfo']['url']) + for kp in kps: + partial_key = self._decrypt_xor_cipher(kpa_target, kp) + for quote in ["'", '"']: + key = self._search_regex( + r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), + js, 'encryption key', default=None) + if key is not None: + break + else: + continue + break + else: + raise ExtractorError('Failed to extract encryption key') - play_info = self._decrypt_play_info(encrypted_play_info, track_id) + if encrypted_play_info is not None: + play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') + if message and 'stream_url' not in play_info: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + song_url = play_info['stream_url'] + formats = [{ + 'format_id': 'normal', + 'url': song_url + }] - if message and 'stream_url' not in play_info: - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') + thumbnail = self._proto_relative_url(self._html_search_regex( + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) + uploader = self._html_search_regex( + r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) + uploader_id = self._search_regex( + r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) + description = self._og_search_description(webpage) + view_count = str_to_int(self._search_regex( + [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', + r'/listeners/?">([0-9,.]+)</a>', + r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], + webpage, 'play count', default=None)) - song_url = play_info['stream_url'] + else: + title = info_json['name'] + thumbnail = urljoin( + 'https://thumbnailer.mixcloud.com/unsafe/600x600/', + try_get(info_json, lambda x: x['picture']['urlRoot'], compat_str)) + uploader = try_get(info_json, lambda x: x['owner']['displayName']) + uploader_id = try_get(info_json, lambda x: x['owner']['username']) + description = try_get(info_json, lambda x: x['description']) + view_count = int_or_none(try_get(info_json, lambda x: x['plays'])) - title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') - thumbnail = self._proto_relative_url(self._html_search_regex( - r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) - uploader = self._html_search_regex( - r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) - uploader_id = self._search_regex( - r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) - description = self._og_search_description(webpage) - view_count = str_to_int(self._search_regex( - [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', - r'/listeners/?">([0-9,.]+)</a>', - r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], - webpage, 'play count', default=None)) + stream_info = info_json['streamInfo'] + formats = [] + + for url_key in ('url', 'hlsUrl', 'dashUrl'): + format_url = stream_info.get(url_key) + if not format_url: + continue + decrypted = self._decrypt_xor_cipher(key, compat_b64decode(format_url)) + if not decrypted: + continue + if url_key == 'hlsUrl': + formats.extend(self._extract_m3u8_formats( + decrypted, track_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif url_key == 'dashUrl': + formats.extend(self._extract_mpd_formats( + decrypted, track_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': decrypted, + }) + self._sort_formats(formats) return { 'id': track_id, 'title': title, - 'url': song_url, + 'formats': formats, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, @@ -216,7 +291,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): functools.partial( self._tracks_page_func, '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type), - self._PAGE_SIZE, use_cache=True) + self._PAGE_SIZE) return self.playlist_result( entries, video_id, '%s (%s)' % (username, list_type), description) diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py index 6a85dcbd5..0e26ca1b3 100644 --- a/youtube_dl/extractor/mnet.py +++ b/youtube_dl/extractor/mnet.py @@ -40,21 +40,29 @@ class MnetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + # TODO: extract rtmp formats + # no stype -> rtmp url + # stype=H -> m3u8 url + # stype=M -> mpd url info = self._download_json( - 'http://content.api.mnet.com/player/vodConfig?id=%s&ctype=CLIP' % video_id, - video_id, 'Downloading vod config JSON')['data']['info'] + 'http://content.api.mnet.com/player/vodConfig', + video_id, 'Downloading vod config JSON', query={ + 'id': video_id, + 'ctype': 'CLIP', + 'stype': 'H', + })['data']['info'] title = info['title'] - rtmp_info = self._download_json( - info['cdn'], video_id, 'Downloading vod cdn JSON') - - formats = [{ - 'url': rtmp_info['serverurl'] + rtmp_info['fileurl'], - 'ext': 'flv', - 'page_url': url, - 'player_url': 'http://flvfile.mnet.com/service/player/201602/cjem_player_tv.swf?v=201602191318', - }] + cdn_data = self._download_json( + info['cdn'], video_id, 'Downloading vod cdn JSON')['data'][0] + m3u8_url = cdn_data['url'] + token = cdn_data.get('token') + if token and token != '-': + m3u8_url += '?' + token + formats = self._extract_wowza_formats( + m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp', 'f4m']) + self._sort_formats(formats) description = info.get('ment') duration = parse_duration(info.get('time')) diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 54716f5c7..1c652813a 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -12,7 +12,7 @@ class MofosexIE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html' _TESTS = [{ 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', - 'md5': '39a15853632b7b2e5679f92f69b78e91', + 'md5': '558fcdafbb63a87c019218d6e49daf8a', 'info_dict': { 'id': '318131', 'display_id': 'amateur-teen-playing-and-masturbating-318131', diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py index 320d27bdd..0093bcd6c 100644 --- a/youtube_dl/extractor/morningstar.py +++ b/youtube_dl/extractor/morningstar.py @@ -8,8 +8,8 @@ from .common import InfoExtractor class MorningstarIE(InfoExtractor): IE_DESC = 'morningstar.com' - _VALID_URL = r'https?://(?:www\.)?morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www|news)\.)morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869', 'md5': '6c0acface7a787aadc8391e4bbf7b0f5', 'info_dict': { @@ -19,7 +19,10 @@ class MorningstarIE(InfoExtractor): 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.", 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$' } - } + }, { + 'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6fe3b6049..e24396e79 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -4,8 +4,11 @@ import datetime import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( ExtractorError, + InAdvancePagedList, + orderedSet, str_to_int, unified_strdate, ) @@ -114,3 +117,86 @@ class MotherlessIE(InfoExtractor): 'age_limit': age_limit, 'url': video_url, } + + +class MotherlessGroupIE(InfoExtractor): + _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' + _TESTS = [{ + 'url': 'http://motherless.com/g/movie_scenes', + 'info_dict': { + 'id': 'movie_scenes', + 'title': 'Movie Scenes', + 'description': 'Hot and sexy scenes from "regular" movies... ' + 'Beautiful actresses fully nude... A looot of ' + 'skin! :)Enjoy!', + }, + 'playlist_mincount': 662, + }, { + 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'info_dict': { + 'id': 'sex_must_be_funny', + 'title': 'Sex must be funny', + 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' + 'any kind!' + }, + 'playlist_mincount': 9, + }] + + @classmethod + def suitable(cls, url): + return (False if MotherlessIE.suitable(url) + else super(MotherlessGroupIE, cls).suitable(url)) + + def _extract_entries(self, webpage, base): + entries = [] + for mobj in re.finditer( + r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', + webpage): + video_url = compat_urlparse.urljoin(base, mobj.group('href')) + if not MotherlessIE.suitable(video_url): + continue + video_id = MotherlessIE._match_id(video_url) + title = mobj.group('title') + entries.append(self.url_result( + video_url, ie=MotherlessIE.ie_key(), video_id=video_id, + video_title=title)) + # Alternative fallback + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(base, '/' + video_id), + ie=MotherlessIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-codename=["\']([A-Z0-9]+)', webpage))] + return entries + + def _real_extract(self, url): + group_id = self._match_id(url) + page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) + webpage = self._download_webpage(page_url, group_id) + title = self._search_regex( + r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) + description = self._html_search_meta( + 'description', webpage, fatal=False) + page_count = self._int(self._search_regex( + r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', + webpage, 'page_count'), 'page_count') + PAGE_SIZE = 80 + + def _get_page(idx): + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) + for entry in self._extract_entries(webpage, url): + yield entry + + playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': group_id, + 'title': title, + 'description': description, + 'entries': playlist + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 25af5ddfd..7a3b57abd 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -115,10 +115,17 @@ class MTVServicesInfoExtractor(InfoExtractor): if transcript.get('kind') != 'captions': continue lang = transcript.get('srclang') - subtitles[lang] = [{ - 'url': compat_str(typographic.get('src')), - 'ext': typographic.get('format') - } for typographic in transcript.findall('./typographic')] + for typographic in transcript.findall('./typographic'): + sub_src = typographic.get('src') + if not sub_src: + continue + ext = typographic.get('format') + if ext == 'cea-608': + ext = 'scc' + subtitles.setdefault(lang, []).append({ + 'url': compat_str(sub_src), + 'ext': ext + }) return subtitles def _get_video_info(self, itemdoc, use_hls=True): @@ -258,7 +265,7 @@ class MTVServicesInfoExtractor(InfoExtractor): if mgid is None or ':' not in mgid: mgid = self._search_regex( - [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], + [r'data-mgid="(.*?)"', r'swfobject\.embedSWF\(".*?(mgid:.*?)"'], webpage, 'mgid', default=None) if not mgid: diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py index 621ae74a7..75d286365 100644 --- a/youtube_dl/extractor/myvi.py +++ b/youtube_dl/extractor/myvi.py @@ -3,22 +3,31 @@ from __future__ import unicode_literals import re +from .common import InfoExtractor from .vimple import SprutoBaseIE class MyviIE(SprutoBaseIE): _VALID_URL = r'''(?x) - https?:// - myvi\.(?:ru/player|tv)/ - (?: + (?: + https?:// + (?:www\.)? + myvi\. (?: - embed/html| - flash| - api/Video/Get - )/| - content/preloader\.swf\?.*\bid= - ) - (?P<id>[\da-zA-Z_-]+) + (?:ru/player|tv)/ + (?: + (?: + embed/html| + flash| + api/Video/Get + )/| + content/preloader\.swf\?.*\bid= + )| + ru/watch/ + )| + myvi: + ) + (?P<id>[\da-zA-Z_-]+) ''' _TESTS = [{ 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', @@ -42,6 +51,12 @@ class MyviIE(SprutoBaseIE): }, { 'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30', 'only_matching': True, + }, { + 'url': 'https://www.myvi.ru/watch/YwbqszQynUaHPn_s82sx0Q2', + 'only_matching': True, + }, { + 'url': 'myvi:YwbqszQynUaHPn_s82sx0Q2', + 'only_matching': True, }] @classmethod @@ -58,3 +73,39 @@ class MyviIE(SprutoBaseIE): 'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData'] return self._extract_spruto(spruto, video_id) + + +class MyviEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?myvi\.tv/(?:[^?]+\?.*?\bv=|embed/)(?P<id>[\da-z]+)' + _TESTS = [{ + 'url': 'https://www.myvi.tv/embed/ccdqic3wgkqwpb36x9sxg43t4r', + 'info_dict': { + 'id': 'b3ea0663-3234-469d-873e-7fecf36b31d1', + 'ext': 'mp4', + 'title': 'Твоя (original song).mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 277, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.myvi.tv/idmi6o?v=ccdqic3wgkqwpb36x9sxg43t4r#watch', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MyviIE.suitable(url) else super(MyviEmbedIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.myvi.tv/embed/%s' % video_id, video_id) + + myvi_id = self._search_regex( + r'CreatePlayer\s*\(\s*["\'].*?\bv=([\da-zA-Z_]+)', + webpage, 'video id') + + return self.url_result('myvi:%s' % myvi_id, ie=MyviIE.ie_key()) diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py deleted file mode 100644 index 6bb64eb63..000000000 --- a/youtube_dl/extractor/myvideo.py +++ /dev/null @@ -1,177 +0,0 @@ -from __future__ import unicode_literals - -import binascii -import base64 -import hashlib -import re -import json - -from .common import InfoExtractor -from ..compat import ( - compat_ord, - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class MyVideoIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' - IE_NAME = 'myvideo' - _TEST = { - 'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', - 'md5': '2d2753e8130479ba2cb7e0a37002053e', - 'info_dict': { - 'id': '8229274', - 'ext': 'flv', - 'title': 'bowling-fail-or-win', - } - } - - # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git - # Released into the Public Domain by Tristan Fischer on 2013-05-19 - # https://github.com/rg3/youtube-dl/pull/842 - def __rc4crypt(self, data, key): - x = 0 - box = list(range(256)) - for i in list(range(256)): - x = (x + box[i] + compat_ord(key[i % len(key)])) % 256 - box[i], box[x] = box[x], box[i] - x = 0 - y = 0 - out = '' - for char in data: - x = (x + 1) % 256 - y = (y + box[x]) % 256 - box[x], box[y] = box[y], box[x] - out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256]) - return out - - def __md5(self, s): - return hashlib.md5(s).hexdigest().encode() - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - GK = ( - b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt' - b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3' - b'TnpsbA0KTVRkbU1tSTRNdz09' - ) - - # Get video webpage - webpage_url = 'http://www.myvideo.de/watch/%s' % video_id - webpage = self._download_webpage(webpage_url, video_id) - - mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage) - if mobj is not None: - self.report_extraction(video_id) - video_url = mobj.group(1) + '.flv' - - video_title = self._html_search_regex('<title>([^<]+)', - webpage, 'title') - - return { - 'id': video_id, - 'url': video_url, - 'title': video_title, - } - - mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage) - if mobj is not None: - request = sanitized_Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '') - response = self._download_webpage(request, video_id, - 'Downloading video info') - info = json.loads(base64.b64decode(response).decode('utf-8')) - return { - 'id': video_id, - 'title': info['title'], - 'url': info['streaming_url'].replace('rtmpe', 'rtmpt'), - 'play_path': info['filename'], - 'ext': 'flv', - 'thumbnail': info['thumbnail'][0]['url'], - } - - # try encxml - mobj = re.search('var flashvars={(.+?)}', webpage) - if mobj is None: - raise ExtractorError('Unable to extract video') - - params = {} - encxml = '' - sec = mobj.group(1) - for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec): - if not a == '_encxml': - params[a] = b - else: - encxml = compat_urllib_parse_unquote(b) - if not params.get('domain'): - params['domain'] = 'www.myvideo.de' - xmldata_url = '%s?%s' % (encxml, compat_urllib_parse_urlencode(params)) - if 'flash_playertype=MTV' in xmldata_url: - self._downloader.report_warning('avoiding MTV player') - xmldata_url = ( - 'http://www.myvideo.de/dynamic/get_player_video_xml.php' - '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes' - ) % video_id - - # get enc data - enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1] - enc_data_b = binascii.unhexlify(enc_data) - sk = self.__md5( - base64.b64decode(base64.b64decode(GK)) + - self.__md5( - str(video_id).encode('utf-8') - ) - ) - dec_data = self.__rc4crypt(enc_data_b, sk) - - # extracting infos - self.report_extraction(video_id) - - video_url = None - mobj = re.search('connectionurl=\'(.*?)\'', dec_data) - if mobj: - video_url = compat_urllib_parse_unquote(mobj.group(1)) - if 'myvideo2flash' in video_url: - self.report_warning( - 'Rewriting URL to use unencrypted rtmp:// ...', - video_id) - video_url = video_url.replace('rtmpe://', 'rtmp://') - - if not video_url: - # extract non rtmp videos - mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError('unable to extract url') - video_url = compat_urllib_parse_unquote(mobj.group(1)) + compat_urllib_parse_unquote(mobj.group(2)) - - video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file') - video_file = compat_urllib_parse_unquote(video_file) - - if not video_file.endswith('f4m'): - ppath, prefix = video_file.split('.') - video_playpath = '%s:%s' % (prefix, ppath) - else: - video_playpath = '' - - video_swfobj = self._search_regex(r'swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj') - video_swfobj = compat_urllib_parse_unquote(video_swfobj) - - video_title = self._html_search_regex("(.*?)", - webpage, 'title') - - return { - 'id': video_id, - 'url': video_url, - 'tc_url': video_url, - 'title': video_title, - 'ext': 'flv', - 'play_path': video_playpath, - 'player_url': video_swfobj, - } diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index b91d86528..4d2ee6408 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -68,11 +68,11 @@ class NationalGeographicVideoIE(InfoExtractor): class NationalGeographicIE(ThePlatformIE, AdobePassIE): IE_NAME = 'natgeo' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P[^/?]+)' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:(?:wild/)?[^/]+/)?(?:videos|episodes)|u)/(?P[^/?]+)' _TESTS = [ { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', + 'url': 'http://channel.nationalgeographic.com/u/kdi9Ld0PN2molUUIMSBGxoeDhD729KRjQcnxtetilWPMevo8ZwUBIDuPR0Q3D2LVaTsk0MPRkRWDB8ZhqWVeyoxfsZZm36yRp1j-zPfsHEyI_EgAeFY/', 'md5': '518c9aa655686cf81493af5cc21e2a04', 'info_dict': { 'id': 'vKInpacll2pC', @@ -86,7 +86,7 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): 'add_ie': ['ThePlatform'], }, { - 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', + 'url': 'http://channel.nationalgeographic.com/u/kdvOstqYaBY-vSBPyYgAZRUL4sWUJ5XUUPEhc7ISyBHqoIO4_dzfY3K6EjHIC0hmFXoQ7Cpzm6RkET7S3oMlm6CFnrQwSUwo/', 'md5': 'c4912f656b4cbe58f3e000c489360989', 'info_dict': { 'id': 'Pok5lWCkiEFA', @@ -102,6 +102,18 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): { 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/', + 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', + 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', + 'only_matching': True, } ] @@ -111,7 +123,7 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): release_url = self._search_regex( r'video_auth_playlist_url\s*=\s*"([^"]+)"', webpage, 'release url') - theplatform_path = self._search_regex(r'https?://link.theplatform.com/s/([^?]+)', release_url, 'theplatform path') + theplatform_path = self._search_regex(r'https?://link\.theplatform\.com/s/([^?]+)', release_url, 'theplatform path') video_id = theplatform_path.split('/')[-1] query = { 'mbr': 'true', diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index e8131333f..bb3d94413 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -43,9 +41,14 @@ class NaverIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', - webpage) - if m_id is None: + vid = self._search_regex( + r'videoId["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'video id', fatal=None, group='value') + in_key = self._search_regex( + r'inKey["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'key', default=None, group='value') + + if not vid or not in_key: error = self._html_search_regex( r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', webpage, 'error', default=None) @@ -53,9 +56,9 @@ class NaverIE(InfoExtractor): raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') video_data = self._download_json( - 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1), + 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid, video_id, query={ - 'key': m_id.group(2), + 'key': in_key, }) meta = video_data['meta'] title = meta['subject'] diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 53561961c..be295a7a3 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -122,7 +122,7 @@ class NBAIE(TurnerBaseIE): playlist_title = self._og_search_title(webpage, fatal=False) entries = OnDemandPagedList( functools.partial(self._fetch_page, team, video_id), - self._PAGE_SIZE, use_cache=True) + self._PAGE_SIZE) return self.playlist_result(entries, team, playlist_title) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 62db70b43..9dc8f9ebc 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import re +import base64 from .common import InfoExtractor from .theplatform import ThePlatformIE @@ -15,7 +16,7 @@ from ..utils import ( class NBCIE(AdobePassIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?Pn?\d+))' + _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?Pn?\d+))' _TESTS = [ { @@ -67,11 +68,16 @@ class NBCIE(AdobePassIE): 'skip_download': True, }, 'skip': 'Only works from US', - } + }, + { + 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', + 'only_matching': True, + }, ] def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() + permalink = 'http' + permalink video_data = self._download_json( 'https://api.nbc.com/v3/videos', video_id, query={ 'filter[permalink]': permalink, @@ -109,10 +115,10 @@ class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', 'info_dict': { 'id': '9CsDKds0kvHI', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', 'timestamp': 1426270238, @@ -120,7 +126,7 @@ class NBCSportsVPlayerIE(InfoExtractor): 'uploader': 'NBCU-SPORTS', } }, { - 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, }] @@ -134,7 +140,8 @@ class NBCSportsVPlayerIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._og_search_video_url(webpage) + theplatform_url = self._og_search_video_url(webpage).replace( + 'vplayer.nbcsports.com', 'player.theplatform.com') return self.url_result(theplatform_url, 'ThePlatform') @@ -352,6 +359,7 @@ class NBCNewsIE(ThePlatformIE): class NBCOlympicsIE(InfoExtractor): + IE_NAME = 'nbcolympics' _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P[a-z-]+)' _TEST = { @@ -389,3 +397,54 @@ class NBCOlympicsIE(InfoExtractor): 'ie_key': ThePlatformIE.ie_key(), 'display_id': display_id, } + + +class NBCOlympicsStreamIE(AdobePassIE): + IE_NAME = 'nbcolympics:stream' + _VALID_URL = r'https?://stream\.nbcolympics\.com/(?P[0-9a-z-]+)' + _TEST = { + 'url': 'http://stream.nbcolympics.com/2018-winter-olympics-nbcsn-evening-feb-8', + 'info_dict': { + 'id': '203493', + 'ext': 'mp4', + 'title': 're:Curling, Alpine, Luge [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + _DATA_URL_TEMPLATE = 'http://stream.nbcolympics.com/data/%s_%s.json' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid') + resource = self._search_regex( + r"resource\s*=\s*'(.+)';", webpage, + 'resource').replace("' + pid + '", pid) + event_config = self._download_json( + self._DATA_URL_TEMPLATE % ('event_config', pid), + pid)['eventConfig'] + title = self._live_title(event_config['eventTitle']) + source_url = self._download_json( + self._DATA_URL_TEMPLATE % ('live_sources', pid), + pid)['videoSources'][0]['sourceUrl'] + media_token = self._extract_mvpd_auth( + url, pid, event_config.get('requestorId', 'NBCOlympics'), resource) + formats = self._extract_m3u8_formats(self._download_webpage( + 'http://sp.auth.adobe.com/tvs/v1/sign', pid, query={ + 'cdn': 'akamai', + 'mediaToken': base64.b64encode(media_token.encode()), + 'resource': base64.b64encode(resource.encode()), + 'url': source_url, + }), pid, 'mp4') + self._sort_formats(formats) + + return { + 'id': pid, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 07528d140..aec2ea133 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -190,10 +190,12 @@ class NDREmbedBaseIE(InfoExtractor): ext = determine_ext(src, None) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) + src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native')) + src, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False)) else: quality = f.get('quality') ff = { diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index 255f60878..ddec89f2c 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -1,45 +1,106 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote_plus +) from ..utils import ( - int_or_none, + parse_duration, remove_end, unified_strdate, + urljoin ) class NDTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ndtv\.com/video/(?:[^/]+/)+[^/?^&]+-(?P\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?ndtv\.com/(?:[^/]+/)*videos?/?(?:[^/]+/)*[^/?^&]+-(?P\d+)' - _TEST = { - 'url': 'http://www.ndtv.com/video/news/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal-300710', - 'md5': '39f992dbe5fb531c395d8bbedb1e5e88', - 'info_dict': { - 'id': '300710', - 'ext': 'mp4', - 'title': "NDTV exclusive: Don't need character certificate from Rahul Gandhi, says Arvind Kejriwal", - 'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02', - 'upload_date': '20131208', - 'duration': 1327, - 'thumbnail': r're:https?://.*\.jpg', + _TESTS = [ + { + 'url': 'https://khabar.ndtv.com/video/show/prime-time/prime-time-ill-system-and-poor-education-468818', + 'md5': '78efcf3880ef3fd9b83d405ca94a38eb', + 'info_dict': { + 'id': '468818', + 'ext': 'mp4', + 'title': "प्राइम टाइम: सिस्टम बीमार, स्कूल बदहाल", + 'description': 'md5:f410512f1b49672e5695dea16ef2731d', + 'upload_date': '20170928', + 'duration': 2218, + 'thumbnail': r're:https?://.*\.jpg', + } }, - } + { + # __filename is url + 'url': 'http://movies.ndtv.com/videos/cracker-free-diwali-wishes-from-karan-johar-kriti-sanon-other-stars-470304', + 'md5': 'f1d709352305b44443515ac56b45aa46', + 'info_dict': { + 'id': '470304', + 'ext': 'mp4', + 'title': "Cracker-Free Diwali Wishes From Karan Johar, Kriti Sanon & Other Stars", + 'description': 'md5:f115bba1adf2f6433fa7c1ade5feb465', + 'upload_date': '20171019', + 'duration': 137, + 'thumbnail': r're:https?://.*\.jpg', + } + }, + { + 'url': 'https://www.ndtv.com/video/news/news/delhi-s-air-quality-status-report-after-diwali-is-very-poor-470372', + 'only_matching': True + }, + { + 'url': 'https://auto.ndtv.com/videos/the-cnb-daily-october-13-2017-469935', + 'only_matching': True + }, + { + 'url': 'https://sports.ndtv.com/cricket/videos/2nd-t20i-rock-thrown-at-australia-cricket-team-bus-after-win-over-india-469764', + 'only_matching': True + }, + { + 'url': 'http://gadgets.ndtv.com/videos/uncharted-the-lost-legacy-review-465568', + 'only_matching': True + }, + { + 'url': 'http://profit.ndtv.com/videos/news/video-indian-economy-on-very-solid-track-international-monetary-fund-chief-470040', + 'only_matching': True + }, + { + 'url': 'http://food.ndtv.com/video-basil-seeds-coconut-porridge-419083', + 'only_matching': True + }, + { + 'url': 'https://doctor.ndtv.com/videos/top-health-stories-of-the-week-467396', + 'only_matching': True + }, + { + 'url': 'https://swirlster.ndtv.com/video/how-to-make-friends-at-work-469324', + 'only_matching': True + } + ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - NDTV') + # '__title' does not contain extra words such as sub-site name, "Video" etc. + title = compat_urllib_parse_unquote_plus( + self._search_regex(r"__title\s*=\s*'([^']+)'", webpage, 'title', default=None) or + self._og_search_title(webpage)) filename = self._search_regex( - r"__filename='([^']+)'", webpage, 'video filename') - video_url = 'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename + r"(?:__)?filename\s*[:=]\s*'([^']+)'", webpage, 'video filename') + # in "movies" sub-site pages, filename is URL + video_url = urljoin('https://ndtvod.bc-ssl.cdn.bitgravity.com/23372/ndtv/', filename.lstrip('/')) - duration = int_or_none(self._search_regex( - r"__duration='([^']+)'", webpage, 'duration', fatal=False)) + # "doctor" sub-site has MM:SS format + duration = parse_duration(self._search_regex( + r"(?:__)?duration\s*[:=]\s*'([^']+)'", webpage, 'duration', fatal=False)) + # "sports", "doctor", "swirlster" sub-sites don't have 'publish-date' upload_date = unified_strdate(self._html_search_meta( - 'publish-date', webpage, 'upload date', fatal=False)) + 'publish-date', webpage, 'upload date', default=None) or self._html_search_meta( + 'uploadDate', webpage, 'upload date', default=None) or self._search_regex( + r'datePublished"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False)) description = remove_end(self._og_search_description(webpage), ' (Read more)') diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index 0e26f8399..82e7cf522 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -87,19 +87,21 @@ class NewgroundsIE(InfoExtractor): self._check_formats(formats, media_id) self._sort_formats(formats) - uploader = self._search_regex( - r'(?:Author|Writer)\s*]+>([^<]+)', webpage, 'uploader', + uploader = self._html_search_regex( + (r'(?s)]*>(.+?).*?\s*Author\s*', + r'(?:Author|Writer)\s*]+>([^<]+)'), webpage, 'uploader', fatal=False) - timestamp = unified_timestamp(self._search_regex( - r'
Uploaded
\s*
([^<]+)', webpage, 'timestamp', + timestamp = unified_timestamp(self._html_search_regex( + (r'
\s*Uploaded\s*
\s*
([^<]+
\s*
[^<]+)', + r'
\s*Uploaded\s*
\s*
([^<]+)'), webpage, 'timestamp', default=None)) duration = parse_duration(self._search_regex( - r'
Song\s*
.+?
([^<]+)', webpage, 'duration', - default=None)) + r'(?s)
\s*Song\s*
\s*
.+?
\s*
([^<]+)', webpage, + 'duration', default=None)) filesize_approx = parse_filesize(self._html_search_regex( - r'
Song\s*
(.+?)
', webpage, 'filesize', + r'(?s)
\s*Song\s*
\s*
(.+?)
', webpage, 'filesize', default=None)) if len(formats) == 1: formats[0]['filesize_approx'] = filesize_approx diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index d0235fdfe..5e46a75c0 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -18,11 +18,18 @@ from ..utils import ( class NexxIE(InfoExtractor): - _VALID_URL = r'https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P\d+)/videos/byid/(?P\d+)' + _VALID_URL = r'''(?x) + (?: + https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P\d+)/videos/byid/| + nexx:(?:(?P\d+):)?| + https?://arc\.nexx\.cloud/api/video/ + ) + (?P\d+) + ''' _TESTS = [{ # movie 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', - 'md5': '16746bfc28c42049492385c989b26c4a', + 'md5': '828cea195be04e66057b846288295ba1', 'info_dict': { 'id': '128907', 'ext': 'mp4', @@ -36,9 +43,6 @@ class NexxIE(InfoExtractor): 'timestamp': 1384264416, 'upload_date': '20131112', }, - 'params': { - 'format': 'bestvideo', - }, }, { # episode 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858', @@ -56,14 +60,44 @@ class NexxIE(InfoExtractor): 'season_number': 2, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, + }, { + # does not work via arc + 'url': 'nexx:741:1269984', + 'md5': 'c714b5b238b2958dc8d5642addba6886', + 'info_dict': { + 'id': '1269984', + 'ext': 'mp4', + 'title': '1 TAG ohne KLO... wortwörtlich! 😑', + 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', + 'description': 'md5:4604539793c49eda9443ab5c5b1d612f', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 607, + 'timestamp': 1518614955, + 'upload_date': '20180214', + }, }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, + }, { + 'url': 'nexx:748:128907', + 'only_matching': True, + }, { + 'url': 'nexx:128907', + 'only_matching': True, + }, { + 'url': 'https://arc.nexx.cloud/api/video/128907.json', + 'only_matching': True, }] + @staticmethod + def _extract_domain_id(webpage): + mobj = re.search( + r']+\bsrc=["\'](?:https?:)?//require\.nexx(?:\.cloud|cdn\.com)/(?P\d+)', + webpage) + return mobj.group('id') if mobj else None + @staticmethod def _extract_urls(webpage): # Reference: @@ -72,11 +106,8 @@ class NexxIE(InfoExtractor): entries = [] # JavaScript Integration - mobj = re.search( - r']+\bsrc=["\']https?://require\.nexx(?:\.cloud|cdn\.com)/(?P\d+)', - webpage) - if mobj: - domain_id = mobj.group('id') + domain_id = NexxIE._extract_domain_id(webpage) + if domain_id: for video_id in re.findall( r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)', webpage): @@ -112,102 +143,150 @@ class NexxIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - domain_id, video_id = mobj.group('domain_id', 'id') + domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') + video_id = mobj.group('id') - # Reverse engineered from JS code (see getDeviceID function) - device_id = '%d:%d:%d%d' % ( - random.randint(1, 4), int(time.time()), - random.randint(1e4, 99999), random.randint(1, 9)) + video = None - result = self._call_api(domain_id, 'session/init', video_id, data={ - 'nxp_devh': device_id, - 'nxp_userh': '', - 'precid': '0', - 'playlicense': '0', - 'screenx': '1920', - 'screeny': '1080', - 'playerversion': '6.0.00', - 'gateway': 'html5', - 'adGateway': '', - 'explicitlanguage': 'en-US', - 'addTextTemplates': '1', - 'addDomainData': '1', - 'addAdModel': '1', - }, headers={ - 'X-Request-Enable-Auth-Fallback': '1', - }) + response = self._download_json( + 'https://arc.nexx.cloud/api/video/%s.json' % video_id, + video_id, fatal=False) + if response and isinstance(response, dict): + result = response.get('result') + if result and isinstance(result, dict): + video = result - cid = result['general']['cid'] + # not all videos work via arc, e.g. nexx:741:1269984 + if not video: + # Reverse engineered from JS code (see getDeviceID function) + device_id = '%d:%d:%d%d' % ( + random.randint(1, 4), int(time.time()), + random.randint(1e4, 99999), random.randint(1, 9)) - # As described in [1] X-Request-Token generation algorithm is - # as follows: - # md5( operation + domain_id + domain_secret ) - # where domain_secret is a static value that will be given by nexx.tv - # as per [1]. Here is how this "secret" is generated (reversed - # from _play.api.init function, search for clienttoken). So it's - # actually not static and not that much of a secret. - # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf - secret = result['device']['clienttoken'][int(device_id[0]):] - secret = secret[0:len(secret) - int(device_id[-1])] - - op = 'byid' - - # Reversed from JS code for _play.api.call function (search for - # X-Request-Token) - request_token = hashlib.md5( - ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() - - video = self._call_api( - domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ - 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', - 'addInteractionOptions': '1', - 'addStatusDetails': '1', - 'addStreamDetails': '1', - 'addCaptions': '1', - 'addScenes': '1', - 'addHotSpots': '1', - 'addBumpers': '1', - 'captionFormat': 'data', + result = self._call_api(domain_id, 'session/init', video_id, data={ + 'nxp_devh': device_id, + 'nxp_userh': '', + 'precid': '0', + 'playlicense': '0', + 'screenx': '1920', + 'screeny': '1080', + 'playerversion': '6.0.00', + 'gateway': 'html5', + 'adGateway': '', + 'explicitlanguage': 'en-US', + 'addTextTemplates': '1', + 'addDomainData': '1', + 'addAdModel': '1', }, headers={ - 'X-Request-CID': cid, - 'X-Request-Token': request_token, + 'X-Request-Enable-Auth-Fallback': '1', }) + cid = result['general']['cid'] + + # As described in [1] X-Request-Token generation algorithm is + # as follows: + # md5( operation + domain_id + domain_secret ) + # where domain_secret is a static value that will be given by nexx.tv + # as per [1]. Here is how this "secret" is generated (reversed + # from _play.api.init function, search for clienttoken). So it's + # actually not static and not that much of a secret. + # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf + secret = result['device']['clienttoken'][int(device_id[0]):] + secret = secret[0:len(secret) - int(device_id[-1])] + + op = 'byid' + + # Reversed from JS code for _play.api.call function (search for + # X-Request-Token) + request_token = hashlib.md5( + ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() + + video = self._call_api( + domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ + 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', + 'addInteractionOptions': '1', + 'addStatusDetails': '1', + 'addStreamDetails': '1', + 'addCaptions': '1', + 'addScenes': '1', + 'addHotSpots': '1', + 'addBumpers': '1', + 'captionFormat': 'data', + }, headers={ + 'X-Request-CID': cid, + 'X-Request-Token': request_token, + }) + general = video['general'] title = general['title'] stream_data = video['streamdata'] language = general.get('language_raw') or '' - # TODO: reverse more cdns and formats + # TODO: reverse more cdns cdn = stream_data['cdnType'] assert cdn == 'azure' azure_locator = stream_data['azureLocator'] - AZURE_URL = 'http://nx-p%02d.akamaized.net/' - - for secure in ('s', ''): - cdn_shield = stream_data.get('cdnShieldHTTP%s' % secure.upper()) - if cdn_shield: - azure_base = 'http%s://%s' % (secure, cdn_shield) - break - else: - azure_base = AZURE_URL % int(stream_data['azureAccount'].replace('nexxplayplus', '')) + def get_cdn_shield_base(shield_type='', static=False): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + else: + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) + azure_stream_base = get_cdn_shield_base() is_ml = ',' in language - azure_m3u8_url = '%s%s/%s_src%s.ism/Manifest(format=m3u8-aapl)' % ( - azure_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' protection_token = try_get( video, lambda x: x['protectiondata']['token'], compat_str) if protection_token: - azure_m3u8_url += '?hdnts=%s' % protection_token + azure_manifest_url += '?hdnts=%s' % protection_token formats = self._extract_m3u8_formats( - azure_m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='%s-hls' % cdn) + azure_manifest_url % '(format=m3u8-aapl)', + video_id, 'mp4', 'm3u8_native', + m3u8_id='%s-hls' % cdn, fatal=False) + formats.extend(self._extract_mpd_formats( + azure_manifest_url % '(format=mpd-time-csf)', + video_id, mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_ism_formats( + azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + + azure_progressive_base = get_cdn_shield_base('Prog', True) + azure_file_distribution = stream_data.get('azureFileDistribution') + if azure_file_distribution: + fds = azure_file_distribution.split(',') + if fds: + for fd in fds: + ss = fd.split(':') + if len(ss) == 2: + tbr = int_or_none(ss[0]) + if tbr: + f = { + 'url': '%s%s/%s_src_%s_%d.mp4' % ( + azure_progressive_base, azure_locator, video_id, ss[1], tbr), + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = ss[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 510b1c41f..256a24d86 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -10,7 +10,7 @@ from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): # None of videos on the website are still alive? IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P[^/?#.]+)' + _VALID_URL = r'https?://(?P(?:(?:www|beta)\.)?nick(?:jr)?\.com)/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _GEO_COUNTRIES = ['US'] _TESTS = [{ @@ -69,13 +69,74 @@ class NickIE(MTVServicesInfoExtractor): 'mgid': uri, } - def _extract_mgid(self, webpage): - return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'http://%s/data/video.endLevel.json' % domain, + display_id, query={ + 'urlKey': display_id, + }) + return self._get_videos_info(video_data['player'] + video_data['id']) + + +class NickBrIE(MTVServicesInfoExtractor): + IE_NAME = 'nickelodeon:br' + _VALID_URL = r'''(?x) + https?:// + (?: + (?P(?:www\.)?nickjr|mundonick\.uol)\.com\.br| + (?:www\.)?nickjr\.nl + ) + /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P[^/?\#.]+) + ''' + _TESTS = [{ + 'url': 'http://www.nickjr.com.br/patrulha-canina/videos/210-labirinto-de-pipoca/', + 'only_matching': True, + }, { + 'url': 'http://mundonick.uol.com.br/programas/the-loud-house/videos/muitas-irmas/7ljo9j', + 'only_matching': True, + }, { + 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + uri = self._search_regex( + r'data-(?:contenturi|mgid)="([^"]+)', webpage, 'mgid') + video_id = self._id_from_uri(uri) + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html', + video_id, query={ + 'uri': uri, + 'configtype': 'edge', + }, headers={ + 'Referer': url, + }) + info_url = self._remove_template_parameter(config['feedWithQueryParams']) + if info_url == 'None': + if domain.startswith('www.'): + domain = domain[4:] + content_domain = { + 'mundonick.uol': 'mundonick.com.br', + 'nickjr': 'br.nickelodeonjunior.tv', + }[domain] + query = { + 'mgid': uri, + 'imageEp': content_domain, + 'arcEp': content_domain, + } + if domain == 'nickjr.com.br': + query['ep'] = 'c4b16088' + info_url = update_url_query( + 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed', query) + return self._get_videos_info_from_url(info_url, video_id) class NickDeIE(MTVServicesInfoExtractor): IE_NAME = 'nick.de' - _VALID_URL = r'https?://(?:www\.)?(?Pnick\.(?:de|com\.pl)|nickelodeon\.(?:nl|at))/[^/]+/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pnick\.(?:de|com\.pl|ch)|nickelodeon\.(?:nl|be|at|dk|no|se))/[^/]+/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', 'only_matching': True, @@ -91,6 +152,21 @@ class NickDeIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.no/program/2626-bulderhuset/videoer/90947-femteklasse-veronica-vs-vanzilla', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.dk/serier/2626-hojs-hus/videoer/761-tissepause', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.se/serier/2626-lugn-i-stormen/videos/998-', + 'only_matching': True, + }, { + 'url': 'http://www.nick.ch/shows/2304-adventure-time-abenteuerzeit-mit-finn-und-jake', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.be/afspeellijst/4530-top-videos/videos/episode/73917-inval-broodschapper-lariekoek-arie', + 'only_matching': True, }] def _extract_mrss_url(self, webpage, host): @@ -132,13 +208,31 @@ class NickNightIE(NickDeIE): class NickRuIE(MTVServicesInfoExtractor): IE_NAME = 'nickelodeonru' - _VALID_URL = r'https?://(?:www\.)nickelodeon\.ru/(?:playlist|shows|videos)/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu|com\.tr)/[^/]+/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6', 'only_matching': True, }, { 'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.fr/programmes/bob-l-eponge/videos/le-marathon-de-booh-kini-bottom-mardi-31-octobre/nfn7z0', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.es/videos/nickelodeon-consejos-tortitas/f7w7xy', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.pt/series/spongebob-squarepants/videos/a-bolha-de-tinta-gigante/xutq1b', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.ro/emisiuni/shimmer-si-shine/video/nahal-din-bomboane/uw5u2k', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.hu/musorok/spongyabob-kockanadrag/videok/episodes/buborekfujas-az-elszakadt-nadrag/q57iob#playlist/k6te4y', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.com.tr/programlar/sunger-bob/videolar/kayip-yatak/mgqbjy', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 026329d3e..df7f528be 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -40,7 +40,7 @@ class NiconicoIE(InfoExtractor): 'uploader': 'takuya0301', 'uploader_id': '2698420', 'upload_date': '20131123', - 'timestamp': 1385182762, + 'timestamp': int, # timestamp is unstable 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', 'duration': 33, 'view_count': int, @@ -115,8 +115,8 @@ class NiconicoIE(InfoExtractor): 'skip': 'Requires an account', }, { # "New" HTML5 video + # md5 is unstable 'url': 'http://www.nicovideo.jp/watch/sm31464864', - 'md5': '351647b4917660986dc0fa8864085135', 'info_dict': { 'id': 'sm31464864', 'ext': 'mp4', @@ -124,7 +124,7 @@ class NiconicoIE(InfoExtractor): 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', 'timestamp': 1498514060, 'upload_date': '20170626', - 'uploader': 'ゲス', + 'uploader': 'ゲスト', 'uploader_id': '40826363', 'thumbnail': r're:https?://.*', 'duration': 198, @@ -132,6 +132,25 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, }, 'skip': 'Requires an account', + }, { + # Video without owner + 'url': 'http://www.nicovideo.jp/watch/sm18238488', + 'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e', + 'info_dict': { + 'id': 'sm18238488', + 'ext': 'mp4', + 'title': '【実写版】ミュータントタートルズ', + 'description': 'md5:15df8988e47a86f9e978af2064bf6d8e', + 'timestamp': 1341160408, + 'upload_date': '20120701', + 'uploader': None, + 'uploader_id': None, + 'thumbnail': r're:https?://.*', + 'duration': 5271, + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, @@ -395,7 +414,9 @@ class NiconicoIE(InfoExtractor): webpage_url = get_video_info('watch_url') or url - owner = api_data.get('owner', {}) + # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" + # in the JSON, which will cause None to be returned instead of {}. + owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index a06d38afd..dc6a27d36 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -13,7 +13,7 @@ class NineGagIE(InfoExtractor): _TESTS = [{ 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', 'info_dict': { - 'id': 'Kk2X5', + 'id': 'kXzwOKyGlSA', 'ext': 'mp4', 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py index 351bea7ba..f32f530f7 100644 --- a/youtube_dl/extractor/ninenow.py +++ b/youtube_dl/extractor/ninenow.py @@ -4,15 +4,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + ExtractorError, int_or_none, float_or_none, - ExtractorError, + smuggle_url, ) class NineNowIE(InfoExtractor): IE_NAME = '9now.com.au' _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P[^/?#]+)' + _GEO_COUNTRIES = ['AU'] _TESTS = [{ # clip 'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc', @@ -75,7 +77,9 @@ class NineNowIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': self._GEO_COUNTRIES}), 'id': video_id, 'title': title, 'description': common_data.get('description'), diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index 9b5ad5a9f..febef097a 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -43,7 +43,8 @@ class NJPWWorldIE(InfoExtractor): webpage, urlh = self._download_webpage_handle( 'https://njpwworld.com/auth/login', None, note='Logging in', errnote='Unable to login', - data=urlencode_postdata({'login_id': username, 'pw': password})) + data=urlencode_postdata({'login_id': username, 'pw': password}), + headers={'Referer': 'https://njpwworld.com/auth'}) # /auth/login will return 302 for successful logins if urlh.geturl() == 'https://njpwworld.com/auth/login': self.report_warning('unable to login') diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 8b83e1f76..a9f9b10c4 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -70,7 +70,7 @@ class NocoIE(InfoExtractor): return login = self._download_json( - self._LOGIN_URL, None, 'Logging in as %s' % username, + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata({ 'a': 'login', 'cookie': '1', diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index f7fa098a5..974de3c3e 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + js_to_json, smuggle_url, try_get, ) @@ -24,8 +25,6 @@ class NoovoIE(InfoExtractor): 'timestamp': 1491399228, 'upload_date': '20170405', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': 'RPM+', }, 'params': { @@ -37,13 +36,11 @@ class NoovoIE(InfoExtractor): 'info_dict': { 'id': '5395865725001', 'title': 'Épisode 13 : Les retrouvailles', - 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'description': 'md5:888c3330f0c1b4476c5bc99a1c040473', 'ext': 'mp4', 'timestamp': 1492019320, 'upload_date': '20170412', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': "L'amour est dans le pré", 'season_number': 5, 'episode': 'Épisode 13', @@ -58,40 +55,46 @@ class NoovoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, - video_id)['data'] + webpage = self._download_webpage(url, video_id) - content = try_get(data, lambda x: x['contents'][0]) + bc_url = BrightcoveNewIE._extract_url(self, webpage) - brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + data = self._parse_json( + self._search_regex( + r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + + title = try_get( + data, lambda x: x['video']['nom'], + compat_str) or self._html_search_meta( + 'dcterms.Title', webpage, 'title', fatal=True) + + description = self._html_search_meta( + ('dcterms.Description', 'description'), webpage, 'description') series = try_get( - data, ( - lambda x: x['show']['title'], - lambda x: x['season']['show']['title']), - compat_str) + data, lambda x: x['emission']['nom']) or self._search_regex( + r']+class="banner-card__subtitle h4"[^>]*>([^<]+)', + webpage, 'series', default=None) - episode = None - og = data.get('og') - if isinstance(og, dict) and og.get('type') == 'video.episode': - episode = og.get('title') + season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {} + season = try_get(season_el, lambda x: x['nom'], compat_str) + season_number = int_or_none(try_get(season_el, lambda x: x['numero'])) - video = content or data + episode_el = try_get(season_el, lambda x: x['episode'], dict) or {} + episode = try_get(episode_el, lambda x: x['nom'], compat_str) + episode_number = int_or_none(try_get(episode_el, lambda x: x['numero'])) return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['CA']}), - 'id': brightcove_id, - 'title': video.get('title'), - 'creator': video.get('source'), - 'view_count': int_or_none(video.get('viewsCount')), + 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'title': title, + 'description': description, 'series': series, - 'season_number': int_or_none(try_get( - data, lambda x: x['season']['seasonNumber'])), + 'season': season, + 'season_number': season_number, 'episode': episode, - 'episode_number': int_or_none(data.get('episodeNumber')), + 'episode_number': episode_number, } diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py deleted file mode 100644 index e43b37136..000000000 --- a/youtube_dl/extractor/nowtv.py +++ /dev/null @@ -1,261 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - determine_ext, - int_or_none, - parse_iso8601, - parse_duration, - remove_start, -) - - -class NowTVBaseIE(InfoExtractor): - _VIDEO_FIELDS = ( - 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', - 'broadcastStartDate', 'seoUrl', 'duration', 'files', - 'format.defaultImage169Format', 'format.defaultImage169Logo') - - def _extract_video(self, info, display_id=None): - video_id = compat_str(info['id']) - - files = info['files'] - if not files: - if info.get('geoblocked', False): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) - if not info.get('free', True): - raise ExtractorError( - 'Video %s is not available for free' % video_id, expected=True) - - formats = [] - for item in files['items']: - if determine_ext(item['path']) != 'f4v': - continue - app, play_path = remove_start(item['path'], '/').split('/', 1) - formats.append({ - 'url': 'rtmpe://fms.rtl.de', - 'app': app, - 'play_path': 'mp4:%s' % play_path, - 'ext': 'flv', - 'page_url': 'http://rtlnow.rtl.de', - 'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf', - 'tbr': int_or_none(item.get('bitrate')), - }) - self._sort_formats(formats) - - title = info['title'] - description = info.get('articleLong') or info.get('articleShort') - timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') - duration = parse_duration(info.get('duration')) - - f = info.get('format', {}) - thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') - - return { - 'id': video_id, - 'display_id': display_id or info.get('seoUrl'), - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } - - -class NowTVIE(NowTVBaseIE): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P[^/]+)/(?:player|preview)' - - _TESTS = [{ - # rtl - 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', - 'info_dict': { - 'id': '203519', - 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', - 'ext': 'flv', - 'title': 'Inka Bause stellt die neuen Bauern vor', - 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432580700, - 'upload_date': '20150525', - 'duration': 2786, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # rtl2 - 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', - 'info_dict': { - 'id': '203481', - 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', - 'ext': 'flv', - 'title': 'Berlin - Tag & Nacht (Folge 934)', - 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432666800, - 'upload_date': '20150526', - 'duration': 2641, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # rtlnitro - 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', - 'info_dict': { - 'id': '165780', - 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', - 'ext': 'flv', - 'title': 'Hals- und Beinbruch', - 'description': 'md5:b50d248efffe244e6f56737f0911ca57', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432415400, - 'upload_date': '20150523', - 'duration': 2742, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # superrtl - 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', - 'info_dict': { - 'id': '99205', - 'display_id': 'medicopter-117/angst', - 'ext': 'flv', - 'title': 'Angst!', - 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1222632900, - 'upload_date': '20080928', - 'duration': 3025, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # ntv - 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', - 'info_dict': { - 'id': '203521', - 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', - 'ext': 'flv', - 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', - 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432751700, - 'upload_date': '20150527', - 'duration': 1083, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # vox - 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', - 'info_dict': { - 'id': '128953', - 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', - 'ext': 'flv', - 'title': "Büro-Fall / Chihuahua 'Joel'", - 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1432408200, - 'upload_date': '20150523', - 'duration': 3092, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', - 'only_matching': True, - }, { - 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', - 'only_matching': True, - }, { - 'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player', - 'only_matching': True, - }, { - 'url': 'http://www.nowtv.de/rtl2/zuhause-im-glueck/jahr/2015/11/eine-erschuetternde-diagnose/player', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = '%s/%s' % (mobj.group('show_id'), mobj.group('id')) - - info = self._download_json( - 'https://api.nowtv.de/v3/movies/%s?fields=%s' - % (display_id, ','.join(self._VIDEO_FIELDS)), display_id) - - return self._extract_video(info, display_id) - - -class NowTVListIE(NowTVBaseIE): - _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P[^/]+)/list/(?P[^?/#&]+)$' - - _SHOW_FIELDS = ('title', ) - _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) - - _TESTS = [{ - 'url': 'http://www.nowtv.at/rtl/stern-tv/list/aktuell', - 'info_dict': { - 'id': '17006', - 'title': 'stern TV - Aktuell', - }, - 'playlist_count': 1, - }, { - 'url': 'http://www.nowtv.at/rtl/das-supertalent/list/free-staffel-8', - 'info_dict': { - 'id': '20716', - 'title': 'Das Supertalent - FREE Staffel 8', - }, - 'playlist_count': 14, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_id = mobj.group('show_id') - season_id = mobj.group('id') - - fields = [] - fields.extend(self._SHOW_FIELDS) - fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) - fields.extend( - 'formatTabs.formatTabPages.container.movies.%s' % field - for field in self._VIDEO_FIELDS) - - list_info = self._download_json( - 'https://api.nowtv.de/v3/formats/seo?fields=%s&name=%s.php' - % (','.join(fields), show_id), - season_id) - - season = next( - season for season in list_info['formatTabs']['items'] - if season.get('seoheadline') == season_id) - - title = '%s - %s' % (list_info['title'], season['headline']) - - entries = [] - for container in season['formatTabPages']['items']: - for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: - entries.append(self._extract_video(info)) - - return self.playlist_result( - entries, compat_str(season.get('id') or season_id), title) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index fa4ef20c5..ff2153387 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, ExtractorError, fix_xml_ampersands, + int_or_none, orderedSet, parse_duration, qualities, @@ -38,7 +39,7 @@ class NPOIE(NPOBaseIE): npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| ntr\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__| - (?:zapp|npo3)\.nl/(?:[^/]+/){2} + (?:zapp|npo3)\.nl/(?:[^/]+/){2,} ) ) (?P[^/?#]+) @@ -156,6 +157,9 @@ class NPOIE(NPOBaseIE): }, { 'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373', 'only_matching': True, + }, { + 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', + 'only_matching': True, }] def _real_extract(self, url): @@ -170,6 +174,10 @@ class NPOIE(NPOBaseIE): transform_source=strip_jsonp, ) + error = metadata.get('error') + if error: + raise ExtractorError(error, expected=True) + # For some videos actual video id (prid) is different (e.g. for # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 # video id is POMS_WNL_853698 but prid is POW_00996502) @@ -187,7 +195,15 @@ class NPOIE(NPOBaseIE): formats = [] urls = set() - quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) + def is_legal_url(format_url): + return format_url and format_url not in urls and re.match( + r'^(?:https?:)?//', format_url) + + QUALITY_LABELS = ('Laag', 'Normaal', 'Hoog') + QUALITY_FORMATS = ('adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std') + + quality_from_label = qualities(QUALITY_LABELS) + quality_from_format_id = qualities(QUALITY_FORMATS) items = self._download_json( 'http://ida.omroep.nl/app.php/%s' % video_id, video_id, 'Downloading formats JSON', query={ @@ -196,18 +212,34 @@ class NPOIE(NPOBaseIE): })['items'][0] for num, item in enumerate(items): item_url = item.get('url') - if not item_url or item_url in urls: + if not is_legal_url(item_url): continue urls.add(item_url) format_id = self._search_regex( r'video/ida/([^/]+)', item_url, 'format id', default=None) + item_label = item.get('label') + def add_format_url(format_url): + width = int_or_none(self._search_regex( + r'(\d+)[xX]\d+', format_url, 'width', default=None)) + height = int_or_none(self._search_regex( + r'\d+[xX](\d+)', format_url, 'height', default=None)) + if item_label in QUALITY_LABELS: + quality = quality_from_label(item_label) + f_id = item_label + elif item_label in QUALITY_FORMATS: + quality = quality_from_format_id(format_id) + f_id = format_id + else: + quality, f_id = [None] * 2 formats.append({ 'url': format_url, - 'format_id': format_id, - 'quality': quality(format_id), + 'format_id': f_id, + 'width': width, + 'height': height, + 'quality': quality, }) # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 @@ -219,7 +251,7 @@ class NPOIE(NPOBaseIE): stream_info = self._download_json( item_url + '&type=json', video_id, 'Downloading %s stream JSON' - % item.get('label') or item.get('format') or format_id or num) + % item_label or item.get('format') or format_id or num) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: error = (self._parse_json( @@ -251,7 +283,7 @@ class NPOIE(NPOBaseIE): if not is_live: for num, stream in enumerate(metadata.get('streams', [])): stream_url = stream.get('url') - if not stream_url or stream_url in urls: + if not is_legal_url(stream_url): continue urls.add(stream_url) # smooth streaming is not supported @@ -469,7 +501,7 @@ class SchoolTVIE(NPODataMidEmbedIE): class HetKlokhuisIE(NPODataMidEmbedIE): IE_NAME = 'hetklokhuis' - _VALID_URL = r'https?://(?:www\.)?hetklokhuis.nl/[^/]+/\d+/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?hetklokhuis\.nl/[^/]+/\d+/(?P[^/?#&]+)' _TEST = { 'url': 'http://hetklokhuis.nl/tv-uitzending/3471/Zwaartekrachtsgolven', diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 18ead9426..3b4f51f61 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -237,7 +237,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-ne.nrk.no' + _API_HOST = 'psapi-we.nrk.no' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 854b6800c..190d8af4d 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -14,15 +14,27 @@ from ..utils import ( int_or_none, qualities, unescapeHTML, + urlencode_postdata, ) class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P[\d-]+)' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m|mobile)\.)? + (?:odnoklassniki|ok)\.ru/ + (?: + video(?:embed)?/| + web-api/video/moviePlayer/| + live/| + dk\?.*?st\.mvId= + ) + (?P[\d-]+) + ''' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', - 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc', + 'md5': '0b62089b479e06681abaaca9d204f152', 'info_dict': { 'id': '20079905452', 'ext': 'mp4', @@ -34,7 +46,6 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, - 'skip': 'Video has been blocked', }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', @@ -56,7 +67,7 @@ class OdnoklassnikiIE(InfoExtractor): 'url': 'http://ok.ru/video/64211978996595-1', 'md5': '2f206894ffb5dbfcce2c5a14b909eea5', 'info_dict': { - 'id': '64211978996595-1', + 'id': 'V_VztHT5BzY', 'ext': 'mp4', 'title': 'Космическая среда от 26 августа 2015', 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', @@ -98,6 +109,12 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://mobile.ok.ru/video/20079905452', 'only_matching': True, + }, { + 'url': 'https://www.ok.ru/live/484531969818', + 'only_matching': True, + }, { + 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', + 'only_matching': True, }] def _real_extract(self, url): @@ -127,9 +144,14 @@ class OdnoklassnikiIE(InfoExtractor): if metadata: metadata = self._parse_json(metadata, video_id) else: + data = {} + st_location = flashvars.get('location') + if st_location: + data['st.location'] = st_location metadata = self._download_json( compat_urllib_parse_unquote(flashvars['metadataUrl']), - video_id, 'Downloading metadata JSON') + video_id, 'Downloading metadata JSON', + data=urlencode_postdata(data)) movie = metadata['movie'] @@ -178,6 +200,10 @@ class OdnoklassnikiIE(InfoExtractor): }) return info + assert title + if provider == 'LIVE_TV_APP': + info['title'] = self._live_title(title) + quality = qualities(('4', '0', '1', '2', '3', '5')) formats = [{ @@ -204,6 +230,20 @@ class OdnoklassnikiIE(InfoExtractor): if fmt_type: fmt['quality'] = quality(fmt_type) + # Live formats + m3u8_url = metadata.get('hlsMasterPlaylistUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + rtmp_url = metadata.get('rtmpUrl') + if rtmp_url: + formats.append({ + 'url': rtmp_url, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + self._sort_formats(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py index 1bf96ea56..8ae5fadd8 100644 --- a/youtube_dl/extractor/once.py +++ b/youtube_dl/extractor/once.py @@ -7,11 +7,11 @@ from .common import InfoExtractor class OnceIE(InfoExtractor): - _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/[^/]+/[^/]+/(?P[^/]+)/(?P[^/]+)/(?:[^/]+/)?(?P[^/]+)/content\.(?:once|m3u8|mp4)' + _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/(?:ads/vmap/)?[^/]+/[^/]+/(?P[^/]+)/(?P[^/]+)/(?:[^/]+/)?(?P[^/]+)/content\.(?:once|m3u8|mp4)' ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' - def _extract_once_formats(self, url): + def _extract_once_formats(self, url, http_formats_preference=None): domain_id, application_id, media_item_id = re.match( OnceIE._VALID_URL, url).groups() formats = self._extract_m3u8_formats( @@ -35,6 +35,7 @@ class OnceIE(InfoExtractor): 'format_id': adaptive_format['format_id'].replace( 'hls', 'http'), 'protocol': 'http', + 'preference': http_formats_preference, }) progressive_formats.append(progressive_format) self._check_formats(progressive_formats, media_item_id) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 1d336cf30..c6e3d5640 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -13,11 +13,11 @@ from ..utils import ( class OnionStudiosIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:videos/[^/]+-|embed\?.*\bid=)(?P\d+)(?!-)' + _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:video(?:s/[^/]+-|/)|embed\?.*\bid=)(?P\d+)(?!-)' _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', - 'md5': 'e49f947c105b8a78a675a0ee1bddedfe', + 'md5': '719d1f8c32094b8c33902c17bcae5e34', 'info_dict': { 'id': '2937', 'ext': 'mp4', @@ -29,12 +29,15 @@ class OnionStudiosIE(InfoExtractor): }, { 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', 'only_matching': True, + }, { + 'url': 'http://www.onionstudios.com/video/6139.json', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?onionstudios\.com/embed.+?)\1', webpage) + r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 52580baed..ad8bf03f8 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -1,9 +1,13 @@ from __future__ import unicode_literals + import re -import base64 from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_b64decode, + compat_str, + compat_urllib_parse_urlencode, +) from ..utils import ( determine_ext, ExtractorError, @@ -12,7 +16,6 @@ from ..utils import ( try_get, unsmuggle_url, ) -from ..compat import compat_urllib_parse_urlencode class OoyalaBaseIE(InfoExtractor): @@ -44,7 +47,7 @@ class OoyalaBaseIE(InfoExtractor): url_data = try_get(stream, lambda x: x['url']['data'], compat_str) if not url_data: continue - s_url = base64.b64decode(url_data.encode('ascii')).decode('utf-8') + s_url = compat_b64decode(url_data).decode('utf-8') if not s_url or s_url in urls: continue urls.append(s_url) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d8036b54a..d0bdd60b8 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,18 +1,249 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import os import re +import subprocess +import tempfile from .common import InfoExtractor -from ..compat import compat_chr +from ..compat import ( + compat_urlparse, + compat_kwargs, +) from ..utils import ( + check_executable, determine_ext, + encodeArgument, ExtractorError, + get_element_by_id, + get_exe_version, + is_outdated_version, + std_headers, ) +def cookie_to_dict(cookie): + cookie_dict = { + 'name': cookie.name, + 'value': cookie.value, + } + if cookie.port_specified: + cookie_dict['port'] = cookie.port + if cookie.domain_specified: + cookie_dict['domain'] = cookie.domain + if cookie.path_specified: + cookie_dict['path'] = cookie.path + if cookie.expires is not None: + cookie_dict['expires'] = cookie.expires + if cookie.secure is not None: + cookie_dict['secure'] = cookie.secure + if cookie.discard is not None: + cookie_dict['discard'] = cookie.discard + try: + if (cookie.has_nonstandard_attr('httpOnly') or + cookie.has_nonstandard_attr('httponly') or + cookie.has_nonstandard_attr('HttpOnly')): + cookie_dict['httponly'] = True + except TypeError: + pass + return cookie_dict + + +def cookie_jar_to_list(cookie_jar): + return [cookie_to_dict(cookie) for cookie in cookie_jar] + + +class PhantomJSwrapper(object): + """PhantomJS wrapper class + + This class is experimental. + """ + + _TEMPLATE = r''' + phantom.onError = function(msg, trace) {{ + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) {{ + msgStack.push('TRACE:'); + trace.forEach(function(t) {{ + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }}); + }} + console.error(msgStack.join('\n')); + phantom.exit(1); + }}; + var page = require('webpage').create(); + var fs = require('fs'); + var read = {{ mode: 'r', charset: 'utf-8' }}; + var write = {{ mode: 'w', charset: 'utf-8' }}; + JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ + phantom.addCookie(x); + }}); + page.settings.resourceTimeout = {timeout}; + page.settings.userAgent = "{ua}"; + page.onLoadStarted = function() {{ + page.evaluate(function() {{ + delete window._phantom; + delete window.callPhantom; + }}); + }}; + var saveAndExit = function() {{ + fs.write("{html}", page.content, write); + fs.write("{cookies}", JSON.stringify(phantom.cookies), write); + phantom.exit(); + }}; + page.onLoadFinished = function(status) {{ + if(page.url === "") {{ + page.setContent(fs.read("{html}", read), "{url}"); + }} + else {{ + {jscode} + }} + }}; + page.open(""); + ''' + + _TMP_FILE_NAMES = ['script', 'html', 'cookies'] + + @staticmethod + def _version(): + return get_exe_version('phantomjs', version_re=r'([0-9.]+)') + + def __init__(self, extractor, required_version=None, timeout=10000): + self._TMP_FILES = {} + + self.exe = check_executable('phantomjs', ['-v']) + if not self.exe: + raise ExtractorError('PhantomJS executable not found in PATH, ' + 'download it from http://phantomjs.org', + expected=True) + + self.extractor = extractor + + if required_version: + version = self._version() + if is_outdated_version(version, required_version): + self.extractor._downloader.report_warning( + 'Your copy of PhantomJS is outdated, update it to version ' + '%s or newer if you encounter any errors.' % required_version) + + self.options = { + 'timeout': timeout, + } + for name in self._TMP_FILE_NAMES: + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + self._TMP_FILES[name] = tmp + + def __del__(self): + for name in self._TMP_FILE_NAMES: + try: + os.remove(self._TMP_FILES[name].name) + except (IOError, OSError, KeyError): + pass + + def _save_cookies(self, url): + cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) + for cookie in cookies: + if 'path' not in cookie: + cookie['path'] = '/' + if 'domain' not in cookie: + cookie['domain'] = compat_urlparse.urlparse(url).netloc + with open(self._TMP_FILES['cookies'].name, 'wb') as f: + f.write(json.dumps(cookies).encode('utf-8')) + + def _load_cookies(self): + with open(self._TMP_FILES['cookies'].name, 'rb') as f: + cookies = json.loads(f.read().decode('utf-8')) + for cookie in cookies: + if cookie['httponly'] is True: + cookie['rest'] = {'httpOnly': None} + if 'expiry' in cookie: + cookie['expire_time'] = cookie['expiry'] + self.extractor._set_cookie(**compat_kwargs(cookie)) + + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): + """ + Downloads webpage (if needed) and executes JS + + Params: + url: website url + html: optional, html code of website + video_id: video id + note: optional, displayed when downloading webpage + note2: optional, displayed when executing JS + headers: custom http headers + jscode: code to be executed when page is loaded + + Returns tuple with: + * downloaded website (after JS execution) + * anything you print with `console.log` (but not inside `page.execute`!) + + In most cases you don't need to add any `jscode`. + It is executed in `page.onLoadFinished`. + `saveAndExit();` is mandatory, use it instead of `phantom.exit()` + It is possible to wait for some element on the webpage, for example: + var check = function() { + var elementFound = page.evaluate(function() { + return document.querySelector('#b.done') !== null; + }); + if(elementFound) + saveAndExit(); + else + window.setTimeout(check, 500); + } + + page.evaluate(function(){ + document.querySelector('#a').click(); + }); + check(); + """ + if 'saveAndExit();' not in jscode: + raise ExtractorError('`saveAndExit();` not found in `jscode`') + if not html: + html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) + with open(self._TMP_FILES['html'].name, 'wb') as f: + f.write(html.encode('utf-8')) + + self._save_cookies(url) + + replaces = self.options + replaces['url'] = url + user_agent = headers.get('User-Agent') or std_headers['User-Agent'] + replaces['ua'] = user_agent.replace('"', '\\"') + replaces['jscode'] = jscode + + for x in self._TMP_FILE_NAMES: + replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') + + with open(self._TMP_FILES['script'].name, 'wb') as f: + f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) + + if video_id is None: + self.extractor.to_screen('%s' % (note2,)) + else: + self.extractor.to_screen('%s: %s' % (video_id, note2)) + + p = subprocess.Popen([ + self.exe, '--ssl-protocol=any', + self._TMP_FILES['script'].name + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate() + if p.returncode != 0: + raise ExtractorError( + 'Executing JS failed\n:' + encodeArgument(err)) + with open(self._TMP_FILES['html'].name, 'rb') as f: + html = f.read().decode('utf-8') + + self._load_cookies() + + return (html, encodeArgument(out)) + + class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:openload\.(?:co|io)|oload\.tv)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -53,11 +284,27 @@ class OpenloadIE(InfoExtractor): # for title and ext 'url': 'https://openload.co/embed/Sxz5sADo82g/', 'only_matching': True, + }, { + # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available + # via https://openload.co/f/e-Ixz9ZR5L0/ + 'url': 'https://openload.co/f/e-Ixz9ZR5L0/', + 'only_matching': True, }, { 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', 'only_matching': True, + }, { + 'url': 'http://www.openload.link/f/KnG-kKZdcfY', + 'only_matching': True, + }, { + 'url': 'https://oload.stream/f/KnG-kKZdcfY', + 'only_matching': True, + }, { + 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', + 'only_matching': True, }] + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' + @staticmethod def _extract_urls(webpage): return re.findall( @@ -66,63 +313,58 @@ class OpenloadIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) + url_pattern = 'https://openload.co/%%s/%s/' % video_id + headers = { + 'User-Agent': self._USER_AGENT, + } - if 'File not found' in webpage or 'deleted by the owner' in webpage: - raise ExtractorError('File not found', expected=True) + for path in ('embed', 'f'): + page_url = url_pattern % path + last = path == 'f' + webpage = self._download_webpage( + page_url, video_id, 'Downloading %s webpage' % path, + headers=headers, fatal=last) + if not webpage: + continue + if 'File not found' in webpage or 'deleted by the owner' in webpage: + if not last: + continue + raise ExtractorError('File not found', expected=True, video_id=video_id) + break - ol_id = self._search_regex( - ']+id="[^"]+"[^>]*>([0-9A-Za-z]+)', - webpage, 'openload ID') + phantom = PhantomJSwrapper(self, required_version='2.0') + webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) - decoded = '' - a = ol_id[0:24] - b = [] - for i in range(0, len(a), 8): - b.append(int(a[i:i + 8] or '0', 16)) - ol_id = ol_id[24:] - j = 0 - k = 0 - while j < len(ol_id): - c = 128 - d = 0 - e = 0 - f = 0 - _more = True - while _more: - if j + 1 >= len(ol_id): - c = 143 - f = int(ol_id[j:j + 2] or '0', 16) - j += 2 - d += (f & 127) << e - e += 7 - _more = f >= c - g = d ^ b[k % 3] - for i in range(4): - char_dec = (g >> 8 * i) & (c + 127) - char = compat_chr(char_dec) - if char != '#': - decoded += char - k += 1 + decoded_id = (get_element_by_id('streamurl', webpage) or + get_element_by_id('streamuri', webpage) or + get_element_by_id('streamurj', webpage) or + self._search_regex( + (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', + r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', + r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', + r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', + r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, + 'stream URL')) - video_url = 'https://openload.co/stream/%s?mime=true' - video_url = video_url % decoded + video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', fatal=True) - entries = self._parse_html5_media_entries(url, webpage, video_id) - subtitles = entries[0]['subtitles'] if entries else None + entries = self._parse_html5_media_entries(page_url, webpage, video_id) + entry = entries[0] if entries else {} + subtitles = entry.get('subtitles') info_dict = { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, # Seems all videos have extensions in their titles 'ext': determine_ext(title, 'mp4'), 'subtitles': subtitles, + 'http_headers': headers, } return info_dict diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index cc296eabd..c1fb580ca 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -6,14 +6,15 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - HEADRequest, - unified_strdate, - strip_jsonp, - int_or_none, - float_or_none, determine_ext, + float_or_none, + HEADRequest, + int_or_none, + orderedSet, remove_end, + strip_jsonp, unescapeHTML, + unified_strdate, ) @@ -48,13 +49,13 @@ class ORFTVthekIE(InfoExtractor): 'params': { 'skip_download': True, # rtsp downloads }, - '_skip': 'Blocked outside of Austria / Germany', + 'skip': 'Blocked outside of Austria / Germany', }, { 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', - 'skip_download': True, + 'only_matching': True, }, { 'url': 'http://tvthek.orf.at/profile/Universum/35429', - 'skip_download': True, + 'only_matching': True, }] def _real_extract(self, url): @@ -307,3 +308,108 @@ class ORFIPTVIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, } + + +class ORFFM4StoryIE(InfoExtractor): + IE_NAME = 'orf:fm4:story' + IE_DESC = 'fm4.orf.at stories' + _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P\d+)' + + _TEST = { + 'url': 'http://fm4.orf.at/stories/2865738/', + 'playlist': [{ + 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', + 'info_dict': { + 'id': '547792', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + 'duration': 1748.52, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + }, + }, { + 'md5': 'c6dd2179731f86f4f55a7b49899d515f', + 'info_dict': { + 'id': '547798', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live (2)', + 'duration': 1504.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + }, + }], + } + + def _real_extract(self, url): + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + + entries = [] + all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) + for idx, video_id in enumerate(all_ids): + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + duration = float_or_none(data['duration'], 1000) + + video = data['sources']['q8c'] + load_balancer_url = video['loadBalancerUrl'] + abr = int_or_none(video.get('audioBitrate')) + vbr = int_or_none(video.get('bitrate')) + fps = int_or_none(video.get('videoFps')) + width = int_or_none(video.get('videoWidth')) + height = int_or_none(video.get('videoHeight')) + thumbnail = video.get('preview') + + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + + f = { + 'abr': abr, + 'vbr': vbr, + 'fps': fps, + 'width': width, + 'height': height, + } + + formats = [] + for format_id, format_url in rendition['redirect'].items(): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + continue + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') + if idx >= 1: + # Titles are duplicates, make them unique + title += ' (' + str(idx + 1) + ')' + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date')) + + entries.append({ + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + + return self.playlist_result(entries) diff --git a/youtube_dl/extractor/pandatv.py b/youtube_dl/extractor/pandatv.py index c86d70771..13a2e7efc 100644 --- a/youtube_dl/extractor/pandatv.py +++ b/youtube_dl/extractor/pandatv.py @@ -33,7 +33,7 @@ class PandaTVIE(InfoExtractor): video_id = self._match_id(url) config = self._download_json( - 'https://www.panda.tv/api_room?roomid=%s' % video_id, video_id) + 'https://www.panda.tv/api_room_v2?roomid=%s' % video_id, video_id) error_code = config.get('errno', 0) if error_code is not 0: @@ -66,6 +66,11 @@ class PandaTVIE(InfoExtractor): plflag1 = '4' live_panda = 'live_panda' if plflag0 < 1 else '' + plflag_auth = self._parse_json(video_info['plflag_list'], video_id) + sign = plflag_auth['auth']['sign'] + ts = plflag_auth['auth']['time'] + rid = plflag_auth['auth']['rid'] + quality_key = qualities(['OD', 'HD', 'SD']) suffix = ['_small', '_mid', ''] formats = [] @@ -77,8 +82,8 @@ class PandaTVIE(InfoExtractor): continue for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))): formats.append({ - 'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s' - % (pl, plflag1, room_key, live_panda, suffix[quality], ext), + 'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s?sign=%s&ts=%s&rid=%s' + % (pl, plflag1, room_key, live_panda, suffix[quality], ext, sign, ts, rid), 'format_id': '%s-%s' % (k, ext), 'quality': quality, 'source_preference': pref, diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index fc7bd3411..538738c09 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_str, @@ -18,7 +20,14 @@ from ..utils import ( class PandoraTVIE(InfoExtractor): IE_NAME = 'pandora.tv' IE_DESC = '판도라TV' - _VALID_URL = r'https?://(?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?pandora\.tv/view/(?P[^/]+)/(?P\d+)| # new format + (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?| # old format + m\.pandora\.tv/?\? # mobile + ) + ''' _TESTS = [{ 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', 'info_dict': { @@ -53,14 +62,25 @@ class PandoraTVIE(InfoExtractor): # Test metadata only 'skip_download': True, }, + }, { + 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', + 'only_matching': True, + }, { + 'url': 'http://m.pandora.tv/?c=view&ch_userid=mikakim&prgid=54600346', + 'only_matching': True, }] def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = qs.get('prgid', [None])[0] - user_id = qs.get('ch_userid', [None])[0] - if any(not f for f in (video_id, user_id,)): - raise ExtractorError('Invalid URL', expected=True) + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user_id') + video_id = mobj.group('id') + + if not user_id or not video_id: + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('prgid', [None])[0] + user_id = qs.get('ch_userid', [None])[0] + if any(not f for f in (video_id, user_id,)): + raise ExtractorError('Invalid URL', expected=True) data = self._download_json( 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s' diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py index ebdab8db9..bdd5ff565 100644 --- a/youtube_dl/extractor/parliamentliveuk.py +++ b/youtube_dl/extractor/parliamentliveuk.py @@ -11,7 +11,7 @@ class ParliamentLiveUKIE(InfoExtractor): _TESTS = [{ 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', 'info_dict': { - 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'id': '1_af9nv9ym', 'ext': 'mp4', 'title': 'Home Affairs Committee', 'uploader_id': 'FFMPEG-01', @@ -28,14 +28,14 @@ class ParliamentLiveUKIE(InfoExtractor): webpage = self._download_webpage( 'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id) widget_config = self._parse_json(self._search_regex( - r'kWidgetConfig\s*=\s*({.+});', + r'(?s)kWidgetConfig\s*=\s*({.+});', webpage, 'kaltura widget config'), video_id) - kaltura_url = 'kaltura:%s:%s' % (widget_config['wid'][1:], widget_config['entry_id']) + kaltura_url = 'kaltura:%s:%s' % ( + widget_config['wid'][1:], widget_config['entry_id']) event_title = self._download_json( 'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title'] return { '_type': 'url_transparent', - 'id': video_id, 'title': event_title, 'description': '', 'url': kaltura_url, diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index a6a2c273f..d4b1d34ca 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -67,7 +67,7 @@ class PatreonIE(InfoExtractor): 'https://www.patreon.com/processLogin', compat_urllib_parse_urlencode(login_form).encode('utf-8') ) - login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + login_page = self._download_webpage(request, None, note='Logging in') if re.search(r'onLoginFailed', login_page): raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8889e4a1a..f11d5da52 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -187,7 +187,7 @@ class PBSIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: # Direct video URL - (?:%s)/(?:viralplayer|video)/(?P[0-9]+)/? | + (?:%s)/(?:(?:vir|port)alplayer|video)/(?P[0-9]+)(?:[?/]|$) | # Article with embedded player (or direct video) (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player @@ -367,6 +367,10 @@ class PBSIE(InfoExtractor): { 'url': 'http://watch.knpb.org/video/2365616055/', 'only_matching': True, + }, + { + 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=', + 'only_matching': True, } ] _ERRORS = { @@ -417,6 +421,7 @@ class PBSIE(InfoExtractor): r'class="coveplayerid">([^<]+)<', # coveplayer r']+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'', # jwplayer + r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", ] media_id = self._search_regex( diff --git a/youtube_dl/extractor/performgroup.py b/youtube_dl/extractor/performgroup.py new file mode 100644 index 000000000..26942bfb3 --- /dev/null +++ b/youtube_dl/extractor/performgroup.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PerformGroupIE(InfoExtractor): + _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P[0-9a-f]{26})\.(?P[0-9a-z]{26})' + _TESTS = [{ + # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html + 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', + 'md5': '259cb03d142e2e52471e8837ecacb29f', + 'info_dict': { + 'id': 'xgrwobuzumes1lwjxtcdpwgxd', + 'ext': 'mp4', + 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', + 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', + 'timestamp': 1511533477, + 'upload_date': '20171124', + } + }] + + def _call_api(self, service, auth_token, content_id, referer_url): + return self._download_json( + 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), + content_id, headers={ + 'Referer': referer_url, + 'Origin': 'http://player.performgroup.com', + }, query={ + '_fmt': 'json', + }) + + def _real_extract(self, url): + player_id, auth_token = re.search(self._VALID_URL, url).groups() + bootstrap = self._call_api('bootstrap', auth_token, player_id, url) + video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] + video_id = video['uuid'] + vod = self._call_api('vod', auth_token, video_id, url) + media = vod['videos']['video'][0]['media'] + + formats = [] + hls_url = media.get('hls', {}).get('url') + if hls_url: + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + hds_url = media.get('hds', {}).get('url') + if hds_url: + formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) + + for c in media.get('content', []): + c_url = c.get('url') + if not c_url: + continue + tbr = int_or_none(c.get('bitrate'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': c_url, + 'tbr': tbr, + 'width': int_or_none(c.get('width')), + 'height': int_or_none(c.get('height')), + 'filesize': int_or_none(c.get('fileSize')), + 'vcodec': c.get('type'), + 'fps': int_or_none(c.get('videoFrameRate')), + 'vbr': int_or_none(c.get('videoRate'), 1000), + 'abr': int_or_none(c.get('audioRate'), 1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video['title'], + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': int_or_none(video.get('publishedTime'), 1000), + 'formats': formats, + } diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index bfa12edc9..8afe541ec 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -56,18 +56,16 @@ class PeriscopeIE(PeriscopeBaseIE): def _real_extract(self, url): token = self._match_id(url) - broadcast_data = self._call_api( - 'getBroadcastPublic', {'broadcast_id': token}, token) - broadcast = broadcast_data['broadcast'] - status = broadcast['status'] + stream = self._call_api( + 'accessVideoPublic', {'broadcast_id': token}, token) - user = broadcast_data.get('user', {}) + broadcast = stream['broadcast'] + title = broadcast['status'] - uploader = broadcast.get('user_display_name') or user.get('display_name') - uploader_id = (broadcast.get('username') or user.get('username') or - broadcast.get('user_id') or user.get('id')) + uploader = broadcast.get('user_display_name') or broadcast.get('username') + uploader_id = (broadcast.get('user_id') or broadcast.get('username')) - title = '%s - %s' % (uploader, status) if uploader else status + title = '%s - %s' % (uploader, title) if uploader else title state = broadcast.get('state').lower() if state == 'running': title = self._live_title(title) @@ -77,21 +75,24 @@ class PeriscopeIE(PeriscopeBaseIE): 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - stream = self._call_api( - 'getAccessPublic', {'broadcast_id': token}, token) - + video_urls = set() formats = [] - for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): + for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): video_url = stream.get(format_id + '_url') - if not video_url: + if not video_url or video_url in video_urls: continue - f = { + video_urls.add(video_url) + if format_id != 'rtmp': + formats.extend(self._extract_m3u8_formats( + video_url, token, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=False)) + continue + formats.append({ 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', - } - if format_id != 'rtmp': - f['protocol'] = 'm3u8_native' if state in ('ended', 'timed_out') else 'm3u8' - formats.append(f) + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py new file mode 100644 index 000000000..2366dfb34 --- /dev/null +++ b/youtube_dl/extractor/picarto.py @@ -0,0 +1,165 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + try_get, + update_url_query, + urlencode_postdata, +) + + +class PicartoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)' + _TEST = { + 'url': 'https://picarto.tv/Setz', + 'info_dict': { + 'id': 'Setz', + 'ext': 'mp4', + 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'timestamp': int, + 'is_live': True + }, + 'skip': 'Stream is offline', + } + + @classmethod + def suitable(cls, url): + return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + + def _real_extract(self, url): + channel_id = self._match_id(url) + stream_page = self._download_webpage(url, channel_id) + + if '>This channel does not exist' in stream_page: + raise ExtractorError( + 'Channel %s does not exist' % channel_id, expected=True) + + player = self._parse_json( + self._search_regex( + r'(?s)playerSettings\[\d+\]\s*=\s*(\{.+?\}\s*\n)', stream_page, + 'player settings'), + channel_id, transform_source=js_to_json) + + if player.get('online') is False: + raise ExtractorError('Stream is offline', expected=True) + + cdn_data = self._download_json( + 'https://picarto.tv/process/channel', channel_id, + data=urlencode_postdata({'loadbalancinginfo': channel_id}), + note='Downloading load balancing info') + + def get_event(key): + return try_get(player, lambda x: x['event'][key], compat_str) or '' + + params = { + 'token': player.get('token') or '', + 'ticket': get_event('ticket'), + 'con': int(time.time() * 1000), + 'type': get_event('ticket'), + 'scope': get_event('scope'), + } + + prefered_edge = cdn_data.get('preferedEdge') + default_tech = player.get('defaultTech') + + formats = [] + + for edge in cdn_data['edges']: + edge_ep = edge.get('ep') + if not edge_ep or not isinstance(edge_ep, compat_str): + continue + edge_id = edge.get('id') + for tech in cdn_data['techs']: + tech_label = tech.get('label') + tech_type = tech.get('type') + preference = 0 + if edge_id == prefered_edge: + preference += 1 + if tech_type == default_tech: + preference += 1 + format_id = [] + if edge_id: + format_id.append(edge_id) + if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': + format_id.append('hls') + formats.extend(self._extract_m3u8_formats( + update_url_query( + 'https://%s/hls/%s/index.m3u8' + % (edge_ep, channel_id), params), + channel_id, 'mp4', preference=preference, + m3u8_id='-'.join(format_id), fatal=False)) + continue + elif tech_type == 'video/mp4' or tech_label == 'MP4': + format_id.append('mp4') + formats.append({ + 'url': update_url_query( + 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), + params), + 'format_id': '-'.join(format_id), + 'preference': preference, + }) + else: + # rtmp format does not seem to work + continue + self._sort_formats(formats) + + mature = player.get('mature') + if mature is None: + age_limit = None + else: + age_limit = 18 if mature is True else 0 + + return { + 'id': channel_id, + 'title': self._live_title(channel_id), + 'is_live': True, + 'thumbnail': player.get('vodThumb'), + 'age_limit': age_limit, + 'formats': formats, + } + + +class PicartoVodIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', + 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', + 'info_dict': { + 'id': 'ArtofZod_2017.12.12.00.13.23.flv', + 'ext': 'mp4', + 'title': 'ArtofZod_2017.12.12.00.13.23.flv', + 'thumbnail': r're:^https?://.*\.jpg' + }, + }, { + 'url': 'https://picarto.tv/videopopout/Plague', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + vod_info = self._parse_json( + self._search_regex( + r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, + video_id), + video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats( + vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': vod_info.get('vodThumb'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index e38c7618e..e86c65396 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( + determine_ext, ExtractorError, int_or_none, xpath_text, @@ -26,17 +28,15 @@ class PladformIE(InfoExtractor): (?P\d+) ''' _TESTS = [{ - # http://muz-tv.ru/kinozal/view/7400/ - 'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', - 'md5': '61f37b575dd27f1bb2e1854777fe31f4', + 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', + 'md5': '53362fac3a27352da20fa2803cc5cd6f', 'info_dict': { - 'id': '100183293', + 'id': '3777899', 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', - 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', + 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', + 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 694, - 'age_limit': 0, + 'duration': 3190, }, }, { 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', @@ -56,22 +56,48 @@ class PladformIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + pl = qs.get('pl', ['1'])[0] + video = self._download_xml( - 'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, - video_id) + 'http://out.pladform.ru/getVideo', video_id, query={ + 'pl': pl, + 'videoid': video_id, + }) + + def fail(text): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, text), + expected=True) if video.tag == 'error': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, video.text), - expected=True) + fail(video.text) quality = qualities(('ld', 'sd', 'hd')) - formats = [{ - 'url': src.text, - 'format_id': src.get('quality'), - 'quality': quality(src.get('quality')), - } for src in video.findall('./src')] + formats = [] + for src in video.findall('./src'): + if src is None: + continue + format_url = src.text + if not format_url: + continue + if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + }) + + if not formats: + error = xpath_text(video, './cap', 'error', default=None) + if error: + fail(error) + self._sort_formats(formats) webpage = self._download_webpage( diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index 391e1bd09..4c5f57919 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -24,7 +24,7 @@ class PlaytvakIE(InfoExtractor): 'id': 'A150730_150323_hodinovy-manzel_kuko', 'ext': 'mp4', 'title': 'Vyžeňte vosy a sršně ze zahrady', - 'description': 'md5:f93d398691044d303bc4a3de62f3e976', + 'description': 'md5:4436e61b7df227a093778efb7e373571', 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 279, 'timestamp': 1438732860, @@ -36,9 +36,19 @@ class PlaytvakIE(InfoExtractor): 'info_dict': { 'id': 'A150624_164934_planespotting_cat', 'ext': 'flv', - 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', - 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # another live stream, this one without Misc.videoFLV + 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', + 'info_dict': { + 'id': 'A151218_145728_hlavni-nadrazi_plap', + 'ext': 'flv', + 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { @@ -95,7 +105,7 @@ class PlaytvakIE(InfoExtractor): webpage = self._download_webpage(url, video_id) info_url = self._html_search_regex( - r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') parsed_url = compat_urlparse.urlparse(info_url) @@ -160,7 +170,7 @@ class PlaytvakIE(InfoExtractor): if is_live: title = self._live_title(title) description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description') + 'description', webpage, 'description', default=None) timestamp = None duration = None if not is_live: diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index d35f54ce8..aacc5d4bb 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -18,6 +18,7 @@ from ..utils import ( parse_duration, qualities, srt_subtitles_timecode, + try_get, update_url_query, urlencode_postdata, ) @@ -26,6 +27,39 @@ from ..utils import ( class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' + def _download_course(self, course_id, url, display_id): + try: + return self._download_course_rpc(course_id, url, display_id) + except ExtractorError: + # Old API fallback + return self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_id}), + headers={'Referer': url}) + + def _download_course_rpc(self, course_id, url, display_id): + response = self._download_json( + '%s/player/functions/rpc' % self._API_BASE, display_id, + 'Downloading course JSON', + data=json.dumps({ + 'fn': 'bootstrapPlayer', + 'payload': { + 'courseId': course_id, + }, + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + + course = try_get(response, lambda x: x['payload']['course'], dict) + if course: + return course + + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']['message']), + expected=True) + class PluralsightIE(PluralsightBaseIE): IE_NAME = 'pluralsight' @@ -82,7 +116,7 @@ class PluralsightIE(PluralsightBaseIE): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( - post_url, None, 'Logging in as %s' % username, + post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Content-Type': 'application/x-www-form-urlencoded'}) @@ -97,6 +131,13 @@ class PluralsightIE(PluralsightBaseIE): if BLOCKED in response: raise ExtractorError( 'Unable to login: %s' % BLOCKED, expected=True) + MUST_AGREE = 'To continue using Pluralsight, you must agree to' + if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): + raise ExtractorError( + 'Unable to login: %s some documents. Go to pluralsight.com, ' + 'log in and agree with what Pluralsight requires.' + % MUST_AGREE, expected=True) + raise ExtractorError('Unable to log in') def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): @@ -130,12 +171,12 @@ class PluralsightIE(PluralsightBaseIE): for num, current in enumerate(subs): current = subs[num] start, text = ( - float_or_none(dict_get(current, TIME_OFFSET_KEYS)), + float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), dict_get(current, TEXT_KEYS)) if start is None or text is None: continue end = duration if num == len(subs) - 1 else float_or_none( - dict_get(subs[num + 1], TIME_OFFSET_KEYS)) + dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) if end is None: continue srt += os.linesep.join( @@ -162,10 +203,7 @@ class PluralsightIE(PluralsightBaseIE): display_id = '%s-%s' % (name, clip_id) - course = self._download_json( - 'https://app.pluralsight.com/player/user/api/v1/player/payload', - display_id, data=urlencode_postdata({'courseId': course_name}), - headers={'Referer': url}) + course = self._download_course(course_name, url, display_id) collection = course['modules'] @@ -331,18 +369,7 @@ class PluralsightCourseIE(PluralsightBaseIE): # TODO: PSM cookie - course = self._download_json( - '%s/player/functions/rpc' % self._API_BASE, course_id, - 'Downloading course JSON', - data=json.dumps({ - 'fn': 'bootstrapPlayer', - 'payload': { - 'courseId': course_id, - } - }).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8' - })['payload']['course'] + course = self._download_course(course_id, url, course_id) title = course['title'] course_name = course['name'] diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py index 2d87e7e70..dd5f17f11 100644 --- a/youtube_dl/extractor/pokemon.py +++ b/youtube_dl/extractor/pokemon.py @@ -11,19 +11,34 @@ from ..utils import ( class PokemonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P[a-z0-9]{32})|/[^/]+/\d+_\d+-(?P[^/?#]+))' + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P[a-z0-9]{32})|/(?:[^/]+/)+(?P[^/?#&]+))' _TESTS = [{ - 'url': 'http://www.pokemon.com/us/pokemon-episodes/19_01-from-a-to-z/?play=true', - 'md5': '9fb209ae3a569aac25de0f5afc4ee08f', + 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', + 'md5': '2fe8eaec69768b25ef898cda9c43062e', 'info_dict': { - 'id': 'd0436c00c3ce4071ac6cee8130ac54a1', + 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', 'ext': 'mp4', - 'title': 'From A to Z!', - 'description': 'Bonnie makes a new friend, Ash runs into an old friend, and a terrifying premonition begins to unfold!', - 'timestamp': 1460478136, - 'upload_date': '20160412', + 'title': 'The Ol’ Raise and Switch!', + 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', + 'timestamp': 1511824728, + 'upload_date': '20171127', + }, + 'add_id': ['LimelightMedia'], + }, { + # no data-video-title + 'url': 'https://www.pokemon.com/us/pokemon-episodes/pokemon-movies/pokemon-the-rise-of-darkrai-2008', + 'info_dict': { + 'id': '99f3bae270bf4e5097274817239ce9c8', + 'ext': 'mp4', + 'title': 'Pokémon: The Rise of Darkrai', + 'description': 'md5:ea8fbbf942e1e497d54b19025dd57d9d', + 'timestamp': 1417778347, + 'upload_date': '20141205', + }, + 'add_id': ['LimelightMedia'], + 'params': { + 'skip_download': True, }, - 'add_id': ['LimelightMedia'] }, { 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', 'only_matching': True, @@ -42,7 +57,9 @@ class PokemonIE(InfoExtractor): r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), webpage, 'video data element')) video_id = video_data['data-video-id'] - title = video_data['data-video-title'] + title = video_data.get('data-video-title') or self._html_search_meta( + 'pkm-title', webpage, ' title', default=None) or self._search_regex( + r']+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') return { '_type': 'url_transparent', 'id': video_id, diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py new file mode 100644 index 000000000..ac901f426 --- /dev/null +++ b/youtube_dl/extractor/popcorntv.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P[^/]+)/(?P\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r']+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)]+itemprop=["\']description[^>]*>(.+?)', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + print(self._html_search_meta( + 'duration', webpage)) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 8218c7d3b..60ade06da 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -77,12 +77,14 @@ class PornComIE(InfoExtractor): self._sort_formats(formats) view_count = str_to_int(self._search_regex( - r'class=["\']views["\'][^>]*>

([\d,.]+)', webpage, + (r'Views:\s*\s*\s*([\d,.]+)', + r'class=["\']views["\'][^>]*>

([\d,.]+)'), webpage, 'view count', fatal=False)) def extract_list(kind): s = self._search_regex( - r'(?s)]*>%s:(.+?)

' % kind.capitalize(), + (r'(?s)%s:\s*\s*(.+?)' % kind.capitalize(), + r'(?s)]*>%s:(.+?)

' % kind.capitalize()), webpage, kind, fatal=False) return re.findall(r']+>([^<]+)', s or '') diff --git a/youtube_dl/extractor/pornflip.py b/youtube_dl/extractor/pornflip.py index a4a5d390e..025985fbc 100644 --- a/youtube_dl/extractor/pornflip.py +++ b/youtube_dl/extractor/pornflip.py @@ -14,7 +14,7 @@ from ..utils import ( class PornFlipIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P[0-9A-Za-z]{11})' + _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornflip.com/v/wz7DfNhMmep', 'md5': '98c46639849145ae1fd77af532a9278c', @@ -34,6 +34,15 @@ class PornFlipIE(InfoExtractor): }, { 'url': 'https://www.pornflip.com/embed/wz7DfNhMmep', 'only_matching': True, + }, { + 'url': 'https://www.pornflip.com/v/EkRD6-vS2-s', + 'only_matching': True, + }, { + 'url': 'https://www.pornflip.com/embed/EkRD6-vS2-s', + 'only_matching': True, + }, { + 'url': 'https://www.pornflip.com/v/NG9q6Pb_iK8', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 36761788d..b52879c7a 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor): r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') sources = self._parse_json(js_to_json(self._search_regex( - r"(?s)sources'?\s*:\s*(\{.+?\})\s*\}[;,)]", + r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) if not sources: @@ -82,7 +82,8 @@ class PornHdIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'(\d+) views\s*<', webpage, 'view count', fatal=False)) thumbnail = self._search_regex( - r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) + r"poster'?\s*:\s*([\"'])(?P(?:(?!\1).)+)\1", webpage, + 'thumbnail', fatal=False, group='url') return { 'id': video_id, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index e032817f2..23e24d216 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) @@ -115,12 +115,13 @@ class PornHubIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + self._set_cookie('pornhub.com', 'age_verified', '1') + def dl_webpage(platform): + self._set_cookie('pornhub.com', 'platform', platform) return self._download_webpage( 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, - video_id, headers={ - 'Cookie': 'age_verified=1; platform=%s' % platform, - }) + video_id) webpage = dl_webpage('pc') @@ -186,7 +187,7 @@ class PornHubIE(InfoExtractor): title, thumbnail, duration = [None] * 3 video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) view_count = self._extract_count( @@ -227,20 +228,6 @@ class PornHubIE(InfoExtractor): class PornHubPlaylistBaseIE(InfoExtractor): def _extract_entries(self, webpage): - return [ - self.url_result( - 'http://www.pornhub.com/%s' % video_url, - PornHubIE.ie_key(), video_title=title) - for video_url, title in orderedSet(re.findall( - r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', - webpage)) - ] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see # https://github.com/rg3/youtube-dl/issues/11594). @@ -248,7 +235,21 @@ class PornHubPlaylistBaseIE(InfoExtractor): r'(?s)(]+class=["\']container.+)', webpage, 'container', default=webpage) - entries = self._extract_entries(container) + return [ + self.url_result( + 'http://www.pornhub.com/%s' % video_url, + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + container)) + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = self._extract_entries(webpage) playlist = self._parse_json( self._search_regex( @@ -263,7 +264,7 @@ class PornHubPlaylistBaseIE(InfoExtractor): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/4667351', 'info_dict': { @@ -271,11 +272,14 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): 'title': 'Nataly Hot', }, 'playlist_mincount': 2, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, }] class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P[^/]+)/videos' + _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:user|channel)s/(?P[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -285,6 +289,28 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }, { 'url': 'http://www.pornhub.com/users/rushandlia/videos', 'only_matching': True, + }, { + # default sorting as Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos', + 'info_dict': { + 'id': 'povd', + }, + 'playlist_mincount': 293, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', + 'only_matching': True, + }, { + # Most Recent Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index d8a4bd244..d0955d079 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -129,10 +129,11 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): https?:// (?:www\.)? (?: + (?:beta\.)? (?: prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|7tv|advopedia )\.(?:de|at|ch)| - ran\.de|fem\.com|advopedia\.de + ran\.de|fem\.com|advopedia\.de|galileo\.tv/video ) /(?P.+) ''' @@ -325,6 +326,11 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', 'only_matching': True, }, + { + # geo restricted to Germany + 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', + 'only_matching': True, + }, { 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', 'only_matching': True, @@ -342,8 +348,10 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', r'clip[iI]d=(\d+)', - r'clip[iI]d\s*=\s*["\'](\d+)', + r'clip[iI][dD]\s*=\s*["\'](\d+)', r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", + r'proMamsId"\s*:\s*"(\d+)', + r'proMamsId"\s*:\s*"(\d+)', ] _TITLE_REGEXES = [ r'

\s*(.+?)

', diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 17c27da46..084308aeb 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -2,38 +2,37 @@ from __future__ import unicode_literals import random -import time import re +import time from .common import InfoExtractor from ..utils import ( - sanitized_Request, - strip_jsonp, - unescapeHTML, clean_html, ExtractorError, + strip_jsonp, + unescapeHTML, ) class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P[0-9A-Za-z]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', - 'md5': '9ce1c1c8445f561506d2e3cfb0255705', + 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', + 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', 'info_dict': { 'id': '004295Et37taLD', 'ext': 'mp3', 'title': '可惜没如果', 'release_date': '20141227', 'creator': '林俊杰', - 'description': 'md5:d327722d0361576fde558f1ac68a7065', + 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'note': 'There is no mp3-320 version of this song.', - 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV', + 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', 'info_dict': { 'id': '004MsGEo3DdNxV', @@ -46,14 +45,14 @@ class QQMusicIE(InfoExtractor): } }, { 'note': 'lyrics not in .lrc format', - 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6', + 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', 'info_dict': { 'id': '001JyApY11tIp6', 'ext': 'mp3', 'title': 'Shadows Over Transylvania', 'release_date': '19970225', 'creator': 'Dark Funeral', - 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11', + 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { @@ -105,7 +104,7 @@ class QQMusicIE(InfoExtractor): [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) if albummid: - thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ + thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ % (albummid[-2:-1], albummid[-1], albummid) guid = self.m_r_get_ruin() @@ -156,15 +155,39 @@ class QQPlaylistBaseIE(InfoExtractor): def qq_static_url(category, mid): return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) - @classmethod - def get_entries_from_page(cls, page): + def get_singer_all_songs(self, singmid, num): + return self._download_webpage( + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, + query={ + 'format': 'json', + 'inCharset': 'utf8', + 'outCharset': 'utf-8', + 'platform': 'yqq', + 'needNewCode': 0, + 'singermid': singmid, + 'order': 'listen', + 'begin': 0, + 'num': num, + 'songstatus': 1, + }) + + def get_entries_from_page(self, singmid): entries = [] - for item in re.findall(r'class="data"[^<>]*>([^<>]+)[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P[0-9A-Za-z]+)\.html' _TEST = { - 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', + 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', 'info_dict': { 'id': '001BLpXF2DyJe2', 'title': '林俊杰', 'description': 'md5:870ec08f7d8547c29c93010899103751', }, - 'playlist_count': 12, + 'playlist_mincount': 12, } def _real_extract(self, url): mid = self._match_id(url) - singer_page = self._download_webpage( - self.qq_static_url('singer', mid), mid, 'Download singer page') - - entries = self.get_entries_from_page(singer_page) - + entries = self.get_entries_from_page(mid) + singer_page = self._download_webpage(url, mid, 'Download singer page') singer_name = self._html_search_regex( - r"singername\s*:\s*'([^']+)'", singer_page, 'singer name', - default=None) - - singer_id = self._html_search_regex( - r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id', - default=None) - + r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) singer_desc = None - if singer_id: - req = sanitized_Request( - 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id) - req.add_header( - 'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html') + if mid: singer_desc_page = self._download_xml( - req, mid, 'Donwload singer description XML') + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, + 'Donwload singer description XML', + query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, + headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) singer_desc = singer_desc_page.find('./data/info/desc').text @@ -217,10 +230,10 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P[0-9A-Za-z]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', + 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', @@ -228,7 +241,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): }, 'playlist_count': 4, }, { - 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3', + 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', 'info_dict': { 'id': '002Y5a3b3AlCu3', 'title': '그리고...', @@ -246,7 +259,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): entries = [ self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] ) for song in album['list'] ] album_name = album.get('name') @@ -260,31 +273,30 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P(top|global)_[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P[0-9]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=toplist&p=global_123', + 'url': 'https://y.qq.com/n/yqq/toplist/123.html', 'info_dict': { - 'id': 'global_123', + 'id': '123', 'title': '美国iTunes榜', - }, - 'playlist_count': 10, - }, { - 'url': 'http://y.qq.com/#type=toplist&p=top_3', - 'info_dict': { - 'id': 'top_3', - 'title': '巅峰榜·欧美', - 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' - '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' - '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' - '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放' + 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', }, 'playlist_count': 100, }, { - 'url': 'http://y.qq.com/#type=toplist&p=global_106', + 'url': 'https://y.qq.com/n/yqq/toplist/3.html', 'info_dict': { - 'id': 'global_106', + 'id': '3', + 'title': '巅峰榜·欧美', + 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/106.html', + 'info_dict': { + 'id': '106', 'title': '韩国Mnet榜', + 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', }, 'playlist_count': 50, }] @@ -292,18 +304,15 @@ class QQMusicToplistIE(QQPlaylistBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - list_type, num_id = list_id.split("_") - toplist_json = self._download_json( - 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' - % (list_type, num_id), - list_id, 'Download toplist page') + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, + note='Download toplist page', + query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) - entries = [ - self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid'] - ) for song in toplist_json['songlist'] - ] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', + song['data']['songmid']) + for song in toplist_json['songlist']] topinfo = toplist_json.get('topinfo', {}) list_name = topinfo.get('ListName') @@ -314,10 +323,10 @@ class QQMusicToplistIE(QQPlaylistBaseIE): class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P[0-9]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=taoge&id=3462654915', + 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', 'info_dict': { 'id': '3462654915', 'title': '韩国5月新歌精选下旬', @@ -326,7 +335,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): 'playlist_count': 40, 'skip': 'playlist gone', }, { - 'url': 'http://y.qq.com/#type=taoge&id=1374105607', + 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', 'info_dict': { 'id': '1374105607', 'title': '易入人心的华语民谣', @@ -339,8 +348,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): list_id = self._match_id(url) list_json = self._download_json( - 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' - % list_id, list_id, 'Download list page', + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', + list_id, 'Download list page', + query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, transform_source=strip_jsonp) if not len(list_json.get('cdlist', [])): if list_json.get('code'): @@ -350,11 +360,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): raise ExtractorError('Unable to get playlist info') cdlist = list_json['cdlist'][0] - entries = [ - self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] - ) for song in cdlist['songlist'] - ] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) + for song in cdlist['songlist']] list_name = cdlist.get('dissname') list_description = clean_html(unescapeHTML(cdlist.get('desc'))) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 3b40002a8..b952e59b4 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -20,20 +20,37 @@ from ..utils import ( class RadioCanadaIE(InfoExtractor): IE_NAME = 'radiocanada' _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P[^:/]+)[:/](?P[0-9]+)' - _TEST = { - 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', - 'info_dict': { - 'id': '7184272', - 'ext': 'mp4', - 'title': 'Le parcours du tireur capté sur vidéo', - 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', - 'upload_date': '20141023', + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + ] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -59,6 +76,7 @@ class RadioCanadaIE(InfoExtractor): device_types.append('android') formats = [] + error = None # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file for device_type in device_types: @@ -84,8 +102,8 @@ class RadioCanadaIE(InfoExtractor): if not v_url: continue if v_url == 'null': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, xpath_text(v_data, 'message')), expected=True) + error = xpath_text(v_data, 'message') + continue ext = determine_ext(v_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -129,6 +147,9 @@ class RadioCanadaIE(InfoExtractor): formats.extend(self._extract_f4m_formats( base_url + '/manifest.f4m', video_id, f4m_id='hds', fatal=False)) + if not formats and error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) subtitles = {} @@ -141,7 +162,7 @@ class RadioCanadaIE(InfoExtractor): return { 'id': video_id, - 'title': get_meta('Title'), + 'title': get_meta('Title') or get_meta('AV-nomEmission'), 'description': get_meta('Description') or get_meta('ShortDescription'), 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), 'duration': int_or_none(get_meta('length')), diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e11bf8f9a..d22311031 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -17,6 +17,7 @@ from ..utils import ( parse_duration, strip_or_none, try_get, + unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -249,6 +250,41 @@ class RaiPlayLiveIE(RaiBaseIE): } +class RaiPlayPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._html_search_meta( + ('programma', 'nomeProgramma'), webpage, 'title') + description = unescapeHTML(self._html_search_meta( + ('description', 'og:description'), webpage, 'description')) + print(description) + + entries = [] + for mobj in re.finditer( + r']+\bhref=(["\'])(?P/raiplay/video/.+?)\1', + webpage): + video_url = urljoin(url, mobj.group('path')) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) + + return self.playlist_result(entries, playlist_id, title, description) + + class RaiIE(RaiBaseIE): _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ @@ -345,11 +381,11 @@ class RaiIE(RaiBaseIE): media_type = media['type'] if 'Audio' in media_type: relinker_info = { - 'formats': { + 'formats': [{ 'format_id': media.get('formatoAudio'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), - } + }] } elif 'Video' in media_type: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py new file mode 100644 index 000000000..640c3ee23 --- /dev/null +++ b/youtube_dl/extractor/raywenderlich.py @@ -0,0 +1,102 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..utils import ( + extract_attributes, + ExtractorError, + smuggle_url, + unsmuggle_url, + urljoin, +) + + +class RayWenderlichIE(InfoExtractor): + _VALID_URL = r'https?://videos\.raywenderlich\.com/courses/(?P[^/]+)/lessons/(?P\d+)' + + _TESTS = [{ + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'info_dict': { + 'id': '248377018', + 'ext': 'mp4', + 'title': 'Testing In iOS Episode 1: Introduction', + 'duration': 133, + 'uploader': 'Ray Wenderlich', + 'uploader_id': 'user3304672', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + 'add_ie': [VimeoIE.ie_key()], + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'info_dict': { + 'title': 'Testing in iOS', + 'id': '105-testing-in-ios', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 29, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + course_id, lesson_id = mobj.group('course_id', 'id') + video_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, video_id) + + no_playlist = self._downloader.params.get('noplaylist') + if no_playlist or smuggled_data.get('force_video', False): + if no_playlist: + self.to_screen( + 'Downloading just video %s because of --no-playlist' + % video_id) + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'video id') + return self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id) + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to just download video' + % course_id) + + lesson_ids = set((lesson_id, )) + for lesson in re.findall( + r'(]+\bclass=["\']lesson-link[^>]+>)', webpage): + attrs = extract_attributes(lesson) + if not attrs: + continue + lesson_url = attrs.get('href') + if not lesson_url: + continue + lesson_id = self._search_regex( + r'/lessons/(\d+)', lesson_url, 'lesson id', default=None) + if not lesson_id: + continue + lesson_ids.add(lesson_id) + + entries = [] + for lesson_id in sorted(lesson_ids): + entries.append(self.url_result( + smuggle_url(urljoin(url, lesson_id), {'force_video': True}), + ie=RayWenderlichIE.ie_key())) + + title = self._search_regex( + r'class=["\']course-title[^>]+>([^<]+)', webpage, 'course title', + default=None) + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 5d6cc3610..243603676 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -5,135 +5,93 @@ from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( float_or_none, - int_or_none, - try_get, - # unified_timestamp, ExtractorError, ) class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film|live)/(?:AP-\w+/segment/)?(?PAP-\w+)' + _VALID_URL = r'https?://(?:www\.)?redbull\.tv/video/(?PAP-\w+)' _TESTS = [{ # film - 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', + 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', 'md5': 'fb0445b98aa4394e504b413d98031d1f', 'info_dict': { - 'id': 'AP-1Q756YYX51W11', + 'id': 'AP-1Q6XCDTAN1W11', 'ext': 'mp4', - 'title': 'ABC of...WRC', + 'title': 'ABC of... WRC - ABC of... S1E6', 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', 'duration': 1582.04, - # 'timestamp': 1488405786, - # 'upload_date': '20170301', }, }, { # episode - 'url': 'https://www.redbull.tv/video/AP-1PMT5JCWH1W11/grime?playlist=shows:shows-playall:web', + 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', 'info_dict': { - 'id': 'AP-1PMT5JCWH1W11', + 'id': 'AP-1PMHKJFCW1W11', 'ext': 'mp4', - 'title': 'Grime - Hashtags S2 E4', - 'description': 'md5:334b741c8c1ce65be057eab6773c1cf5', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:b5f522b89b72e1e23216e5018810bb25', 'duration': 904.6, - # 'timestamp': 1487290093, - # 'upload_date': '20170217', - 'series': 'Hashtags', - 'season_number': 2, - 'episode_number': 4, }, 'params': { 'skip_download': True, }, - }, { - # segment - 'url': 'https://www.redbull.tv/live/AP-1R5DX49XS1W11/segment/AP-1QSAQJ6V52111/semi-finals', - 'info_dict': { - 'id': 'AP-1QSAQJ6V52111', - 'ext': 'mp4', - 'title': 'Semi Finals - Vans Park Series Pro Tour', - 'description': 'md5:306a2783cdafa9e65e39aa62f514fd97', - 'duration': 11791.991, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion', - 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) session = self._download_json( - 'https://api-v2.redbull.tv/session', video_id, + 'https://api.redbull.tv/v3/session', video_id, note='Downloading access token', query={ - 'build': '4.370.0', 'category': 'personal_computer', - 'os_version': '1.0', 'os_family': 'http', }) if session.get('code') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, session['message'])) - auth = '%s %s' % (session.get('token_type', 'Bearer'), session['access_token']) + token = session['token'] try: - info = self._download_json( - 'https://api-v2.redbull.tv/content/%s' % video_id, + video = self._download_json( + 'https://api.redbull.tv/v3/products/' + video_id, video_id, note='Downloading video information', - headers={'Authorization': auth} + headers={'Authorization': token} ) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: error_message = self._parse_json( - e.cause.read().decode(), video_id)['message'] + e.cause.read().decode(), video_id)['error'] raise ExtractorError('%s said: %s' % ( self.IE_NAME, error_message), expected=True) raise - video = info['video_product'] - - title = info['title'].strip() + title = video['title'].strip() formats = self._extract_m3u8_formats( - video['url'], video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) subtitles = {} - for _, captions in (try_get( - video, lambda x: x['attachments']['captions'], - dict) or {}).items(): - if not captions or not isinstance(captions, list): - continue - for caption in captions: - caption_url = caption.get('url') - if not caption_url: - continue - ext = caption.get('format') - if ext == 'xml': - ext = 'ttml' - subtitles.setdefault(caption.get('lang') or 'en', []).append({ - 'url': caption_url, - 'ext': ext, - }) + for resource in video.get('resources', []): + if resource.startswith('closed_caption_'): + splitted_resource = resource.split('_') + if splitted_resource[2]: + subtitles.setdefault('en', []).append({ + 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), + 'ext': splitted_resource[2], + }) - subheading = info.get('subheading') + subheading = video.get('subheading') if subheading: title += ' - %s' % subheading return { 'id': video_id, 'title': title, - 'description': info.get('long_description') or info.get( + 'description': video.get('long_description') or video.get( 'short_description'), 'duration': float_or_none(video.get('duration'), scale=1000), - # 'timestamp': unified_timestamp(info.get('published')), - 'series': info.get('show_title'), - 'season_number': int_or_none(info.get('season_number')), - 'episode_number': int_or_none(info.get('episode_number')), 'formats': formats, 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py new file mode 100644 index 000000000..8372925be --- /dev/null +++ b/youtube_dl/extractor/reddit.py @@ -0,0 +1,125 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, +) + + +class RedditIE(InfoExtractor): + _VALID_URL = r'https?://v\.redd\.it/(?P[^/?#&]+)' + _TEST = { + # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '0a070c53eba7ec4534d95a5a1259e253', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'zv89llsvexdz', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + formats.extend(self._extract_mpd_formats( + 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, + mpd_id='dash', fatal=False)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class RedditRIE(InfoExtractor): + _VALID_URL = r'(?Phttps?://(?:(?:www|old)\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'That small heart attack.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1501941939, + 'upload_date': '20170805', + 'uploader': 'Antw87', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', + 'only_matching': True, + }, { + # imgur + 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # streamable + 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', + 'only_matching': True, + }, { + # youtube + 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') + + video_id = self._match_id(url) + + data = self._download_json( + url + '/.json', video_id)[0]['data']['children'][0]['data'] + + video_url = data['url'] + + # Avoid recursing into the same reddit URL + if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: + raise ExtractorError('No media found', expected=True) + + over_18 = data.get('over_18') + if over_18 is True: + age_limit = 18 + elif over_18 is False: + age_limit = 0 + else: + age_limit = None + + return { + '_type': 'url_transparent', + 'url': video_url, + 'title': data.get('title'), + 'thumbnail': data.get('thumbnail'), + 'timestamp': float_or_none(data.get('created_utc')), + 'uploader': data.get('author'), + 'like_count': int_or_none(data.get('ups')), + 'dislike_count': int_or_none(data.get('downs')), + 'comment_count': int_or_none(data.get('num_comments')), + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index c367a6ae7..879bcf81d 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -15,12 +16,12 @@ class RedTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.redtube.com/66418', - 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', + 'md5': 'fc08071233725f26b8f014dba9590005', 'info_dict': { 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', - 'upload_date': '20120831', + 'upload_date': '20110811', 'duration': 596, 'view_count': int, 'age_limit': 18, @@ -45,9 +46,10 @@ class RedTubeIE(InfoExtractor): raise ExtractorError('Video %s has been removed' % video_id, expected=True) title = self._html_search_regex( - (r'

(?P.+?)</h1>', - r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), - webpage, 'title', group='title') + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) formats = [] sources = self._parse_json( @@ -62,7 +64,23 @@ class RedTubeIE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id), }) - else: + medias = self._parse_json( + self._search_regex( + r'mediaDefinition\s*:\s*(\[.+?\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + if medias and isinstance(medias, list): + for media in medias: + format_url = media.get('videoUrl') + if not format_url or not isinstance(format_url, compat_str): + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') formats.append({'url': video_url}) @@ -70,12 +88,14 @@ class RedTubeIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._search_regex( - r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', + r'<span[^>]+>ADDED ([^<]+)<', webpage, 'upload date', fatal=False)) - duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, default=None) or self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( - r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', + (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)'), webpage, 'view count', fatal=False)) # No self-labeling, but they describe themselves as diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index d338b3a93..8bcf87126 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, +) class RENTVIE(InfoExtractor): @@ -13,7 +17,9 @@ class RENTVIE(InfoExtractor): 'info_dict': { 'id': '118577', 'ext': 'mp4', - 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"' + 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"', + 'timestamp': 1472230800, + 'upload_date': '20160826', } }, { 'url': 'http://ren.tv/player/118577', @@ -26,9 +32,33 @@ class RENTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) - jw_config = self._parse_json(self._search_regex( - r'config\s*=\s*({.+});', webpage, 'jw config'), video_id) - return self._parse_jwplayer_data(jw_config, video_id, m3u8_id='hls') + config = self._parse_json(self._search_regex( + r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id) + title = config['title'] + formats = [] + for video in config['src']: + src = video.get('src') + if not src or not isinstance(src, compat_str): + continue + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'duration': int_or_none(config.get('duration')), + 'timestamp': int_or_none(config.get('date')), + 'formats': formats, + } class RENTVArticleIE(InfoExtractor): diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py index fd50065d4..d47fb45ca 100644 --- a/youtube_dl/extractor/restudy.py +++ b/youtube_dl/extractor/restudy.py @@ -5,8 +5,8 @@ from .common import InfoExtractor class RestudyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?restudy\.dk/video/play/id/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'https://www.restudy.dk/video/play/id/1637', 'info_dict': { 'id': '1637', @@ -18,7 +18,10 @@ class RestudyIE(InfoExtractor): # rtmp download 'skip_download': True, } - } + }, { + 'url': 'https://portal.restudy.dk/video/leiden-frosteffekt/id/1637', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -29,7 +32,7 @@ class RestudyIE(InfoExtractor): description = self._og_search_description(webpage).strip() formats = self._extract_smil_formats( - 'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id, + 'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id, video_id) self._sort_formats(formats) diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py deleted file mode 100644 index 2c2c707bd..000000000 --- a/youtube_dl/extractor/ringtv.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class RingTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30', - 'md5': 'd25945f5df41cdca2d2587165ac28720', - 'info_dict': { - 'id': '857645', - 'ext': 'mp4', - 'title': 'Video: Luis Collazo says Victor Ortiz "better not quit on Jan. 30" - Ring TV', - 'description': 'Luis Collazo is excited about his Jan. 30 showdown with fellow former welterweight titleholder Victor Ortiz at Barclays Center in his hometown of Brooklyn. The SuperBowl week fight headlines a Golden Boy Live! card on Fox Sports 1.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id').split('-')[0] - webpage = self._download_webpage(url, video_id) - - if mobj.group('type') == 'news': - video_id = self._search_regex( - r'''(?x)<iframe[^>]+src="http://cms\.springboardplatform\.com/ - embed_iframe/[0-9]+/video/([0-9]+)/''', - webpage, 'real video ID') - title = self._og_search_title(webpage) - description = self._html_search_regex( - r'addthis:description="([^"]+)"', - webpage, 'description', fatal=False) - final_url = 'http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4' % video_id - thumbnail_url = 'http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg' % video_id - - return { - 'id': video_id, - 'url': final_url, - 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, - } diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 46dfc78f5..8b703800e 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -68,7 +68,7 @@ class RoosterTeethIE(InfoExtractor): login_request = self._download_webpage( self._LOGIN_URL, None, - note='Logging in as %s' % username, + note='Logging in', data=urlencode_postdata(login_form), headers={ 'Referer': self._LOGIN_URL, diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py index f8eda8dea..fccf69401 100644 --- a/youtube_dl/extractor/rozhlas.py +++ b/youtube_dl/extractor/rozhlas.py @@ -21,7 +21,7 @@ class RozhlasIE(InfoExtractor): } }, { 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', - 'skip_download': True, + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index 666e90e90..18a327d81 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -1,12 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import ( + compat_b64decode, compat_ord, compat_str, ) @@ -142,11 +142,11 @@ class RTL2YouIE(RTL2YouBaseIE): stream_data = self._download_json( self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) - data, iv = base64.b64decode(stream_data['streamUrl']).decode().split(':') + data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') stream_url = intlist_to_bytes(aes_cbc_decrypt( - bytes_to_intlist(base64.b64decode(data)), + bytes_to_intlist(compat_b64decode(data)), bytes_to_intlist(self._AES_KEY), - bytes_to_intlist(base64.b64decode(iv)) + bytes_to_intlist(compat_b64decode(iv)) )) if b'rtl2_you_video_not_found' in stream_url: raise ExtractorError('video not found', expected=True) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 3e22998c6..be36acc46 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -12,10 +12,10 @@ class RtlNlIE(InfoExtractor): IE_NAME = 'rtl.nl' IE_DESC = 'rtl.nl and rtlxl.nl' _VALID_URL = r'''(?x) - https?://(?:www\.)? + https?://(?:(?:www|static)\.)? (?: rtlxl\.nl/[^\#]*\#!/[^/]+/| - rtl\.nl/(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=|video/) + rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/) ) (?P<id>[0-9a-f-]+)''' @@ -73,6 +73,9 @@ class RtlNlIE(InfoExtractor): }, { 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/', 'only_matching': True, + }, { + 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl', + 'only_matching': True, }] def _real_extract(self, url): @@ -90,58 +93,11 @@ class RtlNlIE(InfoExtractor): meta = info.get('meta', {}) - # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv. - # To workaround this previously adaptive -> flash trick was used to obtain - # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118) - # and bypass georestrictions as well. - # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore - # unusable albeit can be fixed by simple string replacement (see - # https://github.com/rg3/youtube-dl/pull/6337) - # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted - # streams are used now. videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats( m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) - - video_urlpart = videopath.split('/adaptive/')[1][:-5] - PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' - - PG_FORMATS = ( - ('a2t', 512, 288), - ('a3t', 704, 400), - ('nettv', 1280, 720), - ) - - def pg_format(format_id, width, height): - return { - 'url': PG_URL_TEMPLATE % (format_id, video_urlpart), - 'format_id': 'pg-%s' % format_id, - 'protocol': 'http', - 'width': width, - 'height': height, - } - - if not formats: - formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS] - else: - pg_formats = [] - for format_id, width, height in PG_FORMATS: - try: - # Find hls format with the same width and height corresponding - # to progressive format and copy metadata from it. - f = next(f for f in formats if f.get('height') == height) - # hls formats may have invalid width - f['width'] = width - f_copy = f.copy() - f_copy.update(pg_format(format_id, width, height)) - pg_formats.append(f_copy) - except StopIteration: - # Missing hls format does mean that no progressive format with - # such width and height exists either. - pass - formats.extend(pg_formats) self._sort_formats(formats) thumbnails = [] diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 746677a24..ce9db0629 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -7,9 +7,11 @@ import time from .common import InfoExtractor from ..compat import ( + compat_b64decode, compat_struct_unpack, ) from ..utils import ( + determine_ext, ExtractorError, float_or_none, remove_end, @@ -20,7 +22,7 @@ from ..utils import ( def _decrypt_url(png): - encrypted_data = base64.b64decode(png.encode('utf-8')) + encrypted_data = compat_b64decode(png) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] length = compat_struct_unpack('!I', text_chunk[:4])[0] @@ -30,6 +32,9 @@ def _decrypt_url(png): hash_index = data.index('#') alphabet_data = data[:hash_index] url_data = data[hash_index + 1:] + if url_data[0] == 'H' and url_data[3] == '%': + # remove useless HQ%% at the start + url_data = url_data[4:] alphabet = [] e = 0 @@ -84,6 +89,18 @@ class RTVEALaCartaIE(InfoExtractor): 'title': 'TODO', }, 'skip': 'The f4m manifest can\'t be used yet', + }, { + 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', + 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'info_dict': { + 'id': '4236788', + 'ext': 'mp4', + 'title': 'Servir y proteger - Capítulo 104 ', + 'duration': 3222.0, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, @@ -107,24 +124,41 @@ class RTVEALaCartaIE(InfoExtractor): video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) + title = info['title'] png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) png_request = sanitized_Request(png_url) png_request.add_header('Referer', url) png = self._download_webpage(png_request, video_id, 'Downloading url information') video_url = _decrypt_url(png) - if not video_url.endswith('.f4m'): + ext = determine_ext(video_url) + + formats = [] + if not video_url.endswith('.f4m') and ext != 'm3u8': if '?' not in video_url: video_url = video_url.replace('resources/', 'auth/resources/') video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'url': video_url, + }) + self._sort_formats(formats) + subtitles = None if info.get('sbtFile') is not None: subtitles = self.extract_subtitles(video_id, info['sbtFile']) return { 'id': video_id, - 'title': info['title'], - 'url': video_url, + 'title': title, + 'formats': formats, 'thumbnail': info.get('image'), 'page_url': url, 'subtitles': subtitles, diff --git a/youtube_dl/extractor/rtvs.py b/youtube_dl/extractor/rtvs.py new file mode 100644 index 000000000..6573b260d --- /dev/null +++ b/youtube_dl/extractor/rtvs.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTVSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P<id>\d+)' + _TESTS = [{ + # radio archive + 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', + 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', + 'info_dict': { + 'id': '414872', + 'ext': 'mp3', + 'title': 'Ostrov pokladov 1 časť.mp3' + }, + 'params': { + 'skip_download': True, + } + }, { + # tv archive + 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', + 'md5': '85e2c55cf988403b70cac24f5c086dc6', + 'info_dict': { + 'id': '63118', + 'ext': 'mp4', + 'title': 'Amaro Džives - Náš deň', + 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.' + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._search_regex( + r'playlist["\']?\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'playlist url', group='url') + + data = self._download_json( + playlist_url, video_id, 'Downloading playlist')[0] + return self._parse_jwplayer_data(data, video_id=video_id) diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py index 2b830cf47..3c8053a26 100644 --- a/youtube_dl/extractor/ruhd.py +++ b/youtube_dl/extractor/ruhd.py @@ -25,7 +25,7 @@ class RUHDIE(InfoExtractor): video_url = self._html_search_regex( r'<param name="src" value="([^"]+)"', webpage, 'video url') title = self._html_search_regex( - r'<title>([^<]+)   RUHD.ru - Видео Высокого качества №1 в России!', + r'([^<]+)   RUHD\.ru - Видео Высокого качества №1 в России!', webpage, 'title') description = self._html_search_regex( r'(?s)
(.+?)', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 889fa7628..89d89b65a 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -7,43 +7,84 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, - unified_strdate, + bool_or_none, + int_or_none, + try_get, + unified_timestamp, ) -class RutubeIE(InfoExtractor): +class RutubeBaseIE(InfoExtractor): + def _extract_video(self, video, video_id=None, require_title=True): + title = video['title'] if require_title else video.get('title') + + age_limit = video.get('is_adult') + if age_limit is not None: + age_limit = 18 if age_limit is True else 0 + + uploader_id = try_get(video, lambda x: x['author']['id']) + category = try_get(video, lambda x: x['category']['name']) + + return { + 'id': video.get('id') or video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video.get('duration')), + 'uploader': try_get(video, lambda x: x['author']['name']), + 'uploader_id': compat_str(uploader_id) if uploader_id else None, + 'timestamp': unified_timestamp(video.get('created_ts')), + 'category': [category] if category else None, + 'age_limit': age_limit, + 'view_count': int_or_none(video.get('hits')), + 'comment_count': int_or_none(video.get('comments_count')), + 'is_live': bool_or_none(video.get('is_livestream')), + } + + +class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '79938ade01294ef7e27574890d0d3769', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', 'duration': 80, 'uploader': 'NTDRussian', 'uploader_id': '29790', + 'timestamp': 1381943602, 'upload_date': '20131016', 'age_limit': 0, }, - 'params': { - # It requires ffmpeg (m3u8 download) - 'skip_download': True, - }, }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, }, { 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) + @staticmethod def _extract_urls(webpage): return [mobj.group('url') for mobj in re.finditer( @@ -52,12 +93,12 @@ class RutubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + video = self._download_json( 'http://rutube.ru/api/video/%s/?format=json' % video_id, video_id, 'Downloading video JSON') - # Some videos don't have the author field - author = video.get('author') or {} + info = self._extract_video(video, video_id) options = self._download_json( 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, @@ -79,19 +120,8 @@ class RutubeIE(InfoExtractor): }) self._sort_formats(formats) - return { - 'id': video['id'], - 'title': video['title'], - 'description': video['description'], - 'duration': video['duration'], - 'view_count': video['hits'], - 'formats': formats, - 'thumbnail': video['thumbnail_url'], - 'uploader': author.get('name'), - 'uploader_id': compat_str(author['id']) if author else None, - 'upload_date': unified_strdate(video['created_ts']), - 'age_limit': 18 if video['is_adult'] else 0, - } + info['formats'] = formats + return info class RutubeEmbedIE(InfoExtractor): @@ -103,7 +133,8 @@ class RutubeEmbedIE(InfoExtractor): 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', - 'ext': 'mp4', + 'ext': 'flv', + 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix

восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', @@ -111,7 +142,7 @@ class RutubeEmbedIE(InfoExtractor): 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', }, 'params': { - 'skip_download': 'Requires ffmpeg', + 'skip_download': True, }, }, { 'url': 'http://rutube.ru/play/embed/8083783', @@ -125,10 +156,51 @@ class RutubeEmbedIE(InfoExtractor): canonical_url = self._html_search_regex( r'\d+)' @@ -142,27 +214,8 @@ class RutubeChannelIE(InfoExtractor): _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' - def _extract_videos(self, channel_id, channel_title=None): - entries = [] - for pagenum in itertools.count(1): - page = self._download_json( - self._PAGE_TEMPLATE % (channel_id, pagenum), - channel_id, 'Downloading page %s' % pagenum) - results = page['results'] - if not results: - break - entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) - if not page['has_next']: - break - return self.playlist_result(entries, channel_id, channel_title) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id) - - -class RutubeMovieIE(RutubeChannelIE): +class RutubeMovieIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P\d+)' @@ -176,11 +229,11 @@ class RutubeMovieIE(RutubeChannelIE): movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') - movie_name = movie['name'] - return self._extract_videos(movie_id, movie_name) + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) -class RutubePersonIE(RutubeChannelIE): +class RutubePersonIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' _VALID_URL = r'https?://rutube\.ru/video/person/(?P\d+)' @@ -193,3 +246,37 @@ class RutubePersonIE(RutubeChannelIE): }] _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P\d+)' + _TESTS = [{ + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', + 'info_dict': { + 'id': '3097', + }, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' + + @classmethod + def suitable(cls, url): + if not super(RutubePlaylistIE, cls).suitable(url): + return False + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 6c09df25a..9fa8688f8 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -53,6 +53,12 @@ class RuutuIE(InfoExtractor): 'age_limit': 0, }, }, + # Episode where is "NOT-USED", but has other + # downloadable sources available. + { + 'url': 'http://www.ruutu.fi/video/3193728', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -72,7 +78,7 @@ class RuutuIE(InfoExtractor): video_url = child.text if (not video_url or video_url in processed_urls or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): - return + continue processed_urls.append(video_url) ext = determine_ext(video_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 909a6ba97..cc6698f88 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -61,7 +61,7 @@ class SafariBaseIE(InfoExtractor): request = sanitized_Request( self._LOGIN_URL, urlencode_postdata(login_form), headers=headers) login_page = self._download_webpage( - request, None, 'Logging in as %s' % username) + request, None, 'Logging in') if not is_logged(login_page): raise ExtractorError( diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py deleted file mode 100644 index 96e43af84..000000000 --- a/youtube_dl/extractor/sandia.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - mimetype2ext, -) - - -class SandiaIE(InfoExtractor): - IE_DESC = 'Sandia National Laboratories' - _VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P[0-9a-f]+)' - _TEST = { - 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', - 'md5': '9422edc9b9a60151727e4b6d8bef393d', - 'info_dict': { - 'id': '24aace4429fc450fb5b38cdbf424a66e1d', - 'ext': 'mp4', - 'title': 'Xyce Software Training - Section 1', - 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120409', - 'timestamp': 1333983600, - 'duration': 7794, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - presentation_data = self._download_json( - 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', - video_id, data=json.dumps({ - 'getPlayerOptionsRequest': { - 'ResourceId': video_id, - 'QueryString': '', - } - }), headers={ - 'Content-Type': 'application/json; charset=utf-8', - })['d']['Presentation'] - - title = presentation_data['Title'] - - formats = [] - for stream in presentation_data.get('Streams', []): - for fd in stream.get('VideoUrls', []): - formats.append({ - 'format_id': fd['MediaType'], - 'format_note': fd['MimeType'].partition('/')[2], - 'ext': mimetype2ext(fd['MimeType']), - 'url': fd['Location'], - 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': presentation_data.get('Description'), - 'formats': formats, - 'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000), - 'duration': int_or_none(presentation_data.get('Duration'), 1000), - } diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index 597d6f543..4023aeef8 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -1,60 +1,119 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePassIE +import json +import hashlib +import re + +from .aws import AWSIE +from .anvato import AnvatoIE from ..utils import ( - int_or_none, smuggle_url, - update_url_query, + urlencode_postdata, + xpath_text, ) -class ScrippsNetworksWatchIE(AdobePassIE): +class ScrippsNetworksWatchIE(AWSIE): IE_NAME = 'scrippsnetworks:watch' - _VALID_URL = r'https?://watch\.(?:hgtv|foodnetwork|travelchannel|diynetwork|cookingchanneltv)\.com/player\.[A-Z0-9]+\.html#(?P\d+)' - _TEST = { - 'url': 'http://watch.hgtv.com/player.HNT.html#0256538', + _VALID_URL = r'''(?x) + https?:// + watch\. + (?Phgtv|foodnetwork|travelchannel|diynetwork|cookingchanneltv|geniuskitchen)\.com/ + (?: + player\.[A-Z0-9]+\.html\#| + show/(?:[^/]+/){2}| + player/ + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'http://watch.hgtv.com/show/HGTVE/Best-Ever-Treehouses/2241515/Best-Ever-Treehouses/', 'md5': '26545fd676d939954c6808274bdb905a', 'info_dict': { - 'id': '0256538', + 'id': '4173834', 'ext': 'mp4', - 'title': 'Seeking a Wow House', - 'description': 'Buyers retiring in Palm Springs, California, want a modern house with major wow factor. They\'re also looking for a pool and a large, open floorplan with tall windows looking out at the views.', - 'uploader': 'SCNI', - 'upload_date': '20170207', - 'timestamp': 1486450493, + 'title': 'Best Ever Treehouses', + 'description': "We're searching for the most over the top treehouses.", + 'uploader': 'ANV', + 'upload_date': '20170922', + 'timestamp': 1506056400, }, - 'skip': 'requires TV provider authentication', + 'params': { + 'skip_download': True, + }, + 'add_ie': [AnvatoIE.ie_key()], + }, { + 'url': 'http://watch.diynetwork.com/show/DSAL/Salvage-Dawgs/2656646/Covington-Church/', + 'only_matching': True, + }, { + 'url': 'http://watch.diynetwork.com/player.HNT.html#2656646', + 'only_matching': True, + }, { + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', + 'only_matching': True, + }] + + _SNI_TABLE = { + 'hgtv': 'hgtv', + 'diynetwork': 'diy', + 'foodnetwork': 'food', + 'cookingchanneltv': 'cook', + 'travelchannel': 'trav', + 'geniuskitchen': 'genius', } - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - channel = self._parse_json(self._search_regex( - r'"channels"\s*:\s*(\[.+\])', - webpage, 'channels'), video_id)[0] - video_data = next(v for v in channel['videos'] if v.get('nlvid') == video_id) - title = video_data['title'] - release_url = video_data['releaseUrl'] - if video_data.get('restricted'): - requestor_id = self._search_regex( - r'requestorId\s*=\s*"([^"]+)";', webpage, 'requestor id') - resource = self._get_mvpd_resource( - requestor_id, title, video_id, - video_data.get('ratings', [{}])[0].get('rating')) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - release_url = update_url_query(release_url, {'auth': auth}) + _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' + _AWS_PROXY_HOST = 'web.api.video.snidigital.com' - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(release_url, {'force_smil_url': True}), - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnailUrl'), - 'series': video_data.get('showTitle'), - 'season_number': int_or_none(video_data.get('season')), - 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'ie_key': 'ThePlatform', - } + _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, video_id = mobj.group('site', 'id') + + aws_identity_id_json = json.dumps({ + 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION + }).encode('utf-8') + token = self._download_json( + 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, + data=aws_identity_id_json, + headers={ + 'Accept': '*/*', + 'Content-Type': 'application/x-amz-json-1.1', + 'Referer': url, + 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), + 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + })['Token'] + + sts = self._download_xml( + 'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({ + 'Action': 'AssumeRoleWithWebIdentity', + 'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role', + 'RoleSessionName': 'web-identity', + 'Version': '2011-06-15', + 'WebIdentityToken': token, + }), headers={ + 'Referer': url, + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) + + def get(key): + return xpath_text( + sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, + fatal=True) + + mcp_id = self._aws_execute_api({ + 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), + 'access_key': get('AccessKeyId'), + 'secret_key': get('SecretAccessKey'), + 'session_token': get('SessionToken'), + }, video_id)['results'][0]['mcpId'] + + return self.url_result( + smuggle_url( + 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, + {'geo_countries': ['US']}), + AnvatoIE.ie_key(), video_id=mcp_id) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py new file mode 100644 index 000000000..264e1dd8b --- /dev/null +++ b/youtube_dl/extractor/servus.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ServusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:at|de)/p/[^/]+/(?PAA-\w+|\d+-\d+)' + _TESTS = [{ + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'md5': '046dee641cda1c4cabe13baef3be2c1c', + 'info_dict': { + 'id': 'AA-1T6VBU5PW1W12', + 'ext': 'mp4', + 'title': 'Die Grünen aus Volkssicht', + 'description': 'md5:052b5da1cb2cd7d562ef1f19be5a5cba', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + formats = self._extract_m3u8_formats( + 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py new file mode 100644 index 000000000..84568ac69 --- /dev/null +++ b/youtube_dl/extractor/sevenplus.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .brightcove import BrightcoveNewIE +from ..compat import compat_str +from ..utils import ( + try_get, + update_url_query, +) + + +class SevenPlusIE(BrightcoveNewIE): + IE_NAME = '7plus' + _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P[^?]+\?.*?\bepisode-id=(?P[^&#]+))' + _TESTS = [{ + 'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003', + 'info_dict': { + 'id': 'MTYS7-003', + 'ext': 'mp4', + 'title': 'S7 E3 - Wind Surf', + 'description': 'md5:29c6a69f21accda7601278f81b46483d', + 'uploader_id': '5303576322001', + 'upload_date': '20171201', + 'timestamp': 1512106377, + 'series': 'Mighty Ships', + 'season_number': 7, + 'episode_number': 3, + 'episode': 'Wind Surf', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + }, { + 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, episode_id = re.match(self._VALID_URL, url).groups() + + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + + for source in media.get('sources', {}): + src = source.get('src') + if not src: + continue + source['src'] = update_url_query(src, {'rule': ''}) + + info = self._parse_brightcove_metadata(media, episode_id) + + content = self._download_json( + 'https://component-cdn.swm.digital/content/' + path, + episode_id, headers={ + 'market-id': 4, + }, fatal=False) or {} + for item in content.get('items', {}): + if item.get('componentData', {}).get('componentType') == 'infoPanel': + for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: + value = item.get(src_key) + if value: + info[dst_key] = value + info['series'] = try_get( + item, lambda x: x['seriesLogo']['name'], compat_str) + mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title']) + if mobj: + info.update({ + 'season_number': int(mobj.group(1)), + 'episode_number': int(mobj.group(2)), + 'episode': mobj.group(3), + }) + + return info diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py new file mode 100644 index 000000000..6d4e3b76d --- /dev/null +++ b/youtube_dl/extractor/seznamzpravy.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) +from ..utils import ( + urljoin, + int_or_none, + parse_codecs, + try_get, +) + + +def _raw_id(src_url): + return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] + + +class SeznamZpravyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' + _TESTS = [{ + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', + 'info_dict': { + 'id': '170889', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'duration': 241, + 'series': 'Svět bez obalu', + }, + 'params': { + 'skip_download': True, + }, + }, { + # with Location key + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=null&serviceSlug=zpravy&src=https%3A%2F%2Flive-a.sdn.szn.cz%2Fv_39%2F59e468fe454f8472a96af9fa%3Ffl%3Dmdk%2C5c1e2840%7C&itemType=livevod&autoPlay=false&title=P%C5%99edseda%20KDU-%C4%8CSL%20Pavel%20B%C4%9Blobr%C3%A1dek%20ve%20volebn%C3%AD%20V%C3%BDzv%C4%9B%20Seznamu&series=V%C3%BDzva&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_G_J%2FjTBCs.jpeg%3Ffl%3Dcro%2C0%2C0%2C1280%2C720%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=16&height=9&cutFrom=0&cutTo=0&splVersion=VOD&contentId=185688&contextId=38489&showAdvert=true&collocation=&hideFullScreen=false&hideSubtitles=false&embed=&isVideoTooShortForPreroll=false&isVideoTooShortForPreroll2=false&isVideoTooLongForPostroll=false&fakePostrollZoneID=seznam.clanky.zpravy.preroll&fakePrerollZoneID=seznam.clanky.zpravy.preroll&videoCommentId=&trim=default_16x9&noPrerollVideoLength=30&noPreroll2VideoLength=undefined&noMidrollVideoLength=0&noPostrollVideoLength=999999&autoplayPossible=true&version=5.0.41&dotService=zpravy&gemiusPrismIdentifier=zD3g7byfW5ekpXmxTVLaq5Srjw5i4hsYo0HY1aBwIe..27&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy%2Fvyzva&zoneIdPostroll=seznam.pack.videospot&skipOffsetPostroll=5§ionPrefixPostroll=%2Fzpravy%2Fvyzva®ression=false', + 'info_dict': { + 'id': '185688', + 'ext': 'mp4', + 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'series': 'Výzva', + }, + 'params': { + 'skip_download': True, + }, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', + webpage)] + + def _extract_sdn_formats(self, sdn_url, video_id): + sdn_data = self._download_json(sdn_url, video_id) + + if sdn_data.get('Location'): + sdn_url = sdn_data['Location'] + sdn_data = self._download_json(sdn_url, video_id) + + formats = [] + mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {} + for format_id, format_data in mp4_formats.items(): + relative_url = format_data.get('url') + if not relative_url: + continue + + try: + width, height = format_data.get('resolution') + except (TypeError, ValueError): + width, height = None, None + + f = { + 'url': urljoin(sdn_url, relative_url), + 'format_id': 'http-%s' % format_id, + 'tbr': int_or_none(format_data.get('bandwidth'), scale=1000), + 'width': int_or_none(width), + 'height': int_or_none(height), + } + f.update(parse_codecs(format_data.get('codec'))) + formats.append(f) + + pls = sdn_data.get('pls', {}) + + def get_url(format_id): + return try_get(pls, lambda x: x[format_id]['url'], compat_str) + + dash_rel_url = get_url('dash') + if dash_rel_url: + formats.extend(self._extract_mpd_formats( + urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', + fatal=False)) + + hls_rel_url = get_url('hls') + if hls_rel_url: + formats.extend(self._extract_m3u8_formats( + urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + src = params['src'][0] + title = params['title'][0] + video_id = params.get('contentId', [_raw_id(src)])[0] + formats = self._extract_sdn_formats(src + 'spl2,2,VOD', video_id) + + duration = int_or_none(params.get('duration', [None])[0]) + series = params.get('series', [None])[0] + thumbnail = params.get('poster', [None])[0] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'series': series, + 'formats': formats, + } + + +class SeznamZpravyArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[^/?#&]+)-(?P\d+)' + _API_URL = 'https://apizpravy.seznam.cz/' + + _TESTS = [{ + # two videos on one page, with SDN URL + 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', + 'info_dict': { + 'id': '35990', + 'title': 'md5:6011c877a36905f28f271fcd8dcdb0f2', + 'description': 'md5:933f7b06fa337a814ba199d3596d27ba', + }, + 'playlist_count': 2, + }, { + # video with live stream URL + 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', + 'info_dict': { + 'id': '38489', + 'title': 'md5:8fa1afdc36fd378cf0eba2b74c5aca60', + 'description': 'md5:428e7926a1a81986ec7eb23078004fb4', + }, + 'playlist_count': 1, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + webpage = self._download_webpage(url, article_id) + + info = self._search_json_ld(webpage, article_id, default={}) + + title = info.get('title') or self._og_search_title(webpage, fatal=False) + description = info.get('description') or self._og_search_description(webpage) + + return self.playlist_result([ + self.url_result(url, ie=SeznamZpravyIE.ie_key()) + for url in SeznamZpravyIE._extract_urls(webpage)], + article_id, title, description) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 62d41e88a..5c2a6206b 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -1,45 +1,74 @@ # coding: utf-8 from __future__ import unicode_literals -import re import json +import math +import re -from .common import InfoExtractor +from .aws import AWSIE from ..compat import compat_HTTPError from ..utils import ( + clean_html, ExtractorError, + InAdvancePagedList, int_or_none, parse_iso8601, str_or_none, urlencode_postdata, - clean_html, ) -class ShahidIE(InfoExtractor): +class ShahidBaseIE(AWSIE): + _AWS_PROXY_HOST = 'api2.shahid.net' + _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + + def _handle_error(self, e): + fail_data = self._parse_json( + e.cause.read().decode('utf-8'), None, fatal=False) + if fail_data: + faults = fail_data.get('faults', []) + faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) + if faults_message: + raise ExtractorError(faults_message, expected=True) + + def _call_api(self, path, video_id, request=None): + query = {} + if request: + query['request'] = json.dumps(request) + try: + return self._aws_execute_api({ + 'uri': '/proxy/v2/' + path, + 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', + 'secret_key': '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', + }, video_id, query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + +class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' - _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?Pepisode|movie)/(?P\d+)' + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', 'info_dict': { - 'id': '90574', + 'id': '275286', 'ext': 'mp4', - 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', - 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', - 'duration': 2972, - 'timestamp': 1422057420, - 'upload_date': '20150123', + 'title': 'مجلس الشباب الموسم 1 كليب 1', + 'timestamp': 1506988800, + 'upload_date': '20171003', }, 'params': { # m3u8 download 'skip_download': True, } }, { - 'url': 'https://shahid.mbc.net/ar/movie/151746/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9.html', + 'url': 'https://shahid.mbc.net/ar/movies/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9/movie-151746', 'only_matching': True }, { # shahid plus subscriber only - 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', + 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', 'only_matching': True }] @@ -60,13 +89,7 @@ class ShahidIE(InfoExtractor): })['user'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): - fail_data = self._parse_json( - e.cause.read().decode('utf-8'), None, fatal=False) - if fail_data: - faults = fail_data.get('faults', []) - faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) - if faults_message: - raise ExtractorError(faults_message, expected=True) + self._handle_error(e) raise self._download_webpage( @@ -80,37 +103,41 @@ class ShahidIE(InfoExtractor): 'sessionId': user_data['sessionId'], })) - def _get_api_data(self, response): - data = response.get('data', {}) + def _real_extract(self, url): + page_type, video_id = re.match(self._VALID_URL, url).groups() + if page_type == 'clip': + page_type = 'episode' + playout = self._call_api( + 'playout/url/' + video_id, video_id)['playout'] + + if playout.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + + formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + self._sort_formats(formats) + + # video = self._call_api( + # 'product/id', video_id, { + # 'id': video_id, + # 'productType': 'ASSET', + # 'productSubType': page_type.upper() + # })['productModel'] + + response = self._download_json( + 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), + video_id, 'Downloading video JSON', query={ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }) + data = response.get('data', {}) error = data.get('error') if error: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), expected=True) - return data - - def _real_extract(self, url): - page_type, video_id = re.match(self._VALID_URL, url).groups() - - player = self._get_api_data(self._download_json( - 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-player.html' % video_id, - video_id, 'Downloading player JSON')) - - if player.get('drm'): - raise ExtractorError('This video is DRM protected.', expected=True) - - formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') - self._sort_formats(formats) - - video = self._get_api_data(self._download_json( - 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), - video_id, 'Downloading video JSON', query={ - 'apiKey': 'sh@hid0nlin3', - 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', - }))[page_type] - + video = data[page_type] title = video['title'] categories = [ category['name'] @@ -132,3 +159,57 @@ class ShahidIE(InfoExtractor): 'episode_id': video_id, 'formats': formats, } + + +class ShahidShowIE(ShahidBaseIE): + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', + 'info_dict': { + 'id': '79187', + 'title': 'رامز قرش البحر', + 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', + 'only_matching': True + }] + _PAGE_SIZE = 30 + + def _real_extract(self, url): + show_id = self._match_id(url) + + product = self._call_api( + 'playableAsset', show_id, {'showId': show_id})['productModel'] + playlist = product['playlist'] + playlist_id = playlist['id'] + show = product.get('show', {}) + + def page_func(page_num): + playlist = self._call_api( + 'product/playlist', show_id, { + 'playListId': playlist_id, + 'pageNumber': page_num, + 'pageSize': 30, + 'sorts': [{ + 'order': 'DESC', + 'type': 'SORTDATE' + }], + }) + for product in playlist.get('productList', {}).get('products', []): + product_url = product.get('productUrl', []).get('url') + if not product_url: + continue + yield self.url_result( + product_url, 'Shahid', + str_or_none(product.get('id')), + product.get('title')) + + entries = InAdvancePagedList( + page_func, + math.ceil(playlist['count'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, show_id, show.get('title'), show.get('description')) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 89e19e927..b2250afdd 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,8 +1,7 @@ from __future__ import unicode_literals -import base64 - from .common import InfoExtractor +from ..compat import compat_b64decode from ..utils import ( ExtractorError, int_or_none, @@ -22,8 +21,8 @@ class SharedBaseIE(InfoExtractor): video_url = self._extract_video_url(webpage, video_id, url) - title = base64.b64decode(self._html_search_meta( - 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') + title = compat_b64decode(self._html_search_meta( + 'full:title', webpage, 'title')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) @@ -92,5 +91,4 @@ class VivoIE(SharedBaseIE): r'InitializeStream\s*\(\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'stream', group='url'), video_id, - transform_source=lambda x: base64.b64decode( - x.encode('ascii')).decode('utf-8'))[0] + transform_source=lambda x: compat_b64decode(x).decode('utf-8'))[0] diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 547be8f95..69951e387 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, int_or_none, @@ -48,6 +52,7 @@ class SixPlayIE(InfoExtractor): urls = [] quality_key = qualities(['lq', 'sd', 'hq', 'hd']) formats = [] + subtitles = {} for asset in clip_data['assets']: asset_url = asset.get('full_physical_path') protocol = asset.get('protocol') @@ -56,8 +61,11 @@ class SixPlayIE(InfoExtractor): urls.append(asset_url) container = asset.get('video_container') ext = determine_ext(asset_url) + if protocol == 'http_subtitle' or ext == 'vtt': + subtitles.setdefault('fr', []).append({'url': asset_url}) + continue if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp': + if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', @@ -98,4 +106,5 @@ class SixPlayIE(InfoExtractor): 'duration': int_or_none(clip_data.get('duration')), 'series': get(lambda x: x['program']['title']), 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py index 4ca9f6b3c..efcbb36a9 100644 --- a/youtube_dl/extractor/skysports.py +++ b/youtube_dl/extractor/skysports.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import strip_or_none +from ..utils import ( + extract_attributes, + smuggle_url, + strip_or_none, + urljoin, +) class SkySportsIE(InfoExtractor): @@ -22,12 +27,22 @@ class SkySportsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'(]+>)', webpage, 'video data')) + + video_url = 'ooyala:%s' % video_data['data-video-id'] + if video_data.get('data-token-required') == 'true': + token_fetch_options = self._parse_json(video_data.get('data-token-fetch-options', '{}'), video_id, fatal=False) or {} + token_fetch_url = token_fetch_options.get('url') + if token_fetch_url: + embed_token = self._download_webpage(urljoin(url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url(video_url, {'embed_token': embed_token.strip('"')}) return { '_type': 'url_transparent', 'id': video_id, - 'url': 'ooyala:%s' % self._search_regex( - r'data-video-id="([^"]+)"', webpage, 'ooyala id'), + 'url': video_url, 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), 'ie_key': 'Ooyala', diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py new file mode 100644 index 000000000..104576033 --- /dev/null +++ b/youtube_dl/extractor/slideslive.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SlidesLiveIE(InfoExtractor): + _VALID_URL = r'https?://slideslive\.com/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', + 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', + 'info_dict': { + 'id': 'LMtgR8ba0b0', + 'ext': 'mp4', + 'title': '38902413: external video', + 'description': '3890241320170925-9-1yd6ech.mp4', + 'uploader': 'SlidesLive Administrator', + 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'upload_date': '20170925', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + url, video_id, headers={'Accept': 'application/json'}) + service_name = video_data['video_service_name'] + if service_name == 'YOUTUBE': + yt_video_id = video_data['video_service_id'] + return self.url_result(yt_video_id, 'Youtube', video_id=yt_video_id) + else: + raise ExtractorError( + 'Unsupported service name: {0}'.format(service_name), expected=True) diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 7145d285a..6fc2ff60d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor class SlutloadIE(InfoExtractor): _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P[^/]+)/?$' - _TEST = { + _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', 'info_dict': { @@ -15,11 +17,17 @@ class SlutloadIE(InfoExtractor): 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' } - } + }, { + # mobile site + 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) + webpage = self._download_webpage(desktop_url, video_id) video_title = self._html_search_regex(r'

([^<]+)', webpage, 'title').strip() diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 370fa8879..45995f30f 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -310,6 +310,7 @@ class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' _VALID_URL = r'https?://(?:www\.)?(?Psmotri\.com/live/(?P[^/]+))/?.*' + _NETRC_MACHINE = 'smotri' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -352,17 +353,18 @@ class SmotriBroadcastIE(InfoExtractor): adult_content = False ticket = self._html_search_regex( - r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)", - broadcast_page, 'broadcast ticket') + (r'data-user-file=(["\'])(?P(?!\1).+)\1', + r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P[^']+)'\)"), + broadcast_page, 'broadcast ticket', group='ticket') - url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket + broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket broadcast_password = self._downloader.params.get('videopassword') if broadcast_password: - url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() + broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() broadcast_json_page = self._download_webpage( - url, broadcast_id, 'Downloading broadcast JSON') + broadcast_url, broadcast_id, 'Downloading broadcast JSON') try: broadcast_json = json.loads(broadcast_json_page) diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py index accd112aa..58a8c0d4d 100644 --- a/youtube_dl/extractor/sonyliv.py +++ b/youtube_dl/extractor/sonyliv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import smuggle_url class SonyLIVIE(InfoExtractor): @@ -10,12 +11,12 @@ class SonyLIVIE(InfoExtractor): 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", 'info_dict': { 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", - 'id': '5024612095001', + 'id': 'ref:5024612095001', 'ext': 'mp4', - 'upload_date': '20160707', + 'upload_date': '20170923', 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', - 'uploader_id': '4338955589001', - 'timestamp': 1467870968, + 'uploader_id': '5182475815001', + 'timestamp': 1506200547, }, 'params': { 'skip_download': True, @@ -26,9 +27,14 @@ class SonyLIVIE(InfoExtractor): 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): brightcove_id = self._match_id(url) return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { + 'geo_countries': ['IN'], + 'referrer': url, + }), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 2e52e092b..46332e5c2 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,8 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -import re import itertools +import re from .common import ( InfoExtractor, @@ -17,6 +17,7 @@ from ..utils import ( ExtractorError, int_or_none, unified_strdate, + update_url_query, ) @@ -120,10 +121,43 @@ class SoundcloudIE(InfoExtractor): 'license': 'cc-by-sa', }, }, + # private link, downloadable format + { + 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', + 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'info_dict': { + 'id': '340344461', + 'ext': 'wav', + 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', + 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', + 'uploader': 'Ori Uplift Music', + 'upload_date': '20170831', + 'duration': 7449, + 'license': 'all-rights-reserved', + }, + }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'upload_date': '20170226', + 'duration': 207, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + }, + 'params': { + 'skip_download': True, + }, + }, ] - _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg' - _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' + _CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb' @staticmethod def _extract_urls(webpage): @@ -144,7 +178,7 @@ class SoundcloudIE(InfoExtractor): name = full_title or track_id if quiet: self.report_extraction(name) - thumbnail = info.get('artwork_url') + thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') ext = 'mp3' @@ -160,11 +194,13 @@ class SoundcloudIE(InfoExtractor): 'license': info.get('license'), } formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token is not None: + query['secret_token'] = secret_token if info.get('downloadable', False): # We can build a direct link to the song - format_url = ( - 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( - track_id, self._CLIENT_ID)) + format_url = update_url_query( + 'https://api.soundcloud.com/tracks/%s/download' % track_id, query) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), @@ -176,10 +212,7 @@ class SoundcloudIE(InfoExtractor): # We have to retrieve the url format_dict = self._download_json( 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, - track_id, 'Downloading track url', query={ - 'client_id': self._CLIENT_ID, - 'secret_token': secret_token, - }) + track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): abr = int_or_none(self._search_regex( @@ -216,7 +249,7 @@ class SoundcloudIE(InfoExtractor): # cannot be always used, sometimes it can give an HTTP 404 error formats.append({ 'format_id': 'fallback', - 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'url': update_url_query(info['stream_url'], query), 'ext': ext, }) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e004e2c5a..3d78a9d76 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -8,36 +8,49 @@ from .common import InfoExtractor class SoundgasmIE(InfoExtractor): IE_NAME = 'soundgasm' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/(?P[0-9a-zA-Z_\-]+)' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', 'md5': '010082a2c802c5275bb00030743e75ad', 'info_dict': { 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', 'ext': 'm4a', - 'title': 'ytdl_Piano-sample', - 'description': 'Royalty Free Sample Music' + 'title': 'Piano sample', + 'description': 'Royalty Free Sample Music', + 'uploader': 'ytdl', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('title') - audio_title = mobj.group('user') + '_' + mobj.group('title') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + audio_url = self._html_search_regex( - r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL') - audio_id = re.split(r'\/|\.', audio_url)[-2] + r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'audio URL', group='url') + + title = self._search_regex( + r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', + webpage, 'title', default=display_id) + description = self._html_search_regex( - r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description', - fatal=False) + (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', + r'(?s)<li>Description:\s(.*?)<\/li>'), + webpage, 'description', fatal=False) + + audio_id = self._search_regex( + r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) return { 'id': audio_id, 'display_id': display_id, 'url': audio_url, - 'title': audio_title, - 'description': description + 'vcodec': 'none', + 'title': title, + 'description': description, + 'uploader': mobj.group('user'), } diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index d8ce416fc..da75a43a7 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -6,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -20,6 +20,9 @@ class SouthParkIE(MTVServicesInfoExtractor): 'timestamp': 1112760000, 'upload_date': '20050406', }, + }, { + 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', + 'only_matching': True, }] @@ -41,7 +44,7 @@ class SouthParkEsIE(SouthParkIE): class SouthParkDeIE(SouthParkIE): IE_NAME = 'southpark.de' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' _TESTS = [{ @@ -70,12 +73,15 @@ class SouthParkDeIE(SouthParkIE): 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', }, 'playlist_count': 3, + }, { + 'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1', + 'only_matching': True, }] class SouthParkNlIE(SouthParkIE): IE_NAME = 'southpark.nl' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' _TESTS = [{ @@ -90,7 +96,7 @@ class SouthParkNlIE(SouthParkIE): class SouthParkDkIE(SouthParkIE): IE_NAME = 'southparkstudios.dk' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' _TESTS = [{ @@ -100,4 +106,10 @@ class SouthParkDkIE(SouthParkIE): 'description': 'Butters is convinced he\'s living in a virtual reality.', }, 'playlist_mincount': 3, + }, { + 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', + 'only_matching': True, + }, { + 'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1', + 'only_matching': True, }] diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 3394c7e6b..67500b69c 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -3,10 +3,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_duration, + parse_resolution, + str_to_int, +) class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video' + _VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -14,7 +20,7 @@ class SpankBangIE(InfoExtractor): 'id': '3vvn', 'ext': 'mp4', 'title': 'fantasy solo', - 'description': 'Watch fantasy solo free HD porn video - 05 minutes - dillion harper masturbates on a bed free adult movies.', + 'description': 'dillion harper masturbates on a bed', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'silly2587', 'age_limit': 18, @@ -27,32 +33,53 @@ class SpankBangIE(InfoExtractor): # no uploader 'url': 'http://spankbang.com/lklg/video/sex+with+anyone+wedding+edition+2', 'only_matching': True, + }, { + # mobile page + 'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name', + 'only_matching': True, + }, { + # 4k + 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id, headers={ + 'Cookie': 'country=US' + }) - stream_key = self._html_search_regex( - r'''var\s+stream_key\s*=\s*['"](.+?)['"]''', - webpage, 'stream key') + if re.search(r'<[^>]+\bid=["\']video_removed', webpage): + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) - formats = [{ - 'url': 'http://spankbang.com/_%s/%s/title/%sp__mp4' % (video_id, stream_key, height), - 'ext': 'mp4', - 'format_id': '%sp' % height, - 'height': int(height), - } for height in re.findall(r'<(?:span|li|p)[^>]+[qb]_(\d+)p', webpage)] - self._check_formats(formats, video_id) + formats = [] + for mobj in re.finditer( + r'stream_url_(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage): + format_id, format_url = mobj.group('id', 'url') + f = parse_resolution(format_id) + f.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(f) self._sort_formats(formats) title = self._html_search_regex( r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title') - description = self._og_search_description(webpage) + description = self._search_regex( + r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', + webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) uploader = self._search_regex( r'class="user"[^>]*><img[^>]+>([^<]+)', webpage, 'uploader', default=None) + duration = parse_duration(self._search_regex( + r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', + webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + r'([\d,.]+)\s+plays', webpage, 'view count', fatal=False)) age_limit = self._rta_search(webpage) @@ -62,6 +89,8 @@ class SpankBangIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, 'formats': formats, 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 84298fee4..fc995e8c1 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .nexx import NexxEmbedIE +from .nexx import ( + NexxIE, + NexxEmbedIE, +) from .spiegeltv import SpiegeltvIE from ..compat import compat_urlparse from ..utils import ( @@ -51,6 +54,10 @@ class SpiegelIE(InfoExtractor): }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, + }, { + # nexx video + 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -61,6 +68,14 @@ class SpiegelIE(InfoExtractor): if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') + nexx_id = self._search_regex( + r'nexxOmniaId\s*:\s*(\d+)', webpage, 'nexx id', default=None) + if nexx_id: + domain_id = NexxIE._extract_domain_id(webpage) or '748' + return self.url_result( + 'nexx:%s:%s' % (domain_id, nexx_id), ie=NexxIE.ie_key(), + video_id=nexx_id) + video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index c59896a17..a7b1b3b5f 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -44,6 +44,7 @@ class SpikeIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.spike.com/feeds/mrss/' _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' _CUSTOM_URL_REGEX = re.compile(r'spikenetworkapp://([^/]+/[-a-fA-F0-9]+)') + _GEO_COUNTRIES = ['US'] def _extract_mgid(self, webpage): mgid = super(SpikeIE, self)._extract_mgid(webpage) diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py deleted file mode 100644 index 0d7925a08..000000000 --- a/youtube_dl/extractor/sportschau.py +++ /dev/null @@ -1,38 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .wdr import WDRBaseIE -from ..utils import get_element_by_attribute - - -class SportschauIE(WDRBaseIE): - IE_NAME = 'Sportschau' - _VALID_URL = r'https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video-?(?P<id>[^/#?]+)\.html' - _TEST = { - 'url': 'http://www.sportschau.de/uefaeuro2016/videos/video-dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100.html', - 'info_dict': { - 'id': 'mdb-1140188', - 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100', - 'ext': 'mp4', - 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen', - 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.', - 'upload_date': '20160615', - }, - 'skip': 'Geo-restricted to Germany', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = get_element_by_attribute('class', 'headline', webpage) - description = self._html_search_meta('description', webpage, 'description') - - info = self._extract_wdr_video(webpage, video_id) - - info.update({ - 'title': title, - 'description': description, - }) - - return info diff --git a/youtube_dl/extractor/springboardplatform.py b/youtube_dl/extractor/springboardplatform.py new file mode 100644 index 000000000..07d99b579 --- /dev/null +++ b/youtube_dl/extractor/springboardplatform.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + xpath_attr, + xpath_text, + xpath_element, + unescapeHTML, + unified_timestamp, +) + + +class SpringboardPlatformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + cms\.springboardplatform\.com/ + (?: + (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)| + xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) + ) + ''' + _TESTS = [{ + 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', + 'md5': '5c3cb7b5c55740d482561099e920f192', + 'info_dict': { + 'id': '981017', + 'ext': 'mp4', + 'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1409132328, + 'upload_date': '20140827', + 'duration': 193, + }, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') + index = mobj.group('index') or mobj.group('index_2') + + video = self._download_xml( + 'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s' + % (index, video_id), video_id) + + item = xpath_element(video, './/item', 'item', fatal=True) + + content = xpath_element( + item, './{http://search.yahoo.com/mrss/}content', 'content', + fatal=True) + title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True)) + + video_url = content.attrib['url'] + + if 'error_video.mp4' in video_url: + raise ExtractorError( + 'Video %s no longer exists' % video_id, expected=True) + + duration = int_or_none(content.get('duration')) + tbr = int_or_none(content.get('bitrate')) + filesize = int_or_none(content.get('fileSize')) + width = int_or_none(content.get('width')) + height = int_or_none(content.get('height')) + + description = unescapeHTML(xpath_text( + item, './description', 'description')) + thumbnail = xpath_attr( + item, './{http://search.yahoo.com/mrss/}thumbnail', 'url', + 'thumbnail') + + timestamp = unified_timestamp(xpath_text( + item, './{http://cms.springboardplatform.com/namespaces.html}created', + 'timestamp')) + + formats = [{ + 'url': video_url, + 'format_id': 'http', + 'tbr': tbr, + 'filesize': filesize, + 'width': width, + 'height': height, + }] + + m3u8_format = formats[0].copy() + m3u8_format.update({ + 'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8', + 'ext': 'mp4', + 'format_id': 'hls', + 'protocol': 'm3u8_native', + }) + formats.append(m3u8_format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index cce65fb10..ae3dd1380 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -66,7 +66,7 @@ class StanfordOpenClassroomIE(InfoExtractor): r'(?s)<description>([^<]+)</description>', coursepage, 'description', fatal=False) - links = orderedSet(re.findall(r'<a href="(VideoPage.php\?[^"]+)">', coursepage)) + links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] @@ -84,7 +84,7 @@ class StanfordOpenClassroomIE(InfoExtractor): rootpage = self._download_webpage(rootURL, info['id'], errnote='Unable to download course info page') - links = orderedSet(re.findall(r'<a href="(CoursePage.php\?[^"]+)">', rootpage)) + links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index 1a831ef6d..a6a191ceb 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from ..utils import ( + extract_attributes, ExtractorError, - unescapeHTML, + get_element_by_class, + js_to_json, ) @@ -25,35 +27,39 @@ class SteamIE(InfoExtractor): 'url': 'http://store.steampowered.com/video/105600/', 'playlist': [ { - 'md5': 'f870007cee7065d7c76b88f0a45ecc07', + 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', 'info_dict': { - 'id': '81300', - 'ext': 'flv', - 'title': 'Terraria 1.1 Trailer', + 'id': '2040428', + 'ext': 'mp4', + 'title': 'Terraria 1.3 Trailer', 'playlist_index': 1, } }, { - 'md5': '61aaf31a5c5c3041afb58fb83cbb5751', + 'md5': '911672b20064ca3263fa89650ba5a7aa', 'info_dict': { - 'id': '80859', - 'ext': 'flv', - 'title': 'Terraria Trailer', + 'id': '2029566', + 'ext': 'mp4', + 'title': 'Terraria 1.2 Trailer', 'playlist_index': 2, } } ], + 'info_dict': { + 'id': '105600', + 'title': 'Terraria', + }, 'params': { 'playlistend': 2, } }, { 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', 'info_dict': { - 'id': 'WB5DvDOOvAY', + 'id': 'X8kpJBlzD2E', 'ext': 'mp4', - 'upload_date': '20140329', - 'title': 'FRONTIERS - Final Greenlight Trailer', - 'description': 'md5:dc96a773669d0ca1b36c13c1f30250d9', + 'upload_date': '20140617', + 'title': 'FRONTIERS - Trapping', + 'description': 'md5:bf6f7f773def614054089e5769c12a6e', 'uploader': 'AAD Productions', 'uploader_id': 'AtomicAgeDogGames', } @@ -69,6 +75,9 @@ class SteamIE(InfoExtractor): gameID = m.group('gameID') playlist_id = gameID videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + + self._set_cookie('steampowered.com', 'mature_content', '1') + webpage = self._download_webpage(videourl, playlist_id) if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: @@ -76,48 +85,65 @@ class SteamIE(InfoExtractor): self.report_age_confirmation() webpage = self._download_webpage(videourl, playlist_id) + flash_vars = self._parse_json(self._search_regex( + r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, + 'flash vars'), playlist_id, js_to_json) + + playlist_title = None + entries = [] if fileID: - playlist_title = self._html_search_regex( - r'<div class="workshopItemTitle">(.+)</div>', webpage, 'title') - mweb = re.finditer(r'''(?x) - 'movie_(?P<videoID>[0-9]+)':\s*\{\s* - YOUTUBE_VIDEO_ID:\s*"(?P<youtube_id>[^"]+)", - ''', webpage) - videos = [{ - '_type': 'url', - 'url': vid.group('youtube_id'), - 'ie_key': 'Youtube', - } for vid in mweb] - else: - playlist_title = self._html_search_regex( - r'<h2 class="pageheader">(.*?)</h2>', webpage, 'game title') - - mweb = re.finditer(r'''(?x) - 'movie_(?P<videoID>[0-9]+)':\s*\{\s* - FILENAME:\s*"(?P<videoURL>[\w:/\.\?=]+)" - (,\s*MOVIE_NAME:\s*\"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\}, - ''', webpage) - titles = re.finditer( - r'<span class="title">(?P<videoName>.+?)</span>', webpage) - thumbs = re.finditer( - r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">', webpage) - videos = [] - - for vid, vtitle, thumb in zip(mweb, titles, thumbs): - video_id = vid.group('videoID') - title = vtitle.group('videoName') - video_url = vid.group('videoURL') - video_thumb = thumb.group('thumbnail') - if not video_url: - raise ExtractorError('Cannot find video url for %s' % video_id) - videos.append({ - 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': unescapeHTML(title), - 'thumbnail': video_thumb + playlist_title = get_element_by_class('workshopItemTitle', webpage) + for movie in flash_vars.values(): + if not movie: + continue + youtube_id = movie.get('YOUTUBE_VIDEO_ID') + if not youtube_id: + continue + entries.append({ + '_type': 'url', + 'url': youtube_id, + 'ie_key': 'Youtube', }) - if not videos: + else: + playlist_title = get_element_by_class('apphub_AppName', webpage) + for movie_id, movie in flash_vars.items(): + if not movie: + continue + video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) + title = movie.get('MOVIE_NAME') + if not title or not video_id: + continue + entry = { + 'id': video_id, + 'title': title.replace('+', ' '), + } + formats = [] + flv_url = movie.get('FILENAME') + if flv_url: + formats.append({ + 'format_id': 'flv', + 'url': flv_url, + }) + highlight_element = self._search_regex( + r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, + webpage, 'highlight element', fatal=False) + if highlight_element: + highlight_attribs = extract_attributes(highlight_element) + if highlight_attribs: + entry['thumbnail'] = highlight_attribs.get('data-poster') + for quality in ('', '-hd'): + for ext in ('webm', 'mp4'): + video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) + if video_url: + formats.append({ + 'format_id': ext + quality, + 'url': video_url, + }) + if not formats: + continue + entry['formats'] = formats + entries.append(entry) + if not entries: raise ExtractorError('Could not find any videos') - return self.playlist_result(videos, playlist_id, playlist_title) + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index a9e34c027..fcaa5ac0b 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -4,8 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_chr from ..utils import ( determine_ext, + ExtractorError, int_or_none, js_to_json, ) @@ -32,12 +34,34 @@ class StreamangoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'gone', }, { 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', 'only_matching': True, }] def _real_extract(self, url): + def decrypt_src(encoded, val): + ALPHABET = '=/+9876543210zyxwvutsrqponmlkjihgfedcbaZYXWVUTSRQPONMLKJIHGFEDCBA' + encoded = re.sub(r'[^A-Za-z0-9+/=]', '', encoded) + decoded = '' + sm = [None] * 4 + i = 0 + str_len = len(encoded) + while i < str_len: + for j in range(4): + sm[j % 4] = ALPHABET.index(encoded[i]) + i += 1 + char_code = ((sm[0] << 0x2) | (sm[1] >> 0x4)) ^ val + decoded += compat_chr(char_code) + if sm[2] != 0x40: + char_code = ((sm[1] & 0xf) << 0x4) | (sm[2] >> 0x2) + decoded += compat_chr(char_code) + if sm[3] != 0x40: + char_code = ((sm[2] & 0x3) << 0x6) | sm[3] + decoded += compat_chr(char_code) + return decoded + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -46,13 +70,26 @@ class StreamangoIE(InfoExtractor): formats = [] for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): - video = self._parse_json( - format_, video_id, transform_source=js_to_json, fatal=False) - if not video: + mobj = re.search(r'(src\s*:\s*[^(]+\(([^)]*)\)[\s,]*)', format_) + if mobj is None: continue - src = video.get('src') + + format_ = format_.replace(mobj.group(0), '') + + video = self._parse_json( + format_, video_id, transform_source=js_to_json, + fatal=False) or {} + + mobj = re.search( + r'([\'"])(?P<src>(?:(?!\1).)+)\1\s*,\s*(?P<val>\d+)', + mobj.group(1)) + if mobj is None: + continue + + src = decrypt_src(mobj.group('src'), int_or_none(mobj.group('val'))) if not src: continue + ext = determine_ext(src, default_ext=None) if video.get('type') == 'application/dash+xml' or ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -65,6 +102,16 @@ class StreamangoIE(InfoExtractor): 'height': int_or_none(video.get('height')), 'tbr': int_or_none(video.get('bitrate')), }) + + if not formats: + error = self._search_regex( + r'<p[^>]+\bclass=["\']lead[^>]+>(.+?)</p>', webpage, + 'error', default=None) + if not error and '>Sorry' in webpage: + error = 'Video %s is not available' % video_id + if error: + raise ExtractorError(error, expected=True) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py new file mode 100644 index 000000000..ae2ac1b42 --- /dev/null +++ b/youtube_dl/extractor/stretchinternet.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class StretchInternetIE(InfoExtractor): + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/portal\.htm\?.*?\beventId=(?P<id>\d+)' + _TEST = { + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', + 'info_dict': { + 'id': '313900', + 'ext': 'mp4', + 'title': 'Augustana (S.D.) Baseball vs University of Mary', + 'description': 'md5:7578478614aae3bdd4a90f578f787438', + 'timestamp': 1490468400, + 'upload_date': '20170325', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + stream = self._download_json( + 'https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' + % video_id, video_id) + + video_url = 'https://%s' % stream['source'] + + event = self._download_json( + 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', + video_id, query={ + 'clientID': 99997, + 'eventID': video_id, + 'token': 'asdf', + })['event'] + + title = event.get('title') or event['mobileTitle'] + description = event.get('customText') + timestamp = int_or_none(event.get('longtime')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'url': video_url, + } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 48bc4529e..f71eab8b2 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -4,11 +4,17 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, dict_get, int_or_none, try_get, + urljoin, + compat_str, ) @@ -16,6 +22,8 @@ class SVTBaseIE(InfoExtractor): _GEO_COUNTRIES = ['SE'] def _extract_video(self, video_info, video_id): + is_live = dict_get(video_info, ('live', 'simulcast'), default=False) + m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' formats = [] for vr in video_info['videoReferences']: player_type = vr.get('playerType') or vr.get('format') @@ -24,7 +32,7 @@ class SVTBaseIE(InfoExtractor): if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, - ext='mp4', entry_protocol='m3u8_native', + ext='mp4', entry_protocol=m3u8_protocol, m3u8_id=player_type, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( @@ -84,6 +92,7 @@ class SVTBaseIE(InfoExtractor): 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, + 'is_live': is_live, } @@ -122,9 +131,13 @@ class SVTIE(SVTBaseIE): return info_dict -class SVTPlayIE(SVTBaseIE): +class SVTPlayBaseIE(SVTBaseIE): + _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P<json>{.+?})\s*;\s*\n' + + +class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -148,6 +161,9 @@ class SVTPlayIE(SVTBaseIE): }, { 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/kanaler/svt1', + 'only_matching': True, }] def _real_extract(self, url): @@ -157,12 +173,16 @@ class SVTPlayIE(SVTBaseIE): data = self._parse_json( self._search_regex( - r'root\["__svtplay"\]\s*=\s*([^;]+);', - webpage, 'embedded data', default='{}'), + self._SVTPLAY_RE, webpage, 'embedded data', default='{}', + group='json'), video_id, fatal=False) thumbnail = self._og_search_thumbnail(webpage) + def adjust_title(info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + if data: video_info = try_get( data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], @@ -173,6 +193,7 @@ class SVTPlayIE(SVTBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) + adjust_title(info_dict) return info_dict video_id = self._search_regex( @@ -188,4 +209,86 @@ class SVTPlayIE(SVTBaseIE): info_dict['title'] = re.sub( r'\s*\|\s*.+?$', '', info_dict.get('episode') or self._og_search_title(webpage)) + adjust_title(info_dict) return info_dict + + +class SVTSeriesIE(SVTPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.svtplay.se/rederiet', + 'info_dict': { + 'id': 'rederiet', + 'title': 'Rederiet', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + }, + 'playlist_mincount': 318, + }, { + 'url': 'https://www.svtplay.se/rederiet?tab=sasong2', + 'info_dict': { + 'id': 'rederiet-sasong2', + 'title': 'Rederiet - Säsong 2', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + }, + 'playlist_count': 12, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) + + def _real_extract(self, url): + series_id = self._match_id(url) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + season_slug = qs.get('tab', [None])[0] + + if season_slug: + series_id += '-%s' % season_slug + + webpage = self._download_webpage( + url, series_id, 'Downloading series page') + + root = self._parse_json( + self._search_regex( + self._SVTPLAY_RE, webpage, 'content', group='json'), + series_id) + + season_name = None + + entries = [] + for season in root['relatedVideoContent']['relatedVideosAccordion']: + if not isinstance(season, dict): + continue + if season_slug: + if season.get('slug') != season_slug: + continue + season_name = season.get('name') + videos = season.get('videos') + if not isinstance(videos, list): + continue + for video in videos: + content_url = video.get('contentUrl') + if not content_url or not isinstance(content_url, compat_str): + continue + entries.append( + self.url_result( + urljoin(url, content_url), + ie=SVTPlayIE.ie_key(), + video_title=video.get('title') + )) + + metadata = root.get('metaData') + if not isinstance(metadata, dict): + metadata = {} + + title = metadata.get('title') + season_name = season_name or season_slug + + if title and season_name: + title = '%s - %s' % (title, season_name) + elif season_slug: + title = season_slug + + return self.playlist_result( + entries, series_id, title, metadata.get('description')) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index e9474533f..edc31729d 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,58 +4,109 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE -from ..utils import extract_attributes +from ..utils import ( + float_or_none, + int_or_none, + strip_or_none, +) class TBSIE(TurnerBaseIE): - # https://github.com/rg3/youtube-dl/issues/13658 - _WORKING = False - - _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+)' _TESTS = [{ - 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', - 'md5': '9e61d680e2285066ade7199e6408b2ee', + 'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster', 'info_dict': { - 'id': '2007318', + 'id': '8d384cde33b89f3a43ce5329de42903ed5099887', 'ext': 'mp4', - 'title': 'Theatrical Trailer', - 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', + 'title': 'Monster', + 'description': 'Get a first look at the theatrical trailer for TNT’s highly anticipated new psychological thriller The Alienist, which premieres January 22 on TNT.', + 'timestamp': 1508175329, + 'upload_date': '20171016', }, - 'skip': 'TBS videos are deleted after a while', + 'params': { + # m3u8 download + 'skip_download': True, + } }, { - 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', - 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', - 'info_dict': { - 'id': '1538823', - 'ext': 'mp4', - 'title': 'You Better Run', - 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', - }, - 'skip': 'TBS videos are deleted after a while', + 'url': 'http://www.tbs.com/shows/search-party/season-1/episode-1/explicit-the-mysterious-disappearance-of-the-girl-no-one-knew', + 'only_matching': True, + }, { + 'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope', + 'only_matching': True, }] def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - site = domain[:3] + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_params = extract_attributes(self._search_regex(r'(<[^>]+id="page-video"[^>]*>)', webpage, 'video params')) - query = None - clip_id = video_params.get('clipid') - if clip_id: - query = 'id=' + clip_id - else: - query = 'titleId=' + video_params['titleid'] - return self._extract_cvp_info( - 'http://www.%s.com/service/cvpXml?%s' % (domain, query), display_id, { - 'default': { - 'media_src': 'http://ht.cdn.turner.com/%s/big' % site, - }, - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/%s/big' % site, - 'tokenizer_src': 'http://www.%s.com/video/processors/services/token_ipadAdobe.do' % domain, - }, - }, { - 'url': url, - 'site_name': site.upper(), - 'auth_required': video_params.get('isAuthRequired') != 'false', - }) + video_data = self._parse_json(self._search_regex( + r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', + webpage, 'drupal setting'), display_id)['turner_playlist'][0] + + media_id = video_data['mediaID'] + title = video_data['title'] + + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.vgtf.net/token/token_spe', + m3u8_url, media_id, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + duration = float_or_none(chapter.get('duration')) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + }) + self._sort_formats(formats) + + thumbnails = [] + for image_id, image in video_data.get('images', {}).items(): + image_url = image.get('url') + if not image_url or image.get('type') != 'video': + continue + i = { + 'id': image_id, + 'url': image_url, + } + mobj = re.search(r'(\d+)x(\d+)', image_url) + if mobj: + i.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + thumbnails.append(i) + + return { + 'id': media_id, + 'title': title, + 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), + 'duration': duration, + 'timestamp': int_or_none(video_data.get('created')), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'cahpters': chapters, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index f14713a78..1272078c5 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -5,8 +5,9 @@ import re from .common import InfoExtractor from ..utils import ( - qualities, determine_ext, + ExtractorError, + qualities, ) @@ -17,6 +18,7 @@ class TeacherTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)' _TESTS = [{ + # flowplayer 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', 'md5': 'f9434ef992fd65936d72999951ee254c', 'info_dict': { @@ -24,19 +26,10 @@ class TeacherTubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Measures of dispersion from a frequency table', 'description': 'Measures of dispersion from a frequency table', - 'thumbnail': r're:http://.*\.jpg', - }, - }, { - 'url': 'http://www.teachertube.com/viewVideo.php?video_id=340064', - 'md5': '0d625ec6bc9bf50f70170942ad580676', - 'info_dict': { - 'id': '340064', - 'ext': 'mp4', - 'title': 'How to Make Paper Dolls _ Paper Art Projects', - 'description': 'Learn how to make paper dolls in this simple', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, }, { + # jwplayer 'url': 'http://www.teachertube.com/music.php?music_id=8805', 'md5': '01e8352006c65757caf7b961f6050e21', 'info_dict': { @@ -46,20 +39,21 @@ class TeacherTubeIE(InfoExtractor): 'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P', }, }, { + # unavailable video 'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', - 'md5': '9c79fbb2dd7154823996fc28d4a26998', - 'info_dict': { - 'id': '297790', - 'ext': 'mp4', - 'title': 'Intro Video - Schleicher', - 'description': 'Intro Video - Why to flip, how flipping will', - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + error = self._search_regex( + r'<div\b[^>]+\bclass=["\']msgBox error[^>]+>([^<]+)', webpage, + 'error', default=None) + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + title = self._html_search_meta('title', webpage, 'title', fatal=True) TITLE_SUFFIX = ' - TeacherTube' if title.endswith(TITLE_SUFFIX): @@ -84,12 +78,16 @@ class TeacherTubeIE(InfoExtractor): self._sort_formats(formats) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'thumbnail', webpage) + return { 'id': video_id, 'title': title, - 'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'), - 'formats': formats, 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 75346393b..9056c8cbc 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,18 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import binascii import re import json from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_ord, +) from ..utils import ( ExtractorError, qualities, determine_ext, ) -from ..compat import compat_ord class TeamcocoIE(InfoExtractor): @@ -97,7 +99,7 @@ class TeamcocoIE(InfoExtractor): for i in range(len(cur_fragments)): cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') try: - raw_data = base64.b64decode(cur_sequence) + raw_data = compat_b64decode(cur_sequence) if compat_ord(raw_data[0]) == compat_ord('{'): return json.loads(raw_data.decode('utf-8')) except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py index 5886e9c1b..a0353fe3a 100644 --- a/youtube_dl/extractor/telebruxelles.py +++ b/youtube_dl/extractor/telebruxelles.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class TeleBruxellesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(news|sport|dernier-jt|emission)/?(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(?:[^/]+/)*(?P<id>[^/#?]+)' _TESTS = [{ 'url': 'http://bx1.be/news/que-risque-lauteur-dune-fausse-alerte-a-la-bombe/', 'md5': 'a2a67a5b1c3e8c9d33109b902f474fd9', @@ -31,6 +31,16 @@ class TeleBruxellesIE(InfoExtractor): }, { 'url': 'http://bx1.be/emission/bxenf1-gastronomie/', 'only_matching': True, + }, { + 'url': 'https://bx1.be/berchem-sainte-agathe/personnel-carrefour-de-berchem-sainte-agathe-inquiet/', + 'only_matching': True, + }, { + 'url': 'https://bx1.be/dernier-jt/', + 'only_matching': True, + }, { + # live stream + 'url': 'https://bx1.be/lives/direct-tv/', + 'only_matching': True, }] def _real_extract(self, url): @@ -38,22 +48,29 @@ class TeleBruxellesIE(InfoExtractor): webpage = self._download_webpage(url, display_id) article_id = self._html_search_regex( - r"<article id=\"post-(\d+)\"", webpage, 'article ID', default=None) + r'<article[^>]+\bid=["\']post-(\d+)', webpage, 'article ID', default=None) title = self._html_search_regex( - r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title') + r'<h1[^>]*>(.+?)</h1>', webpage, 'title', + default=None) or self._og_search_title(webpage) description = self._og_search_description(webpage, default=None) rtmp_url = self._html_search_regex( - r'file\s*:\s*"(rtmp://[^/]+/vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*".mp4)"', + r'file["\']?\s*:\s*"(r(?:tm|mt)ps?://[^/]+/(?:vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*"\.mp4|stream/live))"', webpage, 'RTMP url') + # Yes, they have a typo in scheme name for live stream URLs (e.g. + # https://bx1.be/lives/direct-tv/) + rtmp_url = re.sub(r'^rmtp', 'rtmp', rtmp_url) rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url) formats = self._extract_wowza_formats(rtmp_url, article_id or display_id) self._sort_formats(formats) + is_live = 'stream/live' in rtmp_url + return { 'id': article_id or display_id, 'display_id': display_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': description, 'formats': formats, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index fafaa826f..6965c127b 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -10,19 +10,33 @@ from ..utils import ( ) -class TeleQuebecIE(InfoExtractor): +class TeleQuebecBaseIE(InfoExtractor): + @staticmethod + def _limelight_result(media_id): + return { + '_type': 'url_transparent', + 'url': smuggle_url( + 'limelight:media:' + media_id, {'geo_countries': ['CA']}), + 'ie_key': 'LimelightMedia', + } + + +class TeleQuebecIE(TeleQuebecBaseIE): _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york', - 'md5': 'fe95a0957e5707b1b01f5013e725c90f', + # available till 01.01.2023 + 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', 'info_dict': { - 'id': '20984', + 'id': '577116881b4b439084e6b1cf4ef8b1b3', 'ext': 'mp4', - 'title': 'Le couronnement de New York', - 'description': 'md5:f5b3d27a689ec6c1486132b2d687d432', - 'upload_date': '20170201', - 'timestamp': 1485972222, - } + 'title': 'Un petit choc et puis repart!', + 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'upload_date': '20180222', + 'timestamp': 1519326631, + }, + 'params': { + 'skip_download': True, + }, }, { # no description 'url': 'http://zonevideo.telequebec.tv/media/30261', @@ -31,19 +45,107 @@ class TeleQuebecIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) + media_data = self._download_json( 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, media_id)['media'] - return { - '_type': 'url_transparent', - 'id': media_id, - 'url': smuggle_url( - 'limelight:media:' + media_data['streamInfo']['sourceId'], - {'geo_countries': ['CA']}), - 'title': media_data['title'], + + info = self._limelight_result(media_data['streamInfo']['sourceId']) + info.update({ + 'title': media_data.get('title'), 'description': try_get( media_data, lambda x: x['descriptions'][0]['text'], compat_str), 'duration': int_or_none( media_data.get('durationInMilliseconds'), 1000), - 'ie_key': 'LimelightMedia', + }) + return info + + +class TeleQuebecEmissionIE(TeleQuebecBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + [^/]+\.telequebec\.tv/emissions/| + (?:www\.)?telequebec\.tv/ + ) + (?P<id>[^?#&]+) + ''' + _TESTS = [{ + 'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente', + 'info_dict': { + 'id': '66648a6aef914fe3badda25e81a4d50a', + 'ext': 'mp4', + 'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?", + 'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014', + 'upload_date': '20171024', + 'timestamp': 1508862118, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', + 'only_matching': True, + }, { + 'url': 'http://www.telequebec.tv/masha-et-michka/epi059masha-et-michka-3-053-078', + 'only_matching': True, + }, { + 'url': 'http://www.telequebec.tv/documentaire/bebes-sur-mesure/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + media_id = self._search_regex( + r'mediaUID\s*:\s*["\'][Ll]imelight_(?P<id>[a-z0-9]{32})', webpage, + 'limelight id') + + info = self._limelight_result(media_id) + info.update({ + 'title': self._og_search_title(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + }) + return info + + +class TeleQuebecLiveIE(InfoExtractor): + _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?P<id>endirect)' + _TEST = { + 'url': 'http://zonevideo.telequebec.tv/endirect/', + 'info_dict': { + 'id': 'endirect', + 'ext': 'mp4', + 'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + m3u8_url = None + webpage = self._download_webpage( + 'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id, + fatal=False) + if webpage: + m3u8_url = self._search_regex( + r'm3U8Url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'm3u8 url', default=None, group='url') + if not m3u8_url: + m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8' + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title('Télé-Québec - En direct'), + 'is_live': True, + 'formats': formats, } diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py new file mode 100644 index 000000000..0c6f70784 --- /dev/null +++ b/youtube_dl/extractor/tennistv.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + unified_timestamp, +) + + +class TennisTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P<id>[-a-z0-9]+)' + _TEST = { + 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', + 'info_dict': { + 'id': 'indian-wells-2018-verdasco-fritz', + 'ext': 'mp4', + 'title': 'Fernando Verdasco v Taylor Fritz', + 'description': 're:^After his stunning victory.{174}$', + 'thumbnail': 'https://atp-prod.akamaized.net/api/images/v1/images/112831/landscape/1242/0', + 'timestamp': 1521017381, + 'upload_date': '20180314', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires email and password of a subscribed account', + } + _NETRC_MACHINE = 'tennistv' + + def _login(self): + (username, password) = self._get_login_info() + if not username or not password: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + + login_form = { + 'Email': username, + 'Password': password, + } + login_json = json.dumps(login_form).encode('utf-8') + headers = { + 'content-type': 'application/json', + 'Referer': 'https://www.tennistv.com/login', + 'Origin': 'https://www.tennistv.com', + } + + login_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/login', None, + note='Logging in', + errnote='Login failed (wrong password?)', + headers=headers, + data=login_json) + + if login_result['error']['errorCode']: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) + + if login_result['entitlement'] != 'SUBSCRIBED': + self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) + + self._session_token = login_result['sessionToken'] + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + internal_id = self._search_regex(r'video=([0-9]+)', webpage, 'internal video id') + + headers = { + 'Origin': 'https://www.tennistv.com', + 'authorization': 'ATP %s' % self._session_token, + 'content-type': 'application/json', + 'Referer': url, + } + check_data = { + 'videoID': internal_id, + 'VideoUrlType': 'HLSV3', + } + check_json = json.dumps(check_data).encode('utf-8') + check_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', + video_id, note='Checking video authorization', headers=headers, data=check_json) + formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') + + vdata_url = 'https://www.tennistv.com/api/channels/v1/de/none/video/%s' % video_id + vdata = self._download_json(vdata_url, video_id) + + timestamp = unified_timestamp(vdata['timestamp']) + thumbnail = vdata['video']['thumbnailUrl'] + description = vdata['displayText']['description'] + title = vdata['video']['title'] + + series = vdata['tour'] + venue = vdata['displayText']['venue'] + round_str = vdata['seo']['round'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'series': series, + 'season': venue, + 'episode': round_str, + } diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index de236bbba..b1a985ff6 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -216,7 +216,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): def hex_to_bytes(hex): return binascii.a2b_hex(hex.encode('ascii')) - relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1) + relative_path = re.match(r'https?://link\.theplatform\.com/s/([^?]+)', url).group(1) clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path)) checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() sig = flags + expiration_date + checksum + str_to_hex(sig_secret) diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py deleted file mode 100644 index d63aef5de..000000000 --- a/youtube_dl/extractor/thesixtyone.py +++ /dev/null @@ -1,106 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class TheSixtyOneIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/ - (?:.*?/)* - (?: - s| - song/comments/list| - song - )/(?:[^/]+/)?(?P<id>[A-Za-z0-9]+)/?$''' - _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' - _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream' - _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' - _TESTS = [ - { - 'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/', - 'md5': '821cc43b0530d3222e3e2b70bb4622ea', - 'info_dict': { - 'id': 'SrE3zD7s1jt', - 'ext': 'mp3', - 'title': 'CASIO - Unicorn War Mixtape', - 'thumbnail': 're:^https?://.*_desktop$', - 'upload_date': '20071217', - 'duration': 3208, - } - }, - { - 'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/', - 'only_matching': True, - }, - { - 'url': 'http://www.thesixtyone.com/maryatmidnight/song/StrawberriesandCream/yvWtLp0c4GQ/', - 'only_matching': True, - }, - ] - - _DECODE_MAP = { - 'x': 'a', - 'm': 'b', - 'w': 'c', - 'q': 'd', - 'n': 'e', - 'p': 'f', - 'a': '0', - 'h': '1', - 'e': '2', - 'u': '3', - 's': '4', - 'i': '5', - 'o': '6', - 'y': '7', - 'r': '8', - 'c': '9' - } - - def _real_extract(self, url): - song_id = self._match_id(url) - - webpage = self._download_webpage( - self._SONG_URL_TEMPLATE.format(song_id), song_id) - - song_data = self._parse_json(self._search_regex( - r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id) - - if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None): - song_data['audio_server'] = 's3.amazonaws.com' - else: - song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com' - - keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']] - url = self._SONG_FILE_URL_TEMPLATE.format( - "".join(reversed(keys)), **song_data) - - formats = [{ - 'format_id': 'sd', - 'url': url, - 'ext': 'mp3', - }] - - return { - 'id': song_id, - 'title': '{artist:} - {name:}'.format(**song_data), - 'formats': formats, - 'comment_count': song_data.get('comments_count'), - 'duration': song_data.get('play_time'), - 'like_count': song_data.get('score'), - 'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data), - 'upload_date': unified_strdate(song_data.get('publish_date')), - } diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py index 33683b139..dc3dd03c8 100644 --- a/youtube_dl/extractor/thisav.py +++ b/youtube_dl/extractor/thisav.py @@ -57,10 +57,10 @@ class ThisAVIE(InfoExtractor): info_dict = self._extract_jwplayer_data( webpage, video_id, require_title=False) uploader = self._html_search_regex( - r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', + r': <a href="http://www\.thisav\.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', webpage, 'uploader name', fatal=False) uploader_id = self._html_search_regex( - r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', + r': <a href="http://www\.thisav\.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', webpage, 'uploader id', fatal=False) info_dict.update({ diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 7e6ec3430..0c2f8f119 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -21,6 +21,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): r'flashvars\.config\s*=\s*escape\("([^"]+)"', r'<input[^>]+name="config\d?" value="([^"]+)"', ] + _HOST = 'tna' + _VKEY_SUFFIX = '' _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' @@ -72,7 +74,13 @@ class TNAFlixNetworkBaseIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') if 'display_id' in mobj.groupdict() else video_id + for display_id_key in ('display_id', 'display_id_2'): + if display_id_key in mobj.groupdict(): + display_id = mobj.group(display_id_key) + if display_id: + break + else: + display_id = video_id webpage = self._download_webpage(url, display_id) @@ -81,8 +89,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): if not cfg_url: inputs = self._hidden_inputs(webpage) - cfg_url = ('https://cdn-fck.tnaflix.com/tnaflix/%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' - % (inputs['vkey'], inputs['nkey'], video_id)) + cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' + % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id)) cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', @@ -91,7 +99,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): formats = [] def extract_video_url(vl): - return re.sub(r'speed=\d+', 'speed=', unescapeHTML(vl.text)) + # Any URL modification now results in HTTP Error 403: Forbidden + return unescapeHTML(vl.text) video_link = cfg_xml.find('./videoLink') if video_link is not None: @@ -192,18 +201,21 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): webpage)] -class TNAFlixIE(TNAFlixNetworkBaseIE): +class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): + _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<' + _UPLOADER_REGEX = r'<span>by\s*<a[^>]+\bhref=["\']/profile/[^>]+>([^<]+)<' + _CATEGORIES_REGEX = r'(?s)<span[^>]*>Categories:</span>(.+?)</div>' + + +class TNAFlixIE(TNAEMPFlixBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)' - _DESCRIPTION_REGEX = r'(?s)>Description:]+>(.+?)<' - _UPLOADER_REGEX = r'\s*Verified Member\s*\s*(.+?)<' - _CATEGORIES_REGEX = r'(?s)]*>Categories:(.+?)

' _TESTS = [{ # anonymous uploader, no categories 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'md5': '7e569419fe6d69543d01e6be22f5f7c4', 'info_dict': { 'id': '553878', 'display_id': 'Carmella-Decesare-striptease', @@ -228,7 +240,7 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): 'duration': 164, 'age_limit': 18, 'uploader': 'bobwhite39', - 'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'], + 'categories': list, } }, { 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', @@ -236,14 +248,15 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): }] -class EMPFlixIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P.+?)-(?P[0-9]+)\.html' +class EMPFlixIE(TNAEMPFlixBaseIE): + _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P.+?)-|[^/]+/(?P[^/]+)/video)(?P[0-9]+)' - _UPLOADER_REGEX = r']+class="infoTitle"[^>]*>Uploaded By:(.+?)' + _HOST = 'emp' + _VKEY_SUFFIX = '-1' _TESTS = [{ 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'md5': 'bc30d48b91a7179448a0bda465114676', 'info_dict': { 'id': '33051', 'display_id': 'Amateur-Finger-Fuck', @@ -259,6 +272,9 @@ class EMPFlixIE(TNAFlixNetworkBaseIE): }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', 'only_matching': True, + }, { + 'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'only_matching': True, }] diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index 348d6ecdf..5e5efda0f 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -132,7 +132,7 @@ class ToggleIE(InfoExtractor): formats = [] for video_file in info.get('Files', []): video_url, vid_format = video_file.get('URL'), video_file.get('Format') - if not video_url or not vid_format: + if not video_url or video_url == 'NA' or not vid_format: continue ext = determine_ext(video_url) vid_format = vid_format.replace(' ', '') @@ -143,6 +143,18 @@ class ToggleIE(InfoExtractor): note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id=vid_format, + note='Downloading %s MPD manifest' % vid_format, + errnote='Failed to download %s MPD manifest' % vid_format, + fatal=False)) + elif ext == 'ism': + formats.extend(self._extract_ism_formats( + video_url, video_id, ism_id=vid_format, + note='Downloading %s ISM manifest' % vid_format, + errnote='Failed to download %s ISM manifest' % vid_format, + fatal=False)) elif ext in ('mp4', 'wvm'): # wvm are drm-protected files formats.append({ diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 26d770992..2e7876cc5 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, js_to_json, - ExtractorError, urlencode_postdata, extract_attributes, smuggle_url, @@ -15,7 +16,7 @@ from ..utils import ( class TouTvIE(InfoExtractor): _NETRC_MACHINE = 'toutv' IE_NAME = 'tou.tv' - _VALID_URL = r'https?://ici\.tou\.tv/(?P[a-zA-Z0-9_-]+(?:/S[0-9]+E[0-9]+)?)' + _VALID_URL = r'https?://ici\.tou\.tv/(?P[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)' _access_token = None _claims = None @@ -36,13 +37,16 @@ class TouTvIE(InfoExtractor): }, { 'url': 'http://ici.tou.tv/hackers', 'only_matching': True, + }, { + 'url': 'https://ici.tou.tv/l-age-adulte/S01C501', + 'only_matching': True, }] def _real_initialize(self): email, password = self._get_login_info() if email is None: return - state = 'http://ici.tou.tv//' + state = 'http://ici.tou.tv/' webpage = self._download_webpage(state, None, 'Downloading homepage') toutvlogin = self._parse_json(self._search_regex( r'(?s)toutvlogin\s*=\s*({.+?});', webpage, 'toutvlogin'), None, js_to_json) @@ -55,16 +59,30 @@ class TouTvIE(InfoExtractor): 'scope': 'media-drmt openid profile email id.write media-validation.read.privileged', 'state': state, }) - login_form = self._search_regex( - r'(?s)(]+(?:id|name)="Form-login".+?)', login_webpage, 'login form') - form_data = self._hidden_inputs(login_form) + + def extract_form_url_and_data(wp, default_form_url, form_spec_re=''): + form, form_elem = re.search( + r'(?s)((]+?%s[^>]*?>).+?)' % form_spec_re, wp).groups() + form_data = self._hidden_inputs(form) + form_url = extract_attributes(form_elem).get('action') or default_form_url + return form_url, form_data + + post_url, form_data = extract_form_url_and_data( + login_webpage, + 'https://services.radio-canada.ca/auth/oauth/v2/authorize/login', + r'(?:id|name)="Form-login"') form_data.update({ 'login-email': email, 'login-password': password, }) - post_url = extract_attributes(login_form).get('action') or authorize_url - _, urlh = self._download_webpage_handle( + consent_webpage = self._download_webpage( post_url, None, 'Logging in', data=urlencode_postdata(form_data)) + post_url, form_data = extract_form_url_and_data( + consent_webpage, + 'https://services.radio-canada.ca/auth/oauth/v2/authorize/consent') + _, urlh = self._download_webpage_handle( + post_url, None, 'Following Redirection', + data=urlencode_postdata(form_data)) self._access_token = self._search_regex( r'access_token=([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', urlh.geturl(), 'access token') @@ -78,8 +96,10 @@ class TouTvIE(InfoExtractor): def _real_extract(self, url): path = self._match_id(url) metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path) + # IsDrm does not necessarily mean the video is DRM protected (see + # https://github.com/rg3/youtube-dl/issues/13994). if metadata.get('IsDrm'): - raise ExtractorError('This video is DRM protected.', expected=True) + self.report_warning('This video is probably DRM protected.', path) video_id = metadata['IdMedia'] details = metadata['Details'] title = details['OriginalTitle'] diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 1853a1104..368c45729 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -31,6 +31,12 @@ class Tube8IE(KeezMoviesIE): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)', + webpage) + def _real_extract(self, url): webpage, info = self._extract_info(url) diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index c44018aec..36f6c1673 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -13,11 +13,11 @@ from ..utils import ( class TubiTvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?:video|movies|tv-shows)/(?P[0-9]+)' _LOGIN_URL = 'http://tubitv.com/login' _NETRC_MACHINE = 'tubitv' _GEO_COUNTRIES = ['US'] - _TEST = { + _TESTS = [{ 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday', 'md5': '43ac06be9326f41912dc64ccf7a80320', 'info_dict': { @@ -27,7 +27,13 @@ class TubiTvIE(InfoExtractor): 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.', 'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434', }, - } + }, { + 'url': 'http://tubitv.com/tv-shows/321886/s01_e01_on_nom_stories', + 'only_matching': True, + }, { + 'url': 'http://tubitv.com/movies/383676/tracker', + 'only_matching': True, + }] def _login(self): (username, password) = self._get_login_info() diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index efeb677ee..e73b64aeb 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -18,9 +18,32 @@ from ..utils import ( class TurnerBaseIE(AdobePassIE): + _AKAMAI_SPE_TOKEN_CACHE = {} + def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): + secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' + token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) + if not token: + query = { + 'path': secure_path, + 'videoId': content_id, + } + if ap_data.get('auth_required'): + query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) + auth = self._download_xml( + tokenizer_src, content_id, query=query) + error_msg = xpath_text(auth, 'error/msg') + if error_msg: + raise ExtractorError(error_msg, expected=True) + token = xpath_text(auth, 'token') + if not token: + return video_url + self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token + return video_url + '?hdnea=' + token + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): video_data = self._download_xml(data_src, video_id) video_id = video_data.attrib['id'] @@ -33,7 +56,6 @@ class TurnerBaseIE(AdobePassIE): # rtmp_src = splited_rtmp_src[1] # aifp = xpath_text(video_data, 'akamai/aifp', default='') - tokens = {} urls = [] formats = [] rex = re.compile( @@ -67,26 +89,10 @@ class TurnerBaseIE(AdobePassIE): secure_path_data = path_data.get('secure') if not secure_path_data: continue - video_url = secure_path_data['media_src'] + video_url - secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' - token = tokens.get(secure_path) - if not token: - query = { - 'path': secure_path, - 'videoId': content_id, - } - if ap_data.get('auth_required'): - query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], video_id, ap_data['site_name'], ap_data['site_name']) - auth = self._download_xml( - secure_path_data['tokenizer_src'], video_id, query=query) - error_msg = xpath_text(auth, 'error/msg') - if error_msg: - raise ExtractorError(error_msg, expected=True) - token = xpath_text(auth, 'token') - if not token: - continue - tokens[secure_path] = token - video_url = video_url + '?hdnea=' + token + video_url = self._add_akamai_spe_token( + secure_path_data['tokenizer_src'], + secure_path_data['media_src'] + video_url, + content_id, ap_data) elif not re.match('https?://', video_url): base_path_data = path_data.get(ext, path_data.get('default', {})) media_src = base_path_data.get('media_src') diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 822372ea1..362318b24 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals -import base64 - from .common import InfoExtractor -from ..compat import compat_parse_qs +from ..compat import ( + compat_b64decode, + compat_parse_qs, +) class TutvIE(InfoExtractor): @@ -26,7 +27,7 @@ class TutvIE(InfoExtractor): data_content = self._download_webpage( 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') - video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8') + video_url = compat_b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') return { 'id': internal_id, diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 7aeb2c620..cfcce020a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -18,7 +18,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^\?]+)\?video_id=| + (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -63,6 +63,10 @@ class TV4IE(InfoExtractor): 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', 'only_matching': True, }, + { + 'url': 'http://www.tv4play.se/program/farang/3922081', + 'only_matching': True, + } ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py index 3ced098f9..0b863df2f 100644 --- a/youtube_dl/extractor/tva.py +++ b/youtube_dl/extractor/tva.py @@ -3,52 +3,52 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - int_or_none, - parse_iso8601, + float_or_none, smuggle_url, ) class TVAIE(InfoExtractor): - _VALID_URL = r'https?://videos\.tva\.ca/episode/(?P\d+)' + _VALID_URL = r'https?://videos\.tva\.ca/details/_(?P\d+)' _TEST = { - 'url': 'http://videos.tva.ca/episode/85538', + 'url': 'https://videos.tva.ca/details/_5596811470001', 'info_dict': { - 'id': '85538', + 'id': '5596811470001', 'ext': 'mp4', - 'title': 'Épisode du 25 janvier 2017', - 'description': 'md5:e9e7fb5532ab37984d2dc87229cadf98', - 'upload_date': '20170126', - 'timestamp': 1485442329, + 'title': 'Un extrait de l\'épisode du dimanche 8 octobre 2017 !', + 'uploader_id': '5481942443001', + 'upload_date': '20171003', + 'timestamp': 1507064617, }, 'params': { # m3u8 download 'skip_download': True, } } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s' def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( - "https://d18jmrhziuoi7p.cloudfront.net/isl/api/v1/dataservice/Items('%s')" % video_id, - video_id, query={ - '$expand': 'Metadata,CustomId', - '$select': 'Metadata,Id,Title,ShortDescription,LongDescription,CreatedDate,CustomId,AverageUserRating,Categories,ShowName', - '$format': 'json', + 'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={ + 'Accept': 'application/json', + }, query={ + 'appId': '5955fc5f23eec60006c951f1', }) - metadata = video_data.get('Metadata', {}) + + def get_attribute(key): + for attribute in video_data.get('attributes', []): + if attribute.get('key') == key: + return attribute.get('value') + return None return { '_type': 'url_transparent', 'id': video_id, - 'title': video_data['Title'], - 'url': smuggle_url('ooyala:' + video_data['CustomId'], {'supportedformats': 'm3u8,hds'}), - 'description': video_data.get('LongDescription') or video_data.get('ShortDescription'), - 'series': video_data.get('ShowName'), - 'episode': metadata.get('EpisodeTitle'), - 'episode_number': int_or_none(metadata.get('EpisodeNumber')), - 'categories': video_data.get('Categories'), - 'average_rating': video_data.get('AverageUserRating'), - 'timestamp': parse_iso8601(video_data.get('CreatedDate')), - 'ie_key': 'Ooyala', + 'title': get_attribute('title'), + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), + 'description': get_attribute('description'), + 'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'), + 'duration': float_or_none(get_attribute('video-duration'), 1000), + 'ie_key': 'BrightcoveNew', } diff --git a/youtube_dl/extractor/tvn24.py b/youtube_dl/extractor/tvn24.py index 12ed6039c..6590e1fd0 100644 --- a/youtube_dl/extractor/tvn24.py +++ b/youtube_dl/extractor/tvn24.py @@ -9,7 +9,7 @@ from ..utils import ( class TVN24IE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P[^/]+)' _TESTS = [{ 'url': 'http://www.tvn24.pl/wiadomosci-z-kraju,3/oredzie-artura-andrusa,702428.html', 'md5': 'fbdec753d7bc29d96036808275f2130c', @@ -18,7 +18,7 @@ class TVN24IE(InfoExtractor): 'ext': 'mp4', 'title': '"Święta mają być wesołe, dlatego, ludziska, wszyscy pod jemiołę"', 'description': 'Wyjątkowe orędzie Artura Andrusa, jednego z gości "Szkła kontaktowego".', - 'thumbnail': 're:http://.*[.]jpeg', + 'thumbnail': 're:https?://.*[.]jpeg', } }, { 'url': 'http://fakty.tvn24.pl/ogladaj-online,60/53-konferencja-bezpieczenstwa-w-monachium,716431.html', @@ -29,6 +29,9 @@ class TVN24IE(InfoExtractor): }, { 'url': 'http://tvn24bis.pl/poranek,146,m/gen-koziej-w-tvn24-bis-wracamy-do-czasow-zimnej-wojny,715660.html', 'only_matching': True, + }, { + 'url': 'https://www.tvn24.pl/magazyn-tvn24/angie-w-jednej-czwartej-polka-od-szarej-myszki-do-cesarzowej-europy,119,2158', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py new file mode 100644 index 000000000..808571ece --- /dev/null +++ b/youtube_dl/extractor/tvnow.py @@ -0,0 +1,279 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + parse_duration, + try_get, + update_url_query, +) + + +class TVNowBaseIE(InfoExtractor): + _VIDEO_FIELDS = ( + 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', + 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', + 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', + 'format.defaultImage169Logo') + + def _call_api(self, path, video_id, query): + return self._download_json( + 'https://api.tvnow.de/v3/' + path, + video_id, query=query) + + def _extract_video(self, info, display_id): + video_id = compat_str(info['id']) + title = info['title'] + + mpd_url = info['manifest']['dashclear'] + if not mpd_url: + if info.get('isDrm'): + raise ExtractorError( + 'Video %s is DRM protected' % video_id, expected=True) + if info.get('geoblocked'): + raise ExtractorError( + 'Video %s is not available from your location due to geo restriction' % video_id, + expected=True) + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + + mpd_url = update_url_query(mpd_url, {'filter': ''}) + formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + + f = info.get('format', {}) + + thumbnails = [{ + 'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id, + }] + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + if thumbnail: + thumbnails.append({ + 'url': thumbnail, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'duration': duration, + 'series': f.get('title'), + 'season_number': int_or_none(info.get('season')), + 'episode_number': int_or_none(info.get('episode')), + 'episode': title, + 'formats': formats, + } + + +class TVNowIE(TVNowBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?P[^/]+)/ + (?!(?:list|jahr)(?:/|$))(?P[^/?\#&]+) + ''' + + _TESTS = [{ + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', + 'info_dict': { + 'id': '331082', + 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'ext': 'mp4', + 'title': 'Der neue Porsche 911 GT 3', + 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1495994400, + 'upload_date': '20170528', + 'duration': 5283, + 'series': 'GRIP - Das Motormagazin', + 'season_number': 14, + 'episode_number': 405, + 'episode': 'Der neue Porsche 911 GT 3', + }, + }, { + # rtl2 + 'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', + 'only_matching': True, + }, { + # rtlnitro + 'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', + 'only_matching': True, + }, { + # superrtl + 'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', + 'only_matching': True, + }, { + # ntv + 'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', + 'only_matching': True, + }, { + # vox + 'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', + 'only_matching': True, + }, { + # rtlplus + 'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + + info = self._call_api( + 'movies/' + display_id, display_id, query={ + 'fields': ','.join(self._VIDEO_FIELDS), + }) + + return self._extract_video(info, display_id) + + +class TVNowListBaseIE(TVNowBaseIE): + _SHOW_VALID_URL = r'''(?x) + (?P + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ + (?P[^/]+) + ) + ''' + + def _extract_list_info(self, display_id, show_id): + fields = list(self._SHOW_FIELDS) + fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) + fields.extend( + 'formatTabs.formatTabPages.container.movies.%s' % field + for field in self._VIDEO_FIELDS) + return self._call_api( + 'formats/seo', display_id, query={ + 'fields': ','.join(fields), + 'name': show_id + '.php' + }) + + +class TVNowListIE(TVNowListBaseIE): + _VALID_URL = r'%s/(?:list|jahr)/(?P[^?\#&]+)' % TVNowListBaseIE._SHOW_VALID_URL + + _SHOW_FIELDS = ('title', ) + _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) + _VIDEO_FIELDS = ('id', 'headline', 'seoUrl', ) + + _TESTS = [{ + 'url': 'https://www.tvnow.de/rtl/30-minuten-deutschland/list/aktuell', + 'info_dict': { + 'id': '28296', + 'title': '30 Minuten Deutschland - Aktuell', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.tvnow.de/vox/ab-ins-beet/list/staffel-14', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/2018/3', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if TVNowIE.suitable(url) + else super(TVNowListIE, cls).suitable(url)) + + def _real_extract(self, url): + base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() + + list_info = self._extract_list_info(season_id, show_id) + + season = next( + season for season in list_info['formatTabs']['items'] + if season.get('seoheadline') == season_id) + + title = list_info.get('title') + headline = season.get('headline') + if title and headline: + title = '%s - %s' % (title, headline) + else: + title = headline or title + + entries = [] + for container in season['formatTabPages']['items']: + items = try_get( + container, lambda x: x['container']['movies']['items'], + list) or [] + for info in items: + seo_url = info.get('seoUrl') + if not seo_url: + continue + video_id = info.get('id') + entries.append(self.url_result( + '%s/%s/player' % (base_url, seo_url), TVNowIE.ie_key(), + compat_str(video_id) if video_id else None)) + + return self.playlist_result( + entries, compat_str(season.get('id') or season_id), title) + + +class TVNowShowIE(TVNowListBaseIE): + _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL + + _SHOW_FIELDS = ('id', 'title', ) + _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) + _VIDEO_FIELDS = () + + _TESTS = [{ + 'url': 'https://www.tvnow.at/vox/ab-ins-beet', + 'info_dict': { + 'id': 'ab-ins-beet', + 'title': 'Ab ins Beet!', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://www.tvnow.at/vox/ab-ins-beet/list', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if TVNowIE.suitable(url) or TVNowListIE.suitable(url) + else super(TVNowShowIE, cls).suitable(url)) + + def _real_extract(self, url): + base_url, show_id = re.match(self._VALID_URL, url).groups() + + list_info = self._extract_list_info(show_id, show_id) + + entries = [] + for season_info in list_info['formatTabs']['items']: + season_url = season_info.get('seoheadline') + if not season_url: + continue + season_id = season_info.get('id') + entries.append(self.url_result( + '%s/list/%s' % (base_url, season_url), TVNowListIE.ie_key(), + compat_str(season_id) if season_id else None, + season_info.get('headline'))) + + return self.playlist_result(entries, show_id, list_info.get('title')) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index c5b3288ad..3954f0b93 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -15,16 +15,16 @@ from ..utils import ( class TVPIE(InfoExtractor): IE_NAME = 'tvp' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:(?!\d+/)[^/]+/)*(?P\d+)' + _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P\d+)' _TESTS = [{ - 'url': 'http://vod.tvp.pl/194536/i-seria-odc-13', + 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, I seria – odc. 13', - 'description': 'md5:76649d2014f65c99477be17f23a4dead', + 'description': 'md5:381afa5bca72655fe94b05cfe82bf53d', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', @@ -37,12 +37,13 @@ class TVPIE(InfoExtractor): }, }, { # page id is not the same as video id(#7799) - 'url': 'http://vod.tvp.pl/22704887/08122015-1500', - 'md5': 'cf6a4705dfd1489aef8deb168d6ba742', + 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', + 'md5': '84cd3c8aec4840046e5ab712416b73d0', 'info_dict': { - 'id': '22680786', + 'id': '33908820', 'ext': 'mp4', - 'title': 'Wiadomości, 08.12.2015, 15:00', + 'title': 'Wiadomości, 28.09.2017, 19:30', + 'description': 'Wydanie główne codziennego serwisu informacyjnego.' }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 99ff82a5d..84597b55e 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -15,7 +15,9 @@ from ..utils import ( int_or_none, parse_iso8601, qualities, + smuggle_url, try_get, + unsmuggle_url, update_url_query, ) @@ -224,6 +226,9 @@ class TVPlayIE(InfoExtractor): ] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + video_id = self._match_id(url) geo_country = self._search_regex( r'https?://[^/]+\.([a-z]{2})', url, @@ -268,6 +273,8 @@ class TVPlayIE(InfoExtractor): 'ext': ext, } if video_url.startswith('rtmp'): + if smuggled_data.get('skip_rtmp'): + continue m = re.search( r'^(?Prtmp://[^/]+/(?P[^/]+))/(?P.+)$', video_url) if not m: @@ -426,4 +433,13 @@ class ViafreeIE(InfoExtractor): r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', webpage, 'video id') - return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key()) + return self.url_result( + smuggle_url( + 'mtg:%s' % video_id, + { + 'geo_countries': [ + compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]], + # rtmp host mtgfs.fplive.net for viafree is unresolvable + 'skip_rtmp': True, + }), + ie=TVPlayIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 7af11659f..4b3b3e705 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -14,7 +14,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' + _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sexy?|tube|adult))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', @@ -60,8 +60,8 @@ class TwentyFourVideoIE(InfoExtractor): duration = int_or_none(self._og_search_property( 'duration', webpage, 'duration', fatal=False)) timestamp = parse_iso8601(self._search_regex( - r'

', @@ -96,6 +106,83 @@ class XHamsterIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) + age_limit = self._rta_search(webpage) + + def get_height(s): + return int_or_none(self._search_regex( + r'^(\d+)[pP]', s, 'height', default=None)) + + initials = self._parse_json( + self._search_regex( + r'window\.initials\s*=\s*({.+?})\s*;\s*\n', webpage, 'initials', + default='{}'), + video_id, fatal=False) + if initials: + video = initials['videoModel'] + title = video['title'] + formats = [] + for format_id, formats_dict in video['sources'].items(): + if not isinstance(formats_dict, dict): + continue + for quality, format_item in formats_dict.items(): + if format_id == 'download': + # Download link takes some time to be generated, + # skipping for now + continue + if not isinstance(format_item, dict): + continue + format_url = format_item.get('link') + filesize = int_or_none( + format_item.get('size'), invscale=1000000) + else: + format_url = format_item + filesize = None + if not isinstance(format_url, compat_str): + continue + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': format_url, + 'ext': determine_ext(format_url, 'mp4'), + 'height': get_height(quality), + 'filesize': filesize, + }) + self._sort_formats(formats) + + categories_list = video.get('categories') + if isinstance(categories_list, list): + categories = [] + for c in categories_list: + if not isinstance(c, dict): + continue + c_name = c.get('name') + if isinstance(c_name, compat_str): + categories.append(c_name) + else: + categories = None + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'timestamp': int_or_none(video.get('created')), + 'uploader': try_get( + video, lambda x: x['author']['name'], compat_str), + 'thumbnail': video.get('thumbURL'), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(try_get( + video, lambda x: x['rating']['likes'], int)), + 'dislike_count': int_or_none(try_get( + video, lambda x: x['rating']['dislikes'], int)), + 'comment_count': int_or_none(video.get('views')), + 'age_limit': age_limit, + 'categories': categories, + 'formats': formats, + } + + # Old layout fallback + title = self._html_search_regex( [r']*>([^<]+)', r']+itemprop=".*?caption.*?"[^>]+content="(.+?)"', @@ -119,8 +206,7 @@ class XHamsterIE(InfoExtractor): formats.append({ 'format_id': format_id, 'url': format_url, - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]', format_id, 'height', default=None)) + 'height': get_height(format_id), }) video_url = self._search_regex( @@ -148,8 +234,8 @@ class XHamsterIE(InfoExtractor): webpage, 'uploader', default='anonymous') thumbnail = self._search_regex( - [r'''thumb\s*:\s*(?P["'])(?P.+?)(?P=q)''', - r''']+poster=(?P["'])(?P.+?)(?P=q)[^>]*>'''], + [r'''["']thumbUrl["']\s*:\s*(?P["'])(?P.+?)(?P=q)''', + r''']+"poster"=(?P["'])(?P.+?)(?P=q)[^>]*>'''], webpage, 'thumbnail', fatal=False, group='thumbnail') duration = parse_duration(self._search_regex( @@ -167,8 +253,6 @@ class XHamsterIE(InfoExtractor): mobj = re.search(r'Comments \((?P\d+)\)', webpage) comment_count = mobj.group('commentcount') if mobj else 0 - age_limit = self._rta_search(webpage) - categories_html = self._search_regex( r'(?s)Categories:.+?)', webpage, 'categories', default=None) @@ -195,15 +279,16 @@ class XHamsterIE(InfoExtractor): class XHamsterEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P\d+)' + _VALID_URL = r'https?://(?:.+?\.)?xhamster\.com/xembed\.php\?video=(?P\d+)' _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', 'info_dict': { 'id': '3328539', 'ext': 'mp4', 'title': 'Pen Masturbation', + 'timestamp': 1406581861, 'upload_date': '20140728', - 'uploader_id': 'anonymous', + 'uploader': 'ManyakisArt', 'duration': 5, 'age_limit': 18, } @@ -221,7 +306,7 @@ class XHamsterEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r'href="(https?://xhamster\.com/movies/%s/[^"]*\.html[^"]*)"' % video_id, + r'href="(https?://xhamster\.com/(?:movies/{0}/[^"]*\.html|videos/[^/]*-{0})[^"]*)"'.format(video_id), webpage, 'xhamster url', default=None) if not video_url: diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index d017e03de..8333fb534 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -9,8 +9,8 @@ from ..utils import int_or_none class XiamiBaseIE(InfoExtractor): _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id' - def _download_webpage(self, *args, **kwargs): - webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(XiamiBaseIE, self)._download_webpage_handle(*args, **kwargs) if '>Xiami is currently not available in your country.<' in webpage: self.raise_geo_restricted('Xiami is currently not available in your country') return webpage @@ -40,9 +40,12 @@ class XiamiBaseIE(InfoExtractor): 'subtitles': subtitles, } - def _extract_tracks(self, item_id, typ=None): + def _extract_tracks(self, item_id, referer, typ=None): playlist = self._download_json( - '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id) + '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), + item_id, headers={ + 'Referer': referer, + }) return [ self._extract_track(track, item_id) for track in playlist['data']['trackList']] @@ -135,13 +138,13 @@ class XiamiSongIE(XiamiBaseIE): }] def _real_extract(self, url): - return self._extract_tracks(self._match_id(url))[0] + return self._extract_tracks(self._match_id(url), url)[0] class XiamiPlaylistBaseIE(XiamiBaseIE): def _real_extract(self, url): item_id = self._match_id(url) - return self.playlist_result(self._extract_tracks(item_id, self._TYPE), item_id) + return self.playlist_result(self._extract_tracks(item_id, url, self._TYPE), item_id) class XiamiAlbumIE(XiamiPlaylistBaseIE): diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py new file mode 100644 index 000000000..a912e54b8 --- /dev/null +++ b/youtube_dl/extractor/ximalaya.py @@ -0,0 +1,233 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor + + +class XimalayaBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CN'] + + +class XimalayaIE(XimalayaBaseIE): + IE_NAME = 'ximalaya' + IE_DESC = '喜马拉雅FM' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P[0-9]+)/sound/(?P[0-9]+)' + _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' + _TESTS = [ + { + 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'http://m.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', + 'info_dict': { + 'id': '15705996', + 'ext': 'm4a', + 'uploader': '李延隆老师', + 'uploader_id': 11045267, + 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', + 'title': 'Lesson 1 Excuse me!', + 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" + "听录音,然后回答问题,这是谁的手袋?", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['train', '外语'], + 'duration': 40, + 'view_count': int, + 'like_count': int, + } + }, + ] + + def _real_extract(self, url): + + is_m = 'm.ximalaya' in url + scheme = 'https' if url.startswith('https') else 'http' + + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id, + note='Download sound page for %s' % audio_id, + errnote='Unable to get sound page') + + audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) + audio_info = self._download_json(audio_info_file, audio_id, + 'Downloading info json %s' % audio_info_file, + 'Unable to download info file') + + formats = [] + for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): + if audio_info.get(k): + formats.append({ + 'format_id': bps, + 'url': audio_info[k], + }) + + thumbnails = [] + for k in audio_info.keys(): + # cover pics kyes like: cover_url', 'cover_url_142' + if k.startswith('cover_url'): + thumbnail = {'name': k, 'url': audio_info[k]} + if k == 'cover_url_142': + thumbnail['width'] = 180 + thumbnail['height'] = 180 + thumbnails.append(thumbnail) + + audio_uploader_id = audio_info.get('uid') + + if is_m: + audio_description = self._html_search_regex(r'(?s)]+>(.+?)', + webpage, 'audio_description', fatal=False) + else: + audio_description = self._html_search_regex(r'(?s)]*>(.+?)', + webpage, 'audio_description', fatal=False) + + if not audio_description: + audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) + audio_description = self._download_webpage(audio_description_file, audio_id, + note='Downloading description file %s' % audio_description_file, + errnote='Unable to download descrip file', + fatal=False) + audio_description = audio_description.strip() if audio_description else None + + return { + 'id': audio_id, + 'uploader': audio_info.get('nickname'), + 'uploader_id': audio_uploader_id, + 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, + 'title': audio_info['title'], + 'thumbnails': thumbnails, + 'description': audio_description, + 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), + 'duration': audio_info.get('duration'), + 'view_count': audio_info.get('play_count'), + 'like_count': audio_info.get('favorites_count'), + 'formats': formats, + } + + +class XimalayaAlbumIE(XimalayaBaseIE): + IE_NAME = 'ximalaya:album' + IE_DESC = '喜马拉雅FM 专辑' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P[0-9]+)/album/(?P[0-9]+)' + _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' + _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' + _LIST_VIDEO_RE = r']+?href="(?P/%s/sound/(?P\d+)/?)"[^>]+?title="(?P[^>]+)">' + _TESTS = [{ + 'url': 'http://www.ximalaya.com/61425525/album/5534601/', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, { + 'url': 'http://m.ximalaya.com/61425525/album/5534601', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, + ] + + def _real_extract(self, url): + self.scheme = scheme = 'https' if url.startswith('https') else 'http' + + mobj = re.match(self._VALID_URL, url) + uid, playlist_id = mobj.group('uid'), mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, + note='Download album page for %s' % playlist_id, + errnote='Unable to get album info') + + title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', + webpage, 'title', fatal=False) + + return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) + + def _entries(self, page, playlist_id, uid): + html = page + for page_num in itertools.count(1): + for entry in self._process_page(html, uid): + yield entry + + next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', + html, 'list_next_url', default=None, group='more') + if not next_url: + break + + next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) + html = self._download_webpage(next_full_url, playlist_id) + + def _process_page(self, html, uid): + find_from = html.index('album_soundlist') + for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): + yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), + XimalayaIE.ie_key(), + mobj.group('id'), + mobj.group('title')) diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index e0a6255dc..ac1ccc404 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -1,19 +1,29 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..utils import ( + determine_ext, + int_or_none, + NO_DEFAULT, + str_to_int, +) class XNXXIE(InfoExtractor): _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' _TESTS = [{ 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', - 'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0', + 'md5': '7583e96c15c0f21e9da3453d9920fbba', 'info_dict': { 'id': '55awb78', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Skyrim Test Video', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 469, + 'view_count': int, 'age_limit': 18, }, }, { @@ -26,23 +36,49 @@ class XNXXIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'flv_url=(.*?)&', - webpage, 'video URL') - video_url = compat_urllib_parse_unquote(video_url) + def get(meta, default=NO_DEFAULT, fatal=True): + return self._search_regex( + r'set%s\s*\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % meta, + webpage, meta, default=default, fatal=fatal, group='value') - video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM', - webpage, 'title') + title = self._og_search_title( + webpage, default=None) or get('VideoTitle') - video_thumbnail = self._search_regex(r'url_bigthumb=(.*?)&', - webpage, 'thumbnail', fatal=False) + formats = [] + for mobj in re.finditer( + r'setVideo(?:Url(?P<id>Low|High)|HLS)\s*\(\s*(?P<q>["\'])(?P<url>(?:https?:)?//.+?)(?P=q)', webpage): + format_url = mobj.group('url') + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=1, m3u8_id='hls', fatal=False)) + else: + format_id = mobj.group('id') + if format_id: + format_id = format_id.lower() + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': -1 if format_id == 'low' else 0, + }) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage, default=None) or get( + 'ThumbUrl', fatal=False) or get('ThumbUrl169', fatal=False) + duration = int_or_none(self._og_search_property('duration', webpage)) + view_count = str_to_int(self._search_regex( + r'id=["\']nb-views-number[^>]+>([\d,.]+)', webpage, 'view count', + default=None)) return { 'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': 'flv', - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, 'age_limit': 18, + 'formats': formats, } diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index bea9b87ad..c6c0b3291 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -18,7 +18,7 @@ class XTubeIE(InfoExtractor): _VALID_URL = r'''(?x) (?: xtube:| - https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-) + https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?:embedded/)?(?P<display_id>[^/]+)-) ) (?P<id>[^/?&#]+) ''' @@ -64,6 +64,9 @@ class XTubeIE(InfoExtractor): }, { 'url': 'xtube:kVTUy_G222_', 'only_matching': True, + }, { + 'url': 'https://www.xtube.com/video-watch/embedded/milf-tara-and-teen-shared-and-cum-covered-extreme-bukkake-32203482?embedsize=big', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index eca603028..efee95651 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -14,8 +14,16 @@ from ..utils import ( class XVideosIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P<id>[0-9]+)(?:.*)' - _TEST = { + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?xvideos\.com/video| + flashservice\.xvideos\.com/embedframe/| + static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= + ) + (?P<id>[0-9]+) + ''' + _TESTS = [{ 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', 'md5': '14cea69fcb84db54293b1e971466c2e1', 'info_dict': { @@ -25,21 +33,35 @@ class XVideosIE(InfoExtractor): 'duration': 108, 'age_limit': 18, } - } + }, { + 'url': 'https://flashservice.xvideos.com/embedframe/4588838', + 'only_matching': True, + }, { + 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + webpage = self._download_webpage( + 'http://www.xvideos.com/video%s/' % video_id, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) - video_title = self._html_search_regex( - r'<title>(.*?)\s+-\s+XVID', webpage, 'title') - video_thumbnail = self._search_regex( - r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) - video_duration = int_or_none(self._og_search_property( + title = self._html_search_regex( + (r'<title>(?P<title>.+?)\s+-\s+XVID', + r'setVideoTitle\s*\(\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', default=None, + group='title') or self._og_search_title(webpage) + + thumbnail = self._search_regex( + (r'setThumbUrl\(\s*(["\'])(?P<thumbnail>(?:(?!\1).)+)\1', + r'url_bigthumb=(?P<thumbnail>.+?)&'), + webpage, 'thumbnail', fatal=False, group='thumbnail') + duration = int_or_none(self._og_search_property( 'duration', webpage, default=None)) or parse_duration( self._search_regex( r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)', @@ -74,8 +96,8 @@ class XVideosIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'title': video_title, - 'duration': video_duration, - 'thumbnail': video_thumbnail, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, 'age_limit': 18, } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 38f82bf44..552013a74 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -12,11 +12,13 @@ from ..compat import ( ) from ..utils import ( clean_html, - unescapeHTML, + determine_ext, ExtractorError, + extract_attributes, int_or_none, mimetype2ext, - determine_ext, + smuggle_url, + unescapeHTML, ) from .brightcove import ( @@ -28,7 +30,7 @@ from .nbc import NBCSportsVPlayerIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?(?:\.html)?)' + _VALID_URL = r'(?P<host>https?://(?:(?P<country>[a-zA-Z]{2})\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?:(?P<display_id>.+)?-)?(?P<id>[0-9]+)(?:-[a-z]+)?(?:\.html)?' _TESTS = [ { 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', @@ -50,6 +52,7 @@ class YahooIE(InfoExtractor): 'description': 'md5:66b627ab0a282b26352136ca96ce73c1', 'duration': 151, }, + 'skip': 'HTTP Error 404', }, { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', @@ -142,7 +145,7 @@ class YahooIE(InfoExtractor): 'skip': 'Domain name in.lifestyle.yahoo.com gone', }, { 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': '2a9752f74cb898af5d1083ea9f661b58', + 'md5': '989396ae73d20c6f057746fb226aa215', 'info_dict': { 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', 'ext': 'mp4', @@ -227,13 +230,33 @@ class YahooIE(InfoExtractor): 'skip_download': True, }, }, + { + # custom brightcove + 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37083565/clown-entertainers-say-it-is-hurting-their-business/', + 'info_dict': { + 'id': '5575377707001', + 'ext': 'mp4', + 'title': "Clown entertainers say 'It' is hurting their business", + 'description': 'Stephen King s horror film has much to answer for. Jelby and Mr Loopy the Clowns join us.', + 'timestamp': 1505341164, + 'upload_date': '20170913', + 'uploader_id': '2376984109001', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # custom brightcove, geo-restricted to Australia, bypassable + 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37263964/sunrise-episode-wed-27-sep/', + 'only_matching': True, + } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') or self._match_id(url) page_id = mobj.group('id') - url = mobj.group('url') + display_id = mobj.group('display_id') or page_id host = mobj.group('host') webpage, urlh = self._download_webpage_handle(url, display_id) if 'err=404' in urlh.geturl(): @@ -257,10 +280,31 @@ class YahooIE(InfoExtractor): if bc_url: return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + def brightcove_url_result(bc_url): + return self.url_result( + smuggle_url(bc_url, {'geo_countries': [mobj.group('country')]}), + BrightcoveNewIE.ie_key()) + # Look for Brightcove New Studio embeds bc_url = BrightcoveNewIE._extract_url(self, webpage) if bc_url: - return self.url_result(bc_url, BrightcoveNewIE.ie_key()) + return brightcove_url_result(bc_url) + + brightcove_iframe = self._search_regex( + r'(<iframe[^>]+data-video-id=["\']\d+[^>]+>)', webpage, + 'brightcove iframe', default=None) + if brightcove_iframe: + attr = extract_attributes(brightcove_iframe) + src = attr.get('src') + if src: + parsed_src = compat_urlparse.urlparse(src) + qs = compat_urlparse.parse_qs(parsed_src.query) + account_id = qs.get('accountId', ['2376984109001'])[0] + brightcove_id = attr.get('data-video-id') or qs.get('videoId', [None])[0] + if account_id and brightcove_id: + return brightcove_url_result( + 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + % (account_id, brightcove_id)) # Query result is often embedded in webpage as JSON. Sometimes explicit requests # to video API results in a failure with geo restriction reason therefore using diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index eb1062142..009203851 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -34,8 +34,8 @@ class YandexMusicBaseIE(InfoExtractor): 'youtube-dl with --cookies', expected=True) - def _download_webpage(self, *args, **kwargs): - webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: self._raise_captcha() return webpage @@ -57,14 +57,14 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio - Gypsy Eyes 1', + 'title': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', 'filesize': 4628061, 'duration': 193.04, 'track': 'Gypsy Eyes 1', 'album': 'Gypsy Soul', 'album_artist': 'Carlo Ambrosio', - 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', - 'release_year': '2009', + 'artist': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari', + 'release_year': 2009, }, 'skip': 'Travis CI servers blocked by YandexMusic', } @@ -120,7 +120,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): track_info.update({ 'album': album.get('title'), 'album_artist': extract_artist(album.get('artists')), - 'release_year': compat_str(year) if year else None, + 'release_year': int_or_none(year), }) track_artist = extract_artist(track.get('artists')) diff --git a/youtube_dl/extractor/yapfiles.py b/youtube_dl/extractor/yapfiles.py new file mode 100644 index 000000000..7fafbf596 --- /dev/null +++ b/youtube_dl/extractor/yapfiles.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + unescapeHTML, +) + + +class YapFilesIE(InfoExtractor): + _YAPFILES_URL = r'//(?:(?:www|api)\.)?yapfiles\.ru/get_player/*\?.*?\bv=(?P<id>\w+)' + _VALID_URL = r'https?:%s' % _YAPFILES_URL + _TESTS = [{ + # with hd + 'url': 'http://www.yapfiles.ru/get_player/?v=vMDE1NjcyNDUt0413', + 'md5': '2db19e2bfa2450568868548a1aa1956c', + 'info_dict': { + 'id': 'vMDE1NjcyNDUt0413', + 'ext': 'mp4', + 'title': 'Самый худший пароль WIFI', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 72, + }, + }, { + # without hd + 'url': 'https://api.yapfiles.ru/get_player/?uid=video_player_1872528&plroll=1&adv=1&v=vMDE4NzI1Mjgt690b', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.*?)\1' + % YapFilesIE._YAPFILES_URL, webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, fatal=False) + + player_url = None + query = {} + if webpage: + player_url = self._search_regex( + r'player\.init\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'player url', default=None, group='url') + + if not player_url: + player_url = 'http://api.yapfiles.ru/load/%s/' % video_id + query = { + 'md5': 'ded5f369be61b8ae5f88e2eeb2f3caff', + 'type': 'json', + 'ref': url, + } + + player = self._download_json( + player_url, video_id, query=query)['player'] + + playlist_url = player['playlist'] + title = player['title'] + thumbnail = player.get('poster') + + if title == 'Ролик удален' or 'deleted.jpg' in (thumbnail or ''): + raise ExtractorError( + 'Video %s has been removed' % video_id, expected=True) + + playlist = self._download_json( + playlist_url, video_id)['player']['main'] + + hd_height = int_or_none(player.get('hd')) + + QUALITIES = ('sd', 'hd') + quality_key = qualities(QUALITIES) + formats = [] + for format_id in QUALITIES: + is_hd = format_id == 'hd' + format_url = playlist.get( + 'file%s' % ('_hd' if is_hd else '')) + if not format_url or not isinstance(format_url, compat_str): + continue + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': quality_key(format_id), + 'height': hd_height if is_hd else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': int_or_none(player.get('length')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 0c4bc2eda..2f5a7b023 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0402' if 'tudou.com' in url else '0401', + 'ccode': '0590', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, @@ -240,14 +240,24 @@ class YoukuShowIE(InfoExtractor): }, { # Ongoing playlist. The initial page is the last one 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', - 'only_matchine': True, + 'only_matching': True, + }, { + # No data-id value. + 'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', + 'only_matching': True, + }, { + # Wrong number of reload_id. + 'url': 'http://list.youku.com/show/id_z20eb4acaf5c211e3b2ad.html', + 'only_matching': True, }] def _extract_entries(self, playlist_data_url, show_id, note, query): query['callback'] = 'cb' playlist_data = self._download_json( playlist_data_url, show_id, query=query, note=note, - transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] + transform_source=lambda s: js_to_json(strip_jsonp(s))).get('html') + if playlist_data is None: + return [None, None] drama_list = (get_element_by_class('p-drama-grid', playlist_data) or get_element_by_class('p-drama-half-row', playlist_data)) if drama_list is None: @@ -276,9 +286,9 @@ class YoukuShowIE(InfoExtractor): r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id') # The first reload_id has the same items as first_page reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page) + entries.extend(initial_entries) for idx, reload_id in enumerate(reload_ids): if reload_id == first_page_reload_id: - entries.extend(initial_entries) continue _, new_entries = self._extract_entries( 'http://list.youku.com/show/episode', show_id, @@ -287,8 +297,8 @@ class YoukuShowIE(InfoExtractor): 'id': page_config['showid'], 'stage': reload_id, }) - entries.extend(new_entries) - + if new_entries is not None: + entries.extend(new_entries) desc = self._html_search_meta('description', webpage, fatal=False) playlist_title = desc.split(',')[0] if desc else None detail_li = get_element_by_class('p-intro', webpage) diff --git a/youtube_dl/extractor/younow.py b/youtube_dl/extractor/younow.py new file mode 100644 index 000000000..04dbc87fc --- /dev/null +++ b/youtube_dl/extractor/younow.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + +CDN_API_BASE = 'https://cdn.younow.com/php/api' +MOMENT_URL_FORMAT = '%s/moment/fetch/id=%%s' % CDN_API_BASE + + +class YouNowLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.younow.com/AmandaPadeezy', + 'info_dict': { + 'id': 'AmandaPadeezy', + 'ext': 'mp4', + 'is_live': True, + 'title': 'March 26, 2017', + 'thumbnail': r're:^https?://.*\.jpg$', + 'tags': ['girls'], + 'categories': ['girls'], + 'uploader': 'AmandaPadeezy', + 'uploader_id': '6716501', + 'uploader_url': 'https://www.younow.com/AmandaPadeezy', + 'creator': 'AmandaPadeezy', + }, + 'skip': True, + } + + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) or YouNowMomentIE.suitable(url) + else super(YouNowLiveIE, cls).suitable(url)) + + def _real_extract(self, url): + username = self._match_id(url) + + data = self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username) + + if data.get('errorCode') != 0: + raise ExtractorError(data['errorMsg'], expected=True) + + uploader = try_get( + data, lambda x: x['user']['profileUrlString'], + compat_str) or username + + return { + 'id': uploader, + 'is_live': True, + 'title': self._live_title(uploader), + 'thumbnail': data.get('awsUrl'), + 'tags': data.get('tags'), + 'categories': data.get('tags'), + 'uploader': uploader, + 'uploader_id': data.get('userId'), + 'uploader_url': 'https://www.younow.com/%s' % username, + 'creator': uploader, + 'view_count': int_or_none(data.get('viewers')), + 'like_count': int_or_none(data.get('likes')), + 'formats': [{ + 'url': '%s/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s' + % (CDN_API_BASE, data['broadcastId'], data['userId']), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + } + + +def _extract_moment(item, fatal=True): + moment_id = item.get('momentId') + if not moment_id: + if not fatal: + return + raise ExtractorError('Unable to extract moment id') + + moment_id = compat_str(moment_id) + + title = item.get('text') + if not title: + title = 'YouNow %s' % ( + item.get('momentType') or item.get('titleType') or 'moment') + + uploader = try_get(item, lambda x: x['owner']['name'], compat_str) + uploader_id = try_get(item, lambda x: x['owner']['userId']) + uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None + + entry = { + 'extractor_key': 'YouNowMoment', + 'id': moment_id, + 'title': title, + 'view_count': int_or_none(item.get('views')), + 'like_count': int_or_none(item.get('likes')), + 'timestamp': int_or_none(item.get('created')), + 'creator': uploader, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'formats': [{ + 'url': 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8' + % (moment_id, moment_id), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }], + } + + return entry + + +class YouNowChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/]+)/channel' + _TEST = { + 'url': 'https://www.younow.com/its_Kateee_/channel', + 'info_dict': { + 'id': '14629760', + 'title': 'its_Kateee_ moments' + }, + 'playlist_mincount': 8, + } + + def _entries(self, username, channel_id): + created_before = 0 + for page_num in itertools.count(1): + if created_before is None: + break + info = self._download_json( + '%s/moment/profile/channelId=%s/createdBefore=%d/records=20' + % (CDN_API_BASE, channel_id, created_before), username, + note='Downloading moments page %d' % page_num) + items = info.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + item_type = item.get('type') + if item_type == 'moment': + entry = _extract_moment(item, fatal=False) + if entry: + yield entry + elif item_type == 'collection': + moments = item.get('momentsIds') + if isinstance(moments, list): + for moment_id in moments: + m = self._download_json( + MOMENT_URL_FORMAT % moment_id, username, + note='Downloading %s moment JSON' % moment_id, + fatal=False) + if m and isinstance(m, dict) and m.get('item'): + entry = _extract_moment(m['item']) + if entry: + yield entry + created_before = int_or_none(item.get('created')) + + def _real_extract(self, url): + username = self._match_id(url) + channel_id = compat_str(self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username, note='Downloading user information')['userId']) + return self.playlist_result( + self._entries(username, channel_id), channel_id, + '%s moments' % username) + + +class YouNowMomentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.younow.com/GABO.../20712117/36319236/3b316doc/m', + 'md5': 'a30c70eadb9fb39a1aa3c8c0d22a0807', + 'info_dict': { + 'id': '20712117', + 'ext': 'mp4', + 'title': 'YouNow capture', + 'view_count': int, + 'like_count': int, + 'timestamp': 1490432040, + 'upload_date': '20170325', + 'uploader': 'GABO...', + 'uploader_id': 35917228, + }, + } + + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) + else super(YouNowMomentIE, cls).suitable(url)) + + def _real_extract(self, url): + video_id = self._match_id(url) + item = self._download_json(MOMENT_URL_FORMAT % video_id, video_id) + return _extract_moment(item['item']) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2e71795e7..04aeb91af 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -86,7 +87,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (username, password) = self._get_login_info() # No authentication to be performed if username is None: - if self._LOGIN_REQUIRED: + if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return True @@ -245,6 +246,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _download_webpage_handle(self, *args, **kwargs): + kwargs.setdefault('query', {})['disable_polymer'] = 'true' + return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) + def _real_initialize(self): if self._downloader is None: return @@ -326,6 +332,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| + (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains @@ -1003,6 +1010,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Skipping DASH manifest', ], }, + { + # The following content has been identified by the YouTube community + # as inappropriate or offensive to some audiences. + 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', + 'info_dict': { + 'id': '6SJNVb0GnPI', + 'ext': 'mp4', + 'title': 'Race Differences in Intelligence', + 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', + 'duration': 965, + 'upload_date': '20140124', + 'uploader': 'New Century Foundation', + 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', + 'license': 'Standard YouTube License', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { # itag 212 'url': '1t24XAntNCY', @@ -1347,6 +1375,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): playback_url, video_id, 'Marking watched', 'Unable to mark watched', fatal=False) + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + <iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1437,9 +1502,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + is_live = None + view_count = None + + def extract_view_count(v_info): + return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + # Get video info embed_webpage = None - is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -1509,6 +1579,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue get_video_info = compat_parse_qs(video_info_webpage) add_dash_mpd(get_video_info) + if view_count is None: + view_count = extract_view_count(get_video_info) if not video_info: video_info = get_video_info if 'token' in get_video_info: @@ -1524,6 +1596,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'token' not in video_info: video_info = get_video_info break + + def extract_unavailable_message(): + return self._html_search_regex( + r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', + video_webpage, 'unavailable message', default=None) + if 'token' not in video_info: if 'reason' in video_info: if 'The uploader has not made this video available in your country.' in video_info['reason']: @@ -1532,8 +1610,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): countries = regions_allowed.split(',') if regions_allowed else None self.raise_geo_restricted( msg=video_info['reason'][0], countries=countries) + reason = video_info['reason'][0] + if 'Invalid parameters' in reason: + unavailable_message = extract_unavailable_message() + if unavailable_message: + reason = unavailable_message raise ExtractorError( - 'YouTube said: %s' % video_info['reason'][0], + 'YouTube said: %s' % reason, expected=True, video_id=video_id) else: raise ExtractorError( @@ -1550,6 +1633,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # description description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: + + def replace_url(m): + redir_url = compat_urlparse.urljoin(url, m.group(1)) + parsed_redir_url = compat_urllib_parse_urlparse(redir_url) + if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': + qs = compat_parse_qs(parsed_redir_url.query) + q = qs.get('q') + if q and q[0]: + return q[0] + return redir_url + description_original = video_description = re.sub(r'''(?x) <a\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? @@ -1558,7 +1652,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class="[^"]*"[^>]*> [^<]+\.{3}\s* </a> - ''', r'\1', video_description) + ''', replace_url, video_description) video_description = clean_html(video_description) else: fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) @@ -1592,10 +1686,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self.playlist_result(entries, video_id, video_title, video_description) self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if 'view_count' in video_info: - view_count = int(video_info['view_count'][0]) - else: - view_count = None + if view_count is None: + view_count = extract_view_count(video_info) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: @@ -1613,7 +1705,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_uploader_id = None video_uploader_url = None mobj = re.search( - r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', video_webpage) if mobj is not None: video_uploader_id = mobj.group('uploader_id') @@ -1639,10 +1731,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not upload_date: upload_date = self._search_regex( [r'(?s)id="eow-date.*?>(.*?)</span>', - r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'], + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) - if upload_date: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( @@ -1731,7 +1821,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': video_info['conn'][0], 'player_url': player_url, }] - elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1: + elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) @@ -1854,6 +1944,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break if codecs: dct.update(parse_codecs(codecs)) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] @@ -1874,9 +1969,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' formats.append(a_format) else: - unavailable_message = self._html_search_regex( - r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', - video_webpage, 'unavailable message', default=None) + unavailable_message = extract_unavailable_message() if unavailable_message: raise ExtractorError(unavailable_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') @@ -1972,39 +2065,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } -class YoutubeSharedVideoIE(InfoExtractor): - _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?.*\bci=(?P<id>[0-9A-Za-z_-]{11})' - IE_NAME = 'youtube:shared' - - _TEST = { - 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', - 'info_dict': { - 'id': 'uPDB5I9wfp8', - 'ext': 'webm', - 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', - 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', - 'upload_date': '20160219', - 'uploader': 'Pocoyo - Português (BR)', - 'uploader_id': 'PocoyoBrazil', - }, - 'add_ie': ['Youtube'], - 'params': { - # There are already too many Youtube downloads - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - real_video_id = self._html_search_meta( - 'videoId', webpage, 'YouTube video id', fatal=True) - - return self.url_result(real_video_id, YoutubeIE.ie_key()) - - class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: @@ -2028,7 +2088,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): | (%(playlist_id)s) )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ @@ -2224,6 +2284,19 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', page, 'title', default=None) + _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' + uploader = self._search_regex( + r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, + page, 'uploader', default=None) + mobj = re.search( + r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, + page) + if mobj: + uploader_id = mobj.group('uploader_id') + uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) + else: + uploader_id = uploader_url = None + has_videos = True if not playlist_title: @@ -2234,8 +2307,15 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): except StopIteration: has_videos = False - return has_videos, self.playlist_result( + playlist = self.playlist_result( self._entries(page, playlist_id), playlist_id, playlist_title) + playlist.update({ + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + }) + + return has_videos, playlist def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL @@ -2371,7 +2451,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' @@ -2464,10 +2544,11 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): webpage = self._download_webpage(url, channel_id, fatal=False) if webpage: page_type = self._og_search_property( - 'type', webpage, 'page type', default=None) + 'type', webpage, 'page type', default='') video_id = self._html_search_meta( 'videoId', webpage, 'video id', default=None) - if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id): + if page_type.startswith('video') and video_id and re.match( + r'^[0-9A-Za-z_-]{11}$', video_id): return self.url_result(video_id, YoutubeIE.ie_key()) return self.url_result(base_url) @@ -2502,7 +2583,11 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): }] -class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): +class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' + + +class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -2536,8 +2621,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): raise ExtractorError( '[youtube] No video results', expected=True) - new_videos = self._ids_to_results(orderedSet(re.findall( - r'href="/watch\?v=(.{11})', html_content))) + new_videos = list(self._process_page(html_content)) videos += new_videos if not new_videos or len(videos) > limit: break @@ -2560,11 +2644,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): +class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -2616,10 +2699,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - + def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index ids = [] @@ -2630,12 +2710,15 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): # 'recommended' feed has infinite 'load more' and each new portion spins # the same videos in (sometimes) slightly different order, so we'll check # for unicity and break when portion has no new videos - new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) if not new_ids: break ids.extend(new_ids) + for entry in self._ids_to_results(new_ids): + yield entry + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: break @@ -2647,8 +2730,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] + def _real_extract(self, url): + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, + self._PLAYLIST_TITLE) return self.playlist_result( - self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) + self._entries(page), playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py new file mode 100644 index 000000000..773073d85 --- /dev/null +++ b/youtube_dl/extractor/zattoo.py @@ -0,0 +1,270 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from uuid import uuid4 + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + urlencode_postdata, +) + + +class ZattooBaseIE(InfoExtractor): + _NETRC_MACHINE = 'zattoo' + _HOST_URL = 'https://zattoo.com' + + _power_guide_hash = None + + def _login(self): + (username, password) = self._get_login_info() + if not username or not password: + self.raise_login_required( + 'A valid %s account is needed to access this media.' + % self._NETRC_MACHINE) + + try: + data = self._download_json( + '%s/zapi/v2/account/login' % self._HOST_URL, None, 'Logging in', + data=urlencode_postdata({ + 'login': username, + 'password': password, + 'remember': 'true', + }), headers={ + 'Referer': '%s/login' % self._HOST_URL, + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError( + 'Unable to login: incorrect username and/or password', + expected=True) + raise + + self._power_guide_hash = data['session']['power_guide_hash'] + + def _real_initialize(self): + webpage = self._download_webpage( + self._HOST_URL, None, 'Downloading app token') + app_token = self._html_search_regex( + r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', + webpage, 'app token', group='token') + app_version = self._html_search_regex( + r'<!--\w+-(.+?)-', webpage, 'app version', default='2.8.2') + + # Will setup appropriate cookies + self._request_webpage( + '%s/zapi/v2/session/hello' % self._HOST_URL, None, + 'Opening session', data=urlencode_postdata({ + 'client_app_token': app_token, + 'uuid': compat_str(uuid4()), + 'lang': 'en', + 'app_version': app_version, + 'format': 'json', + })) + + self._login() + + def _extract_cid(self, video_id, channel_name): + channel_groups = self._download_json( + '%s/zapi/v2/cached/channels/%s' % (self._HOST_URL, + self._power_guide_hash), + video_id, 'Downloading channel list', + query={'details': False})['channel_groups'] + channel_list = [] + for chgrp in channel_groups: + channel_list.extend(chgrp['channels']) + try: + return next( + chan['cid'] for chan in channel_list + if chan.get('cid') and ( + chan.get('display_alias') == channel_name or + chan.get('cid') == channel_name)) + except StopIteration: + raise ExtractorError('Could not extract channel id') + + def _extract_cid_and_video_info(self, video_id): + data = self._download_json( + '%s/zapi/program/details' % self._HOST_URL, + video_id, + 'Downloading video information', + query={ + 'program_id': video_id, + 'complete': True + }) + + p = data['program'] + cid = p['cid'] + + info_dict = { + 'id': video_id, + 'title': p.get('title') or p['episode_title'], + 'description': p.get('description'), + 'thumbnail': p.get('image_url'), + 'creator': p.get('channel_name'), + 'episode': p.get('episode_title'), + 'episode_number': int_or_none(p.get('episode_number')), + 'season_number': int_or_none(p.get('season_number')), + 'release_year': int_or_none(p.get('year')), + 'categories': try_get(p, lambda x: x['categories'], list), + } + + return cid, info_dict + + def _extract_formats(self, cid, video_id, record_id=None, is_live=False): + postdata_common = { + 'https_watch_urls': True, + } + + if is_live: + postdata_common.update({'timeshift': 10800}) + url = '%s/zapi/watch/live/%s' % (self._HOST_URL, cid) + elif record_id: + url = '%s/zapi/watch/recording/%s' % (self._HOST_URL, record_id) + else: + url = '%s/zapi/watch/recall/%s/%s' % (self._HOST_URL, cid, video_id) + + formats = [] + for stream_type in ('dash', 'hls', 'hls5', 'hds'): + postdata = postdata_common.copy() + postdata['stream_type'] = stream_type + + data = self._download_json( + url, video_id, 'Downloading %s formats' % stream_type.upper(), + data=urlencode_postdata(postdata), fatal=False) + if not data: + continue + + watch_urls = try_get( + data, lambda x: x['stream']['watch_urls'], list) + if not watch_urls: + continue + + for watch in watch_urls: + if not isinstance(watch, dict): + continue + watch_url = watch.get('url') + if not watch_url or not isinstance(watch_url, compat_str): + continue + format_id_list = [stream_type] + maxrate = watch.get('maxrate') + if maxrate: + format_id_list.append(compat_str(maxrate)) + audio_channel = watch.get('audio_channel') + if audio_channel: + format_id_list.append(compat_str(audio_channel)) + preference = 1 if audio_channel == 'A' else None + format_id = '-'.join(format_id_list) + if stream_type in ('dash', 'dash_widevine', 'dash_playready'): + this_formats = self._extract_mpd_formats( + watch_url, video_id, mpd_id=format_id, fatal=False) + elif stream_type in ('hls', 'hls5', 'hls5_fairplay'): + this_formats = self._extract_m3u8_formats( + watch_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False) + elif stream_type == 'hds': + this_formats = self._extract_f4m_formats( + watch_url, video_id, f4m_id=format_id, fatal=False) + elif stream_type == 'smooth_playready': + this_formats = self._extract_ism_formats( + watch_url, video_id, ism_id=format_id, fatal=False) + else: + assert False + for this_format in this_formats: + this_format['preference'] = preference + formats.extend(this_formats) + self._sort_formats(formats) + return formats + + def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): + if is_live: + cid = self._extract_cid(video_id, channel_name) + info_dict = { + 'id': channel_name, + 'title': self._live_title(channel_name), + 'is_live': True, + } + else: + cid, info_dict = self._extract_cid_and_video_info(video_id) + formats = self._extract_formats( + cid, video_id, record_id=record_id, is_live=is_live) + info_dict['formats'] = formats + return info_dict + + +class QuicklineBaseIE(ZattooBaseIE): + _NETRC_MACHINE = 'quickline' + _HOST_URL = 'https://mobiltv.quickline.com' + + +class QuicklineIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' + + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + } + + def _real_extract(self, url): + channel_name, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id) + + +class QuicklineLiveIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P<id>[^/]+)' + + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/srf1', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if QuicklineIE.suitable(url) else super(QuicklineLiveIE, cls).suitable(url) + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) + + +class ZattooIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + + # Since regular videos are only available for 7 days and recorded videos + # are only available for a specific user, we cannot have detailed tests. + _TESTS = [{ + 'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_name, video_id, record_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id, record_id) + + +class ZattooLiveIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)' + + _TEST = { + 'url': 'https://zattoo.com/watch/srf1', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if ZattooIE.suitable(url) else super(ZattooLiveIE, cls).suitable(url) + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 523bb5c95..bb9020c91 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -42,16 +42,19 @@ class ZDFIE(ZDFBaseIE): _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') _TESTS = [{ - 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { - 'id': 'zdfmediathek-trailer-100', + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', 'ext': 'mp4', - 'title': 'Die neue ZDFmediathek', - 'description': 'md5:3003d36487fb9a5ea2d1ff60beb55e8d', - 'duration': 30, - 'timestamp': 1477627200, - 'upload_date': '20161028', - } + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', + }, + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', 'only_matching': True, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 38439c971..3e4ac03a2 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -478,6 +478,11 @@ def parseOpts(overrideArguments=None): '--no-resize-buffer', action='store_true', dest='noresizebuffer', default=False, help='Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.') + downloader.add_option( + '--http-chunk-size', + dest='http_chunk_size', metavar='SIZE', default=None, + help='Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' + 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)') downloader.add_option( '--test', action='store_true', dest='test', default=False, @@ -671,7 +676,8 @@ def parseOpts(overrideArguments=None): filesystem.add_option( '-a', '--batch-file', dest='batchfile', metavar='FILE', - help='File containing URLs to download (\'-\' for stdin)') + help="File containing URLs to download ('-' for stdin), one URL per line. " + "Lines starting with '#', ';' or ']' are considered as comments and ignored.") filesystem.add_option( '--id', default=False, action='store_true', dest='useid', help='Use only video ID in file name') @@ -847,7 +853,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--convert-subs', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, - help='Convert the subtitles to other format (currently supported: srt|ass|vtt)') + help='Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)') parser.add_option_group(general) parser.add_option_group(network) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index e606a58de..56be914b8 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -31,7 +31,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor): temp_filename = prepend_extension(filename, 'temp') if not info.get('thumbnails'): - raise EmbedThumbnailPPError('Thumbnail was not found. Nothing to do.') + self._downloader.to_screen('[embedthumbnail] There aren\'t any thumbnails to embed') + return [], info thumbnail_filename = info['thumbnails'][-1]['filename'] diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 51256a3fb..3ea1afcf3 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -44,7 +44,7 @@ ACODECS = { 'aac': 'aac', 'flac': 'flac', 'm4a': 'aac', - 'opus': 'opus', + 'opus': 'libopus', 'vorbis': 'libvorbis', 'wav': None, } @@ -585,7 +585,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): dfxp_file = old_file srt_file = subtitles_filename(filename, lang, 'srt') - with io.open(dfxp_file, 'rt', encoding='utf-8') as f: + with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) with io.open(srt_file, 'wt', encoding='utf-8') as f: diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index fbdfa02ac..b0aed9ca7 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -42,6 +42,7 @@ class XAttrMetadataPP(PostProcessor): 'user.dublincore.format': 'format', } + num_written = 0 for xattrname, infoname in xattr_mapping.items(): value = info.get(infoname) @@ -52,6 +53,7 @@ class XAttrMetadataPP(PostProcessor): byte_value = value.encode('utf-8') write_xattr(filename, xattrname, byte_value) + num_written += 1 return [], info @@ -62,8 +64,8 @@ class XAttrMetadataPP(PostProcessor): except XAttrMetadataError as e: if e.reason == 'NO_SPACE': self._downloader.report_warning( - 'There\'s no disk space left or disk quota exceeded. ' + - 'Extended attributes are not written.') + 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' + + (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize()) elif e.reason == 'VALUE_TOO_LONG': self._downloader.report_warning( 'Unable to write extended attributes due to too long values.') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c9cbd5842..b460393bf 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_HTMLParser, compat_basestring, compat_chr, + compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, compat_expanduser, compat_html_entities, @@ -81,7 +82,7 @@ def register_socks_protocols(): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -159,6 +160,8 @@ DATE_FORMATS = ( '%Y-%m-%dT%H:%M', '%b %d %Y at %H:%M', '%b %d %Y at %H:%M:%S', + '%B %d %Y at %H:%M', + '%B %d %Y at %H:%M:%S', ) DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) @@ -535,10 +538,22 @@ def sanitize_path(s): return os.path.join(*sanitized_path) -# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of -# unwanted failures due to missing protocol def sanitize_url(url): - return 'http:%s' % url if url.startswith('//') else url + # Prepend protocol-less URLs with `http:` scheme in order to mitigate + # the number of unwanted failures due to missing protocol + if url.startswith('//'): + return 'http:%s' % url + # Fix some common typos seen so far + COMMON_TYPOS = ( + # https://github.com/rg3/youtube-dl/issues/15649 + (r'^httpss://', r'https://'), + # https://bx1.be/lives/direct-tv/ + (r'^rmtp([es]?)://', r'rtmp\1://'), + ) + for mistake, fixup in COMMON_TYPOS: + if re.match(mistake, url): + return re.sub(mistake, fixup, url) + return url def sanitized_Request(url, *args, **kwargs): @@ -596,7 +611,7 @@ def unescapeHTML(s): assert type(s) == compat_str return re.sub( - r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) def get_subprocess_encoding(): @@ -863,8 +878,8 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): # expected HTTP responses to meet HTTP/1.0 or later (see also # https://github.com/rg3/youtube-dl/issues/6727) if sys.version_info < (3, 0): - kwargs[b'strict'] = True - hc = http_class(*args, **kwargs) + kwargs['strict'] = True + hc = http_class(*args, **compat_kwargs(kwargs)) source_address = ydl_handler._params.get('source_address') if source_address is not None: sa = (source_address, 0) @@ -1196,6 +1211,11 @@ def unified_timestamp(date_str, day_first=True): if m: date_str = date_str[:-len(m.group('tz'))] + # Python only supports microseconds, so remove nanoseconds + m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) + if m: + date_str = m.group(1) + for expression in date_formats(day_first): try: dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) @@ -1328,24 +1348,24 @@ def _windows_write_string(s, out): if fileno not in WIN_OUTPUT_IDS: return False - GetStdHandle = ctypes.WINFUNCTYPE( + GetStdHandle = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - (b'GetStdHandle', ctypes.windll.kernel32)) + ('GetStdHandle', ctypes.windll.kernel32)) h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) - WriteConsoleW = ctypes.WINFUNCTYPE( + WriteConsoleW = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32)) + ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) written = ctypes.wintypes.DWORD(0) - GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32)) + GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) FILE_TYPE_CHAR = 0x0002 FILE_TYPE_REMOTE = 0x8000 - GetConsoleMode = ctypes.WINFUNCTYPE( + GetConsoleMode = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.POINTER(ctypes.wintypes.DWORD))( - (b'GetConsoleMode', ctypes.windll.kernel32)) + ('GetConsoleMode', ctypes.windll.kernel32)) INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value def not_a_console(handle): @@ -1674,6 +1694,28 @@ def parse_count(s): return lookup_unit_table(_UNIT_TABLE, s) +def parse_resolution(s): + if s is None: + return {} + + mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s) + if mobj: + return { + 'width': int(mobj.group('w')), + 'height': int(mobj.group('h')), + } + + mobj = re.search(r'\b(\d+)[pPiI]\b', s) + if mobj: + return {'height': int(mobj.group(1))} + + mobj = re.search(r'\b([48])[kK]\b', s) + if mobj: + return {'height': int(mobj.group(1)) * 540} + + return {} + + def month_by_name(name, lang='en'): """ Return the number of a month by (locale-independently) English name """ @@ -1815,6 +1857,10 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default +def bool_or_none(v, default=None): + return v if isinstance(v, bool) else default + + def strip_or_none(v): return None if v is None else v.strip() @@ -1831,10 +1877,20 @@ def parse_duration(s): days, hours, mins, secs, ms = m.groups() else: m = re.match( - r'''(?ix)(?:P?T)? + r'''(?ix)(?:P? + (?: + [0-9]+\s*y(?:ears?)?\s* + )? + (?: + [0-9]+\s*m(?:onths?)?\s* + )? + (?: + [0-9]+\s*w(?:eeks?)?\s* + )? (?: (?P<days>[0-9]+)\s*d(?:ays?)?\s* )? + T)? (?: (?P<hours>[0-9]+)\s*h(?:ours?)?\s* )? @@ -1929,7 +1985,7 @@ class PagedList(object): class OnDemandPagedList(PagedList): - def __init__(self, pagefunc, pagesize, use_cache=False): + def __init__(self, pagefunc, pagesize, use_cache=True): self._pagefunc = pagefunc self._pagesize = pagesize self._use_cache = use_cache @@ -2169,6 +2225,20 @@ def try_get(src, getter, expected_type=None): return v +def merge_dicts(*dicts): + merged = {} + for a_dict in dicts: + for k, v in a_dict.items(): + if v is None: + continue + if (k not in merged or + (isinstance(v, compat_str) and v and + isinstance(merged[k], compat_str) and + not merged[k])): + merged[k] = v + return merged + + def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) @@ -2250,7 +2320,7 @@ def js_to_json(code): "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| {comment}|,(?={skip}[\]}}])| - [a-zA-Z_][.a-zA-Z_0-9]*| + (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| [0-9]+(?={skip}:) '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) @@ -2336,6 +2406,7 @@ def mimetype2ext(mt): 'ttml+xml': 'ttml', 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', + 'x-ms-sami': 'sami', 'x-ms-wmv': 'wmv', 'mpegurl': 'm3u8', 'x-mpegurl': 'm3u8', @@ -2358,7 +2429,7 @@ def parse_codecs(codecs_str): vcodec, acodec = None, None for full_codec in splited_codecs: codec = full_codec.split('.')[0] - if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'): if not vcodec: vcodec = full_codec elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): @@ -2517,8 +2588,8 @@ def _match_one(filter_part, dct): return op(actual_value, comparison_value) UNARY_OPERATORS = { - '': lambda v: v is not None, - '!': lambda v: v is None, + '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), + '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), } operator_rex = re.compile(r'''(?x)\s* (?P<op>%s)\s*(?P<key>[a-z_]+) @@ -2568,14 +2639,18 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' LEGACY_NAMESPACES = ( - ('http://www.w3.org/ns/ttml', [ - 'http://www.w3.org/2004/11/ttaf1', - 'http://www.w3.org/2006/04/ttaf1', - 'http://www.w3.org/2006/10/ttaf1', + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', ]), - ('http://www.w3.org/ns/ttml#styling', [ - 'http://www.w3.org/ns/ttml#style', + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', ]), ) @@ -2670,7 +2745,7 @@ def dfxp2srt(dfxp_data): for ns in v: dfxp_data = dfxp_data.replace(ns, k) - dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 022172375..04896efc8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.09' +__version__ = '2018.05.01'