youtube-dl/youtube_dl/extractor/bandcamp.py

from __future__ import unicode_literals

import random
import re
import time

from ..compat import compat_str, compat_urlparse
from ..utils import (KNOWN_EXTENSIONS, ExtractorError, float_or_none,
                     int_or_none, parse_filesize, str_or_none, try_get,
                     unescapeHTML, unified_strdate, unified_timestamp,
                     update_url_query, url_or_none)
from .common import InfoExtractor


class BandcampIE(InfoExtractor):
    _VALID_URL = r"https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)"
    _TESTS = [
        {
            "url": "http://youtube-dl.bandcamp.com/track/youtube-dl-test-song",
            "md5": "c557841d5e50261777a6585648adf439",
            "info_dict": {
                "id": "1812978515",
                "ext": "mp3",
                "title": (
                    "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song"
                    " \"'/\\\u00e4\u21ad"
                ),
                "duration": 9.8485,
            },
            "_skip": "There is a limit of 200 free downloads / month for the test song",
        },
        {
            # free download
            "url": "http://benprunty.bandcamp.com/track/lanius-battle",
            "md5": "853e35bf34aa1d6fe2615ae612564b36",
            "info_dict": {
                "id": "2650410135",
                "ext": "aiff",
                "title": "Ben Prunty - Lanius (Battle)",
                "thumbnail": r"re:^https?://.*\.jpg$",
                "uploader": "Ben Prunty",
                "timestamp": 1396508491,
                "upload_date": "20140403",
                "release_date": "20140403",
                "duration": 260.877,
                "track": "Lanius (Battle)",
                "track_number": 1,
                "track_id": "2650410135",
                "artist": "Ben Prunty",
                "album": "FTL: Advanced Edition Soundtrack",
            },
        },
        {
            # no free download, mp3 128
            "url": "https://relapsealumni.bandcamp.com/track/hail-to-fire",
            "md5": "fec12ff55e804bb7f7ebeb77a800c8b7",
            "info_dict": {
                "id": "2584466013",
                "ext": "mp3",
                "title": "Mastodon - Hail to Fire",
                "thumbnail": r"re:^https?://.*\.jpg$",
                "uploader": "Mastodon",
                "timestamp": 1322005399,
                "upload_date": "20111122",
                "release_date": "20040207",
                "duration": 120.79,
                "track": "Hail to Fire",
                "track_number": 5,
                "track_id": "2584466013",
                "artist": "Mastodon",
                "album": "Call of the Mastodon",
            },
        },
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        title = mobj.group("title")
        webpage = self._download_webpage(url, title)
        thumbnail = self._html_search_meta("og:image", webpage, default=None)

        track_id = None
        track = None
        track_number = None
        duration = None

        formats = []
        trackinfo_block = self._search_regex(
            r"trackinfo&quot;:\[\s*({.+?})\s*\],&quot;",
            webpage,
            "track info",
            default="{}",
        )
        quoted_json = trackinfo_block.replace("&quot;", '"')
        track_info = self._parse_json(quoted_json, title)
        if track_info:
            file_ = track_info.get("file")
            if isinstance(file_, dict):
                for format_id, format_url in file_.items():
                    if not url_or_none(format_url):
                        continue
                    ext, abr_str = format_id.split("-", 1)
                    formats.append(
                        {
                            "format_id": format_id,
                            "url": self._proto_relative_url(format_url, "http:"),
                            "ext": ext,
                            "vcodec": "none",
                            "acodec": ext,
                            "abr": int_or_none(abr_str),
                        }
                    )
            track = track_info.get("title")
            track_id = str_or_none(track_info.get("track_id") or track_info.get("id"))
            track_number = int_or_none(track_info.get("track_num"))
            duration = float_or_none(track_info.get("duration"))

        #        r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key,
        def extract(key):
            return self._search_regex(
                r",&quot;%s&quot;:(&quot;)(?P<value>(?:(?!&quot;).)+)&quot;" % key,
                webpage,
                key,
                default=None,
                group="value",
            )

        artist = extract("artist")

        album = extract("album_title")
        timestamp = unified_timestamp(
            extract("publish_date") or extract("album_publish_date")
        )
        release_date = unified_strdate(extract("album_release_date"))

        download_link = self._search_regex(
            r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
            webpage,
            "download link",
            default=None,
            group="url",
        )
        if download_link:
            track_id = self._search_regex(
                r"(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$",
                webpage,
                "track id",
            )

            download_webpage = self._download_webpage(
                download_link, track_id, "Downloading free downloads page"
            )

            blob = self._parse_json(
                self._search_regex(
                    r'data-blob=(["\'])(?P<blob>{.+?})\1',
                    download_webpage,
                    "blob",
                    group="blob",
                ),
                track_id,
                transform_source=unescapeHTML,
            )

            info = try_get(
                blob,
                (lambda x: x["digital_items"][0], lambda x: x["download_items"][0]),
                dict,
            )
            if info:
                downloads = info.get("downloads")
                if isinstance(downloads, dict):
                    if not track:
                        track = info.get("title")
                    if not artist:
                        artist = info.get("artist")
                    if not thumbnail:
                        thumbnail = info.get("thumb_url")

                    download_formats = {}
                    download_formats_list = blob.get("download_formats")
                    if isinstance(download_formats_list, list):
                        for f in blob["download_formats"]:
                            name, ext = f.get("name"), f.get("file_extension")
                            if all(isinstance(x, compat_str) for x in (name, ext)):
                                download_formats[name] = ext.strip(".")

                    for format_id, f in downloads.items():
                        format_url = f.get("url")
                        if not format_url:
                            continue
                        # Stat URL generation algorithm is reverse engineered from
                        # download_*_bundle_*.js
                        stat_url = update_url_query(
                            format_url.replace("/download/", "/statdownload/"),
                            {".rand": int(time.time() * 1000 * random.random())},
                        )
                        format_id = f.get("encoding_name") or format_id
                        stat = self._download_json(
                            stat_url,
                            track_id,
                            "Downloading %s JSON" % format_id,
                            transform_source=lambda s: s[
                                s.index("{") : s.rindex("}") + 1
                            ],
                            fatal=False,
                        )
                        if not stat:
                            continue
                        retry_url = url_or_none(stat.get("retry_url"))
                        if not retry_url:
                            continue
                        formats.append(
                            {
                                "url": self._proto_relative_url(retry_url, "http:"),
                                "ext": download_formats.get(format_id),
                                "format_id": format_id,
                                "format_note": f.get("description"),
                                "filesize": parse_filesize(f.get("size_mb")),
                                "vcodec": "none",
                            }
                        )

        self._sort_formats(formats)

        title = "%s - %s" % (artist, track) if artist else track

        if not duration:
            duration = float_or_none(
                self._html_search_meta("duration", webpage, default=None)
            )

        return {
            "id": track_id,
            "title": title,
            "thumbnail": thumbnail,
            "uploader": artist,
            "timestamp": timestamp,
            "release_date": release_date,
            "duration": duration,
            "track": track,
            "track_number": track_number,
            "track_id": track_id,
            "artist": artist,
            "album": album,
            "formats": formats,
        }


class BandcampAlbumIE(InfoExtractor):
    IE_NAME = "Bandcamp:album"
    _VALID_URL = r"https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<album_id>[^/?#&]+)"

    _TESTS = [
        {
            "url": "http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1",
            "playlist": [
                {
                    "md5": "39bc1eded3476e927c724321ddf116cf",
                    "info_dict": {"id": "1353101989", "ext": "mp3", "title": "Intro",},
                },
                {
                    "md5": "1a2c32e2691474643e912cc6cd4bffaa",
                    "info_dict": {
                        "id": "38097443",
                        "ext": "mp3",
                        "title": "Kero One - Keep It Alive (Blazo remix)",
                    },
                },
            ],
            "info_dict": {
                "title": "Jazz Format Mixtape vol.1",
                "id": "jazz-format-mixtape-vol-1",
                "uploader_id": "blazo",
            },
            "params": {"playlistend": 2},
            "skip": "Bandcamp imposes download limits.",
        },
        {
            "url": (
                "http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave"
            ),
            "info_dict": {
                "title": "Hierophany of the Open Grave",
                "uploader_id": "nightbringer",
                "id": "hierophany-of-the-open-grave",
            },
            "playlist_mincount": 9,
        },
        {
            # with escaped quote in title
            "url": "https://jstrecords.bandcamp.com/album/entropy-ep",
            "info_dict": {
                "title": '"Entropy" EP',
                "uploader_id": "jstrecords",
                "id": "entropy-ep",
            },
            "playlist_mincount": 3,
        },
        {
            # not all tracks have songs
            "url": "https://insulters.bandcamp.com/album/we-are-the-plague",
            "info_dict": {
                "id": "we-are-the-plague",
                "title": "WE ARE THE PLAGUE",
                "uploader_id": "insulters",
            },
            "playlist_count": 2,
        },
    ]

    @classmethod
    def suitable(cls, url):
        return (
            False
            if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
            else super(BandcampAlbumIE, cls).suitable(url)
        )

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uploader_id = mobj.group("subdomain")
        album_id = mobj.group("album_id")
        playlist_id = album_id or uploader_id
        webpage = self._download_webpage(url, playlist_id)
        track_elements = re.findall(
            r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>',
            webpage,
        )
        if not track_elements:
            raise ExtractorError("The page doesn't contain any tracks")
        # Only tracks with duration info have songs
        entries = [
            self.url_result(
                compat_urlparse.urljoin(url, t_path),
                ie=BandcampIE.ie_key(),
                video_title=self._search_regex(
                    r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
                    elem_content,
                    "track title",
                    fatal=False,
                ),
            )
            for elem_content, t_path in track_elements
            if self._html_search_meta("duration", elem_content, default=None)
        ]

        title = self._html_search_regex(
            r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', webpage, "title", fatal=False
        )
        if title:
            title = title.replace(r"\"", '"')
        return {
            "_type": "playlist",
            "uploader_id": uploader_id,
            "id": playlist_id,
            "title": title,
            "entries": entries,
        }


class BandcampWeeklyIE(InfoExtractor):
    IE_NAME = "Bandcamp:weekly"
    _VALID_URL = r"https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)"
    _TESTS = [
        {
            "url": "https://bandcamp.com/?show=224",
            "md5": "b00df799c733cf7e0c567ed187dea0fd",
            "info_dict": {
                "id": "224",
                "ext": "opus",
                "title": "BC Weekly April 4th 2017 - Magic Moments",
                "description": "md5:5d48150916e8e02d030623a48512c874",
                "duration": 5829.77,
                "release_date": "20170404",
                "series": "Bandcamp Weekly",
                "episode": "Magic Moments",
                "episode_number": 208,
                "episode_id": "224",
            },
        },
        {"url": "https://bandcamp.com/?blah/blah@&show=228", "only_matching": True},
    ]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        blob = self._parse_json(
            self._search_regex(
                r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, "blob", group="blob"
            ),
            video_id,
            transform_source=unescapeHTML,
        )

        show = blob["bcw_show"]

        # This is desired because any invalid show id redirects to `bandcamp.com`
        # which happens to expose the latest Bandcamp Weekly episode.
        show_id = int_or_none(show.get("show_id")) or int_or_none(video_id)

        formats = []
        for format_id, format_url in show["audio_stream"].items():
            if not url_or_none(format_url):
                continue
            for known_ext in KNOWN_EXTENSIONS:
                if known_ext in format_id:
                    ext = known_ext
                    break
            else:
                ext = None
            formats.append(
                {
                    "format_id": format_id,
                    "url": format_url,
                    "ext": ext,
                    "vcodec": "none",
                }
            )
        self._sort_formats(formats)

        title = show.get("audio_title") or "Bandcamp Weekly"
        subtitle = show.get("subtitle")
        if subtitle:
            title += " - %s" % subtitle

        episode_number = None
        seq = blob.get("bcw_seq")

        if seq and isinstance(seq, list):
            try:
                episode_number = next(
                    int_or_none(e.get("episode_number"))
                    for e in seq
                    if isinstance(e, dict) and int_or_none(e.get("id")) == show_id
                )
            except StopIteration:
                pass

        return {
            "id": video_id,
            "title": title,
            "description": show.get("desc") or show.get("short_desc"),
            "duration": float_or_none(show.get("audio_duration")),
            "is_live": False,
            "release_date": unified_strdate(show.get("published_date")),
            "series": "Bandcamp Weekly",
            "episode": show.get("subtitle"),
            "episode_number": episode_number,
            "episode_id": compat_str(video_id),
            "formats": formats,
        }


class BandcampUserIE(InfoExtractor):
    IE_NAME = "Bandcamp:user"
    _VALID_URL = r"https?://(?:(?P<id>[^.]+)\.)?bandcamp\.com"

    _TESTS = [
        {
            "url": "https://adrianvonziegler.bandcamp.com",
            "info_dict": {
                "id": "adrianvonziegler",
                "title": "Discography of adrianvonziegler",
            },
            "playlist_mincount": 23,
        },
        {
            "url": "http://dotscale.bandcamp.com",
            "info_dict": {"id": "dotscale", "title": "Discography of dotscale",},
            "playlist_count": 1,
        },
        {
            "url": "https://nightcallofficial.bandcamp.com",
            "info_dict": {
                "id": "nightcallofficial",
                "title": "Discography of nightcallofficial",
            },
            "playlist_count": 4,
        },
    ]

    @classmethod
    def suitable(cls, url):
        return (
            False
            if BandcampAlbumIE.suitable(url)
            or BandcampIE.suitable(url)
            or BandcampWeeklyIE.suitable(url)
            else super(BandcampUserIE, cls).suitable(url)
        )

    def _real_extract(self, url):
        uploader = self._match_id(url)
        webpage = self._download_webpage(url, uploader)
        entries = []

        # Bandcamp User type 1 page
        discography_data = re.findall(
            r'<li data-item-id="([^"]+)[^>]+>\s*<a href="(/[^/]+/[^/"]+)">',
            webpage,
            re.MULTILINE,
        )

        if len(discography_data) > 0:
            for match in discography_data:
                element_id = match[0]
                element_url = match[1]
                if element_url.split("/")[1] == "album":
                    ie = BandcampAlbumIE.ie_key()
                else:
                    ie = BandcampIE.ie_key()

                entries.append(
                    self.url_result(
                        compat_urlparse.urljoin(url, element_url),
                        ie=ie,
                        video_id=element_id,
                        video_title=element_url.split("/")[2],
                    )
                )
        else:
            # Bandcamp user type 2 page
            discography_data = re.findall(
                r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage
            )

            for element in discography_data:
                if re.match("/album/+", element):
                    ie = BandcampAlbumIE.ie_key()
                else:
                    ie = BandcampIE.ie_key()

                entries.append(
                    self.url_result(
                        compat_urlparse.urljoin(url, element),
                        ie=ie,
                        video_title=element,
                    )
                )

        return {
            "_type": "playlist",
            "id": uploader,
            "title": "Discography of %s" % uploader,
            "entries": entries,
        }