Merge 1326a5aa38 into d65d89183f

2020-09-25 17:25:35 +08:00 · 2020-09-25 17:25:35 +08:00 · c855d00d37
parent d65d89183f 1326a5aa38
commit c855d00d37
3 changed files with 99 additions and 24 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -1403,8 +1403,49 @@ Line 1
        '''

        self.assertEqual(get_element_by_class('foo', html), 'nice')
+        self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar">nice</span>')
        self.assertEqual(get_element_by_class('no-such-class', html), None)

+        html = '''
+            <span class="foo bar"/>
+        '''
+
+        self.assertEqual(get_element_by_class('foo', html), None)
+        self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar"/>')
+
+        html = '''
+            <span class="foo bar"></span>
+        '''
+
+        self.assertEqual(get_element_by_class('foo', html), '')
+        self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar"></span>')
+
+        html = '''
+            <span class="content-section__wrap bar">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice')
+        self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), '<span class="content-section__wrap bar">nice</span>')
+
+        html = '''
+            <span class="-test-hyphen">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice')
+
+        html = '''
+            <span class="_test_underscore">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('_test_underscore', html), 'nice')
+
+        html = '''
+            <span class="ä-umlaut ↑-unicode">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice')
+        self.assertEqual(get_element_by_class('↑-unicode', html), 'nice')
+
    def test_get_element_by_attribute(self):
        html = '''
            <span class="foo bar">nice</span>
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@ -2,8 +2,10 @@ from __future__ import unicode_literals

 from .common import InfoExtractor
 from ..utils import (
-    unified_strdate,
    clean_html,
+    extract_attributes,
+    get_element_by_class,
+    unified_strdate,
 )


@ -40,19 +42,23 @@ class ArchiveOrgIE(InfoExtractor):
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            'http://archive.org/embed/' + video_id, video_id)
-        jwplayer_playlist = self._parse_json(self._search_regex(
-            r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
-            webpage, 'jwplayer playlist'), video_id)
+        input_element_with_playlist = get_element_by_class(
+            'js-play8-playlist', webpage, include_tag=True)
+        jwplayer_playlist = self._parse_json(extract_attributes(
+            input_element_with_playlist)['value'], video_id)
        info = self._parse_jwplayer_data(
            {'playlist': jwplayer_playlist}, video_id, base_url=url)

        def get_optional(metadata, field):
            return metadata.get(field, [None])[0]

-        metadata = self._download_json(
+        json_metadata = self._download_json(
            'http://archive.org/details/' + video_id, video_id, query={
                'output': 'json',
-            })['metadata']
+            }, fatal=False)
+        metadata = (json_metadata.get('metadata', {})
+                    if isinstance(json_metadata, dict)
+                    else {})
        info.update({
            'title': get_optional(metadata, 'title') or info.get('title'),
            'description': clean_html(get_optional(metadata, 'description')),
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1934,32 +1934,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
    return n.attrib[key]


-def get_element_by_id(id, html):
-    """Return the content of the tag with the specified ID in the passed HTML document"""
-    return get_element_by_attribute('id', id, html)
+def get_element_by_id(id, html, include_tag=False):
+    """
+    Return the content of the tag with the specified ID in the passed HTML document.
+
+    The whole element, including its tag, is returned when `include_flag` is `True`.
+    """
+    return get_element_by_attribute('id', id, html, include_tag)


-def get_element_by_class(class_name, html):
-    """Return the content of the first tag with the specified class in the passed HTML document"""
-    retval = get_elements_by_class(class_name, html)
+def get_element_by_class(class_name, html, include_tag=False):
+    """
+    Return the content of the first tag with the specified class in the passed HTML document.
+
+    The whole element, including its tag, is returned when `include_flag` is `True`.
+    """
+    retval = get_elements_by_class(class_name, html, include_tag)
    return retval[0] if retval else None


-def get_element_by_attribute(attribute, value, html, escape_value=True):
-    retval = get_elements_by_attribute(attribute, value, html, escape_value)
+def get_element_by_attribute(attribute, value, html, escape_value=True,
+                             include_tag=False):
+    """
+    Return the content of the first tag with the specified attribute in the passed HTML document.
+
+    The whole element, including its tag, is returned when `include_flag` is `True`.
+    """
+    retval = get_elements_by_attribute(attribute, value, html, escape_value,
+                                       include_tag)
    return retval[0] if retval else None


-def get_elements_by_class(class_name, html):
-    """Return the content of all tags with the specified class in the passed HTML document as a list"""
+def get_elements_by_class(class_name, html, include_tag=False):
+    """
+    Return the content of all tags with the specified class in the passed HTML document as a list.
+
+    The whole elements, including their tags, are returned when `include_flag` is `True`.
+    """
    return get_elements_by_attribute(
-        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
-        html, escape_value=False)
+        'class', r'[^\'"]*(?<![\w-])%s(?![\w-])[^\'"]*' % re.escape(class_name),
+        html, escape_value=False, include_tag=include_tag)


-def get_elements_by_attribute(attribute, value, html, escape_value=True):
-    """Return the content of the tag with the specified attribute in the passed HTML document"""
+def get_elements_by_attribute(attribute, value, html, escape_value=True,
+                              include_tag=False):
+    """
+    Return the content of all tags with the specified attribute in the passed HTML document.

+    The whole elements, including their tags, are returned when `include_flag` is `True`.
+    """
    value = re.escape(value) if escape_value else value

    retlist = []
@ -1968,11 +1991,13 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
         \s+%s=['"]?%s['"]?
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
-        \s*>
+        \s*(?:\/\s*>|>
        (?P<content>.*?)
-        </\1>
+        </\1>)
    ''' % (re.escape(attribute), value), html):
-        res = m.group('content')
+        res = m.group(0) if include_tag else m.group('content')
+        if res is None:
+            continue

        if res.startswith('"') or res.startswith("'"):
            res = res[1:-1]
@ -1989,7 +2014,10 @@ class HTMLAttributeParser(compat_HTMLParser):
        compat_HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
-        self.attrs = dict(attrs)
+        # Make sure we're looking at the first attributes. Later ones are from
+        # embedded elements.
+        if not self.attrs:
+            self.attrs = dict(attrs)


 def extract_attributes(html_element):