This commit is contained in:
TinyToweringTree 2020-09-25 17:25:35 +08:00 committed by GitHub
commit c855d00d37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 99 additions and 24 deletions

View File

@ -1403,8 +1403,49 @@ Line 1
'''
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar">nice</span>')
self.assertEqual(get_element_by_class('no-such-class', html), None)
html = '''
<span class="foo bar"/>
'''
self.assertEqual(get_element_by_class('foo', html), None)
self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar"/>')
html = '''
<span class="foo bar"></span>
'''
self.assertEqual(get_element_by_class('foo', html), '')
self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar"></span>')
html = '''
<span class="content-section__wrap bar">nice</span>
'''
self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice')
self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), '<span class="content-section__wrap bar">nice</span>')
html = '''
<span class="-test-hyphen">nice</span>
'''
self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice')
html = '''
<span class="_test_underscore">nice</span>
'''
self.assertEqual(get_element_by_class('_test_underscore', html), 'nice')
html = '''
<span class="ä-umlaut ↑-unicode">nice</span>
'''
self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice')
self.assertEqual(get_element_by_class('↑-unicode', html), 'nice')
def test_get_element_by_attribute(self):
html = '''
<span class="foo bar">nice</span>

View File

@ -2,8 +2,10 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
unified_strdate,
clean_html,
extract_attributes,
get_element_by_class,
unified_strdate,
)
@ -40,19 +42,23 @@ class ArchiveOrgIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://archive.org/embed/' + video_id, video_id)
jwplayer_playlist = self._parse_json(self._search_regex(
r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
webpage, 'jwplayer playlist'), video_id)
input_element_with_playlist = get_element_by_class(
'js-play8-playlist', webpage, include_tag=True)
jwplayer_playlist = self._parse_json(extract_attributes(
input_element_with_playlist)['value'], video_id)
info = self._parse_jwplayer_data(
{'playlist': jwplayer_playlist}, video_id, base_url=url)
def get_optional(metadata, field):
return metadata.get(field, [None])[0]
metadata = self._download_json(
json_metadata = self._download_json(
'http://archive.org/details/' + video_id, video_id, query={
'output': 'json',
})['metadata']
}, fatal=False)
metadata = (json_metadata.get('metadata', {})
if isinstance(json_metadata, dict)
else {})
info.update({
'title': get_optional(metadata, 'title') or info.get('title'),
'description': clean_html(get_optional(metadata, 'description')),

View File

@ -1934,32 +1934,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
return n.attrib[key]
def get_element_by_id(id, html):
"""Return the content of the tag with the specified ID in the passed HTML document"""
return get_element_by_attribute('id', id, html)
def get_element_by_id(id, html, include_tag=False):
"""
Return the content of the tag with the specified ID in the passed HTML document.
The whole element, including its tag, is returned when `include_flag` is `True`.
"""
return get_element_by_attribute('id', id, html, include_tag)
def get_element_by_class(class_name, html):
"""Return the content of the first tag with the specified class in the passed HTML document"""
retval = get_elements_by_class(class_name, html)
def get_element_by_class(class_name, html, include_tag=False):
"""
Return the content of the first tag with the specified class in the passed HTML document.
The whole element, including its tag, is returned when `include_flag` is `True`.
"""
retval = get_elements_by_class(class_name, html, include_tag)
return retval[0] if retval else None
def get_element_by_attribute(attribute, value, html, escape_value=True):
retval = get_elements_by_attribute(attribute, value, html, escape_value)
def get_element_by_attribute(attribute, value, html, escape_value=True,
include_tag=False):
"""
Return the content of the first tag with the specified attribute in the passed HTML document.
The whole element, including its tag, is returned when `include_flag` is `True`.
"""
retval = get_elements_by_attribute(attribute, value, html, escape_value,
include_tag)
return retval[0] if retval else None
def get_elements_by_class(class_name, html):
"""Return the content of all tags with the specified class in the passed HTML document as a list"""
def get_elements_by_class(class_name, html, include_tag=False):
"""
Return the content of all tags with the specified class in the passed HTML document as a list.
The whole elements, including their tags, are returned when `include_flag` is `True`.
"""
return get_elements_by_attribute(
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
html, escape_value=False)
'class', r'[^\'"]*(?<![\w-])%s(?![\w-])[^\'"]*' % re.escape(class_name),
html, escape_value=False, include_tag=include_tag)
def get_elements_by_attribute(attribute, value, html, escape_value=True):
"""Return the content of the tag with the specified attribute in the passed HTML document"""
def get_elements_by_attribute(attribute, value, html, escape_value=True,
include_tag=False):
"""
Return the content of all tags with the specified attribute in the passed HTML document.
The whole elements, including their tags, are returned when `include_flag` is `True`.
"""
value = re.escape(value) if escape_value else value
retlist = []
@ -1968,11 +1991,13 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s+%s=['"]?%s['"]?
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s*>
\s*(?:\/\s*>|>
(?P<content>.*?)
</\1>
</\1>)
''' % (re.escape(attribute), value), html):
res = m.group('content')
res = m.group(0) if include_tag else m.group('content')
if res is None:
continue
if res.startswith('"') or res.startswith("'"):
res = res[1:-1]
@ -1989,7 +2014,10 @@ class HTMLAttributeParser(compat_HTMLParser):
compat_HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
self.attrs = dict(attrs)
# Make sure we're looking at the first attributes. Later ones are from
# embedded elements.
if not self.attrs:
self.attrs = dict(attrs)
def extract_attributes(html_element):