Merge 1326a5aa38
into d65d89183f
This commit is contained in:
commit
c855d00d37
|
@ -1403,8 +1403,49 @@ Line 1
|
|||
'''
|
||||
|
||||
self.assertEqual(get_element_by_class('foo', html), 'nice')
|
||||
self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar">nice</span>')
|
||||
self.assertEqual(get_element_by_class('no-such-class', html), None)
|
||||
|
||||
html = '''
|
||||
<span class="foo bar"/>
|
||||
'''
|
||||
|
||||
self.assertEqual(get_element_by_class('foo', html), None)
|
||||
self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar"/>')
|
||||
|
||||
html = '''
|
||||
<span class="foo bar"></span>
|
||||
'''
|
||||
|
||||
self.assertEqual(get_element_by_class('foo', html), '')
|
||||
self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar"></span>')
|
||||
|
||||
html = '''
|
||||
<span class="content-section__wrap bar">nice</span>
|
||||
'''
|
||||
|
||||
self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice')
|
||||
self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), '<span class="content-section__wrap bar">nice</span>')
|
||||
|
||||
html = '''
|
||||
<span class="-test-hyphen">nice</span>
|
||||
'''
|
||||
|
||||
self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice')
|
||||
|
||||
html = '''
|
||||
<span class="_test_underscore">nice</span>
|
||||
'''
|
||||
|
||||
self.assertEqual(get_element_by_class('_test_underscore', html), 'nice')
|
||||
|
||||
html = '''
|
||||
<span class="ä-umlaut ↑-unicode">nice</span>
|
||||
'''
|
||||
|
||||
self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice')
|
||||
self.assertEqual(get_element_by_class('↑-unicode', html), 'nice')
|
||||
|
||||
def test_get_element_by_attribute(self):
|
||||
html = '''
|
||||
<span class="foo bar">nice</span>
|
||||
|
|
|
@ -2,8 +2,10 @@ from __future__ import unicode_literals
|
|||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
unified_strdate,
|
||||
clean_html,
|
||||
extract_attributes,
|
||||
get_element_by_class,
|
||||
unified_strdate,
|
||||
)
|
||||
|
||||
|
||||
|
@ -40,19 +42,23 @@ class ArchiveOrgIE(InfoExtractor):
|
|||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(
|
||||
'http://archive.org/embed/' + video_id, video_id)
|
||||
jwplayer_playlist = self._parse_json(self._search_regex(
|
||||
r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
|
||||
webpage, 'jwplayer playlist'), video_id)
|
||||
input_element_with_playlist = get_element_by_class(
|
||||
'js-play8-playlist', webpage, include_tag=True)
|
||||
jwplayer_playlist = self._parse_json(extract_attributes(
|
||||
input_element_with_playlist)['value'], video_id)
|
||||
info = self._parse_jwplayer_data(
|
||||
{'playlist': jwplayer_playlist}, video_id, base_url=url)
|
||||
|
||||
def get_optional(metadata, field):
|
||||
return metadata.get(field, [None])[0]
|
||||
|
||||
metadata = self._download_json(
|
||||
json_metadata = self._download_json(
|
||||
'http://archive.org/details/' + video_id, video_id, query={
|
||||
'output': 'json',
|
||||
})['metadata']
|
||||
}, fatal=False)
|
||||
metadata = (json_metadata.get('metadata', {})
|
||||
if isinstance(json_metadata, dict)
|
||||
else {})
|
||||
info.update({
|
||||
'title': get_optional(metadata, 'title') or info.get('title'),
|
||||
'description': clean_html(get_optional(metadata, 'description')),
|
||||
|
|
|
@ -1934,32 +1934,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
|
|||
return n.attrib[key]
|
||||
|
||||
|
||||
def get_element_by_id(id, html):
|
||||
"""Return the content of the tag with the specified ID in the passed HTML document"""
|
||||
return get_element_by_attribute('id', id, html)
|
||||
def get_element_by_id(id, html, include_tag=False):
|
||||
"""
|
||||
Return the content of the tag with the specified ID in the passed HTML document.
|
||||
|
||||
The whole element, including its tag, is returned when `include_flag` is `True`.
|
||||
"""
|
||||
return get_element_by_attribute('id', id, html, include_tag)
|
||||
|
||||
|
||||
def get_element_by_class(class_name, html):
|
||||
"""Return the content of the first tag with the specified class in the passed HTML document"""
|
||||
retval = get_elements_by_class(class_name, html)
|
||||
def get_element_by_class(class_name, html, include_tag=False):
|
||||
"""
|
||||
Return the content of the first tag with the specified class in the passed HTML document.
|
||||
|
||||
The whole element, including its tag, is returned when `include_flag` is `True`.
|
||||
"""
|
||||
retval = get_elements_by_class(class_name, html, include_tag)
|
||||
return retval[0] if retval else None
|
||||
|
||||
|
||||
def get_element_by_attribute(attribute, value, html, escape_value=True):
|
||||
retval = get_elements_by_attribute(attribute, value, html, escape_value)
|
||||
def get_element_by_attribute(attribute, value, html, escape_value=True,
|
||||
include_tag=False):
|
||||
"""
|
||||
Return the content of the first tag with the specified attribute in the passed HTML document.
|
||||
|
||||
The whole element, including its tag, is returned when `include_flag` is `True`.
|
||||
"""
|
||||
retval = get_elements_by_attribute(attribute, value, html, escape_value,
|
||||
include_tag)
|
||||
return retval[0] if retval else None
|
||||
|
||||
|
||||
def get_elements_by_class(class_name, html):
|
||||
"""Return the content of all tags with the specified class in the passed HTML document as a list"""
|
||||
def get_elements_by_class(class_name, html, include_tag=False):
|
||||
"""
|
||||
Return the content of all tags with the specified class in the passed HTML document as a list.
|
||||
|
||||
The whole elements, including their tags, are returned when `include_flag` is `True`.
|
||||
"""
|
||||
return get_elements_by_attribute(
|
||||
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
|
||||
html, escape_value=False)
|
||||
'class', r'[^\'"]*(?<![\w-])%s(?![\w-])[^\'"]*' % re.escape(class_name),
|
||||
html, escape_value=False, include_tag=include_tag)
|
||||
|
||||
|
||||
def get_elements_by_attribute(attribute, value, html, escape_value=True):
|
||||
"""Return the content of the tag with the specified attribute in the passed HTML document"""
|
||||
def get_elements_by_attribute(attribute, value, html, escape_value=True,
|
||||
include_tag=False):
|
||||
"""
|
||||
Return the content of all tags with the specified attribute in the passed HTML document.
|
||||
|
||||
The whole elements, including their tags, are returned when `include_flag` is `True`.
|
||||
"""
|
||||
value = re.escape(value) if escape_value else value
|
||||
|
||||
retlist = []
|
||||
|
@ -1968,11 +1991,13 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
|
|||
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
|
||||
\s+%s=['"]?%s['"]?
|
||||
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
|
||||
\s*>
|
||||
\s*(?:\/\s*>|>
|
||||
(?P<content>.*?)
|
||||
</\1>
|
||||
</\1>)
|
||||
''' % (re.escape(attribute), value), html):
|
||||
res = m.group('content')
|
||||
res = m.group(0) if include_tag else m.group('content')
|
||||
if res is None:
|
||||
continue
|
||||
|
||||
if res.startswith('"') or res.startswith("'"):
|
||||
res = res[1:-1]
|
||||
|
@ -1989,7 +2014,10 @@ class HTMLAttributeParser(compat_HTMLParser):
|
|||
compat_HTMLParser.__init__(self)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.attrs = dict(attrs)
|
||||
# Make sure we're looking at the first attributes. Later ones are from
|
||||
# embedded elements.
|
||||
if not self.attrs:
|
||||
self.attrs = dict(attrs)
|
||||
|
||||
|
||||
def extract_attributes(html_element):
|
||||
|
|
Loading…
Reference in New Issue