From 1ac941fd4ca253836dda886185541ec45c5a82b9 Mon Sep 17 00:00:00 2001 From: Hanif Birgani Date: Sun, 15 Mar 2020 15:38:43 +0330 Subject: [PATCH 1/4] Fix Aparat.com extractor Fix issues: #23348 #24252 #22285 --- youtube_dl/extractor/aparat.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 883dcee7a..6c83c8d28 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -48,14 +48,12 @@ class AparatIE(InfoExtractor): options = self._parse_json( self._search_regex( - r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', + r'options\s*=\s*(?P.*}}})\s*;', webpage, 'options', group='value'), video_id) - player = options['plugins']['sabaPlayerPlugin'] - formats = [] - for sources in player['multiSRC']: + for sources in options['multiSRC']: for item in sources: if not isinstance(item, dict): continue @@ -85,11 +83,11 @@ class AparatIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): - info['title'] = player['title'] + info['title'] = options['title'] return merge_dicts(info, { 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), - 'duration': int_or_none(player.get('duration')), + 'duration': int_or_none(options.get('duration')), 'formats': formats, }) From 8b348cf9ff8a1c4f3ef1ef4e31d0cf38404eea41 Mon Sep 17 00:00:00 2001 From: Hanif Birgani Date: Mon, 16 Mar 2020 11:33:18 +0330 Subject: [PATCH 2/4] Remove info extraction from json_ld Aparat uses an invalid ld+json format in some pages, and it causes to JSON parser errors in some URLs, so it is better to get title and description from og instead of ld+json --- youtube_dl/extractor/aparat.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 6c83c8d28..dbaae7f71 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -20,11 +20,7 @@ class AparatIE(InfoExtractor): 'id': 'wP8On', 'ext': 'mp4', 'title': 'تیم گلکسی 11 - زومیت', - 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', - 'duration': 231, - 'timestamp': 1387394859, - 'upload_date': '20131218', - 'view_count': int, + 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028' }, }, { # multiple formats @@ -80,14 +76,14 @@ class AparatIE(InfoExtractor): self._sort_formats( formats, field_preference=('height', 'width', 'tbr', 'format_id')) - info = self._search_json_ld(webpage, video_id, default={}) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) - if not info.get('title'): - info['title'] = options['title'] - - return merge_dicts(info, { + return { + 'title': title, + 'description': description, 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), 'duration': int_or_none(options.get('duration')), 'formats': formats, - }) + } From 3d5ae9e0d3af6da4902027ce9eeb1f59eb99fa81 Mon Sep 17 00:00:00 2001 From: Hanif Birgani Date: Mon, 16 Mar 2020 13:39:51 +0330 Subject: [PATCH 3/4] Remove unused import to pass flake8 --- youtube_dl/extractor/aparat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index dbaae7f71..8b47c3e58 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, - merge_dicts, mimetype2ext, url_or_none, ) From 47c3ebde43c4832b73a2e4dcb3cd79b168c05522 Mon Sep 17 00:00:00 2001 From: Hanif Birgani Date: Sun, 22 Mar 2020 00:07:23 +0430 Subject: [PATCH 4/4] Remove one time variables and make them inline --- youtube_dl/extractor/aparat.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 8b47c3e58..940caf5f7 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -75,12 +75,9 @@ class AparatIE(InfoExtractor): self._sort_formats( formats, field_preference=('height', 'width', 'tbr', 'format_id')) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - return { - 'title': title, - 'description': description, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), 'duration': int_or_none(options.get('duration')),