From 3c12a027d48a2d6d1162ab515df0308237aef881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 23:25:47 +0600 Subject: [PATCH] [indavideo] Split in two extractors, extract all formats and fix timestamp --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/indavideo.py | 178 +++++++++++++++++++----------- 2 files changed, 118 insertions(+), 65 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3bcfa93bb..83d21bd15 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -242,7 +242,10 @@ from .imdb import ( ) from .imgur import ImgurIE from .ina import InaIE -from .indavideo import IndavideoIE +from .indavideo import ( + IndavideoIE, + IndavideoEmbedIE, +) from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 2a2cf2bd3..b75715244 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -3,77 +3,127 @@ from __future__ import unicode_literals from .. import utils from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + parse_iso8601, +) + + +class IndavideoEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P[\da-f]+)' + _TESTS = [{ + 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', + 'md5': 'f79b009c66194acacd40712a6778acfa', + 'info_dict': { + 'id': '1837039', + 'ext': 'mp4', + 'title': 'Cicatánc', + 'description': '', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'cukiajanlo', + 'uploader_id': '83729', + 'timestamp': 1439193826, + 'upload_date': '20150810', + 'duration': 72, + 'age_limit': 0, + 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'], + }, + }, { + 'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', + 'only_matching': True, + }, { + 'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, + video_id)['data'] + + video_id = video['id'] + title = video['title'] + + video_urls = video.get('video_files', []) + video_file = video.get('video_file') + if video: + video_urls.append(video_file) + video_urls = list(set(video_urls)) + + video_prefix = video_urls[0].rsplit('/', 1)[0] + + for flv_file in video.get('flv_files', []): + flv_url = '%s/%s' % (video_prefix, flv_file) + if flv_url not in video_urls: + video_urls.append(flv_url) + + formats = [{ + 'url': video_url, + 'height': self._search_regex(r'\.(\d{3,4})\.mp4$', video_url, 'height', default=None), + } for video_url in video_urls] + self._sort_formats(formats) + + timestamp = video.get('date') + if timestamp: + # upload date is in CEST + timestamp = parse_iso8601(timestamp + ' +0200', ' ') + + thumbnails = [{ + 'url': self._proto_relative_url(thumbnail) + } for thumbnail in video.get('thumbnails', [])] + + tags = [tag['title'] for tag in video.get('tags', [])] + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnails': thumbnails, + 'uploader': video.get('user_name'), + 'uploader_id': video.get('user_id'), + 'timestamp': timestamp, + 'duration': int_or_none(video.get('length')), + 'age_limit': parse_age_limit(video.get('age_limit')), + 'tags': tags, + 'formats': formats, + } class IndavideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?indavideo\.hu/video/(?P.+)' - _TESTS = [ - { - 'url': 'http://indavideo.hu/video/Cicatanc', - 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', - 'info_dict': { - 'id': '1837039', - 'title': 'Cicatánc', - 'ext': 'mp4', - 'display_id': 'Cicatanc', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', - 'uploader': 'cukiajanlo', - 'uploader_id': '83729', - 'duration': 72, - 'age_limit': 0, - 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'] - }, + _VALID_URL = r'https?://(?:www\.)?indavideo\.hu/video/(?P[^/#?]+)' + _TEST = { + 'url': 'http://indavideo.hu/video/Vicces_cica_1', + 'md5': '8c82244ba85d2a2310275b318eb51eac', + 'info_dict': { + 'id': '1335611', + 'display_id': 'Vicces_cica_1', + 'ext': 'mp4', + 'title': 'Vicces cica', + 'description': 'Játszik a tablettel. :D', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Jet_Pack', + 'uploader_id': '491217', + 'timestamp': 1390821212, + 'upload_date': '20140127', + 'duration': 7, + 'age_limit': 0, + 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], }, - { - 'url': 'http://indavideo.hu/video/Vicces_cica_1', - 'md5': '8c82244ba85d2a2310275b318eb51eac', - 'info_dict': { - 'id': '1335611', - 'title': 'Vicces cica', - 'ext': 'mp4', - 'display_id': 'Vicces_cica_1', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Játszik a tablettel. :D', - 'uploader': 'Jet_Pack', - 'uploader_id': '491217', - 'duration': 7, - 'age_limit': 0, - 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], - }, - }, - ] + } def _real_extract(self, url): - video_disp_id = self._match_id(url) - webpage = self._download_webpage(url, video_disp_id) + display_id = self._match_id(url) - embed_url = self._html_search_regex(r'', webpage, 'embed_url') - video_hash = embed_url.split('/')[-1] - - payload = self._download_json('http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/' + video_hash, video_disp_id) - video_info = payload['data'] - - thumbnails = video_info.get('thumbnails') - if thumbnails: - thumbnails = [{'url': self._proto_relative_url(x)} for x in thumbnails] - - tags = video_info.get('tags') - if tags: - tags = [x['title'] for x in tags] + webpage = self._download_webpage(url, display_id) + embed_url = self._search_regex( + r']+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url') return { - 'id': video_info.get('id'), - 'title': video_info['title'], - 'url': video_info['video_file'], - 'ext': 'mp4', - 'display_id': video_disp_id, - 'thumbnails': thumbnails, - 'description': video_info.get('description'), - 'uploader': video_info.get('user_name'), - # TODO: upload date (it's in CET/CEST) - 'uploader_id': video_info.get('user_id'), - 'duration': utils.int_or_none(video_info.get('length')), - 'age_limit': utils.int_or_none(video_info.get('age_limit')), - 'tags': tags, + '_type': 'url_transparent', + 'ie_key': 'IndavideoEmbed', + 'url': embed_url, + 'display_id': display_id, }