youtube-dl/youtube_dl/YoutubeDL.py

2420 lines
108 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
2016-10-02 11:39:18 +00:00
# coding: utf-8
2014-01-05 00:52:03 +00:00
from __future__ import absolute_import, unicode_literals
import collections
import contextlib
import copy
2014-03-13 14:30:25 +00:00
import datetime
import errno
import fileinput
import io
import itertools
2013-11-20 05:18:24 +00:00
import json
import locale
import operator
import os
import platform
import re
import shutil
import subprocess
import socket
import sys
import time
import tokenize
import traceback
import random
2017-07-15 00:02:14 +00:00
from string import ascii_letters
from .compat import (
compat_basestring,
compat_cookiejar,
compat_get_terminal_size,
2013-11-17 15:47:52 +00:00
compat_http_client,
compat_kwargs,
compat_numeric_types,
compat_os_name,
2013-11-17 15:47:52 +00:00
compat_str,
compat_tokenize_tokenize,
2013-11-17 15:47:52 +00:00
compat_urllib_error,
compat_urllib_request,
2015-10-17 15:16:40 +00:00
compat_urllib_request_DataHandler,
)
from .utils import (
2016-03-26 13:40:33 +00:00
age_restricted,
args_to_str,
2013-11-17 15:47:52 +00:00
ContentTooShortError,
date_from_str,
DateRange,
DEFAULT_OUTTMPL,
2013-11-17 15:47:52 +00:00
determine_ext,
determine_protocol,
2013-11-17 15:47:52 +00:00
DownloadError,
encode_compat_str,
2013-11-17 15:47:52 +00:00
encodeFilename,
error_to_compat_str,
expand_path,
2013-11-17 15:47:52 +00:00
ExtractorError,
format_bytes,
2013-12-16 03:15:10 +00:00
formatSeconds,
GeoRestrictedError,
int_or_none,
ISO3166Utils,
2013-11-17 15:47:52 +00:00
locked_file,
make_HTTPS_handler,
2013-11-17 15:47:52 +00:00
MaxDownloadsReached,
orderedSet,
PagedList,
parse_filesize,
PerRequestProxyHandler,
platform_name,
2016-03-26 13:40:33 +00:00
PostProcessingError,
2013-11-17 15:47:52 +00:00
preferredencoding,
2016-03-26 13:40:33 +00:00
prepend_extension,
register_socks_protocols,
2015-01-25 01:38:47 +00:00
render_table,
2016-03-26 13:40:33 +00:00
replace_extension,
2013-11-17 15:47:52 +00:00
SameFileError,
sanitize_filename,
2015-03-08 14:57:30 +00:00
sanitize_path,
sanitize_url,
sanitized_Request,
std_headers,
str_or_none,
2013-11-17 15:47:52 +00:00
subtitles_filename,
UnavailableVideoError,
url_basename,
version_tuple,
2013-11-17 15:47:52 +00:00
write_json_file,
write_string,
YoutubeDLCookieJar,
YoutubeDLCookieProcessor,
YoutubeDLHandler,
YoutubeDLRedirectHandler,
2013-11-17 15:47:52 +00:00
)
from .cache import Cache
from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
2017-09-23 17:08:27 +00:00
from .extractor.openload import PhantomJSwrapper
from .downloader import get_suitable_downloader
from .downloader.rtmp import rtmpdump_version
from .postprocessor import (
FFmpegFixupM3u8PP,
FFmpegFixupM4aPP,
FFmpegFixupStretchedPP,
FFmpegMergerPP,
FFmpegPostProcessor,
get_postprocessor,
)
from .version import __version__
if compat_os_name == 'nt':
import ctypes
class YoutubeDL(object):
"""YoutubeDL class.
YoutubeDL objects are the ones responsible of downloading the
actual video file and writing it to disk if the user has requested
it, among some other tasks. In most cases there should be one per
program. As, given a video URL, the downloader doesn't know how to
extract all the needed information, task that InfoExtractors do, it
has to pass the URL to one of them.
For this, YoutubeDL objects have a method that allows
InfoExtractors to be registered in a given order. When it is passed
a URL, the YoutubeDL object handles it to the first InfoExtractor it
finds that reports being able to handle it. The InfoExtractor extracts
all the information about the video or videos the URL refers to, and
YoutubeDL process the extracted information, possibly using a File
Downloader to download the video.
YoutubeDL objects accept a lot of parameters. In order not to saturate
the object constructor with arguments, it receives a dictionary of
options instead. These options are available through the params
attribute for the InfoExtractors to use. The YoutubeDL also
registers itself as the downloader in charge for the InfoExtractors
that are added to it, so this is a "mutual registration".
Available options:
username: Username for authentication purposes.
password: Password for authentication purposes.
videopassword: Password for accessing a video.
ap_mso: Adobe Pass multiple-system operator identifier.
ap_username: Multiple-system operator account username.
ap_password: Multiple-system operator account password.
usenetrc: Use netrc for authentication instead.
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
2014-03-25 23:43:46 +00:00
no_warnings: Do not print out anything for warnings.
forceurl: Force printing final URL.
forcetitle: Force printing title.
forceid: Force printing ID.
forcethumbnail: Force printing thumbnail URL.
forcedescription: Force printing description.
forcefilename: Force printing final filename.
2013-12-16 03:15:10 +00:00
forceduration: Force printing duration.
2013-11-20 05:18:24 +00:00
forcejson: Force printing info_dict as JSON.
2014-10-24 22:30:57 +00:00
dump_single_json: Force printing the info_dict of the whole playlist
(or video) as a single JSON line.
simulate: Do not download the video files.
format: Video format code. See options.py for more information.
outtmpl: Template for output names.
restrictfilenames: Do not allow "&" and spaces in file names
ignoreerrors: Do not stop on download errors.
force_generic_extractor: Force downloader to use the generic extractor
nooverwrites: Prevent overwriting files.
playliststart: Playlist item to start at.
playlistend: Playlist item to end at.
playlist_items: Specific indices of playlist to download.
playlistreverse: Download playlist items in reverse order.
playlistrandom: Download playlist items in random order.
matchtitle: Download only matching titles.
rejecttitle: Reject downloads for matching titles.
2013-11-24 05:08:11 +00:00
logger: Log messages to a logging.Logger instance.
logtostderr: Log messages to stderr instead of stdout.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
writeannotations: Write the video annotations to a .annotations.xml file
writethumbnail: Write the thumbnail image to a file
write_all_thumbnails: Write all thumbnail formats to files
writesubtitles: Write the video subtitles to a file
writeautomaticsub: Write the automatically generated subtitles to a file
allsubtitles: Downloads all the subtitles of the video
(requires writesubtitles or writeautomaticsub)
listsubtitles: Lists all available subtitles for the video
subtitlesformat: The format code for subtitles
subtitleslangs: List of languages of the subtitles to download
keepvideo: Keep the video file after post-processing
daterange: A DateRange object, download only if the upload_date is in the range.
skip_download: Skip the actual download of the video file
2013-09-22 09:09:25 +00:00
cachedir: Location of the cache files in the filesystem.
False to disable filesystem cache.
noplaylist: Download single video instead of a playlist if in doubt.
age_limit: An integer representing the user's age in years.
Unsuitable videos for the given age are skipped.
min_views: An integer representing the minimum view count the video
must have in order to not be skipped.
Videos without view count information are always
downloaded. None for no limit.
max_views: An integer representing the maximum view count.
Videos that are more popular than that are not
downloaded.
Videos without view count information are always
downloaded. None for no limit.
download_archive: File name of a file where all downloads are recorded.
Videos already present in the file are not downloaded
again.
cookiefile: File name where cookies should be read from and dumped to.
2013-11-24 14:03:25 +00:00
nocheckcertificate:Do not verify SSL certificates
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
At the moment, this is only supported by YouTube.
2013-11-24 14:03:25 +00:00
proxy: URL of the proxy server to use
geo_verification_proxy: URL of the proxy to use for IP address verification
on geo-restricted sites.
socket_timeout: Time to wait for unresponsive hosts, in seconds
bidi_workaround: Work around buggy terminals without bidirectional text
support, using fridibi
2013-12-29 14:28:32 +00:00
debug_printtraffic:Print out sent and received HTTP traffic
include_ads: Download ads as well
default_search: Prepend this string if an input url is not valid.
'auto' for elaborate guessing
encoding: Use this encoding instead of the system-specified.
extract_flat: Do not resolve URLs, return the immediate result.
Pass in 'in_playlist' to only show this behavior for
playlist items.
postprocessors: A list of dictionaries, each with an entry
* key: The name of the postprocessor. See
youtube_dl/postprocessor/__init__.py for a list.
as well as any further keyword arguments for the
postprocessor.
progress_hooks: A list of functions that get called on download
progress, with a dictionary with the entries
* status: One of "downloading", "error", or "finished".
2015-01-25 05:15:51 +00:00
Check this first and ignore unknown values.
If status is one of "downloading", or "finished", the
2015-01-25 05:15:51 +00:00
following properties may also be present:
* filename: The final filename (always present)
* tmpfilename: The filename we're currently writing to
* downloaded_bytes: Bytes on disk
* total_bytes: Size of the whole file, None if unknown
* total_bytes_estimate: Guess of the eventual file size,
None if unavailable.
* elapsed: The number of seconds since download started.
* eta: The estimated time in seconds, None if unknown
* speed: The download speed in bytes/second, None if
unknown
* fragment_index: The counter of the currently
downloaded video fragment.
* fragment_count: The number of fragments (= individual
files that will be merged)
Progress hooks are guaranteed to be called at least once
(with status "finished") if the download is successful.
merge_output_format: Extension to use when merging formats.
fixup: Automatically correct known faults of the file.
One of:
- "never": do nothing
- "warn": only emit a warning
- "detect_or_warn": check whether we can do anything
about it, warn otherwise (default)
source_address: Client-side IP address to bind to.
call_home: Boolean, true iff we are allowed to contact the
youtube-dl servers for debugging.
2016-08-08 20:46:52 +00:00
sleep_interval: Number of seconds to sleep before each download when
used alone or a lower bound of a range for randomized
sleep before each download (minimum possible number
of seconds to sleep) when used along with
max_sleep_interval.
max_sleep_interval:Upper bound of a range for randomized sleep before each
download (maximum possible number of seconds to sleep).
Must only be used along with sleep_interval.
Actual sleep time will be a random float from range
[sleep_interval; max_sleep_interval].
2015-01-25 01:38:47 +00:00
listformats: Print an overview of available video formats and exit.
list_thumbnails: Print a table of all thumbnails and exit.
match_filter: A function that gets called with the info_dict of
every video.
If it returns a message, the video is ignored.
If it returns None, the video is downloaded.
match_filter_func in utils.py is one example for this.
2015-02-10 03:22:10 +00:00
no_color: Do not emit color codes in output.
2017-02-18 18:53:41 +00:00
geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
HTTP header
2017-02-18 18:53:41 +00:00
geo_bypass_country:
Two-letter ISO 3166-2 country code that will be used for
explicit geographic restriction bypassing via faking
X-Forwarded-For HTTP header
geo_bypass_ip_block:
IP range in CIDR notation that will be used similarly to
geo_bypass_country
The following options determine which downloader is picked:
external_downloader: Executable of the external downloader to call.
None or unset for standard (built-in) downloader.
2016-04-21 17:02:17 +00:00
hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
if True, otherwise use ffmpeg/avconv if False, otherwise
use downloader suggested by extractor if None.
The following parameters are not used by YoutubeDL itself, they are used by
the downloader (see youtube_dl/downloader/common.py):
nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
noresizebuffer, retries, continuedl, noprogress, consoletitle,
2018-02-03 19:53:50 +00:00
xattr_set_filesize, external_downloader_args, hls_use_mpegts,
http_chunk_size.
The following options are used by the post processors:
prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
otherwise prefer ffmpeg.
ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
to the binary or its containing directory.
postprocessor_args: A list of additional command-line arguments for the
postprocessor.
2017-09-30 15:56:40 +00:00
The following options are used by the Youtube extractor:
youtube_include_dash_manifest: If True (default), DASH manifests and related
data will be downloaded and processed by extractor.
You can reduce network I/O by disabling it if you don't
care about DASH.
"""
_NUMERIC_FIELDS = set((
'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
'timestamp', 'upload_year', 'upload_month', 'upload_day',
'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
'average_rating', 'comment_count', 'age_limit',
'start_time', 'end_time',
'chapter_number', 'season_number', 'episode_number',
'track_number', 'disc_number', 'release_year',
'playlist_index',
))
params = None
_ies = []
_pps = []
_download_retcode = None
_num_downloads = None
_screen_file = None
def __init__(self, params=None, auto_init=True):
"""Create a FileDownloader object with the given options."""
if params is None:
params = {}
self._ies = []
self._ies_instances = {}
self._pps = []
self._progress_hooks = []
self._download_retcode = 0
self._num_downloads = 0
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
self._err_file = sys.stderr
self.params = {
# Default parameters
'nocheckcertificate': False,
}
self.params.update(params)
self.cache = Cache(self)
2013-09-21 09:48:07 +00:00
def check_deprecated(param, option, suggestion):
if self.params.get(param) is not None:
self.report_warning(
'%s is deprecated. Use %s instead.' % (option, suggestion))
return True
return False
if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
if self.params.get('geo_verification_proxy') is None:
self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
if params.get('bidi_workaround', False):
2013-12-09 17:29:07 +00:00
try:
import pty
master, slave = pty.openpty()
width = compat_get_terminal_size().columns
2013-12-09 17:29:07 +00:00
if width is None:
width_args = []
else:
width_args = ['-w', str(width)]
sp_kwargs = dict(
2013-12-09 17:29:07 +00:00
stdin=subprocess.PIPE,
stdout=slave,
stderr=self._err_file)
try:
self._output_process = subprocess.Popen(
['bidiv'] + width_args, **sp_kwargs
)
except OSError:
self._output_process = subprocess.Popen(
['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
self._output_channel = os.fdopen(master, 'rb')
2013-12-09 17:29:07 +00:00
except OSError as ose:
if ose.errno == errno.ENOENT:
2014-01-05 00:52:03 +00:00
self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
2013-12-09 17:29:07 +00:00
else:
raise
if (sys.platform != 'win32'
and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
and not params.get('restrictfilenames', False)):
# Unicode filesystem API will throw errors (#1474, #13027)
2013-09-21 09:48:07 +00:00
self.report_warning(
2014-01-05 00:52:03 +00:00
'Assuming --restrict-filenames since file system encoding '
2014-10-09 15:00:24 +00:00
'cannot encode all characters. '
2014-01-05 00:52:03 +00:00
'Set the LC_ALL environment variable to fix this.')
self.params['restrictfilenames'] = True
2013-09-21 09:48:07 +00:00
if isinstance(params.get('outtmpl'), bytes):
self.report_warning(
'Parameter outtmpl is bytes, but should be a unicode string. '
'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
self._setup_opener()
if auto_init:
self.print_debug_header()
self.add_default_info_extractors()
for pp_def_raw in self.params.get('postprocessors', []):
pp_class = get_postprocessor(pp_def_raw['key'])
pp_def = dict(pp_def_raw)
del pp_def['key']
pp = pp_class(self, **compat_kwargs(pp_def))
self.add_post_processor(pp)
for ph in self.params.get('progress_hooks', []):
self.add_progress_hook(ph)
register_socks_protocols()
def warn_if_short_id(self, argv):
# short YouTube ID starting with dash?
idxs = [
i for i, a in enumerate(argv)
if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
if idxs:
correct_argv = (
['youtube-dl']
+ [a for i, a in enumerate(argv) if i not in idxs]
+ ['--'] + [argv[i] for i in idxs]
)
self.report_warning(
'Long argument string detected. '
'Use -- to separate parameters and URLs, like this:\n%s\n' %
args_to_str(correct_argv))
def add_info_extractor(self, ie):
"""Add an InfoExtractor object to the end of the list."""
self._ies.append(ie)
if not isinstance(ie, type):
self._ies_instances[ie.ie_key()] = ie
ie.set_downloader(self)
def get_info_extractor(self, ie_key):
"""
Get an instance of an IE with name ie_key, it will try to get one from
the _ies list, if there's no instance it will create a new one and add
it to the extractor list.
"""
ie = self._ies_instances.get(ie_key)
if ie is None:
ie = get_info_extractor(ie_key)()
self.add_info_extractor(ie)
return ie
def add_default_info_extractors(self):
"""
Add the InfoExtractors returned by gen_extractors to the end of the list
"""
for ie in gen_extractor_classes():
self.add_info_extractor(ie)
def add_post_processor(self, pp):
"""Add a PostProcessor object to the end of the chain."""
self._pps.append(pp)
pp.set_downloader(self)
def add_progress_hook(self, ph):
"""Add the progress hook (currently only for the file downloader)"""
self._progress_hooks.append(ph)
2013-12-09 17:29:07 +00:00
def _bidi_workaround(self, message):
if not hasattr(self, '_output_channel'):
2013-12-09 17:29:07 +00:00
return message
assert hasattr(self, '_output_process')
assert isinstance(message, compat_str)
2014-01-05 00:52:03 +00:00
line_count = message.count('\n') + 1
self._output_process.stdin.write((message + '\n').encode('utf-8'))
self._output_process.stdin.flush()
2014-01-05 00:52:03 +00:00
res = ''.join(self._output_channel.readline().decode('utf-8')
2014-11-23 20:39:15 +00:00
for _ in range(line_count))
2014-01-05 00:52:03 +00:00
return res[:-len('\n')]
2013-12-09 17:29:07 +00:00
def to_screen(self, message, skip_eol=False):
"""Print message to stdout if not in quiet mode."""
return self.to_stdout(message, skip_eol, check_quiet=True)
2014-04-07 17:57:42 +00:00
def _write_string(self, s, out=None):
write_string(s, out=out, encoding=self.params.get('encoding'))
2014-04-07 17:57:42 +00:00
def to_stdout(self, message, skip_eol=False, check_quiet=False):
"""Print message to stdout if not in quiet mode."""
2013-11-24 05:08:11 +00:00
if self.params.get('logger'):
self.params['logger'].debug(message)
elif not check_quiet or not self.params.get('quiet', False):
2013-12-09 17:29:07 +00:00
message = self._bidi_workaround(message)
2014-01-05 00:52:03 +00:00
terminator = ['\n', ''][skip_eol]
output = message + terminator
2013-12-09 17:29:07 +00:00
2014-04-07 17:57:42 +00:00
self._write_string(output, self._screen_file)
def to_stderr(self, message):
"""Print message to stderr."""
assert isinstance(message, compat_str)
2013-11-24 05:08:11 +00:00
if self.params.get('logger'):
self.params['logger'].error(message)
else:
2013-12-09 17:29:07 +00:00
message = self._bidi_workaround(message)
2014-01-05 00:52:03 +00:00
output = message + '\n'
2014-04-07 17:57:42 +00:00
self._write_string(output, self._err_file)
2013-11-17 10:39:52 +00:00
def to_console_title(self, message):
if not self.params.get('consoletitle', False):
return
if compat_os_name == 'nt':
if ctypes.windll.kernel32.GetConsoleWindow():
# c_wchar_p() might not be necessary if `message` is
# already of type unicode()
ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
2013-11-17 10:39:52 +00:00
elif 'TERM' in os.environ:
2014-04-07 17:57:42 +00:00
self._write_string('\033]0;%s\007' % message, self._screen_file)
2013-11-17 10:39:52 +00:00
def save_console_title(self):
if not self.params.get('consoletitle', False):
return
if self.params.get('simulate', False):
return
if compat_os_name != 'nt' and 'TERM' in os.environ:
# Save the title on stack
2014-04-07 17:57:42 +00:00
self._write_string('\033[22;0t', self._screen_file)
def restore_console_title(self):
if not self.params.get('consoletitle', False):
return
if self.params.get('simulate', False):
return
if compat_os_name != 'nt' and 'TERM' in os.environ:
# Restore the title from stack
2014-04-07 17:57:42 +00:00
self._write_string('\033[23;0t', self._screen_file)
def __enter__(self):
self.save_console_title()
return self
def __exit__(self, *args):
self.restore_console_title()
2014-01-25 11:02:43 +00:00
if self.params.get('cookiefile') is not None:
self.cookiejar.save(ignore_discard=True, ignore_expires=True)
def trouble(self, message=None, tb=None):
"""Determine action to take when a download problem appears.
Depending on if the downloader has been configured to ignore
download errors or not, this method may throw an exception or
not when errors are found, after printing the message.
tb, if given, is additional traceback information.
"""
if message is not None:
self.to_stderr(message)
if self.params.get('verbose'):
if tb is None:
if sys.exc_info()[0]: # if .trouble has been called from an except block
2014-01-05 00:52:03 +00:00
tb = ''
if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
2014-01-05 00:52:03 +00:00
tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
tb += encode_compat_str(traceback.format_exc())
else:
tb_data = traceback.format_list(traceback.extract_stack())
2014-01-05 00:52:03 +00:00
tb = ''.join(tb_data)
self.to_stderr(tb)
if not self.params.get('ignoreerrors', False):
if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
exc_info = sys.exc_info()[1].exc_info
else:
exc_info = sys.exc_info()
raise DownloadError(message, exc_info)
self._download_retcode = 1
def report_warning(self, message):
'''
Print the message to stderr, it will be prefixed with 'WARNING:'
If stderr is a tty file the 'WARNING:' will be colored
'''
if self.params.get('logger') is not None:
self.params['logger'].warning(message)
else:
2014-03-25 23:43:46 +00:00
if self.params.get('no_warnings'):
return
if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
_msg_header = '\033[0;33mWARNING:\033[0m'
else:
_msg_header = 'WARNING:'
warning_message = '%s %s' % (_msg_header, message)
self.to_stderr(warning_message)
def report_error(self, message, tb=None):
'''
Do the same as trouble, but prefixes the message with 'ERROR:', colored
in red if stderr is a tty file.
'''
if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
2014-01-05 00:52:03 +00:00
_msg_header = '\033[0;31mERROR:\033[0m'
else:
2014-01-05 00:52:03 +00:00
_msg_header = 'ERROR:'