DocNow / twarc

Cannot retrieve contributors at this time

executable file 193 lines (163 sloc) 5.79 KB

Raw Blame

	#!/usr/bin/env python3

	"""
	usage: youtubedl.py [-h] [--max-downloads MAX_DOWNLOADS]
	[--max-filesize MAX_FILESIZE] [--ignore-livestreams]
	[--download-dir DOWNLOAD_DIR] [--block BLOCK]
	[--timeout TIMEOUT]
	files

	Download videos in Twitter JSON data.

	positional arguments:
	files json files to parse

	optional arguments:
	-h, --help show this help message and exit
	--max-downloads MAX_DOWNLOADS
	max downloads per URL
	--max-filesize MAX_FILESIZE
	max filesize to download (bytes)
	--ignore-livestreams ignore livestreams which may never end
	--download-dir DOWNLOAD_DIR
	directory to download to
	--block BLOCK hostnames to block (repeatable)
	--timeout TIMEOUT timeout download after n seconds
	"""

	import os
	import sys
	import json
	import time
	import argparse
	import logging
	import fileinput
	import youtube_dl
	import multiprocessing as mp

	from urllib.parse import urlparse
	from datetime import datetime, timedelta
	from youtube_dl.utils import match_filter_func

	parser = argparse.ArgumentParser(description='Download videos in Twitter JSON data.')
	parser.add_argument(
	'--max-downloads',
	type=int,
	help='max downloads per URL')

	parser.add_argument(
	'--max-filesize',
	type=int,
	help='max filesize to download (bytes)')

	parser.add_argument(
	'--ignore-livestreams',
	action='store_true',
	default=False,
	help='ignore livestreams which may never end')

	parser.add_argument(
	'--download-dir',
	type=str,
	help='directory to download to',
	default='youtubedl')

	parser.add_argument(
	'--block',
	action='append',
	help='hostnames to block (repeatable)')

	parser.add_argument(
	'--timeout',
	type=int,
	default=0,
	help='timeout download after n seconds')

	parser.add_argument('files', action='append', help='json files to parse')

	def main():
	args = parser.parse_args()

	# make download directory
	download_dir = args.download_dir
	if not os.path.isdir(download_dir):
	os.mkdir(download_dir)

	# setup logger
	log_file = "{}/youtubedl.log".format(download_dir)
	logging.basicConfig(filename=log_file, level=logging.INFO)
	log = logging.getLogger()

	# setup youtube_dl config
	ydl_opts = {
	"format": "best",
	"logger": log,
	"restrictfilenames": True,
	"ignoreerrors": True,
	"nooverwrites": True,
	"writedescription": True,
	"writeinfojson": True,
	"writesubtitles": True,
	"writeautomaticsub": True,
	"outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir),
	"download_archive": "{}/archive.txt".format(download_dir)
	}
	if args.ignore_livestreams:
	ydl_opts["matchfilter"] = match_filter_func("!is_live")
	if args.max_downloads:
	ydl_opts['max_downloads'] = args.max_downloads
	if args.max_filesize:
	ydl_opts['max_filesize'] = args.max_filesize

	# keep track of domains to block
	blocklist = []
	if args.block:
	blocklist = args.block

	# read in existing mapping file to know which urls we can ignorej
	seen = set()
	mapping_file = os.path.join(download_dir, 'mapping.tsv')
	if os.path.isfile(mapping_file):
	for line in open(mapping_file):
	url, path = line.split('\t')
	log.info('found %s in %s', url, mapping_file)
	seen.add(url)

	# loop through the tweets
	results = open(mapping_file, 'a')
	for line in fileinput.input(args.files):
	tweet = json.loads(line)
	log.info('analyzing %s', tweet['id_str'])
	for e in tweet['entities']['urls']:
	url = e.get('unshortened_url') or e['expanded_url']

	# see if we can skip this one
	if not url:
	continue
	if url in seen:
	log.info('already processed %s', url)
	continue
	seen.add(url)

	# check for blocks
	uri = urlparse(url)
	if uri.netloc in blocklist:
	logging.warn("%s in block list", url)
	continue

	# set up a multiprocessing queue to manage the download with a timeout
	log.info('processing %s', url)
	q = mp.Queue()
	p = mp.Process(target=download, args=(url, q, ydl_opts, log))
	p.start()

	started = datetime.now()
	while True:
	# if we've exceeded the timeout terminate the process
	if args.timeout and datetime.now() - started > timedelta(seconds=args.timeout):
	log.warning('reached timeout %s', args.timeout)
	p.terminate()
	break
	# if the process is done we can stop
	elif not p.is_alive():
	break
	# otherwise sleep and the check again
	time.sleep(1)

	# if the queue was empty there either wasn't a download or it timed out
	if q.empty():
	filename = ''
	else:
	filename = q.get()

	p.join()

	# write the result to the mapping file
	results.write("{}\t{}\n".format(url, filename))

	def download(url, q, ydl_opts, log):
	try:
	ydl = youtube_dl.YoutubeDL(ydl_opts)
	info = ydl.extract_info(url)
	if info:
	filename = ydl.prepare_filename(info)
	log.info('downloaded %s as %s', url, filename)
	else:
	filename = ""
	logging.warning("%s doesn't look like a video", url)
	except youtube_dl.utils.MaxDownloadsReached as e:
	logging.warning('only %s downloads per url allowed', args.max_downloads)


	if __name__ == "__main__":
	main()

DocNow / twarc

twarc/utils/youtubedl.py / Jump to Code definitions No definitions found in this file. Code navigation not available for this commit

twarc/utils/youtubedl.py /

Jump to

Code definitions

No definitions found in this file.

Code navigation not available for this commit