twarc/youtubedl.py at main · DocNow/twarc · GitHub Skip to content Sign up Sign up Why GitHub? Features → Mobile → Actions → Codespaces → Packages → Security → Code review → Project management → Integrations → GitHub Sponsors → Customer stories→ Team Enterprise Explore Explore GitHub → Learn and contribute Topics → Collections → Trending → Learning Lab → Open source guides → Connect with others The ReadME Project → Events → Community forum → GitHub Education → GitHub Stars program → Marketplace Pricing Plans → Compare plans → Contact Sales → Education → In this repository All GitHub ↵ Jump to ↵ No suggested jump to results In this repository All GitHub ↵ Jump to ↵ In this organization All GitHub ↵ Jump to ↵ In this repository All GitHub ↵ Jump to ↵ Sign in Sign up Sign up {{ message }} DocNow / twarc Notifications Star 1k Fork 214 Code Issues 53 Pull requests 0 Actions Projects 0 Wiki Security Insights More Code Issues Pull requests Actions Projects Wiki Security Insights Permalink main Switch branches/tags Branches Tags Nothing to show {{ refName }} default View all branches Nothing to show {{ refName }} default View all tags twarc/utils/youtubedl.py / Jump to Code definitions No definitions found in this file. Code navigation not available for this commit Go to file Go to file T Go to line L Go to definition R Copy path Copy permalink     Cannot retrieve contributors at this time executable file 193 lines (163 sloc) 5.79 KB Raw Blame Open with Desktop View raw View blame #!/usr/bin/env python3 """ usage: youtubedl.py [-h] [--max-downloads MAX_DOWNLOADS] [--max-filesize MAX_FILESIZE] [--ignore-livestreams] [--download-dir DOWNLOAD_DIR] [--block BLOCK] [--timeout TIMEOUT] files Download videos in Twitter JSON data. positional arguments: files json files to parse optional arguments: -h, --help show this help message and exit --max-downloads MAX_DOWNLOADS max downloads per URL --max-filesize MAX_FILESIZE max filesize to download (bytes) --ignore-livestreams ignore livestreams which may never end --download-dir DOWNLOAD_DIR directory to download to --block BLOCK hostnames to block (repeatable) --timeout TIMEOUT timeout download after n seconds """ import os import sys import json import time import argparse import logging import fileinput import youtube_dl import multiprocessing as mp from urllib.parse import urlparse from datetime import datetime, timedelta from youtube_dl.utils import match_filter_func parser = argparse.ArgumentParser(description='Download videos in Twitter JSON data.') parser.add_argument( '--max-downloads', type=int, help='max downloads per URL') parser.add_argument( '--max-filesize', type=int, help='max filesize to download (bytes)') parser.add_argument( '--ignore-livestreams', action='store_true', default=False, help='ignore livestreams which may never end') parser.add_argument( '--download-dir', type=str, help='directory to download to', default='youtubedl') parser.add_argument( '--block', action='append', help='hostnames to block (repeatable)') parser.add_argument( '--timeout', type=int, default=0, help='timeout download after n seconds') parser.add_argument('files', action='append', help='json files to parse') def main(): args = parser.parse_args() # make download directory download_dir = args.download_dir if not os.path.isdir(download_dir): os.mkdir(download_dir) # setup logger log_file = "{}/youtubedl.log".format(download_dir) logging.basicConfig(filename=log_file, level=logging.INFO) log = logging.getLogger() # setup youtube_dl config ydl_opts = { "format": "best", "logger": log, "restrictfilenames": True, "ignoreerrors": True, "nooverwrites": True, "writedescription": True, "writeinfojson": True, "writesubtitles": True, "writeautomaticsub": True, "outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir), "download_archive": "{}/archive.txt".format(download_dir) } if args.ignore_livestreams: ydl_opts["matchfilter"] = match_filter_func("!is_live") if args.max_downloads: ydl_opts['max_downloads'] = args.max_downloads if args.max_filesize: ydl_opts['max_filesize'] = args.max_filesize # keep track of domains to block blocklist = [] if args.block: blocklist = args.block # read in existing mapping file to know which urls we can ignorej seen = set() mapping_file = os.path.join(download_dir, 'mapping.tsv') if os.path.isfile(mapping_file): for line in open(mapping_file): url, path = line.split('\t') log.info('found %s in %s', url, mapping_file) seen.add(url) # loop through the tweets results = open(mapping_file, 'a') for line in fileinput.input(args.files): tweet = json.loads(line) log.info('analyzing %s', tweet['id_str']) for e in tweet['entities']['urls']: url = e.get('unshortened_url') or e['expanded_url'] # see if we can skip this one if not url: continue if url in seen: log.info('already processed %s', url) continue seen.add(url) # check for blocks uri = urlparse(url) if uri.netloc in blocklist: logging.warn("%s in block list", url) continue # set up a multiprocessing queue to manage the download with a timeout log.info('processing %s', url) q = mp.Queue() p = mp.Process(target=download, args=(url, q, ydl_opts, log)) p.start() started = datetime.now() while True: # if we've exceeded the timeout terminate the process if args.timeout and datetime.now() - started > timedelta(seconds=args.timeout): log.warning('reached timeout %s', args.timeout) p.terminate() break # if the process is done we can stop elif not p.is_alive(): break # otherwise sleep and the check again time.sleep(1) # if the queue was empty there either wasn't a download or it timed out if q.empty(): filename = '' else: filename = q.get() p.join() # write the result to the mapping file results.write("{}\t{}\n".format(url, filename)) def download(url, q, ydl_opts, log): try: ydl = youtube_dl.YoutubeDL(ydl_opts) info = ydl.extract_info(url) if info: filename = ydl.prepare_filename(info) log.info('downloaded %s as %s', url, filename) else: filename = "" logging.warning("%s doesn't look like a video", url) except youtube_dl.utils.MaxDownloadsReached as e: logging.warning('only %s downloads per url allowed', args.max_downloads) if __name__ == "__main__": main() Copy lines Copy permalink View git blame Reference in new issue Go © 2021 GitHub, Inc. Terms Privacy Security Status Docs Contact GitHub Pricing API Training Blog About You can’t perform that action at this time. You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.