twarc/youtubedl.py at main · DocNow/twarc · GitHub


      Skip to content
      
    
                Sign up
              
              
                Sign up
              

                    Why GitHub?
                    
                      
                    Features →
                    	Mobile →
	Actions →
	Codespaces →
	Packages →
	Security →
	Code review →
	Project management →
	Integrations →


                    	GitHub Sponsors →
	Customer stories→


                Team
              
	
                Enterprise
              
	
                    Explore
                    
                      
                    	Explore GitHub →


                    Learn and contribute

                    	Topics →
	Collections →
	Trending →
	Learning Lab →
	Open source guides →


                    Connect with others

                    	The ReadME Project →
	Events →
	Community forum →
	GitHub Education →
	GitHub Stars program →


                Marketplace
              
	
                    Pricing
                    
                       
                    Plans →

                    	Compare plans →
	Contact Sales →


                    	Education →


        In this repository
      
      
        All GitHub
      
      ↵
    

      Jump to
      ↵
    

    No suggested jump to results
  

        In this repository
      
      
        All GitHub
      
      ↵
    

      Jump to
      ↵
    

        In this organization
      
      
        All GitHub
      
      ↵
    

      Jump to
      ↵
    

        In this repository
      
      
        All GitHub
      
      ↵
    

      Jump to
      ↵
    

          Sign in
        
            
              Sign up
            
            
              Sign up
            
      
      {{ message }}


    DocNow
  
  /
  
    twarc
  
  
    Notifications

  
        Star

    
      1k
    

          Fork

      
        214
      
  
          Code
            

          Issues
            53

    
          Pull requests
            0

    
          Actions
            

          Projects
            0

    
          Wiki
            

          Security
            

          Insights
            

            More
          

                    Code
                
	
                    Issues
                
	
                    Pull requests
                
	
                    Actions
                
	
                    Projects
                
	
                    Wiki
                
	
                    Security
                
	
                    Insights
                

    Permalink

    
      main
      
    
      Switch branches/tags
      
    
          Branches
          Tags
        

    Nothing to show


    {{ refName }}
    default
  

              View all branches
          

              Nothing to show

            
    {{ refName }}
    default
  

              View all tags
          
        
        twarc/utils/youtubedl.py
          /
  
      
    Jump to
    
  
        Code definitions
        
          
              No definitions found in this file.
            

          Code navigation not available for this commit
          
        
        Go to file
      

                Go to file
                T
            
	
                  Go to line
                  L
                
              
                  Go to definition
                  R
                
              
                Copy path
              
            
                  Copy permalink
                
              
          Cannot retrieve contributors at this time
        

      executable file
      
      193 lines (163 sloc)
      
    5.79 KB
  

      Raw
        Blame
    

                  Open with Desktop
                
            
              View raw
            
          
                View blame
              
            
		#!/usr/bin/env python3
		

		"""
		usage: youtubedl.py [-h] [--max-downloads MAX_DOWNLOADS]
		                    [--max-filesize MAX_FILESIZE] [--ignore-livestreams]
		                    [--download-dir DOWNLOAD_DIR] [--block BLOCK]
		                    [--timeout TIMEOUT]
		                    files
		
		Download videos in Twitter JSON data.
		
		positional arguments:
		  files                 json files to parse
		
		optional arguments:
		  -h, --help            show this help message and exit
		  --max-downloads MAX_DOWNLOADS
		                        max downloads per URL
		  --max-filesize MAX_FILESIZE
		                        max filesize to download (bytes)
		  --ignore-livestreams  ignore livestreams which may never end
		  --download-dir DOWNLOAD_DIR
		                        directory to download to
		  --block BLOCK         hostnames to block (repeatable)
		  --timeout TIMEOUT     timeout download after n seconds
		"""
		

		import os
		import sys
		import json
		import time
		import argparse
		import logging
		import fileinput
		import youtube_dl
		import multiprocessing as mp
		

		from urllib.parse import urlparse
		from datetime import datetime, timedelta
		from youtube_dl.utils import match_filter_func
		

		parser = argparse.ArgumentParser(description='Download videos in Twitter JSON data.')
		parser.add_argument(
		    '--max-downloads',
		    type=int,
		    help='max downloads per URL')
		

		parser.add_argument(
		    '--max-filesize',
		    type=int,
		    help='max filesize to download (bytes)')
		

		parser.add_argument(
		    '--ignore-livestreams',
		    action='store_true',
		    default=False,
		    help='ignore livestreams which may never end')
		

		parser.add_argument(
		    '--download-dir',
		    type=str,
		    help='directory to download to',
		    default='youtubedl')
		

		parser.add_argument(
		    '--block',
		    action='append',
		    help='hostnames to block (repeatable)')
		

		parser.add_argument(
		    '--timeout',
		    type=int,
		    default=0,
		    help='timeout download after n seconds')
		

		parser.add_argument('files', action='append', help='json files to parse')
		

		def main():
		    args = parser.parse_args()
		

		    # make download directory
		    download_dir = args.download_dir
		    if not os.path.isdir(download_dir):
		        os.mkdir(download_dir)
		

		    # setup logger
		    log_file = "{}/youtubedl.log".format(download_dir)
		    logging.basicConfig(filename=log_file, level=logging.INFO)
		    log = logging.getLogger()
		

		    # setup youtube_dl config
		    ydl_opts = {
		        "format": "best",
		        "logger": log,
		        "restrictfilenames": True,
		        "ignoreerrors": True,
		        "nooverwrites": True,
		        "writedescription": True,
		        "writeinfojson": True,
		        "writesubtitles": True,
		        "writeautomaticsub": True,
		        "outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir),
		        "download_archive": "{}/archive.txt".format(download_dir)
		    }
		    if args.ignore_livestreams:
		        ydl_opts["matchfilter"] = match_filter_func("!is_live")
		    if args.max_downloads:
		        ydl_opts['max_downloads'] = args.max_downloads
		    if args.max_filesize:
		        ydl_opts['max_filesize'] = args.max_filesize
		

		    # keep track of domains to block
		    blocklist = []
		    if args.block:
		        blocklist = args.block
		

		    # read in existing mapping file to know which urls we can ignorej
		    seen = set()
		    mapping_file = os.path.join(download_dir, 'mapping.tsv')
		    if os.path.isfile(mapping_file):
		        for line in open(mapping_file):
		            url, path = line.split('\t')
		            log.info('found %s in %s', url, mapping_file)
		            seen.add(url)
		

		    # loop through the tweets
		    results = open(mapping_file, 'a')
		    for line in fileinput.input(args.files):
		        tweet = json.loads(line)
		        log.info('analyzing %s', tweet['id_str'])
		        for e in tweet['entities']['urls']:
		            url = e.get('unshortened_url') or e['expanded_url']
		

		            # see if we can skip this one
		            if not url:
		                continue
		            if url in seen:
		                log.info('already processed %s', url)
		                continue
		            seen.add(url)
		

		            # check for blocks
		            uri = urlparse(url)
		            if uri.netloc in blocklist:
		                logging.warn("%s in block list", url)
		                continue
		

		            # set up a multiprocessing queue to manage the download with a timeout
		            log.info('processing %s', url)
		            q = mp.Queue()
		            p = mp.Process(target=download, args=(url, q, ydl_opts, log))
		            p.start()
		

		            started = datetime.now()
		            while True:
		                # if we've exceeded the timeout terminate the process
		                if args.timeout and datetime.now() - started > timedelta(seconds=args.timeout):
		                    log.warning('reached timeout %s', args.timeout)
		                    p.terminate()
		                    break
		                # if the process is done we can stop
		                elif not p.is_alive():
		                    break
		                # otherwise sleep and the check again
		                time.sleep(1)
		

		            # if the queue was empty there either wasn't a download or it timed out
		            if q.empty():
		                filename = ''
		            else:
		                filename = q.get()
		

		            p.join()
		

		            # write the result to the mapping file
		            results.write("{}\t{}\n".format(url, filename))
		

		def download(url, q, ydl_opts, log):
		    try:
		        ydl = youtube_dl.YoutubeDL(ydl_opts)
		        info = ydl.extract_info(url)
		        if info:
		            filename = ydl.prepare_filename(info)
		            log.info('downloaded %s as %s', url, filename)
		        else:
		            filename = ""
		            logging.warning("%s doesn't look like a video", url)
		    except youtube_dl.utils.MaxDownloadsReached as e:
		        logging.warning('only %s downloads per url allowed', args.max_downloads)
		

		if __name__ == "__main__":
		    main()


            Copy lines
          
        
            Copy permalink
          
        
	View git blame
	Reference in new issue


        Go

    
    	© 2021 GitHub, Inc.
	Terms
	Privacy
	Security
	Status
	Docs


    	Contact GitHub
	Pricing
	API
	Training
	Blog
	About


    You can’t perform that action at this time.
  

    You signed in with another tab or window. Reload to refresh your session.
    You signed out in another tab or window. Reload to refresh your session.