#!/usr/bin/env python # works2sources.py - given a directory openalex works files, output a set of openalex source files # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # January 24, 2024 - first investigations # configure ROOT = 'https://api.openalex.org/sources/' EXTENSION = '.json' PATTERN = '*.json' PREFIX = 'https://openalex.org/' # require from pathlib import Path import requests import sys from json import loads # get input if len( sys.argv ) != 3 : sys.exit( 'Usage: ' + sys.argv[ 0 ] + " " ) works = sys.argv[ 1 ] sources = sys.argv[ 2 ] # make sane works = Path( works ) sources = Path( sources ) sources.mkdir( exist_ok=True ) # get and process each doi; create a set of openalex works errors = [] for index, work in enumerate( works.glob( PATTERN ) ) : # open the given work sys.stderr.write( 'Processing item #' + str( index + 1 ) + ' ('+ str( work ) + ')\r' ) with open( work ) as handle : record = loads( handle.read() ) # get the primary location try : source = record[ 'primary_location' ][ 'source' ][ 'id' ].replace( PREFIX, '' ) except : # bummer; probably source not found errors.append( str( work ) ) continue # build a request, get the response, and check it response = requests.get( ROOT + source ) if response.ok : # re-initialize text = response.text file = sources/( source + EXTENSION ) # check to see if we have already been here if not file.exists() : # do the work; cache with open( file, 'w' ) as handle : handle.write( text ) # output any errors and done sys.stderr.write( '\nNumber of errors ("source not found"): ' + str( len( errors ) ) + '. ' ) sys.stderr.write( 'Files: ' + '; '.join( errors ) + '\n\n' ) exit()