#!/usr/bin/env python # works2matrix.py - given a directory of works and sources, output a matrix of bibliometrics # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # January 24, 2024 - first investigations # configure PATTERN = '*.json' WORK = 'https://doi.org/' SOURCE = 'https://openalex.org/' COLUMNS = [ 'doi', 'title', 'cited', 'journal', 'hindex', 'i10index', 'abstract' ] # require from json import loads from pathlib import Path import requests import sys import pandas # get input if len( sys.argv ) != 3 : sys.exit( 'Usage: ' + sys.argv[ 0 ] + " " ) works = sys.argv[ 1 ] sources = sys.argv[ 2 ] # initialize works = Path( works ) sources = Path( sources ) # process each source; create an index of sources ("Cool!") index = {} for source in sources.glob( PATTERN ) : # re-initialize identifier = None hindex = None i10index = None # open the given work and parse the easy stuff with open( source ) as handle : record = loads( handle.read() ) id = str( record[ 'id' ].replace( SOURCE, '' ) ) hindex = str( record[ 'summary_stats' ][ 'h_index' ] ) i10index = str( record[ 'summary_stats' ][ 'i10_index' ] ) # update index[ id ] = { 'hindex':hindex, 'i10index': i10index } # get and process each work; create a matrix (list) of records matrix = [] for work in works.glob( PATTERN ) : # re-initialize doi = None title = None citedby = None journalTitle = None source = None hindex = None i10index = None abstract = None # open the given work and parse the easy stuff with open( work ) as handle : record = loads( handle.read() ) doi = record[ 'doi' ].replace( WORK, '' ) title = record[ 'title' ].title() citedby = str( record[ 'cited_by_count' ] ) if record[ 'abstract_inverted_index' ] : abstract = ' '.join( list( ( record[ 'abstract_inverted_index' ]).keys() ) ) # try to parse the harder stuff try : journal = record[ 'primary_location' ][ 'source' ][ 'display_name' ] source = record[ 'primary_location' ][ 'source' ][ 'id' ].replace( SOURCE, '' ) # source not found (probably) except : pass # check for source if source : # look up impact factors hindex = index[ source ][ 'hindex' ] i10index = index[ source ][ 'i10index' ] # debug sys.stderr.write( ' doi: ' + doi + '\n' ) sys.stderr.write( ' title: ' + title + '\n' ) sys.stderr.write( ' cited by: ' + citedby + '\n' ) sys.stderr.write( ' journal: ' + str( journal ) + '\n' ) sys.stderr.write( ' hindex: ' + str( hindex ) + '\n' ) sys.stderr.write( ' i10index: ' + str( i10index ) + '\n' ) sys.stderr.write( '\n' ) # update matrix.append( [ doi, title, citedby, journal, hindex, i10index, abstract ] ) # create a dataframe, output, and done matrix = pandas.DataFrame( matrix, columns=COLUMNS ) print( matrix.to_csv( index=False ) ) exit()