# __init__.py - a Web front-end to the Distant Reader # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # November 9, 2022 - first cut; "Thanks Don!" # November 15, 2022 - began adding the index to Arxiv # December 11, 2022 - added CORD # February 22, 2023 - added arbitary study carrel # March 26, 2023 - modified output to include author, title, date, and url columns # June 9, 2023 - added submission of zip file, but nothing is done with the input sans saving it # June 12, 2023 - saved job and zip file # July 20, 2023 - started authenticating and saving patron records; again, "Thanks Don!" # cool echo function # return render_template( 'echo.htm', result=token ) # configure TEMPLATE = "SELECT * FROM indx WHERE indx MATCH '##QUERY##' ORDER BY RANK LIMIT 999;" ARXIV = '/shared/sandbox/arxiv/etc/arxiv.db' REDIRECT = 'http://localhost:8080/login_callback' AUTHORIZE = 'https://sandbox.orcid.org/oauth/authorize' ACCESSTOKEN = 'https://sandbox.orcid.org/oauth/token' DATABASE = './app/etc/patrons.db' # require from authlib.integrations.flask_client import OAuth from flask import Flask, render_template, request, Response, url_for, session, redirect, flash from flask_login import LoginManager, UserMixin, login_user, login_required, current_user, logout_user from internetarchive.search import Search from internetarchive.session import ArchiveSession from pathlib import Path from werkzeug.utils import secure_filename import datetime import os import pandas as pd import random import rdr import re import sqlite3 import string # initialize app = Flask( __name__ ) app.config[ 'SECRET_KEY' ] = "foobar" app.config[ 'ORCID_CLIENT_ID' ] = "APP-2G0DPHEC44CY775V" app.config[ 'ORCID_CLIENT_SECRET' ] = "70690c3a-dcff-407b-b06f-0770b1be5055" oauth = OAuth( app ) oauth.register( name='orcid', authorize_url=AUTHORIZE, access_token_url=ACCESSTOKEN ) login_manager = LoginManager() login_manager.init_app( app ) login_manager.login_view = 'login' ### functions # make sure this patron exists; tricky @login_manager.user_loader def load_user( user_id ) : if user_id is not None and user_id != '' : return Patron.FromID( user_id ) else : return None # validate new user data def create_patron() : # initialize username = request.form.get( 'username', '' ) email = request.form.get( 'email', '' ) # validate input; ought to use cool flash feature if username == '' : return render_template( 'login-new-orcid.htm' ) if email == '' : return render_template( 'login-new-orcid.htm' ) if not re.fullmatch( '[A-Za-z0-9_-]+', username ) : return render_template( 'login-new-orcid.htm' ) # initialize a patron return Patron( username=username, email=email ) # query open library def searchIA( query ) : # configure QUERY = '(##QUERY##) AND mediatype:(texts) AND lending___status:"is_readable"' FIELDS = [ 'creator', 'title', 'date', 'description', 'subject' ] DETAILS = 'https://archive.org/details/##IDENTIFIER##' DOWNLOAD = 'https://archive.org/download/##IDENTIFIER##/##IDENTIFIER##.pdf' MAXIMUM = 128 COLUMNS = [ 'identifier', 'author', 'title', 'date', 'subjects', 'description', 'details', 'download' ] FORMAT = 'csv' # initialize; search query = QUERY.replace( '##QUERY##', query ) results = Search( ArchiveSession(), query, fields=FIELDS ) records = [] # loop through each search result for index, result in enumerate( results ) : # parse the easy stuff identifier = result[ 'identifier' ] title = result[ 'title' ] # parse the harder stuff try : creator = result[ 'creator' ] except KeyError : creator = '' try : date = result[ 'date' ] except KeyError : date = '0000' try : subjects = result[ 'subject' ] except KeyError : subjects = '' try : description = result[ 'description' ] except KeyError : description = '' # normalize year = re.sub( r'^(\d\d\d\d).*', r'\1', date ) details = DETAILS.replace( '##IDENTIFIER##', identifier ) download = DOWNLOAD.replace( '##IDENTIFIER##', identifier ) if type( subjects ) == list : subjects = '; '.join( subjects ) if type( creator ) == list : creator = '; '.join( creator ) if type( description ) == list : description = '; '.join( description ) description = description.replace( '\n', ' ' ) # create a record and update the list of records record = [ identifier, creator, title, year, subjects, description, details, download ] records.append( record ) # limit results; this is not intended to be comprehensive; people are expected to be somewhat smart about their query if index == MAXIMUM : break # create a dataframe results = pd.DataFrame( records, columns=COLUMNS ) # done return( results ) # query an SQLite database def searchDB( database, query ) : # initialize, create sql, and search connection = sqlite3.connect( database ) sql = TEMPLATE.replace( '##QUERY##', query ) results = pd.read_sql_query( sql, connection ) return( results ) ### classes class Patron( UserMixin ) : def __init__( self, id_=None, username='', name='', email='', date="", orcid='', email_verify_date='' ) : self.id = id_ self.username = username self.name = name self.email = email self.create_date = date self.orcid = orcid self.email_verify_date = email_verify_date def save( self ) : database = sqlite3.connect( DATABASE ) if self.create_date == '' : self.create_date = datetime.date.today() if self.id is None : rowid = database.execute( """INSERT INTO patrons (username, name, email, date, orcid, email_verify_date) VALUES (?, ?, ?, ?, ?, ?)""", ( self.username, self.name, self.email, self.create_date, self.orcid, self.email_verify_date ) ) database.commit() patron = Patron.FromUsername(self.username) self.id = patron.id return database.execute( """INSERT OR REPLACE INTO patrons (rowid, username, name, email, date, orcid, email_verify_date) VALUES (?, ?, ?, ?, ?, ?, ?)""", (self.id, self.username, self.name, self.email, self.create_date, self.orcid, self.email_verify_date, ) ) database.commit() @staticmethod def FromID(id_): database = sqlite3.connect( DATABASE ) record = database.execute( """SELECT rowid, username, name, email, date, orcid, email_verify_date FROM patrons WHERE rowid = ?""", (id_,), ).fetchone() if not record : return None return Patron( record[0], record[1], record[2], record[3], record[4], record[5], record[6] ) @staticmethod def FromUsername( username ): database = sqlite3.connect( DATABASE ) record = database.execute( """SELECT rowid, username, name, email, date, orcid, email_verify_date FROM patrons WHERE username = ?""", (username,), ).fetchone() if record is None: return None return Patron( record[0], record[1], record[2], record[3], record[4], record[5], record[6] ) @staticmethod def FromORCID(orcid): database = sqlite3.connect( DATABASE ) record = database.execute( """SELECT rowid, username, name, email, date, orcid, email_verify_date FROM patrons WHERE orcid = ?""", (orcid,), ).fetchone() if record is None : return None return Patron( record[0], record[1], record[2], record[3], record[4], record[5], record[6] ) ### routes # home page @app.route( '/' ) def home() : return render_template( 'index.htm', error=None ) # login/authenticate @app.route( '/login', methods=[ 'GET', 'POST' ] ) def login(): # login or authenticate if request.method == 'GET' : return render_template( 'login.htm' ) else : return oauth.orcid.authorize_redirect( REDIRECT, scope='/authenticate' ) # authorize @app.route( "/login_callback", methods=[ "GET", "POST" ] ) def login_callback(): # initialize token = oauth.orcid.authorize_access_token() name = token[ 'name' ] orcid = token[ 'orcid' ] # find and process the given patron patron = Patron.FromORCID( token[ 'orcid' ] ) if patron is not None : login_user( patron ) return redirect( '/' ) # create a new patron session[ 'orcid' ] = token[ 'orcid' ] session[ 'name' ] = token[ 'name' ] return redirect( url_for( 'new_patron' ) ) # new patron @app.route( '/login/new-patron', methods=[ 'GET', 'POST' ] ) def new_patron(): # get input if request.method == 'GET' : return render_template( 'login-new-orcid.htm' ) # read input and create a patron patron = create_patron() patron.name = request.form.get( 'name', session.pop( 'name' ) ) patron.orcid = session.pop( 'orcid' ) patron.save() # log them in and redirect them home login_user( patron ) return redirect( '/' ) # display a patron's profile @app.route( '/profile' ) @login_required def profile() : return render_template( 'profile.htm' ) # logout @app.route( '/logout' ) @login_required def logout() : logout_user() return redirect( '/' ) # arxiv @app.route( '/arxiv' ) def arxiv() : return render_template( 'arxiv-index.htm', error=None ) # ia @app.route( '/ia' ) def ia() : return render_template( 'ia-index.htm', error=None ) # full text search @app.route( '/search' ) def search() : # configure query = 'love' format = 'csv' index = 'ia' html = 'results.htm' # get input query = request.args.get( 'query', query ) format = request.args.get( 'format', format ) index = request.args.get( 'index', index ) # configure database to query if index == 'arxiv' : results = searchDB( ARXIV, query ) html = 'arxiv-results.htm' elif index == 'ia' : results = searchIA( query ) html = 'ia-results.htm' # get the number of hits hits = len( results ) # enhance if index == 'ia' : results.rename( columns={ 'download':'url' }, inplace = True ) results.drop( columns=[ 'description' ], inplace=True ) if index == 'arxiv' : results.drop( columns=[ 'doi', 'submitter', 'license', 'landingPage', 'comments', 'report', 'abstract' ], inplace=True ) results.rename( columns={ 'pdf':'url', 'id':'identifier' }, inplace = True ) # branch according to format if format == 'json' : return Response( results.to_json( orient='records' ), mimetype='application/json' ) elif format == 'html' : return render_template( html, hits=hits, query=query, result=results.to_html( classes=['display', 'compact'], border=0, table_id='results', index=False )) else : return Response( results.to_csv( index=False ), mimetype='text/csv' ) # build a study carrel @app.route( "/build", methods=[ 'GET', 'POST' ] ) @login_required def build(): # configure INDEX = 'build-index.htm' RESULTS = 'build-results.htm' ZIPS = '/shared/reader-queue/zips' JOBS = '/shared/reader-queue/jobs' RANGE = 12 TSV = '.tsv' ZIP = '.zip' PERMISSION = 0o775 # get the input try : carrel = request.form[ 'carrel' ] patron = request.form[ 'patron' ] email = request.form[ 'email' ] zip = request.files[ 'zip' ] except KeyError : return render_template( INDEX, error=None ) # make sure we have input for everything if not carrel or not patron or not email or not zip : return render_template( INDEX, error=None ) # update the jobs and cache the zip file; kinda tricky key = ''.join( random.choice( string.ascii_lowercase ) for i in range( RANGE ) ) job = '\t'.join( [ patron, email, carrel, key ] ) with open( Path( JOBS )/( key + TSV ), 'w' ) as handle : handle.write( job ) zip.save( Path( ZIPS )/( key + ZIP ) ) # make the files more readable os.chmod( Path(JOBS)/( key + TSV ), PERMISSION ) os.chmod( Path( ZIPS )/( key + ZIP ), PERMISSION ) # a cool script, via cron -- /shared/reader-queue/bin/queue2submit.sh -- will now initialize the heavy lifting # return confirmation return render_template( RESULTS, patron=patron, carrel=carrel, key=key )