# __init__.py - a Web front-end to the Distant Reader indexes # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # November 9, 2022 - first cut; "Thanks Don!" # November 15, 2022 - began adding the index to Arxiv # December 11, 2022 - added CORD # February 22, 2023 - added arbitary study carrel # March 26, 2023 - modified output to include author, title, date, and url columns # June 9, 2023 - added submission of zip file, but nothing is done with the input sans saving it # June 12, 2023 - saved job and zip file # July 20, 2023 - started authenticating and saving patron records; again, "Thanks Don!" # configure TEMPLATE = "SELECT * FROM indx WHERE indx MATCH '##QUERY##' ORDER BY RANK LIMIT 999;" ARXIV = '/shared/sandbox/arxiv/etc/arxiv.db' CORD = '/shared/sandbox/cord/etc/cord.db' CARRELS = '/shared/reader-catalog/etc/catalog.db' GUTENBERG = '/shared/sandbox/gutenberg/etc/gutenberg.db' JOURNALS = '/shared/sandbox/ojs-toolbox/etc/ojs-journals.db' PAMPHLETS = '/shared/sandbox/pamphets/etc/pamphlets.db' LIBRARY = '/shared/www/html/stacks/carrels' HTTPROOT = 'https://distantreader.org/stacks/carrels' # require from authlib.integrations.flask_client import OAuth from flask import Flask, render_template, request, Response, url_for, session, redirect, flash from flask_login import LoginManager, UserMixin, login_user, login_required, current_user, logout_user from internetarchive.search import Search from internetarchive.session import ArchiveSession from pathlib import Path from werkzeug.utils import secure_filename import datetime import os import pandas as pd import random import rdr import re import sqlite3 import string # initialize app = Flask( __name__ ) ### functions # query open library def searchIA( query ) : # configure QUERY = '(##QUERY##) AND mediatype:(texts) AND lending___status:"is_readable"' FIELDS = [ 'creator', 'title', 'date', 'description', 'subject' ] DETAILS = 'https://archive.org/details/##IDENTIFIER##' DOWNLOAD = 'https://archive.org/download/##IDENTIFIER##/##IDENTIFIER##.pdf' MAXIMUM = 999 COLUMNS = [ 'identifier', 'author', 'title', 'date', 'subjects', 'description', 'details', 'download' ] FORMAT = 'csv' # initialize; search query = QUERY.replace( '##QUERY##', query ) results = Search( ArchiveSession(), query, fields=FIELDS ) records = [] # loop through each search result for index, result in enumerate( results ) : # parse the easy stuff identifier = result[ 'identifier' ] title = result[ 'title' ] # parse the harder stuff try : creator = result[ 'creator', '' ] except KeyError : creator = '' try : date = result[ 'date' ] except KeyError : date = '0000' try : subjects = result[ 'subject' ] except KeyError : subjects = '' try : description = result[ 'description' ] except KeyError : description = '' # normalize year = re.sub( r'^(\d\d\d\d).*', r'\1', date ) details = DETAILS.replace( '##IDENTIFIER##', identifier ) download = DOWNLOAD.replace( '##IDENTIFIER##', identifier ) if type( subjects ) == list : subjects = '; '.join( subjects ) if type( creator ) == list : creator = '; '.join( creator ) if type( description ) == list : description = '; '.join( description ) description = description.replace( '\n', ' ' ) # create a record and update the list of records record = [ identifier, creator, title, year, subjects, description, details, download ] records.append( record ) # limit results; this is not intended to be comprehensive; people are expected to be somewhat smart about their query if index == MAXIMUM : break # create a dataframe results = pd.DataFrame( records, columns=COLUMNS ) # done return( results ) # query an SQLite database def searchDB( database, query ) : # initialize, create sql, and search connection = sqlite3.connect( database ) sql = TEMPLATE.replace( '##QUERY##', query ) results = pd.read_sql_query( sql, connection ) return( results ) ### routes # home page @app.route( '/' ) def home() : return render_template( 'index.htm', error=None ) # arxiv @app.route( '/arxiv' ) def arxiv() : return render_template( 'arxiv-index.htm', error=None ) # ia @app.route( '/ia' ) def ia() : return render_template( 'ia-index.htm', error=None ) # all carrels @app.route( '/carrels' ) def carrels() : return render_template( 'carrels-index.htm', error=None ) # gutenberg @app.route( '/gutenberg' ) def gutenberg() : return render_template( 'gutenberg-index.htm', error=None ) # ital @app.route( '/journals' ) def journals() : return render_template( 'journals-index.htm', error=None ) # cord @app.route( '/cord' ) def cord() : return render_template( 'cord-index.htm', error=None ) # catholic pamphlets @app.route( '/pamphlets' ) def pamphlets() : return render_template( 'pamphlets-index.htm', error=None ) # a study carrel @app.route( '/carrel' ) def carrel() : # configure format = 'csv' query = 'love' carrel = 'homer' html = 'carrel-results.htm' # check for null input if not request.args.get( 'carrel' ) : return render_template( 'carrel-index.htm', error=None ) # get input query = request.args.get( 'query', query ) format = request.args.get( 'format', format ) carrel = request.args.get( 'carrel', carrel ) # configure index = Path( LIBRARY ) index = index/carrel/(rdr.ETC)/( rdr.DATABASE ) # do the work; search results = searchDB( index, query ) # get the number of hits hits = len( results ) # drop the full text column; too big root = HTTPROOT + '/' + carrel + '/' + ( rdr.CACHE ) + '/' results[ 'cache' ] = root + results[ 'cache' ] results.rename( columns={ 'id':'identifier', 'cache':'url' }, inplace = True ) results.drop( columns=[ 'fulltext', 'txt' ], inplace=True ) # output, accordingly if format == 'json' : return Response( results.to_json( orient='records' ), mimetype='application/json' ) elif format == 'html' : return render_template( html, hits=hits, query=query, carrel=carrel, result=results.to_html( classes=['display', 'compact'], border=0, table_id='results', index=False )) else : return Response( results.to_csv( index=False ), mimetype='text/csv' ) # full text search @app.route( '/search' ) def search() : # configure query = 'love' format = 'csv' index = 'gutenberg' html = 'results.htm' # get input query = request.args.get( 'query', query ) format = request.args.get( 'format', format ) index = request.args.get( 'index', index ) # configure database to query if index == 'gutenberg' : results = searchDB( GUTENBERG, query ) html = 'gutenberg-results.htm' elif index == 'arxiv' : results = searchDB( ARXIV, query ) html = 'arxiv-results.htm' elif index == 'cord' : results = searchDB( CORD, query ) html = 'cord-results.htm' elif index == 'pamphlets' : results = searchDB( PAMPHLETS, query ) html = 'pamphlets-results.htm' elif index == 'carrels' : results = searchDB( CARRELS, query ) html = 'carrels-results.htm' elif index == 'journals' : results = searchDB( JOURNALS, query ) html = 'journals-results.htm' elif index == 'ia' : results = searchIA( query ) html = 'ia-results.htm' # get the number of hits hits = len( results ) # enhance if index == 'ia' : results.rename( columns={ 'download':'url' }, inplace = True ) results.drop( columns=[ 'description' ], inplace=True ) if index == 'journals' : results.drop( columns=[ 'abstract' ], inplace=True ) if index == 'gutenberg' : results = results[ [ 'gid', 'author', 'title', 'subject', 'classification', 'url' ] ] results.rename( columns={ 'gid':'identifier', 'subject':'subjects' }, inplace = True ) if index == 'pamphlets' : results = results[ [ 'system', 'author', 'title', 'publisher', 'year', 'subjects', 'url' ] ] results.rename( columns={ 'system':'identifier', 'year':'date' }, inplace = True ) if index == 'carrels' : results = results[ [ 'identifier', 'titles', 'date_created', 'keywords', 'types', 'sources', 'items', 'words', 'flesch', 'processes', 'browse', 'read', 'download' ] ] results.rename( columns={ 'date_created':'date' , 'titles':'title', 'types':'type', 'sources':'source', 'processes':'process', 'download':'url'}, inplace = True ) if index == 'arxiv' : results.drop( columns=[ 'doi', 'submitter', 'license', 'landingPage', 'comments', 'report', 'abstract' ], inplace=True ) results.rename( columns={ 'pdf':'url', 'id':'identifier' }, inplace = True ) if index == 'cord' : results.drop( columns=[ 'abstract' ], inplace=True ) # branch according to format if format == 'json' : return Response( results.to_json( orient='records' ), mimetype='application/json' ) elif format == 'html' : return render_template( html, hits=hits, query=query, result=results.to_html( classes=['display', 'compact'], border=0, table_id='results', index=False )) else : return Response( results.to_csv( index=False ), mimetype='text/csv' )