# __init__.py - a Web front-end to the Distant Reader indexes

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# November  9, 2022 - first cut; "Thanks Don!"
# November 15, 2022 - began adding the index to Arxiv
# December 11, 2022 - added CORD
# February 22, 2023 - added arbitary study carrel
# March    26, 2023 - modified output to include author, title, date, and url columns
# June      9, 2023 - added submission of zip file, but nothing is done with the input sans saving it
# June     12, 2023 - saved job and zip file
# July     20, 2023 - started authenticating and saving patron records; again, "Thanks Don!"


# configure
TEMPLATE     = "SELECT * FROM indx WHERE indx MATCH '##QUERY##' ORDER BY RANK LIMIT 999;"
ARXIV        = '/shared/sandbox/arxiv/etc/arxiv.db'
CORD         = '/shared/sandbox/cord/etc/cord.db'
CARRELS      = '/shared/reader-catalog/etc/catalog.db'
GUTENBERG    = '/shared/sandbox/gutenberg/etc/gutenberg.db'
JOURNALS     = '/shared/sandbox/ojs-toolbox/etc/ojs-journals.db'
PAMPHLETS    = '/shared/sandbox/pamphets/etc/pamphlets.db'
LIBRARY      = '/shared/www/html/stacks/carrels'
HTTPROOT     = 'https://distantreader.org/stacks/carrels'

# require
from   authlib.integrations.flask_client import OAuth
from   flask                             import Flask, render_template, request, Response, url_for, session, redirect, flash
from   flask_login                       import LoginManager, UserMixin, login_user, login_required, current_user, logout_user
from   internetarchive.search            import Search
from   internetarchive.session           import ArchiveSession
from   pathlib                           import Path
from   werkzeug.utils                    import secure_filename
import datetime
import os
import pandas as pd
import random
import rdr
import re
import sqlite3
import string

# initialize
app = Flask( __name__ )

### functions

# query open library
def searchIA( query ) :

	# configure
	QUERY    = '(##QUERY##) AND mediatype:(texts) AND lending___status:"is_readable"'
	FIELDS   = [ 'creator', 'title', 'date', 'description', 'subject' ]
	DETAILS  = 'https://archive.org/details/##IDENTIFIER##'
	DOWNLOAD = 'https://archive.org/download/##IDENTIFIER##/##IDENTIFIER##.pdf'
	MAXIMUM  = 999
	COLUMNS  = [ 'identifier', 'author', 'title', 'date', 'subjects', 'description', 'details', 'download' ]
	FORMAT   = 'csv'

	# initialize; search
	query   = QUERY.replace( '##QUERY##', query )
	results = Search( ArchiveSession(), query, fields=FIELDS )
	records = []

	# loop through each search result
	for index, result in enumerate( results ) :

		# parse the easy stuff
		identifier = result[ 'identifier' ]
		title      = result[ 'title' ]	
	
		# parse the harder stuff
		try             : creator = result[ 'creator', '' ]
		except KeyError : creator = ''
		try             : date = result[ 'date' ]
		except KeyError : date = '0000'
		try             : subjects = result[ 'subject' ]
		except KeyError : subjects = ''
		try             : description = result[ 'description' ]
		except KeyError : description = ''
	
		# normalize
		year       = re.sub( r'^(\d\d\d\d).*', r'\1', date )
		details    = DETAILS.replace( '##IDENTIFIER##', identifier )
		download   = DOWNLOAD.replace( '##IDENTIFIER##', identifier )
		if type( subjects )    == list : subjects    = '; '.join( subjects )	
		if type( creator )     == list : creator     = '; '.join( creator )	
		if type( description ) == list : description = '; '.join( description )	
		description = description.replace( '\n', ' ' )
	
		# create a record and update the list of records
		record = [ identifier, creator, title, year, subjects, description, details, download ]
		records.append( record )
	
		# limit results; this is not intended to be comprehensive; people are expected to be somewhat smart about their query	
		if index == MAXIMUM : break

	# create a dataframe
	results = pd.DataFrame( records, columns=COLUMNS )
	
	# done
	return( results )

# query an SQLite database
def searchDB( database, query ) :

	# initialize, create sql, and search
	connection = sqlite3.connect( database )
	sql        = TEMPLATE.replace( '##QUERY##', query )
	results    = pd.read_sql_query( sql, connection )

	return( results )

### routes

# home page
@app.route( '/' )
def home() : return render_template( 'index.htm', error=None )

# arxiv
@app.route( '/arxiv' )
def arxiv() : return render_template( 'arxiv-index.htm', error=None )

# ia
@app.route( '/ia' )
def ia() : return render_template( 'ia-index.htm', error=None )

# all carrels
@app.route( '/carrels' )
def carrels() : return render_template( 'carrels-index.htm', error=None )

# gutenberg
@app.route( '/gutenberg' )
def gutenberg() : return render_template( 'gutenberg-index.htm', error=None )

# ital
@app.route( '/journals' )
def journals() : return render_template( 'journals-index.htm', error=None )

# cord
@app.route( '/cord' )
def cord() : return render_template( 'cord-index.htm', error=None )

# catholic pamphlets
@app.route( '/pamphlets' )
def pamphlets() : return render_template( 'pamphlets-index.htm', error=None )

# a study carrel
@app.route( '/carrel' )
def carrel() :

	# configure
	format = 'csv'
	query  = 'love'
	carrel = 'homer'
	html   = 'carrel-results.htm'

	# check for null input
	if not request.args.get( 'carrel' ) : return render_template( 'carrel-index.htm', error=None )

	# get input
	query  = request.args.get( 'query', query )
	format = request.args.get( 'format', format )
	carrel = request.args.get( 'carrel', carrel )
	
	# configure
	index = Path( LIBRARY )
	index = index/carrel/(rdr.ETC)/( rdr.DATABASE )
	
	# do the work; search
	results = searchDB( index, query )
	
	# get the number of hits
	hits = len( results )

	# drop the full text column; too big
	root               = HTTPROOT + '/' + carrel + '/' + ( rdr.CACHE ) + '/'
	results[ 'cache' ] = root + results[ 'cache' ]
	results.rename( columns={ 'id':'identifier', 'cache':'url' }, inplace = True )
	results.drop( columns=[ 'fulltext', 'txt' ], inplace=True )
	
	# output, accordingly
	if   format == 'json' : return Response( results.to_json( orient='records' ), mimetype='application/json' )
	elif format == 'html' : return render_template( html, hits=hits, query=query, carrel=carrel, result=results.to_html( classes=['display', 'compact'], border=0, table_id='results', index=False ))
	else                  : return Response( results.to_csv( index=False ), mimetype='text/csv' )
	

# full text search
@app.route( '/search' )
def search() :

	# configure
	query   = 'love'
	format  = 'csv'
	index   = 'gutenberg'
	html    = 'results.htm'
	
	# get input
	query  = request.args.get( 'query', query )
	format = request.args.get( 'format', format )
	index  = request.args.get( 'index', index )

	# configure database to query
	if index == 'gutenberg' :
		results = searchDB( GUTENBERG, query )
		html    = 'gutenberg-results.htm'
	
	elif index == 'arxiv'     :
		results = searchDB( ARXIV, query )
		html    = 'arxiv-results.htm'

	elif index == 'cord'     :
		results = searchDB( CORD, query )
		html    = 'cord-results.htm'

	elif index == 'pamphlets' :
		results = searchDB( PAMPHLETS, query )
		html    = 'pamphlets-results.htm'

	elif index == 'carrels'   :
		results = searchDB( CARRELS, query )
		html    = 'carrels-results.htm'

	elif index == 'journals'      :
		results = searchDB( JOURNALS, query )
		html    = 'journals-results.htm'

	elif index == 'ia' :
		results = searchIA( query )
		html    = 'ia-results.htm'

	# get the number of hits
	hits = len( results )
	
	# enhance 
	if index == 'ia' :
		results.rename( columns={ 'download':'url' }, inplace = True )
		results.drop( columns=[ 'description' ], inplace=True )
	
	if index == 'journals' :
		results.drop( columns=[ 'abstract' ], inplace=True )
	
	if index == 'gutenberg' :
		results = results[ [ 'gid', 'author', 'title', 'subject', 'classification', 'url' ] ]
		results.rename( columns={ 'gid':'identifier', 'subject':'subjects' }, inplace = True )
		
	if index == 'pamphlets' :
		results = results[ [ 'system', 'author', 'title', 'publisher', 'year', 'subjects', 'url' ] ]
		results.rename( columns={ 'system':'identifier', 'year':'date' }, inplace = True )

	if index == 'carrels' :
		results = results[ [ 'identifier', 'titles', 'date_created', 'keywords', 'types', 'sources', 'items', 'words', 'flesch', 'processes', 'browse', 'read', 'download' ] ]
		results.rename( columns={ 'date_created':'date' , 'titles':'title', 'types':'type', 'sources':'source', 'processes':'process', 'download':'url'}, inplace = True )
		
	if index == 'arxiv' :
		results.drop( columns=[ 'doi', 'submitter', 'license', 'landingPage', 'comments', 'report', 'abstract' ], inplace=True )
		results.rename( columns={ 'pdf':'url', 'id':'identifier' }, inplace = True )

	if index == 'cord' :
		results.drop( columns=[ 'abstract' ], inplace=True )

	# branch according to format
	if   format == 'json' : return Response( results.to_json( orient='records' ), mimetype='application/json' )
	elif format == 'html' : return render_template( html, hits=hits, query=query, result=results.to_html( classes=['display', 'compact'], border=0, table_id='results', index=False ))
	else                  : return Response( results.to_csv( index=False ), mimetype='text/csv' )