#!/usr/bin/env python # ask-me-anything.py - given a question, output list of matching-esque questions as well as their answers # see: https://stackoverflow.com/questions/64792776/cosine-similarity-between-string-and-list-of-strings # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # October 23, 2023 - first investigations # October 25, 2023 - started applying the process to a carrel as well as returning answers # configure BULLET = ' *' PROMPT = '\nAsk me anything: ' SALUTATION = '\nOkay, bye bye, and thank you.' PATTERN = '*.csv' THRESHOLDSMALL = 0.3 THRESHOLDMEDIUM = 0.5 THRESHOLDLARGE = 0.7 # require from pandas import DataFrame, read_csv, concat from pathlib import Path from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sys import argv, exit # get input if len( argv ) != 2 : exit( "Usage: " + argv[ 0 ] + " " ) carrel = argv[ 1 ] # process each model; get lists of questions, answers, and scores; very tricky qanda = concat( [ read_csv( model ) for model in ( Path( carrel ) ).glob( PATTERN ) ] ) questions = list( qanda[ 'question' ] ) answers = list( qanda[ 'answer' ] ) scores = list( qanda[ 'score' ] ) # repeat forever, almost while True : # try to get input try : question = input( PROMPT ) except : exit( SALUTATION ) # scale the threshold size = len( question.split() ) if size <= 2 : threshold = THRESHOLDSMALL elif size > 2 and size <= 4 : threshold = THRESHOLDMEDIUM else : threshold = THRESHOLDLARGE # do the work model = TfidfVectorizer().fit_transform( [ question ] + questions ) distances = cosine_similarity( model[ 0, : ], model[ 1:, : ] ) results = DataFrame( { 'distances':distances[ 0 ], 'questions':questions, 'answers':answers, 'scores':scores } ).sort_values( 'distances', ascending=False ) # process (filter) the results for index, result in results.iterrows() : # parse distance = result[ 'distances' ] # check for significance if distance > threshold : # parse some more score = str(round( result[ 'scores' ], 2 ) ) # output print( ' '.join( [ BULLET, result[ 'questions' ] + ' (' + result[ 'answers' ] + ' / ' + score + ')' ] ) ) # done exit()