#!/usr/bin/env python

# summarize.py - given a file, use an LLM to summarize it
# see: https://medium.com/@ryver.dev/building-a-simple-ai-powered-text-summarizer-with-transformers-in-python-0a31c848e1d2

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# April 28, 2024 - first cut


# configure
MODEL      = 'philschmid/bart-large-cnn-samsum'
TENSOR     = 'pt'
LENGTH     = 1024
TRUNCATION = True
PROMPT     = 'summarize: '
PENALTY    = 2
BEAMS      = 4
EARLY      = True
SKIPTOKENS = True

# require
from pathlib      import Path
from transformers import BartTokenizer, BartForConditionalGeneration
from sys          import argv, exit

# get input
if len( argv ) != 2 : exit( "Usage: " + argv[ 0 ] + " <file>" )
file = argv[ 1 ]

# initialize
tokenizer = BartTokenizer.from_pretrained( MODEL )
model     = BartForConditionalGeneration.from_pretrained( MODEL )

# read the given file
with open( file ) as handle : text = handle.read()

# do the work
inputs      = tokenizer.encode( PROMPT + text, return_tensors=TENSOR, max_length=LENGTH, truncation=TRUNCATION )
identifiers = model.generate( inputs, max_length=150, min_length=50, length_penalty=PENALTY, num_beams=BEAMS, early_stopping=EARLY )
summary     = tokenizer.decode( identifiers[ 0 ], skip_special_tokens=SKIPTOKENS )

# output and done
print( summary )
exit()