#!/usr/bin/env python

# ngrams.py - given a couple of configurations, output a frequency list of ngrams

# Eric Lease Morgan <emorgan@nd.edu>
# October 25, 2022


# configure
FILE = './love-is.txt'
SIZE = 2 

# require
import nltk

# read the given input and normalize it
with open ( FILE ) as handle : tokens = handle.read()
tokens = nltk.word_tokenize( tokens, preserve_line=True )
tokens = [ token.lower() for token in tokens if token.isalpha() ]

# create a list of ngrams
ngrams = list( nltk.ngrams( tokens, SIZE ) )

# process each ngram; count & tabulate each ngram
frequencies = {}
for ngram in ngrams :

	# update the frequency list
	if ngram in frequencies : frequencies[ ngram ] += 1
	else                    : frequencies[ ngram ]  = 1

# sort the result
ngrams = sorted( frequencies.items(), key=lambda x:x[ 1 ], reverse=True )

# output and done
for ngram in ngrams : print( '\t'.join( list( ngram[ 0 ] ) ) + '\t' + str( ngram[ 1 ] ) )
exit()