#!/usr/bin/env python # ngrams.py - given a couple of configurations, output a frequency list of ngrams # Eric Lease Morgan # October 25, 2022 # configure FILE = './love-is.txt' SIZE = 2 # require import nltk # read the given input and normalize it with open ( FILE ) as handle : tokens = handle.read() tokens = nltk.word_tokenize( tokens, preserve_line=True ) tokens = [ token.lower() for token in tokens if token.isalpha() ] # create a list of ngrams ngrams = list( nltk.ngrams( tokens, SIZE ) ) # process each ngram; count & tabulate each ngram frequencies = {} for ngram in ngrams : # update the frequency list if ngram in frequencies : frequencies[ ngram ] += 1 else : frequencies[ ngram ] = 1 # sort the result ngrams = sorted( frequencies.items(), key=lambda x:x[ 1 ], reverse=True ) # output and done for ngram in ngrams : print( '\t'.join( list( ngram[ 0 ] ) ) + '\t' + str( ngram[ 1 ] ) ) exit()