#!/usr/bin/env python # ner-carrels.py - given a number of configurations, output named-entities (human values) # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # February 22, 2024 - first cut; a long time coming # configure MODEL = './model/model-best' LIBRARY = '/shared/reader-patron-library/ericleasemorgan' PATTERN = '*.txt' SIZE = 2207242 # require from spacy import load from rdr import TXT from pathlib import Path from sys import argv, exit from os import stat # get input if len( argv ) != 2 : exit( "Usage: " + argv[ 0 ] + " " ) carrel = argv[ 1 ] # initialize library = Path( LIBRARY ) corpus = library/carrel/TXT nlp = load( MODEL ) nlp.max_length = SIZE + 1 # process each file in the given corpus for file in corpus.glob( PATTERN ) : # read and model the given file with open( file ) as handle : text = handle.read() doc = nlp( text ) # output for entity in doc.ents : print( '\t'.join( [ file.stem, entity.text.lower() ] ) ) # done exit()