#!/usr/bin/env python # index.py - given a few configurations, index content against a large language model # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public Liciense # January 2, 2024 - first cut, but really hacked upon for the past couple of weeks # configure CORPUS = 'corpus' STORAGE = 'index' # require from llama_index import VectorStoreIndex, SimpleDirectoryReader from llama_index.node_parser import SimpleNodeParser from pathlib import Path # initialize parser = SimpleNodeParser() storage = Path( STORAGE ) # make sane storage.mkdir( exist_ok=True ) # create a list of Llama Index documents, index them, and save; the magic happens here documents = SimpleDirectoryReader( CORPUS ).load_data() nodes = parser.get_nodes_from_documents( documents ) index = VectorStoreIndex( nodes) index.storage_context.persist( persist_dir=storage ) # done exit()