Spaces:

sambanovasystems
/

enterprise_knowledge_retriever

Running

App Files Files Community

enterprise_knowledge_retriever / utils /vectordb /create_vector_db.py

petrojm

add EKR files

a6c26b1 28 days ago

raw

history blame

4.75 kB

	# Define the script's usage example
	USAGE_EXAMPLE = """
	Example usage:

	To process input *.txt files at input_path and save the vector db output at output_db:
	python create_vector_db.py input_path output_db --chunk_size 100 --chunk_overlap 10

	Required arguments:
	- input_path: Path to the input dir containing the .txt files
	- output_path: Path to the output vector db.

	Optional arguments:
	- --chunk_size: Size of the chunks (default: None).
	- --chunk_overlap: Overlap between chunks (default: None).
	"""

	import argparse
	import logging
	import os

	from langchain.document_loaders import DirectoryLoader
	from langchain.embeddings import HuggingFaceInstructEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import FAISS, Chroma, Qdrant

	# Configure the logger
	logging.basicConfig(
	level=logging.INFO, # Set the logging level (e.g., INFO, DEBUG)
	format="%(asctime)s [%(levelname)s] - %(message)s", # Define the log message format
	handlers=[
	logging.StreamHandler(), # Output logs to the console
	logging.FileHandler("create_vector_db.log"),
	],
	)

	# Create a logger object
	logger = logging.getLogger(__name__)


	# Parse the arguments
	def parse_arguments():
	parser = argparse.ArgumentParser(description="Process command line arguments.")
	parser.add_argument("-input_path", type=dir_path, help="path to input directory")
	parser.add_argument("--chunk_size", type=int, help="chunk size for splitting")
	parser.add_argument("--chunk_overlap", type=int, help="chunk overlap for splitting")
	parser.add_argument("-output_path", type=dir_path, help="path to input directory")

	return parser.parse_args()


	# Check valid path
	def dir_path(path):
	if os.path.isdir(path):
	return path
	else:
	raise argparse.ArgumentTypeError(f"readable_dir:{path} is not a valid path")


	def main(input_path, output_db, chunk_size, chunk_overlap, db_type):
	# Load files from input_location
	loader = DirectoryLoader(input_path, glob="*.txt")
	docs = loader.load()
	logger.info(f"Total {len(docs)} files loaded")

	# get the text chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
	)
	chunks = text_splitter.split_documents(docs)
	logger.info(f"Total {len(chunks)} chunks created")

	# create vector store
	encode_kwargs = {"normalize_embeddings": True}
	embedding_model = "BAAI/bge-large-en"
	embeddings = HuggingFaceInstructEmbeddings(
	model_name=embedding_model,
	embed_instruction="", # no instruction is needed for candidate passages
	query_instruction="Represent this sentence for searching relevant passages: ",
	encode_kwargs=encode_kwargs,
	)
	logger.info(
	f"Processing embeddings using {embedding_model}. This could take time depending on the number of chunks ..."
	)

	if db_type == "faiss":
	vectorstore = FAISS.from_documents(documents=chunks, embedding=embeddings)
	# save vectorstore
	vectorstore.save_local(output_db)
	elif db_type == "chromadb":
	vectorstore = Chroma.from_documents(
	documents=chunks, embedding=embeddings, persist_directory=output_db
	)
	elif db_type == "qdrant":
	vectorstore = Qdrant.from_documents(
	documents=chunks,
	embedding=embeddings,
	path=output_db,
	collection_name="test_collection",
	)
	elif db_type == "qdrant-server":
	url = "http://localhost:6333/"
	vectorstore = Qdrant.from_documents(
	documents=chunks,
	embedding=embeddings,
	url=url,
	prefer_grpc=True,
	collection_name="anaconda",
	)

	logger.info(f"Vector store saved to {output_db}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Process data with optional chunking")

	# Required arguments
	parser.add_argument("input_path", type=str, help="Path to the input directory")
	parser.add_argument("output_db", type=str, help="Path to the output vectordb")

	# Optional arguments
	parser.add_argument(
	"--chunk_size", type=int, default=1000, help="Chunk size (default: 1000)"
	)
	parser.add_argument(
	"--chunk_overlap", type=int, default=200, help="Chunk overlap (default: 200)"
	)
	parser.add_argument(
	"--db_type",
	type=str,
	default="faiss",
	help="Type of vectorstore (default: faiss)",
	)

	args = parser.parse_args()
	main(
	args.input_path,
	args.output_db,
	args.chunk_size,
	args.chunk_overlap,
	args.db_type,
	)