gradiopdf / app.py
Noobian's picture
Rename index.py to app.py
501e7d5
import os
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.llms import HuggingFaceHub
from PyPDF2 import PdfReader
from dotenv import load_dotenv
load_dotenv()
def create_index(file_path: str) -> None:
reader = PdfReader(file_path)
text = ''
for page in reader.pages:
text += page.extract_text()
with open('output.txt', 'w') as file:
file.write(text)
loader = DirectoryLoader(
'./',
glob='**/*.txt',
loader_cls=TextLoader
)
documents = loader.load()
text_splitter = CharacterTextSplitter(
separator='\n',
chunk_size=1024,
chunk_overlap=128
)
texts = text_splitter.split_documents(documents)
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
persist_directory = 'db'
vectordb = Chroma.from_documents(
documents=texts,
embedding=embeddings,
persist_directory=persist_directory
)
vectordb.persist()
create_index('sample.pdf')