vectorDB-app / app.py
florentgbelidji's picture
update app.py
4adc61e
import gradio as gr
import pandas as pd
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#Persistent storage is mounted to /data
DB_URL = "./data/lancedb"
TABLE_NAME = "pdf_table"
# define schema for table with embedding api
model = get_registry().get("colbert").create(name="colbert-ir/colbertv2.0")
class TextModel(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
# add in vector db
def lanceDBConnection(df):
"""
LanceDB insertion
"""
db = lancedb.connect(DB_URL)
table = db.create_table(
TABLE_NAME,
schema=TextModel,
mode="overwrite",
)
table.add(df)
return table
def get_pdf(file):
try:
# Access the file path
file_path = file.name if isinstance(file, dict) else file
# Load the PDF using PyPDFLoader
loader = PyPDFLoader(file_path)
documents = loader.load()
except Exception as e:
return f"An error occurred:{e}"
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=50)
docs_sp=text_splitter.split_documents(documents)
texts = [chunk.page_content for chunk in docs_sp]
df = pd.DataFrame({"text": texts})
table = lanceDBConnection(df)
return f"PDF uploaded successfully. Total number of documents: {len(df)}"
def get_nearest_neighbours(query):
db = lancedb.connect(DB_URL)
table = db.open_table(TABLE_NAME)
result = table.search(query).limit(3).to_list()
context = [r["text"] for r in result]
return context
pdf_interface=gr.Interface(
fn=get_pdf,
inputs=[gr.File(label="Upload the PDF", file_types=[".pdf"])],
outputs=[gr.Textbox(label="Status",lines=4)]
)
question_interface=gr.Interface(
fn=get_nearest_neighbours,
inputs=[gr.Textbox(label="Enter your question")],
outputs=["text"]*3
)
demo=gr.TabbedInterface(
interface_list=[pdf_interface, question_interface],
tab_names=["Upload Pdfs","Get relevant chunks"],
title="Save PDF chunks into LanceDB on persitent storage")
demo.launch(share=True)