neuroRAG / app.py
vermen's picture
Update app.y
15ed53f verified
raw
history blame
3.09 kB
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
import pymongo
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel
from llama_index.core import VectorStoreIndex, StorageContext
import os
###### load LLM
model_url = "https://huggingface.co/georgesung/llama3_8b_chat_uncensored/resolve/main/llama3_8b_chat_uncensored_q4_0.gguf"
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
model_url=model_url,
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=None,
temperature=0.01,
max_new_tokens=1024,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": 1},
verbose=True,
)
# load embedding model
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = SentenceSplitter(chunk_size=1024)
Settings.num_output = 256
Settings.context_window = 3900
# Load vector database
MONGO_URI = "mongodb+srv://groverorgrf:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
os.environ["MONGODB_URI"] = MONGO_URI
DB_NAME = "neuroRAG"
COLLECTION_NAME = "neuro_books"
# Connect to your Atlas deployment
mongo_client = MongoClient(MONGO_URI)
collection = mongo_client[DB_NAME][COLLECTION_NAME]
#
vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, vector_index_name="default")
# Recover index
index = VectorStoreIndex.from_vector_store(vector_store)
########### FOR CHAT
def respond(
message,
history: list[tuple[str, str]],
system_message,
top_k,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
#
# build the query engine
query_engine = index.as_query_engine(similarity_top_k=top_k)
#
query_str = message
response = query_engine.query(query_str)
#
return response
#
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="Qual é sua pergunta?", label="System message"),
gr.Slider(minimum=1, maximum=10, value=3, step=1, label="top-k"),
],
)
if __name__ == "__main__":
demo.launch()