import gradio as gr from llama_index.llms.llama_cpp import LlamaCPP from llama_index.core import VectorStoreIndex, StorageContext from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch import pymongo from pymongo.mongo_client import MongoClient from pymongo.operations import SearchIndexModel from llama_index.core import VectorStoreIndex, StorageContext import os ###### load LLM model_url = "https://huggingface.co/georgesung/llama3_8b_chat_uncensored/resolve/main/llama3_8b_chat_uncensored_q4_0.gguf" llm = LlamaCPP( # You can pass in the URL to a GGML model to download it automatically model_url=model_url, # optionally, you can set the path to a pre-downloaded model instead of model_url model_path=None, temperature=0.01, max_new_tokens=256, # could be larger but requires more time # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room context_window=3900, # kwargs to pass to __call__() generate_kwargs={}, # kwargs to pass to __init__() # set to at least 1 to use GPU model_kwargs={"n_gpu_layers": 1}, verbose=True, ) # load embedding model # sentence transformers from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.node_parser import SentenceSplitter from llama_index.core import Settings embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en") Settings.llm = llm Settings.embed_model = embed_model Settings.node_parser = SentenceSplitter(chunk_size=1024) Settings.num_output = 256 Settings.context_window = 3900 # Load vector database MONGO_URI = os.getenv("MONGO_URI") os.environ["MONGODB_URI"] = MONGO_URI DB_NAME = os.getenv("DB_NAME") COLLECTION_NAME = os.getenv("COLLECTION_NAME") # Connect to your Atlas deployment mongo_client = MongoClient(MONGO_URI) collection = mongo_client[DB_NAME][COLLECTION_NAME] # vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, vector_index_name="default") # Recover index index = VectorStoreIndex.from_vector_store(vector_store) ########### FOR CHAT def respond( message, history: list[tuple[str, str]], system_message, top_k, ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) # # build the query engine query_engine = index.as_query_engine(similarity_top_k=top_k) # query_str = message response = query_engine.query(query_str) # return str(response) # """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="Qual é sua pergunta?", label="System message"), gr.Slider(minimum=1, maximum=10, value=3, step=1, label="top-k"), ], ) if __name__ == "__main__": demo.launch()