File size: 3,090 Bytes
15ed53f
 
 
 
 
 
 
 
 
 
18e8699
15ed53f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18e8699
15ed53f
 
18e8699
15ed53f
 
18e8699
 
 
 
15ed53f
18e8699
 
 
 
 
 
 
 
15ed53f
 
 
 
 
 
 
 
 
18e8699
 
 
 
 
 
15ed53f
 
18e8699
 
 
 
 
15ed53f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
import pymongo
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel
from llama_index.core import VectorStoreIndex, StorageContext
import os
###### load LLM
model_url = "https://huggingface.co/georgesung/llama3_8b_chat_uncensored/resolve/main/llama3_8b_chat_uncensored_q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.01,
    max_new_tokens=1024,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)
# load embedding model
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = SentenceSplitter(chunk_size=1024)
Settings.num_output = 256
Settings.context_window = 3900

# Load vector database

MONGO_URI = "mongodb+srv://groverorgrf:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
os.environ["MONGODB_URI"] = MONGO_URI
DB_NAME = "neuroRAG"
COLLECTION_NAME = "neuro_books"
# Connect to your Atlas deployment
mongo_client = MongoClient(MONGO_URI)
collection = mongo_client[DB_NAME][COLLECTION_NAME]
#

vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, vector_index_name="default")

# Recover index
index = VectorStoreIndex.from_vector_store(vector_store)


########### FOR CHAT
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    top_k,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    #
    # build the query engine
    query_engine = index.as_query_engine(similarity_top_k=top_k)
    #
    query_str = message
    response = query_engine.query(query_str)
    #
    return response
#
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="Qual é sua pergunta?", label="System message"),
        gr.Slider(minimum=1, maximum=10, value=3, step=1, label="top-k"),
    ],
)


if __name__ == "__main__":
    demo.launch()