Spaces:
Sleeping
Sleeping
from llama_index.core import ( | |
VectorStoreIndex | |
) | |
from llama_index.core import Settings | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.vector_stores.qdrant import QdrantVectorStore | |
from qdrant_client import QdrantClient | |
from typing import Any, List, Tuple | |
import torch | |
from transformers import AutoTokenizer, AutoModelForMaskedLM | |
import streamlit as st | |
from llama_index.llms.huggingface import ( | |
HuggingFaceInferenceAPI | |
) | |
import os | |
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
Q_END_POINT = os.environ.get("Q_END_POINT") | |
Q_API_KEY = os.environ.get("Q_API_KEY") | |
#DOC | |
#https://docs.llamaindex.ai/en/stable/examples/vector_stores/qdrant_hybrid.html | |
doc_tokenizer = AutoTokenizer.from_pretrained( | |
"naver/efficient-splade-VI-BT-large-doc" | |
) | |
doc_model = AutoModelForMaskedLM.from_pretrained( | |
"naver/efficient-splade-VI-BT-large-doc" | |
) | |
query_tokenizer = AutoTokenizer.from_pretrained( | |
"naver/efficient-splade-VI-BT-large-query" | |
) | |
query_model = AutoModelForMaskedLM.from_pretrained( | |
"naver/efficient-splade-VI-BT-large-query" | |
) | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
doc_model = doc_model.to(device) | |
query_model = query_model.to(device) | |
def sparse_doc_vectors( | |
texts: List[str], | |
) -> Tuple[List[List[int]], List[List[float]]]: | |
""" | |
Computes vectors from logits and attention mask using ReLU, log, and max operations. | |
""" | |
tokens = doc_tokenizer( | |
texts, truncation=True, padding=True, return_tensors="pt" | |
) | |
if torch.cuda.is_available(): | |
tokens = tokens.to("cuda:1") | |
output = doc_model(**tokens) | |
logits, attention_mask = output.logits, tokens.attention_mask | |
relu_log = torch.log(1 + torch.relu(logits)) | |
weighted_log = relu_log * attention_mask.unsqueeze(-1) | |
tvecs, _ = torch.max(weighted_log, dim=1) | |
# extract the vectors that are non-zero and their indices | |
indices = [] | |
vecs = [] | |
for batch in tvecs: | |
indices.append(batch.nonzero(as_tuple=True)[0].tolist()) | |
vecs.append(batch[indices[-1]].tolist()) | |
return indices, vecs | |
def sparse_query_vectors( | |
texts: List[str], | |
) -> Tuple[List[List[int]], List[List[float]]]: | |
""" | |
Computes vectors from logits and attention mask using ReLU, log, and max operations. | |
""" | |
# TODO: compute sparse vectors in batches if max length is exceeded | |
tokens = query_tokenizer( | |
texts, truncation=True, padding=True, return_tensors="pt" | |
) | |
if torch.cuda.is_available(): | |
tokens = tokens.to("cuda:1") | |
output = query_model(**tokens) | |
logits, attention_mask = output.logits, tokens.attention_mask | |
relu_log = torch.log(1 + torch.relu(logits)) | |
weighted_log = relu_log * attention_mask.unsqueeze(-1) | |
tvecs, _ = torch.max(weighted_log, dim=1) | |
# extract the vectors that are non-zero and their indices | |
indices = [] | |
vecs = [] | |
for batch in tvecs: | |
indices.append(batch.nonzero(as_tuple=True)[0].tolist()) | |
vecs.append(batch[indices[-1]].tolist()) | |
return indices, vecs | |
st.header("Chat with the Bhagavad Gita docs π¬ π") | |
if "messages" not in st.session_state.keys(): # Initialize the chat message history | |
st.session_state.messages = [ | |
{"role": "assistant", "content": "Ask me a question about Gita!"} | |
] | |
# creates a persistant index to disk | |
client = QdrantClient( | |
Q_END_POINT, | |
api_key=Q_API_KEY, | |
) | |
# create our vector store with hybrid indexing enabled | |
# batch_size controls how many nodes are encoded with sparse vectors at once | |
vector_store = QdrantVectorStore( | |
"bhagavad_gita", client=client, enable_hybrid=True, batch_size=20,force_disable_check_same_thread=True, | |
sparse_doc_fn=sparse_doc_vectors, | |
sparse_query_fn=sparse_query_vectors, | |
) | |
llm = HuggingFaceInferenceAPI( | |
model_name="meta-llama/Meta-Llama-3-8B-Instruct", | |
token=HUGGINGFACEHUB_API_TOKEN, | |
context_window=8096, | |
) | |
Settings.llm = llm | |
Settings.tokenzier = AutoTokenizer.from_pretrained( | |
"meta-llama/Meta-Llama-3-8B-Instruct" | |
) | |
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device="cpu") | |
Settings.embed_model = embed_model | |
index = VectorStoreIndex.from_vector_store(vector_store=vector_store,embed_model=embed_model) | |
from llama_index.core.memory import ChatMemoryBuffer | |
memory = ChatMemoryBuffer.from_defaults(token_limit=1500) | |
chat_engine = index.as_chat_engine(chat_mode="condense_question", | |
verbose=True, | |
memory=memory, | |
sparse_top_k=10, | |
vector_store_query_mode="hybrid", | |
similarity_top_k=3, | |
) | |
if prompt := st.chat_input("Your question"): # Prompt for user input and save to chat history | |
st.session_state.messages.append({"role": "user", "content": prompt}) | |
for message in st.session_state.messages: # Display the prior chat messages | |
with st.chat_message(message["role"]): | |
st.write(message["content"]) | |
# If last message is not from assistant, generate a new response | |
if st.session_state.messages[-1]["role"] != "assistant": | |
with st.chat_message("assistant"): | |
with st.spinner("Thinking..."): | |
response = chat_engine.chat(prompt) | |
st.write(response.response) | |
message = {"role": "assistant", "content": response.response} | |
st.session_state.messages.append(message) # Add response to message history | |