huggingface-datasets-semantic-search / ragatouille_search.py
davanstrien's picture
davanstrien HF staff
add rag search
35db041
from pathlib import Path
from typing import Any, Dict, List
import gradio as gr
from huggingface_hub import snapshot_download
from ragatouille import RAGPretrainedModel
from toolz import unique
# Top-level variables
INDEX_PATH = Path(".ragatouille/colbert/indexes/my_index_with_ids_and_metadata/")
REPO_ID = "davanstrien/search-index"
INITIAL_QUERY = "hello world"
DEFAULT_K = 10
def initialize_index():
INDEX_PATH.mkdir(parents=True, exist_ok=True)
snapshot_download(REPO_ID, repo_type="dataset", local_dir=INDEX_PATH)
rag = RAGPretrainedModel.from_index(INDEX_PATH)
# Warm up index
rag.search(INITIAL_QUERY)
return rag
def format_results_as_markdown(results: List[Dict[str, Any]]) -> str:
markdown = ""
for result in results:
content = result["content"]
score = result["score"]
rank = result["rank"]
document_id = result["document_id"]
passage_id = result["passage_id"]
link = f"https://huggingface.co/datasets/{document_id}"
markdown += f"### Result {rank}\n"
markdown += f"**Score:** {score}\n\n"
markdown += f"**Document ID:** [{document_id}]({link})\n\n"
markdown += f"**Passage ID:** {passage_id}\n\n"
# Limit initial content display to 1000 characters
preview = f"{content[:1000]}..." if len(content) > 1000 else content
markdown += f"{preview}\n\n"
# Add expandable section for full content if it's longer than 1000 characters
if len(content) > 1000:
markdown += "<details>\n"
markdown += "<summary>Click to expand full content</summary>\n\n"
markdown += f"{content}\n\n"
markdown += "</details>\n\n"
markdown += "---\n\n"
return markdown
def search_with_ragatouille(query, k=DEFAULT_K, make_unique=False):
results = RAG.search(query, k=k)
if make_unique:
results = make_results_unique(results)
return format_results_as_markdown(results)
def make_results_unique(results: List[Dict[str, Any]]):
unique_results = unique(results, lambda x: x["document_id"])
return list(unique_results)
def create_ragatouille_interface():
with gr.Blocks() as ragatouille_demo:
gr.Markdown("### RAGatouille Dataset Search")
gr.Markdown(
"""This interface allows you to search inside dataset cards on the Hub using the [answerai-colbert-small-v1](https://huggingface.co/answerdotai/answerai-colbert-small-v1) ColBERT model via [RAGatouille](https://github.com/AnswerDotAI/RAGatouille). Please be aware that this is an early prototype and may not work as expected!
## Notes:
**Not all datasets are indexed yet!**
For a dataset to be indexed:
- It must have a dataset card on the Hub. You can find documentation on how to write a good dataset card [here](https://huggingface.co/docs/hub/datasets-cards).
- The dataset must have at least 1 like and 1 download
- The card must be a minimum length (to weed out low quality cards)
**At the moment the index is refreshed when I decide to do it, so it may not be up to date.** If there is sufficient interest I will implement a daily refresh (give this repo a like if you'd like this feature!)
Feel free to open a discussion to give feedback or request features &#129303;
"""
)
with gr.Column():
query = gr.Textbox(label="Search query", placeholder="medieval handwriting")
with gr.Row():
k = gr.Slider(1, 100, value=DEFAULT_K, step=1, label="Number of Results")
make_unique = gr.Checkbox(False, label="Show each dataset only once?")
search_button = gr.Button("Search")
search_button.click(
search_with_ragatouille,
inputs=[query, k, make_unique],
outputs=gr.Markdown(label="Results"),
)
return ragatouille_demo
# Initialize RAG globally
RAG = initialize_index()
def main():
demo = create_ragatouille_interface()
demo.launch()
if __name__ == "__main__":
main()