nsarrazin HF staff commited on
Commit
8d0f9c9
β€’
1 Parent(s): e712779

feat(rag): working rag tool with sources

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. .python-version +1 -0
  3. README.md +2 -2
  4. app.py +132 -0
  5. requirements.txt +4 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ rag-tool-template
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Rag Tool Template
3
  emoji: πŸ“Š
4
  colorFrom: indigo
5
  colorTo: blue
@@ -9,4 +9,4 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Rag Conmmunity Tool Template
3
  emoji: πŸ“Š
4
  colorFrom: indigo
5
  colorTo: blue
 
9
  pinned: false
10
  ---
11
 
12
+ Clone this space, add your documents to the `sources` folder and use your space directly from HuggingChat!
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import subprocess
4
+ import os
5
+ import shutil
6
+ import string
7
+ import random
8
+ import glob
9
+ from pypdf import PdfReader
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ model_name = os.environ.get("MODEL", "Snowflake/snowflake-arctic-embed-m")
13
+ chunk_size = int(os.environ.get("CHUNK_SIZE", 128))
14
+ default_max_characters = int(os.environ.get("DEFAULT_MAX_CHARACTERS", 258))
15
+
16
+ model = SentenceTransformer(model_name)
17
+ # model.to(device="cuda")
18
+
19
+ @spaces.GPU
20
+ def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
21
+ query_embeddings = model.encode(queries, prompt_name="query")
22
+ document_embeddings = model.encode(chunks)
23
+
24
+ scores = query_embeddings @ document_embeddings.T
25
+ results = {}
26
+ for query, query_scores in zip(queries, scores):
27
+ chunk_idxs = [i for i in range(len(chunks))]
28
+ # Get a structure like {query: [(chunk_idx, score), (chunk_idx, score), ...]}
29
+ results[query] = list(zip(chunk_idxs, query_scores))
30
+
31
+ return results
32
+
33
+
34
+ def extract_text_from_pdf(reader):
35
+ full_text = ""
36
+ for idx, page in enumerate(reader.pages):
37
+ text = page.extract_text()
38
+ if len(text) > 0:
39
+ full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
40
+
41
+ return full_text.strip()
42
+
43
+ def convert(filename) -> str:
44
+ plain_text_filetypes = [
45
+ ".txt",
46
+ ".csv",
47
+ ".tsv",
48
+ ".md",
49
+ ".yaml",
50
+ ".toml",
51
+ ".json",
52
+ ".json5",
53
+ ".jsonc",
54
+ ]
55
+ # Already a plain text file that wouldn't benefit from pandoc so return the content
56
+ if any(filename.endswith(ft) for ft in plain_text_filetypes):
57
+ with open(filename, "r") as f:
58
+ return f.read()
59
+
60
+ if filename.endswith(".pdf"):
61
+ return extract_text_from_pdf(PdfReader(filename))
62
+
63
+ raise ValueError(f"Unsupported file type: {filename}")
64
+
65
+
66
+ def chunk_to_length(text, max_length=512):
67
+ chunks = []
68
+ while len(text) > max_length:
69
+ chunks.append(text[:max_length])
70
+ text = text[max_length:]
71
+ chunks.append(text)
72
+ return chunks
73
+
74
+ @spaces.GPU
75
+ def predict(query, max_characters) -> str:
76
+ # Embed the query
77
+ query_embedding = model.encode(query, prompt_name="query")
78
+
79
+ # Initialize a list to store all chunks and their similarities across all documents
80
+ all_chunks = []
81
+
82
+ # Iterate through all documents
83
+ for filename, doc in docs.items():
84
+ # Calculate dot product between query and document embeddings
85
+ similarities = doc["embeddings"] @ query_embedding.T
86
+
87
+ # Add chunks and similarities to the all_chunks list
88
+ all_chunks.extend([(filename, chunk, sim) for chunk, sim in zip(doc["chunks"], similarities)])
89
+
90
+ # Sort all chunks by similarity
91
+ all_chunks.sort(key=lambda x: x[2], reverse=True)
92
+
93
+ # Initialize a dictionary to store relevant chunks for each document
94
+ relevant_chunks = {}
95
+
96
+ # Add most relevant chunks until max_characters is reached
97
+ total_chars = 0
98
+ for filename, chunk, _ in all_chunks:
99
+ if total_chars + len(chunk) <= max_characters:
100
+ if filename not in relevant_chunks:
101
+ relevant_chunks[filename] = []
102
+ relevant_chunks[filename].append(chunk)
103
+ total_chars += len(chunk)
104
+ else:
105
+ break
106
+
107
+ return relevant_chunks
108
+
109
+
110
+
111
+ docs = {}
112
+
113
+ for filename in glob.glob("sources/*"):
114
+ converted_doc = convert(filename)
115
+
116
+ chunks = chunk_to_length(converted_doc, chunk_size)
117
+ embeddings = model.encode(chunks)
118
+
119
+ docs[filename] = {
120
+ "chunks": chunks,
121
+ "embeddings": embeddings,
122
+ }
123
+
124
+
125
+ gr.Interface(
126
+ predict,
127
+ inputs=[
128
+ gr.Textbox(label="Query asked about the documents"),
129
+ gr.Number(label="Max output characters", value=default_max_characters),
130
+ ],
131
+ outputs=[gr.JSON(label="Relevant chunks")],
132
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pypdf==4.2.0
2
+ sentence-transformers==3.0.0
3
+ gradio
4
+ spaces