Spaces:

Ritvik19
/

Zeta

Running

App Files Files Community

Ritvik19 commited on Mar 6

Commit

c323312

•

1 Parent(s): 7793370

Upload 8 files

Browse files

Files changed (6) hide show

app.py +119 -83
chain_of_density.py +42 -0
chat_chains.py +33 -27
command_center.py +6 -0
custom_exceptions.py +6 -0
process_documents.py +20 -9

app.py CHANGED Viewed

@@ -8,32 +8,20 @@ import json
 from langchain.callbacks import get_openai_callback
 from langchain_openai import ChatOpenAI
 import base64
-from chat_chains import rag_chain, parse_model_response
-from langchain_core.messages import AIMessage, HumanMessage
-from autoqa_chains import auto_qa_chain, followup_qa_chain, auto_qa_output_parser
 st.set_page_config(layout="wide")
 os.environ["OPENAI_API_KEY"] = "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"
-format_citations = lambda citations: "\n\n".join(
-    [f"{citation['quote']} ... [{citation['source_id']}]" for citation in citations]
-)
-def session_state_2_llm_chat_history(session_state):
-    chat_history = []
-    for ss in session_state:
-        if not ss[0].startswith("/"):
-            chat_history.append(HumanMessage(content=ss[0]))
-            chat_history.append(AIMessage(content=ss[1]))
-    return chat_history
-ai_message_format = lambda message, references: (
-    f"{message}\n\n---\n\n{format_citations(references)}"
-    if references != ""
-    else message
-)
 welcome_message = """
 Hi I'm Agent Zeta, your AI assistant, dedicated to making your journey through machine learning research papers as insightful and interactive as possible. Whether you're diving into the latest studies or brushing up on foundational papers, I'm here to help navigate, discuss, and analyze content with you.
@@ -42,17 +30,20 @@ Here's a quick guide to getting started with me:
 | Command | Description |
 |---------|-------------|
-| `/upload` <list of urls> | Upload and process documents for our conversation. |
-| `/index` | View an index of processed documents to easily navigate your research. |
-| `/cost` | Calculate the cost of our conversation, ensuring transparency in resource usage. |
-| `/download` | Download conversation data for your records or further analysis. |
-| `/auto` <document id> | Automatically generate questions and answers for a document. |
 <br>
 Feel free to use these commands to enhance your research experience. Let's embark on this exciting journey of discovery together!
-Use `/man` at any point of time to view this guide again.
 """
@@ -64,28 +55,26 @@ def process_documents_wrapper(inputs):
         [snip.metadata["chunk_id"], snip.metadata["header"]] for snip in snippets
     ]
     response = f"Uploaded and processed documents {inputs}"
-    st.session_state.messages.append((f"/upload {inputs}", response, ""))
     st.session_state.documents = documents
-    return response
 def index_documents_wrapper(inputs=None):
-    response = pd.DataFrame(
-        st.session_state.index, columns=["id", "reference"]
-    ).to_markdown()
-    st.session_state.messages.append(("/index", response, ""))
-    return response
 def calculate_cost_wrapper(inputs=None):
     try:
         stats_df = pd.DataFrame(st.session_state.costing)
         stats_df.loc["total"] = stats_df.sum()
-        response = stats_df.to_markdown()
     except ValueError:
         response = "No cost incurred yet"
-    st.session_state.messages.append(("/cost", response, ""))
-    return response
 def download_conversation_wrapper(inputs=None):
@@ -100,7 +89,7 @@ def download_conversation_wrapper(inputs=None):
                 st.session_state.index if "index" in st.session_state else []
             ),
             "conversation": [
-                {"human": message[0], "ai": message[1], "references": message[2]}
                 for message in st.session_state.messages
             ],
             "costing": (
@@ -117,25 +106,22 @@ def download_conversation_wrapper(inputs=None):
         }
     )
     conversation_data = base64.b64encode(conversation_data.encode()).decode()
-    st.session_state.messages.append(("/download", "Conversation data downloaded", ""))
-    return f'<a href="data:text/csv;base64,{conversation_data}" download="conversation_data.json">Download Conversation</a>'
-def query_llm_wrapper(inputs):
-    retriever = st.session_state.retriever
-    qa_chain = rag_chain(
-        retriever, ChatOpenAI(model="gpt-4-0125-preview", temperature=0)
-    )
-    relevant_docs = retriever.get_relevant_documents(inputs)
     with get_openai_callback() as cb:
-        response = qa_chain.invoke(
-            {
-                "question": inputs,
-                "chat_history": session_state_2_llm_chat_history(
-                    st.session_state.messages
-                ),
-            }
-        ).content
         stats = cb
     response = parse_model_response(response)
     answer = response["answer"]
@@ -147,7 +133,6 @@ def query_llm_wrapper(inputs):
                     f"[{ref}]"
                     for ref in sorted(
                         [ref.metadata["chunk_id"] for ref in relevant_docs],
-                        key=lambda x: int(x.split("_")[1]),
                     )
                 ]
             ),
@@ -155,7 +140,41 @@ def query_llm_wrapper(inputs):
         }
     )
-    st.session_state.messages.append((inputs, answer, citations))
     st.session_state.costing.append(
         {
             "prompt tokens": stats.prompt_tokens,
@@ -163,11 +182,13 @@ def query_llm_wrapper(inputs):
             "cost": stats.total_cost,
         }
     )
-    return answer, citations
 def auto_qa_chain_wrapper(inputs):
-    document = st.session_state.documents[inputs]
     llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)
     auto_qa_conversation = []
     with get_openai_callback() as cb:
@@ -176,15 +197,15 @@ def auto_qa_chain_wrapper(inputs):
             "questions"
         ]
         auto_qa_conversation = [
-            (f'/auto {qa["question"]}', qa["answer"], "")
             for qa in auto_qa_response_parsed
         ]
         stats = cb
     st.session_state.messages.append(
-        (f"/auto {inputs}", "Auto Convervation Generated", "")
     )
     for qa in auto_qa_conversation:
-        st.session_state.messages.append((qa[0], qa[1], ""))
     st.session_state.costing.append(
         {
@@ -193,12 +214,16 @@ def auto_qa_chain_wrapper(inputs):
             "cost": stats.total_cost,
         }
     )
-    return "\n\n".join(
-        f"Q: {qa['question']}\n\nA: {qa['answer']}" for qa in auto_qa_response_parsed
     )
-def boot(command_center):
     st.write("# Agent Zeta")
     if "costing" not in st.session_state:
         st.session_state.costing = []
@@ -208,34 +233,45 @@ def boot(command_center):
     for message in st.session_state.messages:
         st.chat_message("human").write(message[0])
         st.chat_message("ai").write(
-            ai_message_format(message[1], message[2]), unsafe_allow_html=True
         )
     if query := st.chat_input():
-        st.chat_message("human").write(query)
-        response = command_center.execute_command(query)
-        if response is None:
-            pass
-        elif type(response) == tuple:
-            result, references = response
             st.chat_message("ai").write(
-                ai_message_format(result, references), unsafe_allow_html=True
             )
-        else:
-            st.chat_message("ai").write(response, unsafe_allow_html=True)
 if __name__ == "__main__":
     all_commands = [
-        ("/upload", list, process_documents_wrapper),
-        ("/index", None, index_documents_wrapper),
-        ("/cost", None, calculate_cost_wrapper),
-        ("/download", None, download_conversation_wrapper),
-        ("/man", None, lambda x: welcome_message),
-        ("/auto", int, auto_qa_chain_wrapper),
     ]
     command_center = CommandCenter(
         default_input_type=str,
-        default_function=query_llm_wrapper,
         all_commands=all_commands,
     )
-    boot(command_center)

 from langchain.callbacks import get_openai_callback
 from langchain_openai import ChatOpenAI
 import base64
+from chat_chains import (
+    parse_model_response,
+    qa_chain,
+    format_docs,
+    parse_context_and_question,
+    ai_response_format,
+)
+from autoqa_chains import auto_qa_chain, auto_qa_output_parser
+from chain_of_density import chain_of_density_chain
+from custom_exceptions import InvalidArgumentError, InvalidCommandError
 st.set_page_config(layout="wide")
 os.environ["OPENAI_API_KEY"] = "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"
 welcome_message = """
 Hi I'm Agent Zeta, your AI assistant, dedicated to making your journey through machine learning research papers as insightful and interactive as possible. Whether you're diving into the latest studies or brushing up on foundational papers, I'm here to help navigate, discuss, and analyze content with you.
 | Command | Description |
 |---------|-------------|
+| `/add-papers <list of urls>` | Upload and process documents for our conversation. |
+| `/library` | View an index of processed documents to easily navigate your research. |
+| `/session-expense` | Calculate the cost of our conversation, ensuring transparency in resource usage. |
+| `/export` | Download conversation data for your records or further analysis. |
+| `/auto-insight <document id>` | Automatically generate questions and answers for a document. |
+| `/deep-dive [<list of document ids>] <query>` | Query the AI with a specific document context. |
+| `/condense-summary <document id>` | Generate increasingly concise, entity-dense summaries of a document. |
 <br>
 Feel free to use these commands to enhance your research experience. Let's embark on this exciting journey of discovery together!
+Use `/help-me` at any point of time to view this guide again.
 """
         [snip.metadata["chunk_id"], snip.metadata["header"]] for snip in snippets
     ]
     response = f"Uploaded and processed documents {inputs}"
+    st.session_state.messages.append((f"/add-papers {inputs}", response, "identity"))
     st.session_state.documents = documents
+    return (response, "identity")
 def index_documents_wrapper(inputs=None):
+    response = pd.DataFrame(st.session_state.index, columns=["id", "reference"])
+    st.session_state.messages.append(("/library", response, "dataframe"))
+    return (response, "dataframe")
 def calculate_cost_wrapper(inputs=None):
     try:
         stats_df = pd.DataFrame(st.session_state.costing)
         stats_df.loc["total"] = stats_df.sum()
+        response = stats_df
     except ValueError:
         response = "No cost incurred yet"
+    st.session_state.messages.append(("/session-expense", response, "dataframe"))
+    return (response, "dataframe")
 def download_conversation_wrapper(inputs=None):
                 st.session_state.index if "index" in st.session_state else []
             ),
             "conversation": [
+                {"human": message[0], "ai": jsonify_functions[message[2]](message[1])}
                 for message in st.session_state.messages
             ],
             "costing": (
         }
     )
     conversation_data = base64.b64encode(conversation_data.encode()).decode()
+    st.session_state.messages.append(
+        ("/export", "Conversation data downloaded", "identity")
+    )
+    return (
+        f'<a href="data:text/csv;base64,{conversation_data}" download="conversation_data.json">Download Conversation</a>',
+        "identity",
+    )
+def query_llm(inputs, relevant_docs):
     with get_openai_callback() as cb:
+        response = (
+            qa_chain(ChatOpenAI(model="gpt-4-0125-preview", temperature=0))
+            .invoke({"context": format_docs(relevant_docs), "question": inputs})
+            .content
+        )
         stats = cb
     response = parse_model_response(response)
     answer = response["answer"]
                     f"[{ref}]"
                     for ref in sorted(
                         [ref.metadata["chunk_id"] for ref in relevant_docs],
                     )
                 ]
             ),
         }
     )
+    st.session_state.messages.append(
+        (inputs, {"answer": answer, "citations": citations}, "reponse_with_citations")
+    )
+    st.session_state.costing.append(
+        {
+            "prompt tokens": stats.prompt_tokens,
+            "completion tokens": stats.completion_tokens,
+            "cost": stats.total_cost,
+        }
+    )
+    return ({"answer": answer, "citations": citations}, "reponse_with_citations")
+def rag_llm_wrapper(inputs):
+    retriever = st.session_state.retriever
+    relevant_docs = retriever.get_relevant_documents(inputs)
+    return query_llm(inputs, relevant_docs)
+def query_llm_wrapper(inputs):
+    context, question = parse_context_and_question(inputs)
+    relevant_docs = [st.session_state.documents[c] for c in context]
+    print(context, question)
+    return query_llm(question, relevant_docs)
+def chain_of_density_wrapper(inputs):
+    if inputs == "":
+        raise InvalidArgumentError("Please provide a document id")
+    document = st.session_state.documents[inputs].page_content
+    llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)
+    with get_openai_callback() as cb:
+        summary = chain_of_density_chain(llm).invoke({"paper": document})
+        stats = cb
+    st.session_state.messages.append(("/condense-summary", summary, "identity"))
     st.session_state.costing.append(
         {
             "prompt tokens": stats.prompt_tokens,
             "cost": stats.total_cost,
         }
     )
+    return (summary, "identity")
 def auto_qa_chain_wrapper(inputs):
+    if inputs == "":
+        raise InvalidArgumentError("Please provide a document id")
+    document = st.session_state.documents[inputs].page_content
     llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)
     auto_qa_conversation = []
     with get_openai_callback() as cb:
             "questions"
         ]
         auto_qa_conversation = [
+            (f'/auto {qa["question"]}', qa["answer"], "identity")
             for qa in auto_qa_response_parsed
         ]
         stats = cb
     st.session_state.messages.append(
+        (f"/auto-insight {inputs}", "Auto Convervation Generated", "identity")
     )
     for qa in auto_qa_conversation:
+        st.session_state.messages.append((qa[0], qa[1], "identity"))
     st.session_state.costing.append(
         {
             "cost": stats.total_cost,
         }
     )
+    return (
+        "\n\n".join(
+            f"Q: {qa['question']}\n\nA: {qa['answer']}"
+            for qa in auto_qa_response_parsed
+        ),
+        "identity",
     )
+def boot(command_center, formating_functions):
     st.write("# Agent Zeta")
     if "costing" not in st.session_state:
         st.session_state.costing = []
     for message in st.session_state.messages:
         st.chat_message("human").write(message[0])
         st.chat_message("ai").write(
+            formating_functions[message[2]](message[1]), unsafe_allow_html=True
         )
     if query := st.chat_input():
+        try:
+            st.chat_message("human").write(query)
+            response, format_fn_name = command_center.execute_command(query)
             st.chat_message("ai").write(
+                formating_functions[format_fn_name](response), unsafe_allow_html=True
             )
+        except (InvalidArgumentError, InvalidCommandError) as e:
+            st.error(e)
 if __name__ == "__main__":
     all_commands = [
+        ("/add-papers", list, process_documents_wrapper),
+        ("/library", None, index_documents_wrapper),
+        ("/session-expense", None, calculate_cost_wrapper),
+        ("/export", None, download_conversation_wrapper),
+        ("/help-me", None, lambda x: (welcome_message, "identity")),
+        ("/auto-insight", str, auto_qa_chain_wrapper),
+        ("/deep-dive", str, query_llm_wrapper),
+        ("/condense-summary", str, chain_of_density_wrapper),
     ]
     command_center = CommandCenter(
         default_input_type=str,
+        default_function=rag_llm_wrapper,
         all_commands=all_commands,
     )
+    formating_functions = {
+        "identity": lambda x: x,
+        "dataframe": lambda x: x,
+        "reponse_with_citations": lambda x: ai_response_format(
+            x["answer"], x["citations"]
+        ),
+    }
+    jsonify_functions = {
+        "identity": lambda x: x,
+        "dataframe": lambda x: x.to_dict(orient="records"),
+        "reponse_with_citations": lambda x: x,
+    }
+    boot(command_center, formating_functions)

chain_of_density.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts import PromptTemplate
+chain_of_density_prompt_template = """
+Research Paper: {paper}
+You will generate increasingly concise, entity-dense summaries of the above research paper.
+Repeat the following 2 steps 10 times.
+Step 1. Identify 1-3 informative Entities ('; ' delimited) from the research paper that are missing from the previously generated summary. These entities should be key components such as research questions, methodologies, findings, theoretical contributions, or implications.
+Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities.
+A Missing Entity is:
+- Relevant: critical to understanding the paper’s contribution.
+- Specific: descriptive yet concise (5 words or fewer).
+- Novel: not included in the previous summary.
+- Faithful: accurately represented in the research paper.
+- Anywhere: can be found anywhere in the research paper.
+Guidelines:
+- The first summary should be long (4-5 sentences, ~100 words) yet focus on general information about the research paper, including its broad topic and objectives, without going into detail.
+- Avoid using verbose language and fillers (e.g., 'This research paper discusses') to reach the word count.
+- Strive for efficiency in word use: rewrite the previous summary to improve readability and make space for additional entities.
+- Employ strategies such as fusion (combining entities), compression (shortening descriptions), and removal of uninformative phrases to make space for new entities.
+- The summaries should evolve to be highly dense and concise yet remain self-contained, meaning they can be understood without reading the full paper.
+- Missing entities should be integrated seamlessly into the new summary.
+- Never omit entities from previous summaries. If space is a challenge, incorporate fewer new entities but maintain the same word count.
+Remember, use the exact same number of words for each summary.
+The JSON output should be a list (length 10) of dictionaries. Each dictionary must have two keys: 'missing_entities', listing the 1-3 entities added in each round; and 'denser_summary', presenting the new summary that integrates these entities without increasing the length.
+"""
+chain_of_density_output_parser = JsonOutputParser()
+chain_of_density_prompt = PromptTemplate(
+    template=chain_of_density_prompt_template,
+    input_variables=["paper"],
+)
+chain_of_density_chain = (
+    lambda model: chain_of_density_prompt | model | chain_of_density_output_parser
+)

chat_chains.py CHANGED Viewed

@@ -1,22 +1,8 @@
-from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 import xml.etree.ElementTree as ET
 import re
-contextualize_q_system_prompt = """Given a chat history and the latest user question \
-which might reference context in the chat history, formulate a standalone question \
-which can be understood without the chat history. Do NOT answer the question, \
-just reformulate it if needed and otherwise return it as is."""
-contextualize_q_prompt = ChatPromptTemplate.from_messages(
-    [
-        ("system", contextualize_q_system_prompt),
-        MessagesPlaceholder(variable_name="chat_history"),
-        ("human", "{question}"),
-    ]
-)
-contextualize_q_chain = lambda llm: contextualize_q_prompt | llm | StrOutputParser()
 qa_system_prompt = """As Zeta, your mission is to assist users in navigating the vast sea of machine learning research with ease and insight. When responding to inquiries, adhere to the following guidelines to ensure the utmost accuracy and utility:
 Contextual Understanding: When presented with a question, apply your understanding of machine learning concepts to interpret the context provided accurately. Utilize this context to guide your search for answers within the specified research papers.
@@ -46,7 +32,7 @@ By following these guidelines, you ensure that users receive valuable, accurate,
 qa_prompt = ChatPromptTemplate.from_messages(
     [
         ("system", qa_system_prompt),
-        MessagesPlaceholder(variable_name="chat_history"),
         ("human", "{question}"),
     ]
 )
@@ -54,21 +40,19 @@ qa_prompt = ChatPromptTemplate.from_messages(
 def format_docs(docs):
     return "\n\n".join(
-        f"{doc.metadata['chunk_id']}: {doc.page_content}" for doc in docs
     )
-def contextualized_question(input: dict):
-    if input.get("chat_history"):
-        return contextualize_q_chain
-    else:
-        return input["question"]
 rag_chain = lambda retriever, llm: (
-    RunnablePassthrough.assign(
-        context=contextualized_question | retriever | format_docs
-    )
     | qa_prompt
     | llm
 )
@@ -105,3 +89,25 @@ def parse_model_response(input_string):
     parsed_data["answer"] = "".join(outside_text_parts)
     return parsed_data

+from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 import xml.etree.ElementTree as ET
 import re
 qa_system_prompt = """As Zeta, your mission is to assist users in navigating the vast sea of machine learning research with ease and insight. When responding to inquiries, adhere to the following guidelines to ensure the utmost accuracy and utility:
 Contextual Understanding: When presented with a question, apply your understanding of machine learning concepts to interpret the context provided accurately. Utilize this context to guide your search for answers within the specified research papers.
 qa_prompt = ChatPromptTemplate.from_messages(
     [
         ("system", qa_system_prompt),
+        # MessagesPlaceholder(variable_name="chat_history"),
         ("human", "{question}"),
     ]
 )
 def format_docs(docs):
     return "\n\n".join(
+        f"{doc.metadata['chunk_id']}: {doc.page_content}" if type(doc) != str else doc
+        for doc in docs
     )
 rag_chain = lambda retriever, llm: (
+    {"context": retriever | format_docs, "question": RunnablePassthrough()}
+    | qa_prompt
+    | llm
+)
+qa_chain = lambda llm: (
+    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
     | qa_prompt
     | llm
 )
     parsed_data["answer"] = "".join(outside_text_parts)
     return parsed_data
+def parse_context_and_question(inputs):
+    pattern = r"\[(.*?)\]"
+    match = re.search(pattern, inputs)
+    if match:
+        context = match.group(1)
+        context = [c.strip() for c in context.split()]
+        question = inputs[: match.start()] + inputs[match.end() :]
+        return context, question
+    else:
+        return "", inputs
+format_citations = lambda citations: "\n\n".join(
+    [f"{citation['quote']} ... [{citation['source_id']}]" for citation in citations]
+)
+ai_response_format = lambda message, references: (
+    f"{message}\n\n---\n\n{format_citations(references)}"
+    if references != ""
+    else message
+)

command_center.py CHANGED Viewed

@@ -1,3 +1,6 @@
 class CommandCenter:
     def __init__(self, default_input_type, default_function=None, all_commands=None):
         self.commands = {}
@@ -20,6 +23,9 @@ class CommandCenter:
             command = inputs[0]
             argument = inputs[1:]
         # type casting the arguments
         if self.commands[command]["input_type"] == str:
             argument = " ".join(argument)

+from custom_exceptions import InvalidCommandError
 class CommandCenter:
     def __init__(self, default_input_type, default_function=None, all_commands=None):
         self.commands = {}
             command = inputs[0]
             argument = inputs[1:]
+        if command not in self.commands:
+            raise InvalidCommandError("Invalid command")
         # type casting the arguments
         if self.commands[command]["input_type"] == str:
             argument = " ".join(argument)

custom_exceptions.py ADDED Viewed

	@@ -0,0 +1,6 @@

+class InvalidCommandError(Exception):
+    pass
+class InvalidArgumentError(Exception):
+    pass

process_documents.py CHANGED Viewed

@@ -10,14 +10,25 @@ deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
 def process_documents(urls):
     snippets = []
-    documents = []
     for source_id, url in enumerate(urls):
-        if url.endswith(".pdf"):
-            snippets.extend(process_pdf(url, source_id))
-            documents.append("\n".join([snip.page_content for snip in snippets]))
-        else:
-            snippets.extend(process_web(url, source_id))
-            documents.append("\n".join([snip.page_content for snip in snippets]))
     return snippets, documents
@@ -30,7 +41,7 @@ def process_web(url, source_id):
                 "header": data.metadata["title"],
                 "source_url": url,
                 "source_type": "web",
-                "chunk_id": f"{source_id}_0",
                 "source_id": source_id,
             },
         )
@@ -54,7 +65,7 @@ def process_pdf(url, source_id):
                 "header": " ".join(snip[1]["header_text"].split()[:10]),
                 "source_url": url,
                 "source_type": "pdf",
-                "chunk_id": f"{source_id}_{i}",
                 "source_id": source_id,
             },
         )

 def process_documents(urls):
     snippets = []
+    documents = {}
     for source_id, url in enumerate(urls):
+        snippet = (
+            process_pdf(url, source_id)
+            if url.endswith(".pdf")
+            else process_web(url, source_id)
+        )
+        snippets.extend(snippet)
+        documents[str(source_id)] = Document(
+            page_content="\n".join([snip.page_content for snip in snippet]),
+            metadata={
+                "source_url": url,
+                "source_type": "pdf" if url.endswith(".pdf") else "web",
+                "source_id": source_id,
+                "chunk_id": source_id,
+            },
+        )
+        for snip in snippet:
+            documents[snip.metadata["chunk_id"]] = snip
     return snippets, documents
                 "header": data.metadata["title"],
                 "source_url": url,
                 "source_type": "web",
+                "chunk_id": source_id,
                 "source_id": source_id,
             },
         )
                 "header": " ".join(snip[1]["header_text"].split()[:10]),
                 "source_url": url,
                 "source_type": "pdf",
+                "chunk_id": f"{source_id}_{i:02d}",
                 "source_id": source_id,
             },
         )