Spaces:

chansung
/

paper_qa

Running on CPU Upgrade

App Files Files Community

chansung commited on Mar 6

Commit

928f123

•

1 Parent(s): 5b0b914

update

Browse files

Files changed (12) hide show

.gitignore +1 -0
README.md +5 -6
app.py +449 -99
constants/prompts.toml +17 -0
date_iterator.sh +27 -0
gen/gemini.py +142 -0
gen/utils.py +37 -0
outputs.json +0 -0
paper/download.py +102 -0
paper/parser.py +57 -0
requirements.txt +9 -0
utils.py +28 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -1,14 +1,13 @@
 ---
-title: Paper Q&A
-emoji: 🤓📃
-colorFrom: red
-colorTo: yellow
 sdk: gradio
-sdk_version: 4.19.2
 app_file: app.py
 pinned: false
 license: mit
-short_description: Explore papers with auto generated Q&As!
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Test Paperqa
+emoji: 🔥
+colorFrom: indigo
+colorTo: pink
 sdk: gradio
+sdk_version: 4.20.0
 app_file: app.py
 pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,12 +1,34 @@
-import gradio as gr
 import copy
 import datasets
 STYLE = """
-.main {
-  width: 90% !important;
-  margin: 0 auto; /* Center the container */
 }
 .small-font{
@@ -16,7 +38,7 @@ STYLE = """
 .small-font:hover {
   font-size: 20px !important;
   transition: font-size 0.3s ease-out;
-  transition-delay: 0.8s;
 }
 .group {
@@ -50,22 +72,207 @@ STYLE = """
     border-radius: 0px;
 }
-#search_input > label > span {
     display: none;
 }
-#exp-type > span {
     display: none;
 }
 """
 dataset_repo_id = "chansung/auto-paper-qa2"
 ds = datasets.load_dataset(dataset_repo_id)
 title2qna = {}
 date2qna = {}
 longest_qans = 0
 def count_nans(row):
     count = 0
@@ -119,33 +326,33 @@ def set_paper(date, paper_title):
     return (
         gr.Markdown(f"# {selected_paper['title']}"), gr.Markdown(selected_paper["summary"]),
-        gr.Markdown(f"## 🙋 {selected_paper['0_question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['0_answers:expert']}"),
-        gr.Markdown(f"## 🙋🙋 {selected_paper['0_additional_depth_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}"),
-        gr.Markdown(f"## 🙋🙋 {selected_paper['0_additional_breath_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}"),
-        gr.Markdown(f"## 🙋 {selected_paper['1_question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['1_answers:expert']}"),
-        gr.Markdown(f"## 🙋🙋 {selected_paper['1_additional_depth_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}"),
-        gr.Markdown(f"## 🙋🙋 {selected_paper['1_additional_breath_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}"),
-        gr.Markdown(f"## 🙋 {selected_paper['2_question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['2_answers:expert']}"),
-        gr.Markdown(f"## 🙋🙋 {selected_paper['2_additional_depth_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}"),
-        gr.Markdown(f"## 🙋🙋 {selected_paper['2_additional_breath_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}"),
     )
@@ -196,7 +403,7 @@ function search(searchIn, maxResults = 3) {{
         let titles = {list(titles)};
         for (const title of titles) {{ // Assuming 'titles' is an array defined elsewhere
-            if (results.length > 3) {{
                 break;
             }} else {{
                 if (title.toLowerCase().includes(searchIn.toLowerCase())) {{ // JavaScript's equivalent to Python's 'in'
@@ -206,7 +413,7 @@ function search(searchIn, maxResults = 3) {{
         }}
         // Handle UI elements (Explanation below)
-        const resultElements = [1, 2, 3].map(index => {{
             return results[index - 1] || '';
         }});
@@ -228,13 +435,74 @@ function search(searchIn, maxResults = 3) {{
             document.getElementById('search_r3').style.display = 'block';
         }}
         return resultElements;
     }} else {{
         document.getElementById('search_r1').style.display = 'none';
         document.getElementById('search_r2').style.display = 'none';
         document.getElementById('search_r3').style.display = 'none';
-        return ['', '', '']
     }}
 }}
 """
@@ -251,7 +519,7 @@ def set_papers(date, title):
         gr.Textbox("")
     )
-with gr.Blocks(css=STYLE) as demo:
     gr.Markdown("# Let's explore papers with auto generated Q&As")
     with gr.Column(elem_classes=["group"]):
@@ -272,108 +540,164 @@ with gr.Blocks(css=STYLE) as demo:
             )
         with gr.Column(elem_classes=["no-gap"]):
-            search_in = gr.Textbox("", placeholder="Enter keywords to search...", elem_id="search_input")
             search_r1 = gr.Button(visible=False, elem_id="search_r1", elem_classes=["no-radius"])
             search_r2 = gr.Button(visible=False, elem_id="search_r2", elem_classes=["no-radius"])
             search_r3 = gr.Button(visible=False, elem_id="search_r3", elem_classes=["no-radius"])
-    title = gr.Markdown(f"# {selected_paper['title']}")
-    summary = gr.Markdown(f"{selected_paper['summary']}", elem_classes=["small-font"])
-    with gr.Row():
-        with gr.Column(scale=7):
-            gr.Markdown("## Auto generated Questions & Answers")
-        exp_type = gr.Radio(choices=["ELI5", "Technical"], value="ELI5", elem_id="exp-type", scale=3)
-    # 1
-    with gr.Column(elem_classes=["group"], visible=True) as q_0:
-        basic_q_0 = gr.Markdown(f"## 🙋 {selected_paper['0_question']}")
-        basic_q_eli5_0 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_answers:eli5']}", elem_classes=["small-font"])
-        basic_q_expert_0 = gr.Markdown(f"↪ **(Technical)** {selected_paper['0_answers:expert']}", visible=False, elem_classes=["small-font"])
-        with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_0_0:
-            depth_q_0 = gr.Markdown(f"## 🙋🙋 {selected_paper['0_additional_depth_q:follow up question']}")
-            depth_q_eli5_0 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
-            depth_q_expert_0 = gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
-        with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_0_1:
-            breath_q_0 = gr.Markdown(f"## 🙋🙋 {selected_paper['0_additional_breath_q:follow up question']}")
-            breath_q_eli5_0 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
-            breath_q_expert_0 = gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
-    # 2
-    with gr.Column(elem_classes=["group"], visible=True) as q_1:
-        basic_q_1 = gr.Markdown(f"## 🙋 {selected_paper['1_question']}")
-        basic_q_eli5_1 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_answers:eli5']}", elem_classes=["small-font"])
-        basic_q_expert_1 = gr.Markdown(f"↪ **(Technical)** {selected_paper['1_answers:expert']}", visible=False, elem_classes=["small-font"])
-        with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_1_0:
-            depth_q_1 = gr.Markdown(f"## 🙋🙋 {selected_paper['1_additional_depth_q:follow up question']}")
-            depth_q_eli5_1 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
-            depth_q_expert_1 = gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
-        with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_1_1:
-            breath_q_1 = gr.Markdown(f"## 🙋🙋 {selected_paper['1_additional_breath_q:follow up question']}")
-            breath_q_eli5_1 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
-            breath_q_expert_1 = gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
-    # 3
-    with gr.Column(elem_classes=["group"], visible=True) as q_2:
-        basic_q_2 = gr.Markdown(f"## 🙋 {selected_paper['2_question']}")
-        basic_q_eli5_2 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_answers:eli5']}", elem_classes=["small-font"])
-        basic_q_expert_2 = gr.Markdown(f"↪ **(Technical)** {selected_paper['2_answers:expert']}", visible=False, elem_classes=["small-font"])
-        with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_2_0:
-            depth_q_2 = gr.Markdown(f"## 🙋🙋 {selected_paper['2_additional_depth_q:follow up question']}")
-            depth_q_eli5_2 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
-            depth_q_expert_2 = gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
-        with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_2_1:
-            breath_q_2 = gr.Markdown(f"## 🙋🙋 {selected_paper['2_additional_breath_q:follow up question']}")
-            breath_q_eli5_2 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
-            breath_q_expert_2 = gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
     gr.Markdown("The target papers are collected from [Hugging Face 🤗 Daily Papers](https://huggingface.co/papers) on a daily basis. "
                 "The entire data is generated by [Google's Gemini 1.0](https://deepmind.google/technologies/gemini/) Pro. "
                 "If you are curious how it is done, visit the [Auto Paper Q&A Generation project repository](https://github.com/deep-diver/auto-paper-analysis) "
                 "Also, the generated dataset is hosted on Hugging Face 🤗 Dataset repository as well([Link](https://huggingface.co/datasets/chansung/auto-paper-qa2)). ")
-    search_r1.click(
-        set_date,
-        search_r1,
-        date_dd
-    ).then(
         set_papers,
         inputs=[date_dd, search_r1],
         outputs=[papers_dd, search_in]
     )
-    search_r2.click(
-        set_date,
-        search_r2,
-        date_dd
-    ).then(
         set_papers,
         inputs=[date_dd, search_r2],
         outputs=[papers_dd, search_in]
     )
-    search_r3.click(
-        set_date,
-        search_r3,
-        date_dd
-    ).then(
         set_papers,
         inputs=[date_dd, search_r3],
         outputs=[papers_dd, search_in]
     )
-    date_dd.input(
-        get_papers,
-        date_dd,
-        papers_dd
-    ).then(
         set_paper,
         [date_dd, papers_dd],
         [
@@ -413,7 +737,10 @@ with gr.Blocks(css=STYLE) as demo:
     search_in.change(
         inputs=[search_in],
-        outputs=[search_r1, search_r2, search_r3],
         js=UPDATE_SEARCH_RESULTS,
         fn=None
     )
@@ -428,4 +755,27 @@ with gr.Blocks(css=STYLE) as demo:
         ]
     )
-demo.launch(share=True)

+import os
+import re
 import copy
 import datasets
+import pandas as pd
+import gradio as gr
+from datetime import datetime, timedelta
+from datasets import Dataset
+from huggingface_hub import HfApi
+from huggingface_hub import create_repo
+from huggingface_hub.utils import HfHubHTTPError
+from paper.download import (
+    download_pdf_from_arxiv,
+    get_papers_from_hf_daily_papers,
+    get_papers_from_arxiv_ids
+)
+from paper.parser import extract_text_and_figures
+from gen.gemini import get_basic_qa, get_deep_qa
+import utils
+from apscheduler.schedulers.background import BackgroundScheduler
 STYLE = """
+@media only screen and (max-width: 700px) {
+    .main {
+    width: 80% !important;
+    margin: 0 auto; /* Center the container */
+    }
 }
 .small-font{
 .small-font:hover {
   font-size: 20px !important;
   transition: font-size 0.3s ease-out;
+  transition-delay: 1.5s;
 }
 .group {
     border-radius: 0px;
 }
+.textbox-no-label > label > span {
+    display: none;
+}
+.exp-type > span {
     display: none;
 }
+.conv-type > span {
     display: none;
 }
+.conv-type .wrap:nth-child(3) {
+    width: 167px;
+    margin: auto;
+}
+button {
+    font-size: 10pt !important;
+}
+h3 {
+    font-size: 13pt !important;
+}
 """
+gemini_api_key = os.getenv("GEMINI_API_KEY")
+hf_token = os.getenv("HF_TOKEN")
 dataset_repo_id = "chansung/auto-paper-qa2"
+request_arxiv_repo_id="chansung/requested-arxiv-ids-3"
 ds = datasets.load_dataset(dataset_repo_id)
+request_ds = datasets.load_dataset(request_arxiv_repo_id)
+requested_arxiv_ids = []
+for request_d in request_ds['train']:
+    arxiv_ids = request_d['Requested arXiv IDs']
+    requested_arxiv_ids = requested_arxiv_ids + arxiv_ids
+requested_arxiv_ids_df = pd.DataFrame({'Requested arXiv IDs': requested_arxiv_ids})
 title2qna = {}
 date2qna = {}
 longest_qans = 0
+def filter_function(example, ids):
+    ids_e = example['Requested arXiv IDs']
+    for iid in ids:
+        if iid in ids_e:
+            ids_e.remove(iid)
+            example['Requested arXiv IDs'] = ids_e
+    print(example)
+    return example
+def process_arxiv_ids(gemini_api, hf_repo_id, req_hf_repo_id, hf_token, how_many=10):
+    arxiv_ids = []
+    ds1 = datasets.load_dataset(req_hf_repo_id)
+    for d in ds1['train']:
+        req_arxiv_ids = d['Requested arXiv IDs']
+        if len(req_arxiv_ids) > 0 and req_arxiv_ids[0] != "top":
+            arxiv_ids = arxiv_ids + req_arxiv_ids
+    arxiv_ids = arxiv_ids[:how_many]
+    if arxiv_ids is not None and len(arxiv_ids) > 0:
+        print(f"1. Get metadata for the papers [{arxiv_ids}]")
+        papers = get_papers_from_arxiv_ids(arxiv_ids)
+        print("...DONE")
+        print("2. Generating QAs for the paper")
+        for paper in papers:
+            try:
+                title = paper['title']
+                target_date = paper['target_date']
+                abstract = paper['paper']['summary']
+                arxiv_id = paper['paper']['id']
+                authors = paper['paper']['authors']
+                print(f"...PROCESSING ON[{arxiv_id}, {title}]")
+                print(f"......Downloading the paper PDF")
+                filename = download_pdf_from_arxiv(arxiv_id)
+                print(f"......DONE")
+                print(f"......Extracting text and figures")
+                texts, figures = extract_text_and_figures(filename)
+                text =' '.join(texts)
+                print(f"......DONE")
+                print(f"......Generating the seed(basic) QAs")
+                qnas = get_basic_qa(text, gemini_api_key=gemini_api, trucate=30000)
+                qnas['title'] = title
+                qnas['abstract'] = abstract
+                qnas['authors'] = ','.join(authors)
+                qnas['arxiv_id'] = arxiv_id
+                qnas['target_date'] = target_date
+                qnas['full_text'] = text
+                print(f"......DONE")
+                print(f"......Generating the follow-up QAs")
+                qnas = get_deep_qa(text, qnas, gemini_api_key=gemini_api, trucate=30000)
+                del qnas["qna"]
+                print(f"......DONE")
+                print(f"......Exporting to HF Dataset repo at [{hf_repo_id}]")
+                utils.push_to_hf_hub(qnas, hf_repo_id, hf_token)
+                print(f"......DONE")
+                print(f"......Updating request arXiv HF Dataset repo at [{req_hf_repo_id}]")
+                ds1 = ds1['train'].map(
+                    lambda example: filter_function(example, [arxiv_id])
+                ).filter(
+                    lambda example: len(example['Requested arXiv IDs']) > 0
+                )
+                ds1.push_to_hub(req_hf_repo_id, token=hf_token)
+                print(f"......DONE")
+            except Exception as e:
+                print(f".......failed due to exception {e}")
+                continue
+        HfApi(token=hf_token).restart_space(
+            repo_id="chansung/paper_qa", token=hf_token
+        )
+def push_to_hf_hub(
+    df, repo_id, token, append=True
+):
+    exist = False
+    ds = Dataset.from_pandas(df)
+    try:
+        create_repo(request_arxiv_repo_id, repo_type="dataset", token=hf_token)
+    except HfHubHTTPError as e:
+        exist = True
+    if exist and append:
+        existing_ds = datasets.load_dataset(repo_id)
+        ds = datasets.concatenate_datasets([existing_ds['train'], ds])
+    ds.push_to_hub(repo_id, token=token)
+def _filter_duplicate_arxiv_ids(arxiv_ids_to_be_added):
+    ds1 = datasets.load_dataset("chansung/requested-arxiv-ids-3")
+    ds2 = datasets.load_dataset("chansung/auto-paper-qa2")
+    unique_arxiv_ids = set()
+    for d in ds1['train']:
+        arxiv_ids = d['Requested arXiv IDs']
+        unique_arxiv_ids = set(list(unique_arxiv_ids) + arxiv_ids)
+    for d in ds2['train']:
+        arxiv_id = d['arxiv_id']
+        unique_arxiv_ids.add(arxiv_id)
+    return list(set(arxiv_ids_to_be_added) - unique_arxiv_ids)
+def _is_arxiv_id_valid(arxiv_id):
+  pattern = r"^\d{4}\.\d{5}$"
+  return bool(re.match(pattern, arxiv_id))
+def _get_valid_arxiv_ids(arxiv_ids_str):
+    valid_arxiv_ids = []
+    invalid_arxiv_ids = []
+    for arxiv_id in arxiv_ids_str.split(","):
+        arxiv_id = arxiv_id.strip()
+        if _is_arxiv_id_valid(arxiv_id):
+           valid_arxiv_ids.append(arxiv_id)
+        else:
+            invalid_arxiv_ids.append(arxiv_id)
+    return valid_arxiv_ids, invalid_arxiv_ids
+def add_arxiv_ids_to_queue(queue, arxiv_ids_str):
+    print(0)
+    valid_arxiv_ids, invalid_arxiv_ids = _get_valid_arxiv_ids(arxiv_ids_str)
+    print("01")
+    if len(invalid_arxiv_ids) > 0:
+        gr.Warning(f"found invalid arXiv ids as in {invalid_arxiv_ids}")
+    if len(valid_arxiv_ids) > 0:
+        valid_arxiv_ids = _filter_duplicate_arxiv_ids(valid_arxiv_ids)
+        if len(valid_arxiv_ids) > 0:
+            valid_arxiv_ids = [[arxiv_id] for arxiv_id in valid_arxiv_ids]
+            gr.Warning(f"Processing on [{valid_arxiv_ids}]. Other requested arXiv IDs not found on this list should be already processed or being processed...")
+            valid_arxiv_ids = pd.DataFrame({'Requested arXiv IDs': valid_arxiv_ids})
+            queue = pd.concat([queue, valid_arxiv_ids])
+            queue.reset_index(drop=True)
+            push_to_hf_hub(valid_arxiv_ids, request_arxiv_repo_id, hf_token)
+        else:
+            gr.Warning(f"All requested arXiv IDs are already processed or being processed...")
+    else:
+        gr.Warning(f"No valid arXiv IDs found...")
+    return queue
 def count_nans(row):
     count = 0
     return (
         gr.Markdown(f"# {selected_paper['title']}"), gr.Markdown(selected_paper["summary"]),
+        gr.Markdown(f"### 🙋 {selected_paper['0_question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['0_answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['0_additional_depth_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['0_additional_breath_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋 {selected_paper['1_question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['1_answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['1_additional_depth_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['1_additional_breath_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋 {selected_paper['2_question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['2_answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['2_additional_depth_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['2_additional_breath_q:follow up question']}"),
         gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}"),
         gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}"),
     )
         let titles = {list(titles)};
         for (const title of titles) {{ // Assuming 'titles' is an array defined elsewhere
+            if (results.length > 10) {{
                 break;
             }} else {{
                 if (title.toLowerCase().includes(searchIn.toLowerCase())) {{ // JavaScript's equivalent to Python's 'in'
         }}
         // Handle UI elements (Explanation below)
+        const resultElements = [1,2,3,4,5,6,7,8,9,10].map(index => {{
             return results[index - 1] || '';
         }});
             document.getElementById('search_r3').style.display = 'block';
         }}
+        if (resultElements[3] == '') {{
+            document.getElementById('search_r4').style.display = 'none';
+        }} else {{
+            document.getElementById('search_r4').style.display = 'block';
+        }}
+        if (resultElements[4] == '') {{
+            document.getElementById('search_r5').style.display = 'none';
+        }} else {{
+            document.getElementById('search_r5').style.display = 'block';
+        }}
+        if (resultElements[5] == '') {{
+            document.getElementById('search_r6').style.display = 'none';
+        }} else {{
+            document.getElementById('search_r6').style.display = 'block';
+        }}
+        if (resultElements[6] == '') {{
+            document.getElementById('search_r7').style.display = 'none';
+        }} else {{
+            document.getElementById('search_r7').style.display = 'block';
+        }}
+        if (resultElements[7] == '') {{
+            document.getElementById('search_r8').style.display = 'none';
+        }} else {{
+            document.getElementById('search_r8').style.display = 'block';
+        }}
+        if (resultElements[8] == '') {{
+            document.getElementById('search_r9').style.display = 'none';
+        }} else {{
+            document.getElementById('search_r9').style.display = 'block';
+        }}
+        if (resultElements[9] == '') {{
+            document.getElementById('search_r10').style.display = 'none';
+        }} else {{
+            document.getElementById('search_r10').style.display = 'block';
+        }}
         return resultElements;
     }} else {{
         document.getElementById('search_r1').style.display = 'none';
         document.getElementById('search_r2').style.display = 'none';
         document.getElementById('search_r3').style.display = 'none';
+        document.getElementById('search_r4').style.display = 'none';
+        document.getElementById('search_r5').style.display = 'none';
+        document.getElementById('search_r6').style.display = 'none';
+        document.getElementById('search_r7').style.display = 'none';
+        document.getElementById('search_r8').style.display = 'none';
+        document.getElementById('search_r9').style.display = 'none';
+        document.getElementById('search_r10').style.display = 'none';
+        return ['', '', '', '', '', '', '', '', '', '']
+    }}
+}}
+"""
+UPDATE_IF_TYPE = f"""
+function chage_if_type(if_type) {{
+    if (if_type == 'Q&As') {{
+        document.getElementById('chat_block').style.display = 'none';
+        document.getElementById('qna_block').style.display = 'block';
+    }} else {{
+        document.getElementById('chat_block').style.display = 'block';
+        document.getElementById('qna_block').style.display = 'none';
     }}
 }}
 """
         gr.Textbox("")
     )
+with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
     gr.Markdown("# Let's explore papers with auto generated Q&As")
     with gr.Column(elem_classes=["group"]):
             )
         with gr.Column(elem_classes=["no-gap"]):
+            search_in = gr.Textbox("", placeholder="Enter keywords to search...", elem_classes=["textbox-no-label"])
             search_r1 = gr.Button(visible=False, elem_id="search_r1", elem_classes=["no-radius"])
             search_r2 = gr.Button(visible=False, elem_id="search_r2", elem_classes=["no-radius"])
             search_r3 = gr.Button(visible=False, elem_id="search_r3", elem_classes=["no-radius"])
+            search_r4 = gr.Button(visible=False, elem_id="search_r4", elem_classes=["no-radius"])
+            search_r5 = gr.Button(visible=False, elem_id="search_r5", elem_classes=["no-radius"])
+            search_r6 = gr.Button(visible=False, elem_id="search_r6", elem_classes=["no-radius"])
+            search_r7 = gr.Button(visible=False, elem_id="search_r7", elem_classes=["no-radius"])
+            search_r8 = gr.Button(visible=False, elem_id="search_r8", elem_classes=["no-radius"])
+            search_r9 = gr.Button(visible=False, elem_id="search_r9", elem_classes=["no-radius"])
+            search_r10 = gr.Button(visible=False, elem_id="search_r10", elem_classes=["no-radius"])
+        conv_type = gr.Radio(choices=["Q&As", "Chat"], value="Q&As", interactive=True, visible=False, elem_classes=["conv-type"])
+    with gr.Column(scale=7):
+        title = gr.Markdown(f"# {selected_paper['title']}")
+        summary = gr.Markdown(f"{selected_paper['summary']}", elem_classes=["small-font"])
+        with gr.Column(elem_id="chat_block", visible=False):
+            gr.Chatbot([("hello", "world"), ("how", "are you?")])
+        with gr.Column(elem_id="qna_block", visible=True):
+            with gr.Row():
+                with gr.Column(scale=7):
+                    gr.Markdown("## Auto generated Questions & Answers")
+                exp_type = gr.Radio(choices=["ELI5", "Technical"], value="ELI5", elem_classes=["exp-type"], scale=3)
+            # 1
+            with gr.Column(elem_classes=["group"], visible=True) as q_0:
+                basic_q_0 = gr.Markdown(f"### 🙋 {selected_paper['0_question']}")
+                basic_q_eli5_0 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_answers:eli5']}", elem_classes=["small-font"])
+                basic_q_expert_0 = gr.Markdown(f"↪ **(Technical)** {selected_paper['0_answers:expert']}", visible=False, elem_classes=["small-font"])
+                with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_0_0:
+                    depth_q_0 = gr.Markdown(f"### 🙋🙋 {selected_paper['0_additional_depth_q:follow up question']}")
+                    depth_q_eli5_0 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
+                    depth_q_expert_0 = gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
+                with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_0_1:
+                    breath_q_0 = gr.Markdown(f"### 🙋🙋 {selected_paper['0_additional_breath_q:follow up question']}")
+                    breath_q_eli5_0 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
+                    breath_q_expert_0 = gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
+            # 2
+            with gr.Column(elem_classes=["group"], visible=True) as q_1:
+                basic_q_1 = gr.Markdown(f"### 🙋 {selected_paper['1_question']}")
+                basic_q_eli5_1 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_answers:eli5']}", elem_classes=["small-font"])
+                basic_q_expert_1 = gr.Markdown(f"↪ **(Technical)** {selected_paper['1_answers:expert']}", visible=False, elem_classes=["small-font"])
+                with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_1_0:
+                    depth_q_1 = gr.Markdown(f"### 🙋🙋 {selected_paper['1_additional_depth_q:follow up question']}")
+                    depth_q_eli5_1 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
+                    depth_q_expert_1 = gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
+                with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_1_1:
+                    breath_q_1 = gr.Markdown(f"### 🙋🙋 {selected_paper['1_additional_breath_q:follow up question']}")
+                    breath_q_eli5_1 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
+                    breath_q_expert_1 = gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
+            # 3
+            with gr.Column(elem_classes=["group"], visible=True) as q_2:
+                basic_q_2 = gr.Markdown(f"### 🙋 {selected_paper['2_question']}")
+                basic_q_eli5_2 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_answers:eli5']}", elem_classes=["small-font"])
+                basic_q_expert_2 = gr.Markdown(f"↪ **(Technical)** {selected_paper['2_answers:expert']}", visible=False, elem_classes=["small-font"])
+                with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_2_0:
+                    depth_q_2 = gr.Markdown(f"### 🙋🙋 {selected_paper['2_additional_depth_q:follow up question']}")
+                    depth_q_eli5_2 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
+                    depth_q_expert_2 = gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
+                with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_2_1:
+                    breath_q_2 = gr.Markdown(f"### 🙋🙋 {selected_paper['2_additional_breath_q:follow up question']}")
+                    breath_q_eli5_2 = gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
+                    breath_q_expert_2 = gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
+        gr.Markdown("## Request any arXiv ids")
+        arxiv_queue = gr.Dataframe(
+            headers=["Requested arXiv IDs"], col_count=(1, "fixed"),
+            value=requested_arxiv_ids_df,
+            datatype=["str"],
+            interactive=False
+        )
+        arxiv_id_enter = gr.Textbox(placeholder="Enter comma separated arXiv IDs...", elem_classes=["textbox-no-label"])
+        arxiv_id_enter.submit(
+            add_arxiv_ids_to_queue,
+            [arxiv_queue, arxiv_id_enter],
+            arxiv_queue
+        )
     gr.Markdown("The target papers are collected from [Hugging Face 🤗 Daily Papers](https://huggingface.co/papers) on a daily basis. "
                 "The entire data is generated by [Google's Gemini 1.0](https://deepmind.google/technologies/gemini/) Pro. "
                 "If you are curious how it is done, visit the [Auto Paper Q&A Generation project repository](https://github.com/deep-diver/auto-paper-analysis) "
                 "Also, the generated dataset is hosted on Hugging Face 🤗 Dataset repository as well([Link](https://huggingface.co/datasets/chansung/auto-paper-qa2)). ")
+    search_r1.click(set_date, search_r1, date_dd).then(
         set_papers,
         inputs=[date_dd, search_r1],
         outputs=[papers_dd, search_in]
     )
+    search_r2.click(set_date, search_r2, date_dd).then(
         set_papers,
         inputs=[date_dd, search_r2],
         outputs=[papers_dd, search_in]
     )
+    search_r3.click(set_date, search_r3, date_dd).then(
         set_papers,
         inputs=[date_dd, search_r3],
         outputs=[papers_dd, search_in]
     )
+    search_r4.click(set_date, search_r4, date_dd).then(
+        set_papers,
+        inputs=[date_dd, search_r4],
+        outputs=[papers_dd, search_in]
+    )
+    search_r5.click(set_date, search_r5, date_dd).then(
+        set_papers,
+        inputs=[date_dd, search_r5],
+        outputs=[papers_dd, search_in]
+    )
+    search_r6.click(set_date, search_r6, date_dd).then(
+        set_papers,
+        inputs=[date_dd, search_r6],
+        outputs=[papers_dd, search_in]
+    )
+    search_r7.click(set_date, search_r7, date_dd).then(
+        set_papers,
+        inputs=[date_dd, search_r7],
+        outputs=[papers_dd, search_in]
+    )
+    search_r8.click(set_date, search_r8, date_dd).then(
+        set_papers,
+        inputs=[date_dd, search_r8],
+        outputs=[papers_dd, search_in]
+    )
+    search_r9.click(set_date, search_r9, date_dd).then(
+        set_papers,
+        inputs=[date_dd, search_r9],
+        outputs=[papers_dd, search_in]
+    )
+    search_r10.click(set_date, search_r10, date_dd).then(
+        set_papers,
+        inputs=[date_dd, search_r10],
+        outputs=[papers_dd, search_in]
+    )
+    date_dd.input(get_papers, date_dd, papers_dd).then(
         set_paper,
         [date_dd, papers_dd],
         [
     search_in.change(
         inputs=[search_in],
+        outputs=[
+            search_r1, search_r2, search_r3, search_r4, search_r5,
+            search_r6, search_r7, search_r8, search_r9, search_r10
+        ],
         js=UPDATE_SEARCH_RESULTS,
         fn=None
     )
         ]
     )
+    conv_type.select(
+        inputs=[conv_type],
+        js=UPDATE_IF_TYPE,
+        outputs=None,
+        fn=None
+    )
+start_date = datetime.now() + timedelta(minutes=1)
+scheduler = BackgroundScheduler()
+scheduler.add_job(
+    process_arxiv_ids,
+    trigger='interval',
+    seconds=3600,
+    args=[
+        gemini_api_key,
+        dataset_repo_id,
+        request_arxiv_repo_id,
+        hf_token
+    ],
+    start_date=start_date
+)
+scheduler.start()
+demo.launch(share=True, debug=True)

constants/prompts.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[basic_qa]
+prompt = """
+come up with the 6 questions and answers that could be commonly asked by people about the following paper.
+There should be two types of answers included, one for expert and the other for ELI5.
+Your response should be recorded in a JSON format as ```json{"title": text, "summary": text, "qna": [{"question": "answers": {"eli5": text, "expert": text}}, ...]}```
+"""
+[deep_qa]
+prompt = """
+Paper title: $title
+Previous question: $previous_question
+The answer on the previous question: $previous_answer
+Based on the previous question and answer above, and based on the paper content below, suggest follow-up question and answers in $tone manner.
+There should be two types of answers included, one for expert and the other for ELI5.
+Your response should be recorded in a JSON format as ```json{"follow up question": text, "answers": {"eli5": text, "expert": text}}```
+"""

date_iterator.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+# Set start and end dates (format YYYY-MM-DD)
+start_date=$1
+end_date=$2
+hf_repo_id=$3
+# Convert dates into seconds since epoch (for easier calculations)
+start_seconds=$(date -j -f "%Y-%m-%d" "$start_date" "+%s")
+end_seconds=$(date -j -f "%Y-%m-%d" "$end_date" "+%s")
+# Iterate through dates
+current_seconds=$start_seconds
+while [[ $current_seconds -le $end_seconds ]]; do
+  current_date=$(date -j -r $current_seconds "+%Y-%m-%d")
+  # Replace with your actual program execution
+  echo "Running program for date: $current_date"
+  python app.py --target-date $current_date \
+    --gemini-api $GEMINI_API_KEY \
+    --hf-token $HF_ACCESS_TOKEN \
+    --hf-repo-id $hf_repo_id \
+    --hf-daily-papers
+  current_seconds=$((current_seconds + 86400))  # Add 1 day (86400 seconds)
+done

gen/gemini.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import ast
+import copy
+import toml
+from string import Template
+from pathlib import Path
+from flatdict import FlatDict
+import google.generativeai as genai
+from gen.utils import parse_first_json_snippet
+def determine_model_name(given_image=None):
+  if given_image is None:
+    return "gemini-pro"
+  else:
+    return "gemini-pro-vision"
+def construct_image_part(given_image):
+  return {
+    "mime_type": "image/jpeg",
+    "data": given_image
+  }
+def call_gemini(prompt="", API_KEY=None, given_text=None, given_image=None, generation_config=None, safety_settings=None):
+    genai.configure(api_key=API_KEY)
+    if generation_config is None:
+        generation_config = {
+            "temperature": 0.8,
+            "top_p": 1,
+            "top_k": 32,
+            "max_output_tokens": 4096,
+        }
+    if safety_settings is None:
+        safety_settings = [
+            {
+                "category": "HARM_CATEGORY_HARASSMENT",
+                "threshold": "BLOCK_NONE"
+            },
+            {
+                "category": "HARM_CATEGORY_HATE_SPEECH",
+                "threshold": "BLOCK_NONE"
+            },
+            {
+                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                "threshold": "BLOCK_NONE"
+            },
+            {
+                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+                "threshold": "BLOCK_NONE"
+            },
+        ]
+    model_name = determine_model_name(given_image)
+    model = genai.GenerativeModel(model_name=model_name,
+                                generation_config=generation_config,
+                                safety_settings=safety_settings)
+    USER_PROMPT = prompt
+    if given_text is not None:
+        USER_PROMPT += f"""{prompt}
+    ------------------------------------------------
+    {given_text}
+    """
+    prompt_parts = [USER_PROMPT]
+    if given_image is not None:
+        prompt_parts.append(construct_image_part(given_image))
+    response = model.generate_content(prompt_parts)
+    return response.text
+def try_out(prompt, given_text, gemini_api_key, given_image=None, retry_num=5):
+    qna_json = None
+    cur_retry = 0
+    while qna_json is None and cur_retry < retry_num:
+        try:
+            qna = call_gemini(
+                prompt=prompt,
+                given_text=given_text,
+                given_image=given_image,
+                API_KEY=gemini_api_key
+            )
+            qna_json = parse_first_json_snippet(qna)
+        except Exception as e:
+            cur_retry = cur_retry + 1
+            print(f"......retry {e}")
+    return qna_json
+def get_basic_qa(text, gemini_api_key, trucate=7000):
+    prompts = toml.load(Path('.') / 'constants' / 'prompts.toml')
+    basic_qa = try_out(prompts['basic_qa']['prompt'], text[:trucate], gemini_api_key=gemini_api_key)
+    return basic_qa
+def get_deep_qa(text, basic_qa, gemini_api_key, trucate=7000):
+    prompts = toml.load(Path('.') / 'constants' / 'prompts.toml')
+    title = basic_qa['title']
+    qnas = copy.deepcopy(basic_qa['qna'])
+    for idx, qna in enumerate(qnas):
+        q = qna['question']
+        a_expert = qna['answers']['expert']
+        depth_search_prompt = Template(prompts['deep_qa']['prompt']).substitute(
+            title=title, previous_question=q, previous_answer=a_expert, tone="in-depth"
+        )
+        breath_search_prompt = Template(prompts['deep_qa']['prompt']).substitute(
+            title=title, previous_question=q, previous_answer=a_expert, tone="broad"
+        )
+        depth_search_response = {}
+        breath_search_response = {}
+        while 'follow up question' not in depth_search_response or \
+            'answers' not in depth_search_response or \
+            'eli5' not in depth_search_response['answers'] or \
+            'expert' not in depth_search_response['answers']:
+            depth_search_response = try_out(depth_search_prompt, text[:trucate], gemini_api_key=gemini_api_key)
+        while 'follow up question' not in breath_search_response or \
+            'answers' not in breath_search_response or \
+            'eli5' not in breath_search_response['answers'] or \
+            'expert' not in breath_search_response['answers']:
+            breath_search_response = try_out(breath_search_prompt, text[:trucate], gemini_api_key=gemini_api_key)
+        if depth_search_response is not None:
+            qna['additional_depth_q'] = depth_search_response
+        if breath_search_response is not None:
+            qna['additional_breath_q'] = breath_search_response
+        qna = FlatDict(qna)
+        qna_tmp = copy.deepcopy(qna)
+        for k in qna_tmp:
+            value = qna.pop(k)
+            qna[f'{idx}_{k}'] = value
+        basic_qa.update(ast.literal_eval(str(qna)))
+    return basic_qa

gen/utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import json
+def find_json_snippet(raw_snippet):
+	json_parsed_string = None
+	json_start_index = raw_snippet.find('{')
+	json_end_index = raw_snippet.rfind('}')
+	if json_start_index >= 0 and json_end_index >= 0:
+		json_snippet = raw_snippet[json_start_index:json_end_index+1]
+		try:
+			json_parsed_string = json.loads(json_snippet, strict=False)
+		except:
+			raise ValueError('......failed to parse string into JSON format')
+	else:
+		raise ValueError('......No JSON code snippet found in string.')
+	return json_parsed_string
+def parse_first_json_snippet(snippet):
+	json_parsed_string = None
+	if isinstance(snippet, list):
+		for snippet_piece in snippet:
+			try:
+				json_parsed_string = find_json_snippet(snippet_piece)
+				return json_parsed_string
+			except:
+				pass
+	else:
+		try:
+			json_parsed_string = find_json_snippet(snippet)
+		except Exception as e:
+			print(e)
+			raise ValueError()
+	return json_parsed_string

outputs.json ADDED Viewed

The diff for this file is too large to render. See raw diff

paper/download.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import re
+import json
+import requests
+import datetime
+from datetime import date
+from datetime import datetime
+import xml.etree.ElementTree as ET
+from requests.exceptions import HTTPError
+def _get_today():
+    return str(date.today())
+def _download_pdf_from_arxiv(filename):
+    url = f'https://arxiv.org/pdf/{filename}'
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.content
+    else:
+        raise Exception(f"Failed to download pdf for arXiv id {filename}")
+def download_pdf_from_arxiv(arxiv_id):
+    filename = f"{arxiv_id}.pdf"
+    pdf_content = _download_pdf_from_arxiv(filename)
+    # Save the pdf content to a file
+    with open(filename, "wb") as f:
+        f.write(pdf_content)
+    return filename
+def _get_papers_from_hf_daily_papers(target_date):
+    if target_date is None:
+        target_date = _get_today()
+        print(f"target_date is not set => scrap today's papers [{target_date}]")
+    url = f"https://huggingface.co/api/daily_papers?date={target_date}"
+    response = requests.get(url)
+    if response.status_code == 200:
+        return target_date, response.text
+    else:
+        raise HTTPError(f"Error fetching data. Status code: {response.status_code}")
+def get_papers_from_hf_daily_papers(target_date):
+    target_date, results = _get_papers_from_hf_daily_papers(target_date)
+    results = json.loads(results)
+    for result in results:
+        result["target_date"] = target_date
+    return target_date, results
+def _get_paper_xml_by_arxiv_id(arxiv_id):
+    url = f"http://export.arxiv.org/api/query?search_query=id:{arxiv_id}&start=0&max_results=1"
+    return requests.get(url)
+def _is_arxiv_id_valid(arxiv_id):
+  pattern = r"^\d{4}\.\d{5}$"
+  return bool(re.match(pattern, arxiv_id))
+def _get_paper_metadata_by_arxiv_id(response):
+    root = ET.fromstring(response.content)
+    # Example: Extracting title, authors, and abstract
+    title = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}title').text
+    authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in root.findall('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}author')]
+    abstract = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}summary').text
+    target_date = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}published').text
+    return title, authors, abstract, target_date
+def get_papers_from_arxiv_ids(arxiv_ids):
+    results = []
+    for arxiv_id in arxiv_ids:
+        print(arxiv_id)
+        if _is_arxiv_id_valid(arxiv_id):
+            try:
+                xml_data = _get_paper_xml_by_arxiv_id(arxiv_id)
+                title, authors, abstract, target_date = _get_paper_metadata_by_arxiv_id(xml_data)
+                datetime_obj = datetime.strptime(target_date, "%Y-%m-%dT%H:%M:%SZ")
+                formatted_date = datetime_obj.strftime("%Y-%m-%d")
+                results.append(
+                    {
+                        "title": title,
+                        "target_date": formatted_date,
+                        "paper": {
+                            "summary": abstract,
+                            "id": arxiv_id,
+                            "authors" : authors,
+                        }
+                    }
+                )
+            except:
+                print("......something wrong happend when downloading metadata")
+                print("......this usually happens when you try out the today's published paper")
+                continue
+        else:
+            print(f"......not a valid arXiv ID[{arxiv_id}]")
+    return results

paper/parser.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import fitz
+import PyPDF2
+def extract_text_and_figures(pdf_path):
+    """
+    Extracts text and figures from a PDF file.
+    Args:
+        pdf_path (str): The path to the PDF file.
+    Returns:
+        tuple: A tuple containing two lists:
+            * A list of extracted text blocks.
+            * A list of extracted figures (as bytes).
+    """
+    texts = []
+    figures = []
+    # Open the PDF using PyMuPDF (fitz) for image extraction
+    doc = fitz.open(pdf_path)
+    for page_num, page in enumerate(doc):
+        text = page.get_text("text")  # Extract text as plain text
+        texts.append(text)
+        # Process images on the page
+        image_list = page.get_images()
+        for image_index, img in enumerate(image_list):
+            xref = img[0]  # Image XREF
+            pix = fitz.Pixmap(doc, xref)  # Create Pixmap image
+            # Save image in desired format (here, PNG)
+            if pix.n < 5:  # Grayscale or RGB
+                img_bytes = pix.tobytes("png")
+            else:  # CMYK: Convert to RGB first
+                pix = fitz.Pixmap(fitz.csRGB, pix)
+                img_bytes = pix.tobytes("png")
+            figures.append(img_bytes)
+    # Extract additional text using PyPDF2 (in case fitz didn't get everything)
+    with open(pdf_path, 'rb') as pdf_file:
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        for page_num in range(len(pdf_reader.pages)):
+            page = pdf_reader.pages[page_num]
+            text = page.extract_text()
+            texts.append(text)
+    try:
+        os.remove(pdf_path)
+    except FileNotFoundError:
+        print(f"File '{pdf_path}' not found.")
+    except PermissionError:
+        print(f"Unable to remove '{pdf_path}'. Check permissions.")
+    return texts, figures

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+google-generativeai
+pypdf2
+PyMuPDF
+gradio
+requests
+toml
+datasets
+flatdict
+APScheduler

utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import pandas as pd
+import datasets
+from datasets import Dataset
+from huggingface_hub import create_repo
+from huggingface_hub.utils import HfHubHTTPError
+def push_to_hf_hub(
+	qnas, repo_id, token, append=True
+):
+    print(1)
+    exist = False
+    df = pd.DataFrame([qnas])
+    ds = Dataset.from_pandas(df)
+    ds = ds.cast_column("target_date", datasets.features.Value("timestamp[s]"))
+    print(2)
+    try:
+        create_repo(repo_id, repo_type="dataset", token=token)
+    except HfHubHTTPError as e:
+        exist = True
+    if exist and append:
+        print(3)
+        existing_ds = datasets.load_dataset(repo_id)
+        ds = datasets.concatenate_datasets([existing_ds['train'], ds])
+    print(4)
+    ds.push_to_hub(repo_id, token=token)