Spaces:

KalbeDigitalLab
/

NutriGenMePE

Build error

App Files Files Community

firqaaa commited on Dec 6, 2023

Commit

eb88b82

•

1 Parent(s): 5da956e

Upload 6 files

Browse files

Files changed (6) hide show

Dockerfile +43 -0
app.py +1107 -0
requirements.txt +16 -0
schema.py +87 -0
summ.py +68 -0
utils.py +116 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+FROM python:3.9
+RUN apt update && \
+    apt install -y bash \
+                   poppler-utils \
+                   tesseract-ocr \
+                   libtesseract-dev \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN [ "python", "-c", "import nltk; nltk.download('punkt')" ]
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+COPY . .
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]

app.py ADDED Viewed

	@@ -0,0 +1,1107 @@

+# Author: Firqa Aqila Noor Arasyi
+# Date: 2023-12-04
+import os
+import io
+import json
+import pandas as pd
+import streamlit as st
+from stqdm import stqdm
+from ast import literal_eval
+from tempfile import NamedTemporaryFile
+import PyPDF2
+import pdf2image
+import pytesseract
+from utils import *
+from schema import *
+from summ import get_summ
+from datetime import datetime
+import time
+import base64
+import string
+import random
+import numpy as np
+from langchain.llms import OpenAI
+from langchain.chains import RetrievalQA
+from langchain.vectorstores import Chroma
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders import TextLoader
+from chromadb.utils import embedding_functions
+from unstructured.partition.pdf import partition_pdf
+from unstructured.staging.base import elements_to_json
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.chains import create_extraction_chain
+from Bio import Entrez
+nltk.download("punkt")
+os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
+Entrez.email = os.getenv("ENTREZ_EMAIL")
+Entrez.api_key = os.getenv("ENTREZ_API_KEY")
+fold = -1
+buffer = io.BytesIO()
+st.cache_data()
+def convert_df(df):
+    return df.to_csv().encode("utf-8")
+# Function to create a download link for an Excel file
+# def create_excel_download_link(df, file_name):
+#     output = io.BytesIO()
+#     with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+#         df.to_excel(writer, sheet_name='Sheet1', index=False)
+#     excel_data = output.getvalue()
+#     st.download_button(label="Download Excel File", data=excel_data, key=file_name, file_name=f"{file_name}.xlsx")
+class Journal:
+    def __init__(self, name, bytes):
+        self.name = name
+        self.bytes = bytes
+    def __repr__(self):
+        return f"Journal(name='{self.name}', bytes='{self.bytes}')"
+llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
+textex_chain = create_extraction_chain(textex_schema, llm)
+tablex_chain = create_extraction_chain(tablex_schema, llm)
+st.set_page_config(page_title="NutriGenMe Paper Extractor")
+st.title("NutriGenMe - Paper Extraction")
+st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its latest version, the app is equipped to extract essential information from papers, including tables in both horizontal and vertical orientations, images, and text exclusively.</div><br>", unsafe_allow_html=True)
+uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
+if uploaded_files:
+    st.warning("""
+            Warning! Prior to proceeding, please take a moment to review the following : \n
+            Certain guidelines apply when utilizing this application, particularly if you intend to extract information from tables, whether they are oriented horizontally or vertically.
+               - If you intend to perform multiple PDF processes using Horizontal Table Extraction, ensure that all your PDF files adhere to a horizontal table format
+               - If you plan to undertake multiple PDF processes with Vertical Table Extraction, ensure that all your PDF files conform to a vertical table format
+            """, icon="⚠️")
+col1, col2, col3 = st.columns(3)
+if uploaded_files:
+    journals = []
+    strategy = "hi_res"
+    model_name = "yolox"
+    on_h, on_v, on_t = None, None, None
+    parseButtonH, parseButtonV, parseButtonT = None, None, None
+    # if uploaded_files:
+    with col1:
+        if on_v or on_t:
+            on_h = st.toggle("Horizontal Table Extraction", disabled=True)
+        else:
+            on_h = st.toggle("Horizontal Table Extraction")
+            if on_h:
+                chunk_size_h = st.selectbox(
+                    'Tokens amounts per process :',
+                    (16000, 12000, 10000, 8000, 5000), key='table_h'
+                )
+                parseButtonH = st.button("Get Result", key='table_H')
+    with col2:
+        if on_h or on_t:
+            on_v = st.toggle("Vertical Table Extraction", disabled=True)
+        else:
+            on_v = st.toggle("Vertical Table Extraction")
+            if on_v:
+                chunk_size_v = st.selectbox(
+                    'Tokens amounts per process :',
+                    (16000, 12000, 10000, 8000, 5000), key='table_v'
+                )
+                parseButtonV = st.button("Get Result", key='table_V')
+    with col3:
+        if on_h or on_v:
+            on_t = st.toggle("Text Extraction ", disabled=True)
+        else:
+            on_t = st.toggle("Text Extraction ")
+            if on_t:
+                chunk_size_t = st.selectbox(
+                    'Tokens amounts per process :',
+                    (16000, 12000, 10000, 8000, 5000), key='no_table'
+                )
+                parseButtonT = st.button("Get Result", key="no_Table")
+    if on_h:
+        if parseButtonH:
+            with st.status("Extraction in progress ...", expanded=True) as status:
+                st.write("Getting Result ...")
+                csv = pd.DataFrame()
+                for uploaded_file in stqdm(uploaded_files):
+                    with NamedTemporaryFile(dir='.', suffix=".pdf") as pdf:
+                        pdf.write(uploaded_file.getbuffer())
+                        # st.write(pdf.name)
+                        L = []
+                        # Entity Extraction
+                        st.write("☑ Extracting Entities ...")
+                        bytes_data = uploaded_file.read()
+                        journal = Journal(uploaded_file.name, bytes_data)
+                        images = pdf2image.convert_from_bytes(journal.bytes)
+                        extracted_text = ""
+                        for image in images[:-1]:
+                            text = pytesseract.image_to_string(image)
+                            text = clean_text(text)
+                            extracted_text += text + " "
+                        text = replace_quotes(extracted_text)
+                        text_chunk = split_text(text, chunk_size_h)
+                        chunkdf = []
+                        for i, chunk in enumerate(text_chunk):
+                            inp = chunk
+                            df = pd.DataFrame(literal_eval(str(json.dumps(tablex_chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
+                            chunkdf.append(df)
+                        concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
+                        st.write("☑ Entities Extraction Done ..")
+                        time.sleep(0.1)
+                        st.write("☑ Generating Summary ...")
+                        summary = get_summ(pdf.name)
+                        st.write("☑ Generating Summary Done ..")
+                        time.sleep(0.1)
+                        st.write("☑ Table Extraction in progress ...")
+                        # Table Extraction
+                        # L = []
+                        output_list = []
+                        elements = partition_pdf(filename=pdf.name, strategy=strategy, infer_table_structure=True, model_name=model_name)
+                        with NamedTemporaryFile(dir=".", suffix=".json") as f:
+                            elements_to_json(elements, filename=f"{f.name.split('/')[-1]}")
+                            json_file_path = os.path.abspath(f.name)  # Get the absolute file path
+                            with open(json_file_path, "r", encoding="utf-8") as jsonfile:
+                                data = json.load(jsonfile)
+                            extracted_elements = []
+                            for entry in data:
+                                if entry["type"] == "Table":
+                                    extracted_elements.append(entry["metadata"]["text_as_html"])
+                            with NamedTemporaryFile(dir='.' , suffix='.txt') as txt_file:
+                                text_file_path = os.path.abspath(txt_file.name)
+                                with open(text_file_path, "w", encoding="utf-8") as txtfile:
+                                    for element in extracted_elements:
+                                        txtfile.write(element + "\n\n")
+                                loader = TextLoader(text_file_path)
+                                documents = loader.load()
+                                # split it into chunks
+                                text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+                                docs = text_splitter.split_documents(documents)
+                                embeddings = OpenAIEmbeddings()
+                                db = Chroma.from_documents(docs, embeddings)
+                                llm_table = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)
+                                qa_chain = RetrievalQA.from_chain_type(llm_table, retriever=db.as_retriever())
+                                # List of questions
+                                questions = [
+                                    """Mention all genes / locus name with respective rsID / SNP and potential diseases in a curly brackets like this:
+                                    Example 1 : {"Genes" : "FTO", "SNPs" : "rs9939609", "Diseases" : "Obesity"}
+                                    """,
+                                    """Mention all genes / locus name with respective potential diseases in a curly brackets like this:
+                                    Example 2 : {"Genes" : "FTO", "SNPs" : "" (if not available), "Diseases" : "Obesity"}
+                                    """,
+                                    """Mention all rsIDs / SNPs / Variant with respective potential diseases / traits in a curly brackets like this:
+                                    Example 3 : {"Genes" : "", "SNPs" : "rs9939609", "Diseases" : "Obesity"}
+                                    """
+                                ]
+                                try:
+                                    for query in questions:
+                                        response = qa_chain({"query" : query})
+                                        output_list.append(response)
+                                except Exception as e:
+                                    pass
+                            db.delete_collection()
+                        # 1
+                        for i in range(len(output_list[0]['result'].split('\n'))):
+                            if output_list[0]['result'].split('\n')[i] != "":
+                                try:
+                                    row = literal_eval(output_list[0]['result'].split('\n')[i])[0]
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                # 'Population' : concat['population_race'][0],
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                                # 'Sample Size' : concat['sample_size'][0]
+                                    }}
+                                    if len(row['Genes'].strip().split(',')) > 1:
+                                        for g in row['Genes'].strip().split(','):
+                                            L.append({
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                # 'Population' : concat['population_race'][0],
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                                # 'Sample Size' : concat['sample_size'][0],
+                                                'Genes' : g.strip().upper().replace('Unknown', ''),
+                                                'SNPs' : row['SNPs'].replace('Unknown', ''),
+                                                "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', '').replace('Unknown', '')
+                                            })
+                                    else:
+                                        L.append(row)
+                                except KeyError:
+                                    row = literal_eval(output_list[0]['result'].split('\n')[i])
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                "Publisher Name" : concat['publisher_name'][0],
+                                                # 'Population' : concat['population_race'][0],
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                                # 'Sample Size' : concat['sample_size'][0]
+                                            }
+                                        }
+                                    if len(row['Genes'].strip().split(',')) > 1:
+                                        for g in row['Genes'].strip().split(','):
+                                            L.append({
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' :get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                                'Genes' : g.strip().upper().replace('Unknown', ''),
+                                                'SNPs' : row['SNPs'].replace('Unknown', ''),
+                                                "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', '').replace('Unknown', '')
+                                            })
+                                    else:
+                                        L.append(row)
+                                except SyntaxError:
+                                    row = literal_eval(output_list[0]['result'].split('\n')[i])
+                                    row = f"""{row}"""
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                                # 'Population' : concat['population_race'][0],
+                                                # 'Sample Size' : concat['sample_size'][0]
+                                            }
+                                    }
+                                    if not row['SNPs'].startswith("rs"):
+                                        row.update({
+                                            'SNPs' : "-"
+                                        })
+                                    else:
+                                        L.append(row)
+                                except ValueError:
+                                    if type(output_list[0]['result'].split('\n')[i]) is dict:
+                                        row = output_list[0]['result'].split('\n')[i]
+                                        row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                        }
+                                        if not row['SNPs'].startswith("rs"):
+                                            row.update({
+                                                'SNPs' : "-"
+                                            })
+                                        else:
+                                            L.append(row)
+                        # 2
+                        for i in range(len(output_list[1]['result'].split('\n'))):
+                            if output_list[1]['result'].split('\n')[i] != "":
+                                try:
+                                    row = literal_eval(output_list[1]['result'].split('\n')[i])[0]
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                        }
+                                    if row['SNPs'] != "Not available":
+                                        row.update({
+                                            'SNPs' : "Not available"
+                                    })
+                                    if len(row['Genes'].strip().split(',')) > 1:
+                                        for g in row['Genes'].strip().split(','):
+                                            L.append({
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                                'Genes' : g.strip().upper().replace('Unknown', ''),
+                                                "SNPs" : "Not available",
+                                                "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', '').replace('Unknown', '')
+                                            })
+                                    else:
+                                        L.append(row)
+                                except KeyError:
+                                    row = literal_eval(output_list[1]['result'].split('\n')[i])
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' :get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                    }
+                                    if row['SNPs'] != "Not available":
+                                        row.update({
+                                            'SNPs' : "Not available"
+                                    })
+                                    if len(row['Genes'].strip().split(',')) > 1:
+                                        for g in row['Genes'].strip().split(','):
+                                            L.append({
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                                'Genes' : g.strip().upper().replace('Unknown', ''),
+                                                "SNPs" : "Not available",
+                                                "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', '').replace('Unknown', '')
+                                            })
+                                    else:
+                                        L.append(row)
+                                except SyntaxError:
+                                    row = f"""{row}"""
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                    }
+                                    if not row['SNPs'].startswith("rs"):
+                                        row.update({
+                                            'SNPs' : "-"
+                                        })
+                                    else:
+                                        L.append(row)
+                                except ValueError:
+                                    if type(output_list[1]['result'].split('\n')[i]) is dict:
+                                        row = output_list[1]['result'].split('\n')[i]
+                                        row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                        }
+                                        if not row['SNPs'].startswith("rs"):
+                                            row.update({
+                                                'SNPs' : "-"
+                                            })
+                                        else:
+                                            L.append(row)
+                        # 3
+                        for i in range(len(output_list[2]['result'].split('\n'))):
+                            if output_list[2]['result'].split('\n')[i] != "":
+                                try:
+                                    row = literal_eval(output_list[2]['result'].split('\n')[i])[0]
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                    }
+                                    if not row['SNPs'].startswith("rs"):
+                                        row.update({
+                                            'SNPs' : "-"
+                                        })
+                                    else:
+                                        L.append(row)
+                                except KeyError:
+                                    row = literal_eval(output_list[2]['result'].split('\n')[i])
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                    }
+                                    if not row['SNPs'].startswith("rs"):
+                                        row.update({
+                                            'SNPs' : "-"
+                                        })
+                                    else:
+                                        L.append(row)
+                                except SyntaxError:
+                                    row = f"""{row}"""
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                    }
+                                    if not row['SNPs'].startswith("rs"):
+                                        row.update({
+                                            'SNPs' : "-"
+                                        })
+                                    else:
+                                        L.append(row)
+                                except ValueError:
+                                    if type(output_list[2]['result'].split('\n')[i]) is dict:
+                                        row = output_list[2]['result'].split('\n')[i]
+                                        row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()),
+                                                'Recommendation' : summary,
+                                            }
+                                        }
+                                        if not row['SNPs'].startswith("rs"):
+                                            row.update({
+                                                'SNPs' : "-"
+                                            })
+                                        else:
+                                            L.append(row)
+                    st.write(output_list[2]['result'].split('\n'))
+                    st.write("☑ Table Extraction Done ...")
+                    status.update(label="Gene and SNPs succesfully collected.")
+                    csv = pd.DataFrame(L).replace('', 'Not available')
+                    csv = pd.DataFrame(L).replace('Unknown', '')
+                    st.dataframe(csv)
+                    generated_key = ''.join(random.choice(string.ascii_letters + string.digits) for i in range(16))
+                    # if st.button("Download Excel File", key=generated_key):
+                    #     excel_link = create_excel_download_link(csv, uploaded_file.name.replace('.pdf', ''))
+                    #     st.markdown(excel_link, unsafe_allow_html=True)
+                    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+                        # Write each dataframe to a different worksheet
+                        csv.to_excel(writer, sheet_name='Result')
+                        writer.close()
+                    # time_now = datetime.now()
+                    # current_time = time_now.strftime("%H:%M:%S")
+                        csv = convert_df(csv)
+                        st.download_button(
+                            label="Save Result",
+                            data=buffer,
+                            file_name=f'{uploaded_file.name}'.replace('.pdf', '') + '.xlsx',
+                            mime='application/vnd.ms-excel',
+                            key=generated_key
+                        )
+    if on_v:
+        if parseButtonV:
+            with st.status("Extraction in progress ...", expanded=True) as status:
+                st.write("Getting Result ...")
+                csv = pd.DataFrame()
+                for uploaded_file in stqdm(uploaded_files):
+                    L = []
+                    with NamedTemporaryFile(dir='.', suffix=".pdf") as pdf:
+                        pdf.write(uploaded_file.getbuffer())
+                        # Open the PDF file in read-binary mode
+                        with open(pdf.name, 'rb') as pdf_file:
+                            # Create a PDF reader object
+                            pdf_reader = PyPDF2.PdfReader(pdf_file)
+                            # Create a PDF writer object to write the rotated pages to a new PDF
+                            pdf_writer = PyPDF2.PdfWriter()
+                            # Iterate through each page in the original PDF
+                            for page_num in range(len(pdf_reader.pages)):
+                                # Get the page object
+                                page = pdf_reader.pages[page_num]
+                                # Rotate the page 90 degrees clockwise (use -90 for counterclockwise)
+                                page.rotate(90)
+                                # Add the rotated page to the PDF writer
+                                pdf_writer.add_page(page)
+                        with NamedTemporaryFile(dir='.', suffix=".pdf") as rotated_pdf:
+                            pdf_writer.write(rotated_pdf.name)
+                            # Entity Extraction
+                            st.write("☑ Extracting Entities ...")
+                            bytes_data = uploaded_file.read()
+                            journal = Journal(uploaded_file.name, bytes_data)
+                            images = pdf2image.convert_from_bytes(journal.bytes)
+                            extracted_text = ""
+                            for image in images[:-1]:
+                                text = pytesseract.image_to_string(image)
+                                text = clean_text(text)
+                                extracted_text += text + " "
+                            text = replace_quotes(extracted_text)
+                            text_chunk = split_text(text, chunk_size_v)
+                            chunkdf = []
+                            for i, chunk in enumerate(text_chunk):
+                                inp = chunk
+                                df = pd.DataFrame(literal_eval(str(json.dumps(tablex_chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
+                                chunkdf.append(df)
+                            concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
+                            st.write("☑ Entities Extraction Done ..")
+                            time.sleep(0.1)
+                            st.write("☑ Generating Summary ...")
+                            summary = get_summ(pdf.name)
+                            st.write("☑ Generating Summary Done ..")
+                            time.sleep(0.1)
+                            st.write("☑ Table Extraction in progress ...")
+                            # Table Extraction
+                            output_list = []
+                            elements = partition_pdf(filename=rotated_pdf.name, strategy=strategy, infer_table_structure=True, model_name=model_name)
+                            with NamedTemporaryFile(dir=".", suffix=".json") as f:
+                                elements_to_json(elements, filename=f"{f.name.split('/')[-1]}")
+                                json_file_path = os.path.abspath(f.name)  # Get the absolute file path
+                                with open(json_file_path, "r", encoding="utf-8") as jsonfile:
+                                    data = json.load(jsonfile)
+                                extracted_elements = []
+                                for entry in data:
+                                    if entry["type"] == "Table":
+                                        extracted_elements.append(entry["metadata"]["text_as_html"])
+                                with NamedTemporaryFile(dir='.' , suffix='.txt') as txt_file:
+                                    text_file_path = os.path.abspath(txt_file.name)
+                                    with open(text_file_path, "w", encoding="utf-8") as txtfile:
+                                        for element in extracted_elements:
+                                            txtfile.write(element + "\n\n")
+                                    loader = TextLoader(text_file_path)
+                                    documents = loader.load()
+                                    # split it into chunks
+                                    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+                                    docs = text_splitter.split_documents(documents)
+                                    embeddings = OpenAIEmbeddings()
+                                    db = Chroma.from_documents(docs, embeddings)
+                                    llm_table = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)
+                                    qa_chain = RetrievalQA.from_chain_type(llm_table, retriever=db.as_retriever())
+                                    # List of questions
+                                    questions = [
+                                        """Mention all genes / locus name with respective rsID / SNP and potential diseases in a curly brackets like this:
+                                        Example 1 : {"Genes" : "FTO", "SNPs" : "rs9939609", "Diseases" : "Obesity"}
+                                        """,
+                                        """Mention all genes / locus name with respective potential diseases in a curly brackets like this:
+                                        Example 2 : {"Genes" : "FTO", "SNPs" : "" (if not available), "Diseases" : "Obesitya"}
+                                        """,
+                                        """Mention all rsIDs / SNPs / Variant with respective potential diseases / traits in a curly brackets like this:
+                                        Example 3 : {"Genes" : "", "SNPs" : "rs9939609", "Diseases" : "Obesity"}
+                                        """
+                                    ]
+                                    try:
+                                        for query in questions:
+                                            response = qa_chain({"query" : query})
+                                            output_list.append(response)
+                                    except Exception as e:
+                                        pass
+                                db.delete_collection()
+                            # 1
+                            for i in range(len(output_list[0]['result'].split('\n'))):
+                                if output_list[0]['result'].split('\n')[i] != "":
+                                    try:
+                                        row = literal_eval(output_list[0]['result'].split('\n')[i])[0]
+                                        row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                'Recommendation' : summary,
+                                        }}
+                                        if len(row['Genes'].strip().split(',')) > 1:
+                                            for g in row['Genes'].strip().split(','):
+                                                L.append({
+                                                    'Genes' : g.strip().upper(),
+                                                    'SNPs' : row['SNPs'],
+                                                    "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', ''),
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                })
+                                        else:
+                                            L.append(row)
+                                    except KeyError:
+                                        row = literal_eval(output_list[0]['result'].split('\n')[i])
+                                        row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                'Recommendation' : summary,
+                                        }}
+                                        if len(row['Genes'].strip().split(',')) > 1:
+                                            for g in row['Genes'].strip().split(','):
+                                                L.append({
+                                                    'Genes' : g.strip().upper(),
+                                                    'SNPs' : row['SNPs'],
+                                                    "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', ''),
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                })
+                                        else:
+                                            L.append(row)
+                                    except ValueError:
+                                        if type(output_list[0]['result'].split('\n')[i]) is dict:
+                                            row = output_list[0]['result'].split('\n')[i]
+                                            row = {**row, **{
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                }
+                                            }
+                                            if not row['SNPs'].startswith("rs"):
+                                                row.update({
+                                                    'SNPs' : "-"
+                                                })
+                                            else:
+                                                L.append(row)
+                                    except SyntaxError:
+                                        row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
+                                        row = {**row, **{
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                }
+                                        }
+                                        if not row['SNPs'].startswith("rs"):
+                                            row.update({
+                                                'SNPs' : "-"
+                                            })
+                                        else:
+                                            L.append(row)
+                            # 2
+                            for i in range(len(output_list[1]['result'].split('\n'))):
+                                if output_list[1]['result'].split('\n')[i] != "":
+                                    try:
+                                        row = literal_eval(output_list[1]['result'].split('\n')[i])[0]
+                                        row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                'Recommendation' : summary,
+                                        }}
+                                        if row['SNPs'] != "Not available":
+                                            row.update({
+                                                'SNPs' : "Not available"
+                                        })
+                                        if len(row['Genes'].strip().split(',')) > 1:
+                                            for g in row['Genes'].strip().split(','):
+                                                L.append({
+                                                    'Genes' : g.strip().upper(),
+                                                    "SNPs" : "Not available",
+                                                    "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', ''),
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                })
+                                        else:
+                                            L.append(row)
+                                    except KeyError:
+                                        row = literal_eval(output_list[1]['result'].split('\n')[i])
+                                        row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                'Recommendation' : summary,
+                                        }}
+                                        if row['SNPs'] != "Not available":
+                                            row.update({
+                                                'SNPs' : "Not available"
+                                        })
+                                        if len(row['Genes'].strip().split(',')) > 1:
+                                            for g in row['Genes'].strip().split(','):
+                                                L.append({
+                                                    'Genes' : g.strip().upper(),
+                                                    "SNPs" : "Not available",
+                                                    "Diseases" : ''.join(list(row['Diseases'].title() if row['Diseases'] not in ['T2D', 'T2DM', 'NAFLD', 'CVD'] else row['Diseases'])).replace('Unknown', ''),
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                })
+                                        else:
+                                            L.append(row)
+                                    except ValueError:
+                                        if type(output_list[1]['result'].split('\n')[i]) is dict:
+                                            row = output_list[1]['result'].split('\n')[i]
+                                            row = {**row, **{
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                }
+                                            }
+                                            if not row['SNPs'].startswith("rs"):
+                                                row.update({
+                                                    'SNPs' : "-"
+                                                })
+                                            else:
+                                                L.append(row)
+                                    except SyntaxError:
+                                        row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
+                                        row = {**row, **{
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                }
+                                        }
+                                        if not row['SNPs'].startswith("rs"):
+                                            row.update({
+                                                'SNPs' : "-"
+                                            })
+                                        else:
+                                            L.append(row)
+                        # 3
+                        for i in range(len(output_list[2]['result'].split('\n'))):
+                            if output_list[2]['result'].split('\n')[i] != "":
+                                try:
+                                    row = literal_eval(output_list[2]['result'].split('\n')[i])[0]
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                'Recommendation' : summary,
+                                            }
+                                    }
+                                    if not row['SNPs'].startswith("rs"):
+                                        row.update({
+                                            'SNPs' : "-"
+                                        })
+                                    else:
+                                        L.append(row)
+                                except KeyError:
+                                    row = literal_eval(output_list[2]['result'].split('\n')[i])
+                                    row = {**row, **{
+                                                'Title' : concat['title'][0],
+                                                'Authors' : concat['authors'][0],
+                                                'Publisher Name' : concat['publisher_name'][0],
+                                                'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                'Recommendation' : summary,
+                                            }
+                                    }
+                                    if not row['SNPs'].startswith("rs"):
+                                        row.update({
+                                            'SNPs' : "-"
+                                        })
+                                    else:
+                                        L.append(row)
+                                except ValueError:
+                                        if type(output_list[2]['result'].split('\n')[i]) is dict:
+                                            row = output_list[2]['result'].split('\n')[i]
+                                            row = {**row, **{
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                }
+                                            }
+                                            if not row['SNPs'].startswith("rs"):
+                                                row.update({
+                                                    'SNPs' : "-"
+                                                })
+                                            else:
+                                                L.append(row)
+                                except SyntaxError:
+                                        row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
+                                        row = {**row, **{
+                                                    'Title' : concat['title'][0],
+                                                    'Authors' : concat['authors'][0],
+                                                    'Publisher Name' : concat['publisher_name'][0],
+                                                    'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())),
+                                                    'Population' : ' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()),
+                                                    'Study Methodology' : ' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Study Level' : ' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title(),
+                                                    'Recommendation' : summary,
+                                                }
+                                        }
+                                        if not row['SNPs'].startswith("rs"):
+                                            row.update({
+                                                'SNPs' : "-"
+                                            })
+                                        else:
+                                            L.append(row)
+                    st.write("☑ Table Extraction Done")
+                    status.update(label="Gene and SNPs succesfully collected.")
+                    csv = pd.DataFrame(L).replace('', 'Not available')
+                    csv = pd.DataFrame(L).replace('Unknown', '')
+                    st.dataframe(csv)
+                    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+                        # Write each dataframe to a different worksheet
+                        csv.to_excel(writer, sheet_name='Result')
+                        writer.close()
+                    time_now = datetime.now()
+                    current_time = time_now.strftime("%H:%M:%S")
+                    csv = convert_df(csv)
+                    st.download_button(
+                        label="Save Result",
+                        data=buffer,
+                        file_name=f'{uploaded_file.name}'.replace('.pdf', '') + '.xlsx',
+                        mime='application/vnd.ms-excel'
+                    )
+    if on_t:
+        if parseButtonT:
+            with st.status("Extraction in progress ...", expanded=True) as status:
+                st.write("Getting Result ...")
+                csv = pd.DataFrame()
+                for uploaded_file in stqdm(uploaded_files):
+                    L = []
+                    with NamedTemporaryFile(dir='.', suffix=".pdf") as pdf:
+                        pdf.write(uploaded_file.getbuffer())
+                        # Entity Extraction
+                        st.write("☑ Extracting Entities ...")
+                        bytes_data = uploaded_file.read()
+                        journal = Journal(uploaded_file.name, bytes_data)
+                        images = pdf2image.convert_from_bytes(journal.bytes)
+                        extracted_text = ""
+                        for image in images[:-1]:
+                            text = pytesseract.image_to_string(image)
+                            text = clean_text(text)
+                            extracted_text += text + " "
+                        text = replace_quotes(extracted_text)
+                        text_chunk = split_text(text, chunk_size_t)
+                        chunkdf = []
+                        for i, chunk in enumerate(text_chunk):
+                            inp = chunk
+                            df = pd.DataFrame(literal_eval(str(json.dumps(textex_chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
+                            chunkdf.append(df)
+                        concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
+                        st.write("☑ Entities Extraction Done ..")
+                        time.sleep(0.1)
+                        st.write("☑ Generating Summary ...")
+                        concat['SNPs'] = concat['SNPs'].apply(lambda x: x if x.startswith('rs') else '')
+                        for col in list(concat.columns):
+                            concat[col] = concat[col].apply(lambda x: x if x not in ['N/A', 'not mentioned', 'Not mentioned', 'Unknown'] else '')
+                        summary = get_summ(pdf.name)
+                        time.sleep(0.1)
+                        st.write("☑ Generating Summary Done...")
+                        for i in range(len(concat)):
+                            if (len(concat['genes_locus'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
+                                for g in concat['genes_locus'][i].split(','):
+                                    L.append({
+                                        'Title' : concat['title'][0],
+                                        'Author' : concat['authors'][0],
+                                        'Publisher Name' : concat['publisher'][0],
+                                        'Publication Year' : get_valid_year(' '.join(concat['publication_year'].values.tolist())),
+                                        'Genes' : g.upper(),
+                                        'Population' : upper_abbreviation(' '.join(np.unique(concat['population_race'].values.tolist())).title()),
+                                        'Diseases' : upper_abbreviation(' '.join(concat['diseases'].values.tolist()).title()),
+                                        'Sample Size' : sample_size_postproc(upper_abbreviation(' '.join(concat['sample_size'].values.tolist()).title())),
+                                        'SNPs' : concat['SNPs'][i],
+                                        'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).title()),
+                                        'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).title()),
+                                        'Recommendation' : summary,
+                                    })
+                            elif (len(concat['SNPs'][i].split(',')) >= 1):
+                                for s in concat['SNPs'][i].split(','):
+                                    try:
+                                        L.append({
+                                            'Title' : concat['title'][0],
+                                            'Author' : concat['authors'][0],
+                                            'Publisher Name' : concat['publisher'][0],
+                                            'Publication Year' : get_valid_year(' '.join(concat['publication_year'].values.tolist())),
+                                            'Genes' : get_geneName(s.strip()).upper(),
+                                            'Population' : upper_abbreviation(' '.join(np.unique(concat['population_race'].values.tolist())).title()),
+                                            'Diseases' : upper_abbreviation(' '.join(concat['diseases'].values.tolist()).title()),
+                                            'Sample Size' : sample_size_postproc(upper_abbreviation(' '.join(concat['sample_size'].values.tolist()).title())),
+                                            'SNPs' : s,
+                                            'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).title()),
+                                            'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).title()),
+                                            'Recommendation' : summary,
+                                        })
+                                    except Exception as e:
+                                        L.append({
+                                            'Title' : concat['title'][0],
+                                            'Author' : concat['authors'][0],
+                                            'Publisher Name' : concat['publisher'][0],
+                                            'Publication Year' : get_valid_year(' '.join(concat['publication_year'].values.tolist())),
+                                            'Genes' : '',
+                                            'Population' : upper_abbreviation(' '.join(np.unique(concat['population_race'].values.tolist())).title()),
+                                            'Diseases' : upper_abbreviation(' '.join(concat['diseases'].values.tolist()).title()),
+                                            'Sample Size' : sample_size_postproc(upper_abbreviation(' '.join(concat['sample_size'].values.tolist()).title())),
+                                            'SNPs' : s,
+                                            'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).title()),
+                                            'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).title()),
+                                            'Recommendation' : summary,
+                                        })
+                        csv = pd.concat([csv, pd.DataFrame(L)], ignore_index=True)
+                    status.update(label="Gene and SNPs succesfully collected.")
+                    st.dataframe(csv)
+                    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+                        # Write each dataframe to a different worksheet
+                        csv.to_excel(writer, sheet_name='Result')
+                        writer.close()
+                    time_now = datetime.now()
+                    current_time = time_now.strftime("%H:%M:%S")
+                    csv = convert_df(csv)
+                    st.download_button(
+                        label="Save Result",
+                        data=buffer,
+                        file_name=f'{uploaded_file.name}'.replace('.pdf', '') + '.xlsx',
+                        mime='application/vnd.ms-excel'
+                    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+pikepdf
+stqdm
+pdf2image
+PyPDF2
+pytesseract
+unstructured
+chromadb==0.3.29
+nltk
+pandas
+streamlit
+xlsxwriter
+openai
+biopython
+langchain
+unstructured-pytesseract
+unstructured-inference

schema.py ADDED Viewed

	@@ -0,0 +1,87 @@

+textex_schema = {
+    "properties" : {
+        "title" : {
+            "type" : "string",
+            "description" : "Title of the given text. Often located in the top of the first page."
+        },
+        "authors" : {
+            "type" : "string",
+            "description" : "Authors / writers of the given text. Some of the names of the people."
+        },
+        "publisher" : {
+            "type" : "string",
+            "description" : "Publisher name of the given text."
+        },
+        "publication_year" : {
+            "type" : "string",
+            "description" : "The year when the given text publised."
+        },
+        "genes_locus" : {
+            "type" : "string",
+            "description" : "The gene or locus names mentioned in the text."
+        },
+        "diseases" : {
+            "type" : "string",
+            "description" : "Diseases / Phenotypes / Traits corresponding to the Gene / Locus / SNP mentioned in the text."
+        },
+        "SNPs" : {
+            "type" : "string",
+            "description" : "SNPs (Single Nucleotide Polymorphism) / rsID mentioned in the text. Usually startwith `rs` followed by some numbers."
+        },
+        "population_race" : {
+            "type" : "string",
+            "description" : "Population / race used by the author in the given text."
+        },
+        "sample_size" : {
+            "type" : "string",
+            "description" : "Sample size of the population used in the research that mentioned in the paper."
+        },
+        "study_methodology" : {
+            "type" : "string",
+            "description" : "Study methodoly mentioned in the text."
+        },
+        "study_level" : {
+            "type" : "string",
+            "description" : "Study level mentioned in the text."
+        }
+    },
+    "required" : ["title"]
+}
+tablex_schema = {
+    "properties" : {
+        "title" : {
+            "type" : "string",
+            "description" : "Title of the given text. Often located in the top of the first page. Usually at the top of authors name."
+        },
+        "authors" : {
+            "type" : "string",
+            "description" : "Authors / writers of the given text. Some of the names of the people."
+        },
+        "publisher_name" : {
+            "type" : "string",
+            "description" : "Publisher name of the given text."
+        },
+        "year_of_publication" : {
+            "type" : "string",
+            "description" : "The year when the given text publised."
+        },
+        "population_race" : {
+            "type" : "string",
+            "description" : "Population / race used by the author in the given text."
+        },
+        "sample_size" : {
+            "type" : "string",
+            "description" : "Sample size of the population used in the research that mentioned in the paper."
+        },
+        "study_methodology" : {
+            "type" : "string",
+            "description" : "Study methodoly mentioned in the text."
+        },
+        "study_level" : {
+            "type" : "string",
+            "description" : "Study level mentioned in the text."
+        }
+    },
+    "required" : ["title"]
+}

summ.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+from langchain.chains.llm import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import PromptTemplate
+from langchain.document_loaders import PDFPlumberLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
+from langchain.chains.combine_documents.stuff import StuffDocumentsChain
+os.environ['OPENAI_API_KEY'] = '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'
+llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
+def get_summ(path):
+    loader = PDFPlumberLoader(path)
+    docs = loader.load()
+    # Map
+    map_template = """The following is a set of documents
+    {docs}
+    Based on this list of docs, please identify the main themes
+    Helpful Answer:"""
+    map_prompt = PromptTemplate.from_template(map_template)
+    map_chain = LLMChain(llm=llm, prompt=map_prompt)
+    # Reduce
+    reduce_template = """The following is set of summaries:
+    {doc_summaries}
+    Take these and distill it into a final, consolidated summary of the main themes.
+    Helpful Answer:"""
+    reduce_prompt = PromptTemplate.from_template(reduce_template)
+    # Run chain
+    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
+    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
+    combine_documents_chain = StuffDocumentsChain(
+        llm_chain=reduce_chain, document_variable_name="doc_summaries"
+    )
+    # Combines and iteravely reduces the mapped documents
+    reduce_documents_chain = ReduceDocumentsChain(
+        # This is final chain that is called.
+        combine_documents_chain=combine_documents_chain,
+        # If documents exceed context for `StuffDocumentsChain`
+        collapse_documents_chain=combine_documents_chain,
+        # The maximum number of tokens to group documents into.
+        token_max=12000,
+    )
+    # Combining documents by mapping a chain over them, then combining results
+    map_reduce_chain = MapReduceDocumentsChain(
+        # Map chain
+        llm_chain=map_chain,
+        # Reduce chain
+        reduce_documents_chain=reduce_documents_chain,
+        # The variable name in the llm_chain to put the documents in
+        document_variable_name="docs",
+        # Return the results of the map steps in the output
+        return_intermediate_steps=False,
+    )
+    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+        chunk_size=12000, chunk_overlap=0
+    )
+    split_docs = text_splitter.split_documents(docs)
+    return map_reduce_chain.run(split_docs)

utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import shutil
+import textwrap
+import nltk
+import re
+from Bio import Entrez
+def replace_quotes(text):
+    pattern = r'(?<=")[^"]*(?=")'
+    return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)
+def clean_text(text):
+    """Remove section titles and figure descriptions from text"""
+    pattern = r'[^\w\s]'
+    clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
+    return re.sub(pattern, '', clean)
+def truncate_text(text, max_tokens):
+    wrapper = textwrap.TextWrapper(width=max_tokens)
+    truncated_text = wrapper.wrap(text)
+    if len(truncated_text) > 0:
+        return truncated_text[0]
+    else:
+        return ""
+def split_text(text, chunk_size):
+    chunks = []
+    start = 0
+    end = chunk_size
+    while start < len(text):
+        chunks.append(text[start:end])
+        start = end
+        end += chunk_size
+    return chunks
+def extract_gene_name(text):
+    text_str = text.decode("utf-8")
+    text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
+    pattern = r"<NAME>(.*?)</NAME>"
+    match = re.search(pattern, text_str)
+    if match:
+        gene_name = match.group(1)
+        return gene_name
+    else:
+        return None
+def get_geneName(rsid):
+    text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
+    text = extract_gene_name(text)
+    return text
+def split_text_into_sentences(text, num_sentences):
+    sentences = nltk.sent_tokenize(text)
+    grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
+    return grouped_sentences
+def flatten_list(nested_list):
+    flattened_list = []
+    for item in nested_list:
+        if isinstance(item, list):
+            flattened_list.extend(flatten_list(item))
+        else:
+            flattened_list.append(item)
+    return flattened_list
+def move_file(source_path, destination_path):
+    if not os.path.exists(destination_path):
+        os.makedirs(destination_path)
+    try:
+        shutil.move(source_path, destination_path)
+        print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
+    except Exception as e:
+        print(f"Error: {e}")
+def upper_abbreviation(text):
+    pattern1 = r'\b(?:[A-Z][a-z.]*\.?\s*)+\b'
+    pattern2 = re.compile(r'unknown', re.IGNORECASE)
+    def convert_to_upper(match):
+        return match.group(0).replace('.', '').upper()
+    text = re.sub(pattern2, '', text)
+    output_string = re.sub(pattern1, convert_to_upper, text)
+    return output_string
+def get_valid_year(input_text):
+    four_letter_words = re.findall(r'\b\w{4}\b', input_text)
+    result_text = ' '.join(four_letter_words)
+    if len(result_text.split(' ')) > 1:
+        return ''.join(result_text.split(' ')[0])
+    return result_text
+def sample_size_postproc(text):
+    words = text.split()
+    pattern = r'\b[A-Za-z]+\d+\b'
+    cleaned_words = [word for word in words if not re.match(r'.*\d.*[A-Za-z].*$', word)]
+    cleaned_text = ' '.join(cleaned_words)
+    cleaned_text = re.sub(pattern, '', cleaned_text)
+    return cleaned_text