File size: 3,441 Bytes
ef4e0b3
 
 
 
 
 
 
 
6a1e667
ef4e0b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630c3ba
 
 
ef4e0b3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
"""app.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo
"""
#https://huggingface.co/spaces/user2434/SummarizedAbstract
# Import necessary libraries
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
from io import BytesIO
import PyPDF2

# Function to extract abstract from PDF
def extract_abstract(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        abstract_start, abstract_end = None, None

        for page_num, page in enumerate(reader.pages):
            page_text = page.extract_text()
            if "Abstract" in page_text:
                abstract_start = page_num
                break

        if abstract_start is not None:
            for page_num, page in enumerate(reader.pages[abstract_start + 1:]):
                page_text = page.extract_text()
                if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]):
                    abstract_end = abstract_start + page_num + 1
                    break

        if abstract_start is not None and abstract_end is not None:
            abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end])
            return abstract_text
        else:
            return None

# Function to summarize abstract using a pre-trained model
def summarize_abstract(text):
    tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
    model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")
    inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True)
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=40,
        min_length=20,
        no_repeat_ngram_size=3,
        encoder_no_repeat_ngram_size=3,
        repetition_penalty=2.0,
        num_beams=3,
        do_sample=True,
        early_stopping=False
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    sentences = summary.split('.')
    if len(sentences) > 1:
        summary = sentences[0] + '.'

    return summary

# Function to convert text to speech
def convert_to_speech(text):
    tts = gTTS(text, lang='en')
    buffer = BytesIO()
    tts.write_to_fp(buffer)
    buffer.seek(0)
    return buffer.read()

# Function to process PDF and generate summary
def process_pdf(pdf_path):
    abstract_text = extract_abstract(pdf_path)

    if abstract_text:
        abstract_text = abstract_text[:1024]
        summary = summarize_abstract(abstract_text)

        if summary:
            return summary, convert_to_speech(summary)

# Define Gradio interface
inputs = gr.File(label="Upload a PDF with an abstract")  # Add a label to the file input
summary_text = gr.Text(label="Written summary of the abstract")
audio_summary = gr.Audio(label="Audio summary of abstract")

# Launch the Gradio interface with an example PDF
iface = gr.Interface(
    fn=process_pdf,
    inputs=inputs,
    outputs=[summary_text, audio_summary],
    title="Summarized Abstract",
    description="The app will summarize the abstract of a PDF and read it to the user.",
    examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
             ]
)

# Launch the Gradio interface
iface.launch()