darylalim commited on
Commit
e0285bb
1 Parent(s): 48d3730

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +56 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from optimum.bettertransformer import BetterTransformer
4
+ import gradio as gr
5
+ import pypdf
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained(
8
+ "facebook/bart-large-cnn",
9
+ use_fast=True
10
+ )
11
+
12
+ model_hf = AutoModelForSeq2SeqLM.from_pretrained(
13
+ "facebook/bart-large-cnn",
14
+ torch_dtype=torch.bfloat16
15
+ )
16
+
17
+ model = BetterTransformer.transform(model_hf, keep_original_model=True)
18
+
19
+ def extract_abstract(pdf_path):
20
+ with open(pdf_path, 'rb') as f:
21
+ reader = pypdf.PdfReader(f)
22
+ first_page = reader.pages[0]
23
+ text = first_page.extract_text()
24
+
25
+ abstract_start = text.find('Abstract')
26
+ abstract_end = text.find('Introduction', abstract_start)
27
+
28
+ return text[abstract_start:abstract_end]
29
+
30
+ def summarize_abstract(pdf_path):
31
+ abstract_text = extract_abstract(pdf_path)
32
+
33
+ inputs = tokenizer(
34
+ abstract_text,
35
+ max_length=130,
36
+ return_tensors="pt"
37
+ )
38
+
39
+ prediction = model.generate(**inputs)
40
+ prediction = tokenizer.batch_decode(prediction, skip_special_tokens=True)
41
+
42
+ return prediction[0]
43
+
44
+ demo = gr.Interface(
45
+ fn=summarize_abstract,
46
+ inputs=[gr.File(label="PDF path")],
47
+ outputs=[gr.Textbox(label="Abstract summary")],
48
+ description="""
49
+ # BART Large CNN Abstract Summarization
50
+ [Code](https://github.com/darylalim/bart-large-cnn-abstract-summarization)
51
+ """
52
+ )
53
+
54
+ demo.queue()
55
+
56
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ tokenizers
4
+ optimum
5
+ gradio
6
+ pypdf