Spaces:

piecurus
/

Summarizer

Runtime error

App Files Files Community

Summarizer / app.py

piecurus

Update app.py

df99d98 almost 3 years ago

raw

history blame contribute delete

3.74 kB

	import nltk
	import validators
	import streamlit as st
	from transformers import AutoTokenizer, pipeline

	# local modules
	from extractive_summarizer.model_processors import Summarizer
	from utils import (
	clean_text,
	fetch_article_text,
	preprocess_text_for_abstractive_summarization,
	read_text_from_file,
	)

	if __name__ == "__main__":
	# ---------------------------------
	# Main Application
	# ---------------------------------
	st.title("Text Summarizer")


	summarize_type = st.sidebar.selectbox(
	"Summarization type", options=["Extractive", "Abstractive"]
	)

	st.markdown(
	"Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
	)
	st.markdown(
	"""- Raw text in text box
	- URL of article/news to be summarized
	- .txt, .pdf, .docx file formats"""
	)
	st.markdown("---")
	# ---------------------------
	# SETUP & Constants
	nltk.download("punkt")
	abs_tokenizer_name = "facebook/bart-large-cnn"
	abs_model_name = "facebook/bart-large-cnn"
	abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
	abs_max_length = 90
	abs_min_length = 30
	# ---------------------------

	inp_text = st.text_input("Enter text or a url here")
	st.markdown(
	"<h3 style='text-align: center; color: green;'>OR</h3>",
	unsafe_allow_html=True,
	)
	uploaded_file = st.file_uploader(
	"Upload a .txt, .pdf, .docx file for summarization"
	)

	is_url = validators.url(inp_text)
	if is_url:
	# complete text, chunks to summarize (list of sentences for long docs)
	text, clean_txt = fetch_article_text(url=inp_text)
	elif uploaded_file:
	clean_txt = read_text_from_file(uploaded_file)
	clean_txt = clean_text(clean_txt)
	else:
	clean_txt = clean_text(inp_text)

	# view summarized text (expander)
	with st.expander("View input text"):
	if is_url:
	st.write(clean_txt[0])
	else:
	st.write(clean_txt)
	summarize = st.button("Summarize")

	# called on toggle button [summarize]
	if summarize:
	if summarize_type == "Extractive":
	if is_url:
	text_to_summarize = " ".join([txt for txt in clean_txt])
	else:
	text_to_summarize = clean_txt
	# extractive summarizer

	with st.spinner(
	text="Creating extractive summary. This might take a few seconds ..."
	):
	ext_model = Summarizer()
	summarized_text = ext_model(text_to_summarize, num_sentences=5)

	elif summarize_type == "Abstractive":
	with st.spinner(
	text="Creating abstractive summary. This might take a few seconds ..."
	):
	text_to_summarize = clean_txt
	abs_summarizer = pipeline(
	"summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
	)

	if is_url is False:
	# list of chunks
	text_to_summarize = preprocess_text_for_abstractive_summarization(
	tokenizer=abs_tokenizer, text=clean_txt
	)

	tmp_sum = abs_summarizer(
	text_to_summarize,
	max_length=abs_max_length,
	min_length=abs_min_length,
	do_sample=False,
	)

	summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])

	# final summarized output
	st.subheader("Summarized text")
	st.info(summarized_text)