Article_Summarizer / Scrapper_Summarizer.py
Muhammad Murtaza Naqi (Assistant Manager - Data Analyst)
supporting files
712d86b
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
from transformers import pipeline
# from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration
# Text sumamrization model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def scrape_dawn():
url = 'https://www.dawn.com/business'
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
count = 0 # Counter to track the number of articles scraped
for item in soup.find_all('article', class_='story'):
if count >= 5: # Stop after 10 articles
break
title_tag = item.find('h2', class_='story__title')
if title_tag:
title = title_tag.get_text(strip=True)
link = title_tag.find('a')['href']
full_text = get_full_article_dawn(link)
# Summarize the full article
summary_obj = summarizer(full_text[:1020])
# Convert the summary object to a string
summary = summary_obj[0]['summary_text'] if summary_obj else ""
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
count += 1 # Increment the counter
return articles
# Function to get the full text of an article from Dawn
def get_full_article_dawn(url):
response = requests.get(url, verify = False)
soup = BeautifulSoup(response.text, 'html.parser')
content_div = soup.find('div', class_='story__content')
if content_div:
paragraphs = content_div.find_all('p')
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
return full_text
return "Content not found."
# Function to scrape articles from Business Recorder
def scrape_brecorder():
url = 'https://www.brecorder.com/business-finance'
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
count = 0 # Counter to track the number of articles scraped
for item in soup.find_all('article', class_='story'):
if count >= 5: # Stop after 10 articles
break
title_tag = item.find('h2', class_='story__title')
if title_tag:
title = title_tag.get_text(strip=True)
link = title_tag.find('a')['href']
full_text = get_full_article_brecorder(link)
# Summarize the full article
summary_obj = summarizer(full_text[:1020])
# Convert the summary object to a string
summary = summary_obj[0]['summary_text'] if summary_obj else ""
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
count += 1 # Increment the counter
return articles
# Function to get the full text of an article from Business Recorder
def get_full_article_brecorder(url):
response = requests.get(url, verify = False)
soup = BeautifulSoup(response.text, 'html.parser')
content_div = soup.find('div', class_='story__content')
if content_div:
paragraphs = content_div.find_all(['p', 'li'])
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
return full_text
return "Content not found."
#
# def scrape_tnews():
# url = 'https://www.thenews.com.pk/latest/category/business'
# response = requests.get(url, verify=False)
# soup = BeautifulSoup(response.text, 'html.parser')
# articles = []
#
# count = 0 # Counter to track the number of articles scraped
#
# for item in soup.find_all('div', class_='most-popular-box'):
# if count >= 2: # Stop after 10 articles
# break
#
# title_tag = item.find('h2', class_='most-popular-list')
# if title_tag:
# title = title_tag.get_text(strip=True)
# link = title_tag.find('a')['href']
# full_text = get_full_article_tnews(link)
# # Summarize the full article
# summary_obj = summarizer(full_text[:1020])
#
# # Convert the summary object to a string
# summary = summary_obj[0]['summary_text'] if summary_obj else ""
# articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
#
# count += 1 # Increment the counter
#
# return articles
def scrape_tnews():
url = 'https://www.thenews.com.pk/latest/category/business'
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
count = 0 # Counter to track the number of articles scraped
for item in soup.find_all('div', class_='most-popular-box'):
if count >= 5: # Stop after 2 articles
break
# Extract the title from the <h2> tag
title_tag = item.find('h2')
if title_tag:
title = title_tag.get_text(strip=True)
# Extract the link from the <a> tag inside <h2>
link = item.find('a')['href']
# Fetch and process full article text (you should define get_full_article_tnews)
full_text = get_full_article_tnews(link)
# Summarize the full article (you should define summarizer)
summary_obj = summarizer(full_text[:1020])
summary = summary_obj[0]['summary_text'] if summary_obj else ""
# Append the article details
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
count += 1 # Increment the counter
return articles
def get_full_article_tnews(url):
response = requests.get(url, verify = False)
soup = BeautifulSoup(response.text, 'html.parser')
content_div = soup.find('div', class_='detail-content')
if content_div:
paragraphs = content_div.find_all(['p', 'li'])
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
return full_text
return "Content not found."
# Function to save articles to a CSV file
def save_to_csv(filename, articles):
if not articles:
print(f"No articles found to save in {filename}.")
return
keys = articles[0].keys()
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(articles)
# # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV
# def main():
# # Scraping articles from Dawn
# dawn_articles = scrape_tnews()
# save_to_csv('tnews_articles_full.csv', dawn_articles)
# print("tnews articles saved to CSV file successfully.")
#
# # Scraping articles from Business Recorder
# # brecorder_articles = scrape_brecorder()
# # save_to_csv('brecorder_articles_full.csv', brecorder_articles)
# # print("Business Recorder articles saved to CSV file successfully.")
#
#
# if __name__ == '__main__':
# main()
# url = 'https://www.thenews.com.pk/latest/category/business'
# response = requests.get(url, verify=False)
# soup = BeautifulSoup(response.text, 'html.parser')
# s = soup.find_all('div', class_='most-popular-box')
# print(s)