Spaces:

MurtazaNaqi
/

Article_Summarizer

Running

File size: 7,351 Bytes

712d86b

import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
from transformers import pipeline
# from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

# Text sumamrization model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")



def scrape_dawn():
    url = 'https://www.dawn.com/business'
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = []

    count = 0  # Counter to track the number of articles scraped

    for item in soup.find_all('article', class_='story'):
        if count >= 5:  # Stop after 10 articles
            break

        title_tag = item.find('h2', class_='story__title')
        if title_tag:
            title = title_tag.get_text(strip=True)
            link = title_tag.find('a')['href']
            full_text = get_full_article_dawn(link)
            # Summarize the full article
            summary_obj = summarizer(full_text[:1020])

            # Convert the summary object to a string
            summary = summary_obj[0]['summary_text'] if summary_obj else ""
            articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})

            count += 1  # Increment the counter

    return articles

# Function to get the full text of an article from Dawn
def get_full_article_dawn(url):
    response = requests.get(url, verify = False)
    soup = BeautifulSoup(response.text, 'html.parser')
    content_div = soup.find('div', class_='story__content')
    if content_div:
        paragraphs = content_div.find_all('p')
        full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
        return full_text
    return "Content not found."


# Function to scrape articles from Business Recorder
def scrape_brecorder():
    url = 'https://www.brecorder.com/business-finance'
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = []
    count = 0  # Counter to track the number of articles scraped

    for item in soup.find_all('article', class_='story'):
        if count >= 5:  # Stop after 10 articles
            break

        title_tag = item.find('h2', class_='story__title')
        if title_tag:
            title = title_tag.get_text(strip=True)
            link = title_tag.find('a')['href']
            full_text = get_full_article_brecorder(link)
            # Summarize the full article
            summary_obj = summarizer(full_text[:1020])

            # Convert the summary object to a string
            summary = summary_obj[0]['summary_text'] if summary_obj else ""
            articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})

            count += 1  # Increment the counter

    return articles

# Function to get the full text of an article from Business Recorder
def get_full_article_brecorder(url):
    response = requests.get(url, verify = False)
    soup = BeautifulSoup(response.text, 'html.parser')
    content_div = soup.find('div', class_='story__content')
    if content_div:
        paragraphs = content_div.find_all(['p', 'li'])
        full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
        return full_text
    return "Content not found."

#
# def scrape_tnews():
#     url = 'https://www.thenews.com.pk/latest/category/business'
#     response = requests.get(url, verify=False)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     articles = []
#
#     count = 0  # Counter to track the number of articles scraped
#
#     for item in soup.find_all('div', class_='most-popular-box'):
#         if count >= 2:  # Stop after 10 articles
#             break
#
#         title_tag = item.find('h2', class_='most-popular-list')
#         if title_tag:
#             title = title_tag.get_text(strip=True)
#             link = title_tag.find('a')['href']
#             full_text = get_full_article_tnews(link)
#             # Summarize the full article
#             summary_obj = summarizer(full_text[:1020])
#
#             # Convert the summary object to a string
#             summary = summary_obj[0]['summary_text'] if summary_obj else ""
#             articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
#
#             count += 1  # Increment the counter
#
#     return articles


def scrape_tnews():
    url = 'https://www.thenews.com.pk/latest/category/business'
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = []

    count = 0  # Counter to track the number of articles scraped

    for item in soup.find_all('div', class_='most-popular-box'):
        if count >= 5:  # Stop after 2 articles
            break

        # Extract the title from the <h2> tag
        title_tag = item.find('h2')
        if title_tag:
            title = title_tag.get_text(strip=True)

            # Extract the link from the <a> tag inside <h2>
            link = item.find('a')['href']

            # Fetch and process full article text (you should define get_full_article_tnews)
            full_text = get_full_article_tnews(link)

            # Summarize the full article (you should define summarizer)
            summary_obj = summarizer(full_text[:1020])
            summary = summary_obj[0]['summary_text'] if summary_obj else ""

            # Append the article details
            articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})


            count += 1  # Increment the counter

    return articles


def get_full_article_tnews(url):
    response = requests.get(url, verify = False)
    soup = BeautifulSoup(response.text, 'html.parser')
    content_div = soup.find('div', class_='detail-content')
    if content_div:
        paragraphs = content_div.find_all(['p', 'li'])
        full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
        return full_text
    return "Content not found."

# Function to save articles to a CSV file
def save_to_csv(filename, articles):
    if not articles:
        print(f"No articles found to save in {filename}.")
        return
    keys = articles[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(articles)


# # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV
# def main():
#     # Scraping articles from Dawn
#     dawn_articles = scrape_tnews()
#     save_to_csv('tnews_articles_full.csv', dawn_articles)
#     print("tnews articles saved to CSV file successfully.")
#
#     # Scraping articles from Business Recorder
#     # brecorder_articles = scrape_brecorder()
#     # save_to_csv('brecorder_articles_full.csv', brecorder_articles)
#     # print("Business Recorder articles saved to CSV file successfully.")
#
#
# if __name__ == '__main__':
#     main()

# url = 'https://www.thenews.com.pk/latest/category/business'
# response = requests.get(url, verify=False)
# soup = BeautifulSoup(response.text, 'html.parser')
# s = soup.find_all('div', class_='most-popular-box')
# print(s)