AutoGPT

Sleeping

App Files Files Community

AutoGPT / autogpt /commands /web_playwright.py

zee2221

Duplicate from aliabid94/AutoGPT

2021719 over 1 year ago

raw

history blame contribute delete

2.1 kB

	"""Web scraping commands using Playwright"""
	from __future__ import annotations

	try:
	from playwright.sync_api import sync_playwright
	except ImportError:
	print(
	"Playwright not installed. Please install it with 'pip install playwright' to use."
	)
	from bs4 import BeautifulSoup

	from autogpt.processing.html import extract_hyperlinks, format_hyperlinks


	def scrape_text(url: str) -> str:
	"""Scrape text from a webpage

	Args:
	url (str): The URL to scrape text from

	Returns:
	str: The scraped text
	"""
	with sync_playwright() as p:
	browser = p.chromium.launch()
	page = browser.new_page()

	try:
	page.goto(url)
	html_content = page.content()
	soup = BeautifulSoup(html_content, "html.parser")

	for script in soup(["script", "style"]):
	script.extract()

	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = "\n".join(chunk for chunk in chunks if chunk)

	except Exception as e:
	text = f"Error: {str(e)}"

	finally:
	browser.close()

	return text


	def scrape_links(url: str) -> str \| list[str]:
	"""Scrape links from a webpage

	Args:
	url (str): The URL to scrape links from

	Returns:
	Union[str, List[str]]: The scraped links
	"""
	with sync_playwright() as p:
	browser = p.chromium.launch()
	page = browser.new_page()

	try:
	page.goto(url)
	html_content = page.content()
	soup = BeautifulSoup(html_content, "html.parser")

	for script in soup(["script", "style"]):
	script.extract()

	hyperlinks = extract_hyperlinks(soup, url)
	formatted_links = format_hyperlinks(hyperlinks)

	except Exception as e:
	formatted_links = f"Error: {str(e)}"

	finally:
	browser.close()

	return formatted_links