Spaces:
Running
Running
moritalous
commited on
Commit
•
b3702ea
1
Parent(s):
41bf566
Upload 2 files
Browse files- app.py +47 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from markdownify import MarkdownConverter
|
5 |
+
|
6 |
+
|
7 |
+
def md(soup, **options):
|
8 |
+
return MarkdownConverter(**options).convert_soup(soup)
|
9 |
+
|
10 |
+
|
11 |
+
def main_fn(url: str, check: list[int]):
|
12 |
+
|
13 |
+
response = requests.get(url)
|
14 |
+
soup = BeautifulSoup(response.text)
|
15 |
+
|
16 |
+
for tag in ["script", "style"]:
|
17 |
+
target = soup.find_all(tag)
|
18 |
+
for t in target:
|
19 |
+
t.clear
|
20 |
+
|
21 |
+
body = soup.find("body")
|
22 |
+
main = soup.find("main")
|
23 |
+
|
24 |
+
if main:
|
25 |
+
return md(main, strip=check)
|
26 |
+
|
27 |
+
return md(body)
|
28 |
+
|
29 |
+
|
30 |
+
demo = gr.Interface(
|
31 |
+
main_fn,
|
32 |
+
title="URL to Markdown",
|
33 |
+
description="""<div style="width: fit-content; margin: 0 auto;">Gets HTML given by URL and converts it to Markdown.Does not support dynamically generated HTML such as React.</div>
|
34 |
+
<div style="width: fit-content; margin: 0 auto;">URLで与えたHTMLを取得してMarkdownに変換します。Reactなどの動的に生成されるHTMLには対応していません</div>""",
|
35 |
+
inputs=[
|
36 |
+
gr.Text(label="URL", placeholder="https://*****"),
|
37 |
+
gr.CheckboxGroup(
|
38 |
+
label="Ignore tags(無視するタグ)",
|
39 |
+
choices=["a", "img", "noscript"],
|
40 |
+
value=["a", "img"],
|
41 |
+
),
|
42 |
+
],
|
43 |
+
outputs=[gr.TextArea(label="Markdown", show_copy_button=True)],
|
44 |
+
allow_flagging="never",
|
45 |
+
)
|
46 |
+
|
47 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4
|
2 |
+
gradio
|
3 |
+
markdownify
|
4 |
+
requests
|