moritalous commited on
Commit
b3702ea
1 Parent(s): 41bf566

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +47 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from markdownify import MarkdownConverter
5
+
6
+
7
+ def md(soup, **options):
8
+ return MarkdownConverter(**options).convert_soup(soup)
9
+
10
+
11
+ def main_fn(url: str, check: list[int]):
12
+
13
+ response = requests.get(url)
14
+ soup = BeautifulSoup(response.text)
15
+
16
+ for tag in ["script", "style"]:
17
+ target = soup.find_all(tag)
18
+ for t in target:
19
+ t.clear
20
+
21
+ body = soup.find("body")
22
+ main = soup.find("main")
23
+
24
+ if main:
25
+ return md(main, strip=check)
26
+
27
+ return md(body)
28
+
29
+
30
+ demo = gr.Interface(
31
+ main_fn,
32
+ title="URL to Markdown",
33
+ description="""<div style="width: fit-content; margin: 0 auto;">Gets HTML given by URL and converts it to Markdown.Does not support dynamically generated HTML such as React.</div>
34
+ <div style="width: fit-content; margin: 0 auto;">URLで与えたHTMLを取得してMarkdownに変換します。Reactなどの動的に生成されるHTMLには対応していません</div>""",
35
+ inputs=[
36
+ gr.Text(label="URL", placeholder="https://*****"),
37
+ gr.CheckboxGroup(
38
+ label="Ignore tags(無視するタグ)",
39
+ choices=["a", "img", "noscript"],
40
+ value=["a", "img"],
41
+ ),
42
+ ],
43
+ outputs=[gr.TextArea(label="Markdown", show_copy_button=True)],
44
+ allow_flagging="never",
45
+ )
46
+
47
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ beautifulsoup4
2
+ gradio
3
+ markdownify
4
+ requests