JTRNS
commited on
Commit
•
cc6362f
1
Parent(s):
167811c
initial setup
Browse files- Dockerfile +13 -0
- README.md +2 -2
- app.py +35 -0
- requirements.txt +3 -0
- templates/index.html +118 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
|
7 |
+
WORKDIR /app
|
8 |
+
|
9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
+
|
12 |
+
COPY --chown=user . /app
|
13 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Chunky
|
3 |
-
emoji:
|
4 |
colorFrom: pink
|
5 |
colorTo: green
|
6 |
sdk: docker
|
@@ -8,4 +8,4 @@ pinned: false
|
|
8 |
license: mit
|
9 |
---
|
10 |
|
11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Chunky
|
3 |
+
emoji: 📚
|
4 |
colorFrom: pink
|
5 |
colorTo: green
|
6 |
sdk: docker
|
|
|
8 |
license: mit
|
9 |
---
|
10 |
|
11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, Form, Request
|
2 |
+
from fastapi.concurrency import asynccontextmanager
|
3 |
+
from fastapi.responses import HTMLResponse
|
4 |
+
from fastapi.templating import Jinja2Templates
|
5 |
+
from typing import Annotated
|
6 |
+
from wtpsplit import SaT
|
7 |
+
|
8 |
+
sat_models = {}
|
9 |
+
|
10 |
+
|
11 |
+
@asynccontextmanager
|
12 |
+
async def lifespan(app: FastAPI):
|
13 |
+
# Load the ML model
|
14 |
+
sat_models["sat-3l-sm"] = SaT("sat-3l-sm")
|
15 |
+
yield
|
16 |
+
# Clean up the ML models and release the resources
|
17 |
+
sat_models.clear()
|
18 |
+
|
19 |
+
|
20 |
+
app = FastAPI(lifespan=lifespan)
|
21 |
+
|
22 |
+
app = FastAPI()
|
23 |
+
templates = Jinja2Templates(directory="templates")
|
24 |
+
|
25 |
+
@app.get("/", response_class=HTMLResponse)
|
26 |
+
def root(request: Request):
|
27 |
+
return templates.TemplateResponse(request=request, name="index.html")
|
28 |
+
|
29 |
+
|
30 |
+
@app.post("/split", response_class=HTMLResponse)
|
31 |
+
async def split_text(request: Request, text: Annotated[str, Form()] = ""):
|
32 |
+
sentences = sat_models["sat-3l-sm"].split(text)
|
33 |
+
return templates.TemplateResponse(
|
34 |
+
request=request, name="index.html", context={"sentences": sentences}
|
35 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn[standard]
|
3 |
+
wtpsplit
|
templates/index.html
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<html>
|
2 |
+
|
3 |
+
<head>
|
4 |
+
<title>Chunky | Sentence Segmentation Service</title>
|
5 |
+
<meta name="description" content="Chunky is a sentence segmentation service.">
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
7 |
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
8 |
+
<style>
|
9 |
+
body {
|
10 |
+
margin: 0;
|
11 |
+
font-family: ui-monospace,
|
12 |
+
Menlo, Monaco,
|
13 |
+
"Cascadia Mono", "Segoe UI Mono",
|
14 |
+
"Roboto Mono",
|
15 |
+
"Oxygen Mono",
|
16 |
+
"Ubuntu Mono",
|
17 |
+
"Source Code Pro",
|
18 |
+
"Fira Mono",
|
19 |
+
"Droid Sans Mono",
|
20 |
+
"Consolas", "Courier New", monospace;
|
21 |
+
}
|
22 |
+
|
23 |
+
main {
|
24 |
+
margin: 0 auto;
|
25 |
+
padding: 1rem;
|
26 |
+
max-width: 73ch;
|
27 |
+
}
|
28 |
+
|
29 |
+
ul {
|
30 |
+
list-style: square;
|
31 |
+
}
|
32 |
+
|
33 |
+
div[data-autogrow]:has(textarea) {
|
34 |
+
display: grid;
|
35 |
+
}
|
36 |
+
|
37 |
+
div[data-autogrow]:has(textarea)::after {
|
38 |
+
content: attr(data-autogrow) ' ';
|
39 |
+
white-space: pre-wrap;
|
40 |
+
visibility: hidden;
|
41 |
+
}
|
42 |
+
|
43 |
+
div[data-autogrow]:has(textarea)>textarea {
|
44 |
+
resize: none;
|
45 |
+
overflow: hidden;
|
46 |
+
}
|
47 |
+
|
48 |
+
div[data-autogrow]:has(textarea)>textarea,
|
49 |
+
div[data-autogrow]:has(textarea)::after {
|
50 |
+
grid-area: 1 / 1 / 2 / 2;
|
51 |
+
border: 1px solid currentColor;
|
52 |
+
padding: 0.5rem;
|
53 |
+
font: inherit;
|
54 |
+
text-wrap: stable;
|
55 |
+
}
|
56 |
+
</style>
|
57 |
+
</head>
|
58 |
+
|
59 |
+
<body>
|
60 |
+
<main>
|
61 |
+
|
62 |
+
<h1>chunky</h1>
|
63 |
+
<p>Sentence Segmentation Service</p>
|
64 |
+
<form action="/split" method="post" enctype="multipart/form-data">
|
65 |
+
|
66 |
+
|
67 |
+
<label for="text">Text:</label><br>
|
68 |
+
<div data-autogrow="">
|
69 |
+
<textarea name="text" id="text" rows="1" maxlength="16000"></textarea>
|
70 |
+
</div>
|
71 |
+
|
72 |
+
<input type="submit" value="Split Sentences">
|
73 |
+
</form>
|
74 |
+
{% if sentences %}
|
75 |
+
<button id="copy" type="button">copy</button>
|
76 |
+
{%endif %}
|
77 |
+
<ul>
|
78 |
+
{% for sentence in sentences %}
|
79 |
+
<li>{{ sentence }}</li>
|
80 |
+
{% endfor %}
|
81 |
+
</ul>
|
82 |
+
|
83 |
+
<section>
|
84 |
+
<h2>References</h2>
|
85 |
+
<article>
|
86 |
+
<header>
|
87 |
+
<h3>Segment Any Text: A Universal Approach for Robust, Efficient and Adaptable Sentence Segmentation</h3>
|
88 |
+
<p>by Markus Frohmann, Igor Sterner, Ivan Vulić, Benjamin Minixhofer, and Markus Schedl</p>
|
89 |
+
</header>
|
90 |
+
<p>
|
91 |
+
<cite>
|
92 |
+
Frohmann, M., Sterner, I., Vulić, I., Minixhofer, B., & Schedl, M. (2024). Segment Any Text: A Universal
|
93 |
+
Approach for Robust, Efficient and Adaptable Sentence Segmentation.
|
94 |
+
<em>arXiv preprint arXiv:2406.16678</em>.
|
95 |
+
<a href="https://doi.org/10.48550/arXiv.2406.16678">https://doi.org/10.48550/arXiv.2406.16678</a>
|
96 |
+
</cite>
|
97 |
+
</p>
|
98 |
+
</article>
|
99 |
+
</section>
|
100 |
+
</main>
|
101 |
+
|
102 |
+
<noscript>
|
103 |
+
{{sentences|safe|trim}}
|
104 |
+
</noscript>
|
105 |
+
<script>
|
106 |
+
const textarea = document.querySelector("textarea");
|
107 |
+
if (textarea) {
|
108 |
+
textarea.addEventListener("input", (e) => {
|
109 |
+
textarea.parentElement.dataset["autogrow"] = textarea.value;
|
110 |
+
})
|
111 |
+
}
|
112 |
+
document.querySelector("button#copy").addEventListener("click", (e) => {
|
113 |
+
navigator.clipboard.writeText(document.querySelector("noscript").textContent.trim() ?? "[]")
|
114 |
+
})
|
115 |
+
</script>
|
116 |
+
</body>
|
117 |
+
|
118 |
+
</html>
|