JTRNS commited on
Commit
cc6362f
1 Parent(s): 167811c

initial setup

Browse files
Files changed (5) hide show
  1. Dockerfile +13 -0
  2. README.md +2 -2
  3. app.py +35 -0
  4. requirements.txt +3 -0
  5. templates/index.html +118 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Chunky
3
- emoji: 📉
4
  colorFrom: pink
5
  colorTo: green
6
  sdk: docker
@@ -8,4 +8,4 @@ pinned: false
8
  license: mit
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Chunky
3
+ emoji: 📚
4
  colorFrom: pink
5
  colorTo: green
6
  sdk: docker
 
8
  license: mit
9
  ---
10
 
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Form, Request
2
+ from fastapi.concurrency import asynccontextmanager
3
+ from fastapi.responses import HTMLResponse
4
+ from fastapi.templating import Jinja2Templates
5
+ from typing import Annotated
6
+ from wtpsplit import SaT
7
+
8
+ sat_models = {}
9
+
10
+
11
+ @asynccontextmanager
12
+ async def lifespan(app: FastAPI):
13
+ # Load the ML model
14
+ sat_models["sat-3l-sm"] = SaT("sat-3l-sm")
15
+ yield
16
+ # Clean up the ML models and release the resources
17
+ sat_models.clear()
18
+
19
+
20
+ app = FastAPI(lifespan=lifespan)
21
+
22
+ app = FastAPI()
23
+ templates = Jinja2Templates(directory="templates")
24
+
25
+ @app.get("/", response_class=HTMLResponse)
26
+ def root(request: Request):
27
+ return templates.TemplateResponse(request=request, name="index.html")
28
+
29
+
30
+ @app.post("/split", response_class=HTMLResponse)
31
+ async def split_text(request: Request, text: Annotated[str, Form()] = ""):
32
+ sentences = sat_models["sat-3l-sm"].split(text)
33
+ return templates.TemplateResponse(
34
+ request=request, name="index.html", context={"sentences": sentences}
35
+ )
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ wtpsplit
templates/index.html ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html>
2
+
3
+ <head>
4
+ <title>Chunky | Sentence Segmentation Service</title>
5
+ <meta name="description" content="Chunky is a sentence segmentation service.">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1">
7
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
8
+ <style>
9
+ body {
10
+ margin: 0;
11
+ font-family: ui-monospace,
12
+ Menlo, Monaco,
13
+ "Cascadia Mono", "Segoe UI Mono",
14
+ "Roboto Mono",
15
+ "Oxygen Mono",
16
+ "Ubuntu Mono",
17
+ "Source Code Pro",
18
+ "Fira Mono",
19
+ "Droid Sans Mono",
20
+ "Consolas", "Courier New", monospace;
21
+ }
22
+
23
+ main {
24
+ margin: 0 auto;
25
+ padding: 1rem;
26
+ max-width: 73ch;
27
+ }
28
+
29
+ ul {
30
+ list-style: square;
31
+ }
32
+
33
+ div[data-autogrow]:has(textarea) {
34
+ display: grid;
35
+ }
36
+
37
+ div[data-autogrow]:has(textarea)::after {
38
+ content: attr(data-autogrow) ' ';
39
+ white-space: pre-wrap;
40
+ visibility: hidden;
41
+ }
42
+
43
+ div[data-autogrow]:has(textarea)>textarea {
44
+ resize: none;
45
+ overflow: hidden;
46
+ }
47
+
48
+ div[data-autogrow]:has(textarea)>textarea,
49
+ div[data-autogrow]:has(textarea)::after {
50
+ grid-area: 1 / 1 / 2 / 2;
51
+ border: 1px solid currentColor;
52
+ padding: 0.5rem;
53
+ font: inherit;
54
+ text-wrap: stable;
55
+ }
56
+ </style>
57
+ </head>
58
+
59
+ <body>
60
+ <main>
61
+
62
+ <h1>chunky</h1>
63
+ <p>Sentence Segmentation Service</p>
64
+ <form action="/split" method="post" enctype="multipart/form-data">
65
+
66
+
67
+ <label for="text">Text:</label><br>
68
+ <div data-autogrow="">
69
+ <textarea name="text" id="text" rows="1" maxlength="16000"></textarea>
70
+ </div>
71
+
72
+ <input type="submit" value="Split Sentences">
73
+ </form>
74
+ {% if sentences %}
75
+ <button id="copy" type="button">copy</button>
76
+ {%endif %}
77
+ <ul>
78
+ {% for sentence in sentences %}
79
+ <li>{{ sentence }}</li>
80
+ {% endfor %}
81
+ </ul>
82
+
83
+ <section>
84
+ <h2>References</h2>
85
+ <article>
86
+ <header>
87
+ <h3>Segment Any Text: A Universal Approach for Robust, Efficient and Adaptable Sentence Segmentation</h3>
88
+ <p>by Markus Frohmann, Igor Sterner, Ivan Vulić, Benjamin Minixhofer, and Markus Schedl</p>
89
+ </header>
90
+ <p>
91
+ <cite>
92
+ Frohmann, M., Sterner, I., Vulić, I., Minixhofer, B., & Schedl, M. (2024). Segment Any Text: A Universal
93
+ Approach for Robust, Efficient and Adaptable Sentence Segmentation.
94
+ <em>arXiv preprint arXiv:2406.16678</em>.
95
+ <a href="https://doi.org/10.48550/arXiv.2406.16678">https://doi.org/10.48550/arXiv.2406.16678</a>
96
+ </cite>
97
+ </p>
98
+ </article>
99
+ </section>
100
+ </main>
101
+
102
+ <noscript>
103
+ {{sentences|safe|trim}}
104
+ </noscript>
105
+ <script>
106
+ const textarea = document.querySelector("textarea");
107
+ if (textarea) {
108
+ textarea.addEventListener("input", (e) => {
109
+ textarea.parentElement.dataset["autogrow"] = textarea.value;
110
+ })
111
+ }
112
+ document.querySelector("button#copy").addEventListener("click", (e) => {
113
+ navigator.clipboard.writeText(document.querySelector("noscript").textContent.trim() ?? "[]")
114
+ })
115
+ </script>
116
+ </body>
117
+
118
+ </html>