Spaces:
Runtime error
Runtime error
Sharathhebbar24
commited on
Commit
•
d28ba37
1
Parent(s):
65bf04b
First HF_SPace APp
Browse files- .env +10 -0
- .gitignore +3 -0
- Dockerfile +13 -0
- README.md +9 -10
- folder_creation.py +12 -0
- main.py +65 -0
- minio_services.py +36 -0
- pdf_to_img.py +26 -0
- requirements.txt +6 -0
- static/fastapi.png +0 -0
- static/minIO.png +0 -0
- table_extraction.py +65 -0
.env
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Directories
|
2 |
+
IMAGES_DIR = images/
|
3 |
+
OUTPUTS_DIR = outputs/
|
4 |
+
PDF_DIR = pdfs/
|
5 |
+
|
6 |
+
# Bucket
|
7 |
+
MINIO_KEY = QUK5tI3fsjStPYrCKs18eb3OPTFzPLGeVOLXrsMc
|
8 |
+
HOST = localhost:9000
|
9 |
+
BUCKET_NAME = table-detection
|
10 |
+
ACCESS_KEY = Table extraction
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
junks/
|
2 |
+
demo.py
|
3 |
+
__pycache__
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim-bullseye
|
2 |
+
|
3 |
+
WORKDIR /
|
4 |
+
|
5 |
+
COPY requirements.txt requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --upgrade pip
|
8 |
+
RUN pip install -r requirements.txt
|
9 |
+
|
10 |
+
COPY . .
|
11 |
+
EXPOSE $PORT
|
12 |
+
|
13 |
+
CMD ["uvicorn", "main:app", "--reload"]
|
README.md
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# Table Detection
|
2 |
+
|
3 |
+
- To run fast api server ```uvicorn main:app --reload```
|
4 |
+
|
5 |
+
![FastAPI-Server](static/fastapi.png)
|
6 |
+
|
7 |
+
- RUN MINIO SERVER: .\minio.exe server C:\minio --console-address :9090
|
8 |
+
|
9 |
+
![minIO-server](static/minIO.png)
|
|
folder_creation.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
def remove_files(dirs):
|
4 |
+
for files in os.listdir(dirs):
|
5 |
+
os.remove(os.path.join(dirs, files))
|
6 |
+
|
7 |
+
|
8 |
+
def make_directory_if_not_exists(dir_name):
|
9 |
+
if not os.path.exists(dir_name):
|
10 |
+
os.mkdir(dir_name)
|
11 |
+
else:
|
12 |
+
remove_files(dir_name)
|
main.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from pathlib import Path, PurePath
|
4 |
+
|
5 |
+
from fastapi.middleware.cors import CORSMiddleware
|
6 |
+
from fastapi import FastAPI, Form, HTTPException
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
|
9 |
+
from folder_creation import make_directory_if_not_exists
|
10 |
+
from pdf_to_img import pdf_to_image
|
11 |
+
from table_extraction import Table_extraction
|
12 |
+
from minio_services import MINIO
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
IMAGES_DIR = os.getenv('IMAGES_DIR')
|
17 |
+
PDF_DIR = os.getenv('PDF_DIR')
|
18 |
+
OUTPUTS_DIR = os.getenv('OUTPUTS_DIR')
|
19 |
+
HOST = os.getenv('HOST')
|
20 |
+
ACCESS_KEY = os.getenv('ACCESS_KEY')
|
21 |
+
MINIO_KEY = os.getenv('MINIO_KEY')
|
22 |
+
BUCKET_NAME = os.getenv('BUCKET_NAME')
|
23 |
+
app = FastAPI()
|
24 |
+
|
25 |
+
origins = ["*"]
|
26 |
+
|
27 |
+
app.add_middleware(
|
28 |
+
CORSMiddleware,
|
29 |
+
allow_origins=origins,
|
30 |
+
allow_credentials=True,
|
31 |
+
allow_methods=["*"],
|
32 |
+
allow_headers=["*"],
|
33 |
+
)
|
34 |
+
|
35 |
+
@app.post("/")
|
36 |
+
async def main(file_name: str = Form(None), uid: str = Form(None)):
|
37 |
+
try:
|
38 |
+
make_directory_if_not_exists(IMAGES_DIR)
|
39 |
+
IMAGES_DIR1 = IMAGES_DIR + uid + '/'
|
40 |
+
make_directory_if_not_exists(IMAGES_DIR1)
|
41 |
+
|
42 |
+
make_directory_if_not_exists(OUTPUTS_DIR)
|
43 |
+
OUTPUTS_DIR1 = OUTPUTS_DIR + uid + '/'
|
44 |
+
make_directory_if_not_exists(OUTPUTS_DIR1)
|
45 |
+
print(file_name)
|
46 |
+
if PurePath(file_name).suffix == '.pdf':
|
47 |
+
imagename = pdf_to_image(file_name, IMAGES_DIR1)
|
48 |
+
else:
|
49 |
+
imagename = IMAGES_DIR1+Path(file_name).name
|
50 |
+
shutil.copy(file_name, imagename)
|
51 |
+
|
52 |
+
model = Table_extraction(imagename, OUTPUTS_DIR1)
|
53 |
+
op_img = model.get_results()
|
54 |
+
|
55 |
+
minio = MINIO(HOST, ACCESS_KEY, MINIO_KEY, BUCKET_NAME, uid, op_img)
|
56 |
+
minio.upload_to_minio()
|
57 |
+
obj = minio.download_from_minio()
|
58 |
+
shutil.rmtree(IMAGES_DIR)
|
59 |
+
shutil.rmtree(OUTPUTS_DIR)
|
60 |
+
return obj
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
shutil.rmtree(IMAGES_DIR)
|
64 |
+
shutil.rmtree(OUTPUTS_DIR)
|
65 |
+
raise HTTPException(status_code=404, detail=str(e))
|
minio_services.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from minio import Minio
|
3 |
+
from minio.error import (ResponseError, BucketAlreadyOwnedByYou,
|
4 |
+
BucketAlreadyExists)
|
5 |
+
class MINIO():
|
6 |
+
def __init__(self, HOST, ACCESS_KEY, MINIO_KEY, BUCKET_NAME, UID, op):
|
7 |
+
self.minioClient = Minio(HOST,
|
8 |
+
access_key=ACCESS_KEY,
|
9 |
+
secret_key=MINIO_KEY,
|
10 |
+
secure=False)
|
11 |
+
self.BUCKET_NAME = BUCKET_NAME
|
12 |
+
self.UID = UID
|
13 |
+
self.op = op
|
14 |
+
|
15 |
+
def upload_to_minio(self):
|
16 |
+
try:
|
17 |
+
self.minioClient.make_bucket(self.BUCKET_NAME, location="us-east-1")
|
18 |
+
except BucketAlreadyOwnedByYou as err:
|
19 |
+
pass
|
20 |
+
except BucketAlreadyExists as err:
|
21 |
+
pass
|
22 |
+
except ResponseError as err:
|
23 |
+
raise
|
24 |
+
|
25 |
+
# Put an object 'A' with contents from 'B'.
|
26 |
+
try:
|
27 |
+
self.minioClient.fput_object(self.BUCKET_NAME, str(self.UID) + '/' + Path(self.op).name, self.op)
|
28 |
+
except ResponseError as err:
|
29 |
+
print(err)
|
30 |
+
|
31 |
+
|
32 |
+
def download_from_minio(self):
|
33 |
+
val = self.minioClient.fget_object(self.BUCKET_NAME, Path(self.op).name, str(self.UID) + '/' + Path(self.op).name)
|
34 |
+
return val.object_name
|
35 |
+
|
36 |
+
|
pdf_to_img.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pypdfium2 as pdfium
|
2 |
+
|
3 |
+
|
4 |
+
def pdf_to_image(pdf_name, IMAGES_DIR):
|
5 |
+
pdf = pdfium.PdfDocument(pdf_name)
|
6 |
+
n_pages = len(pdf)
|
7 |
+
|
8 |
+
for page_number in range(n_pages):
|
9 |
+
page = pdf.get_page(page_number)
|
10 |
+
|
11 |
+
scale_value = 3
|
12 |
+
pil_image = page.render_to(
|
13 |
+
pdfium.BitmapConv.pil_image,
|
14 |
+
scale = scale_value,
|
15 |
+
rotation = 0,
|
16 |
+
fill_colour=(255, 255, 255, 255),
|
17 |
+
crop=(0, 0, 0, 0),
|
18 |
+
greyscale=False,
|
19 |
+
optimise_mode=pdfium.OptimiseMode.NONE,)
|
20 |
+
|
21 |
+
imagename = IMAGES_DIR + str(page_number + 1) + ".png"
|
22 |
+
print(imagename)
|
23 |
+
pil_image.save(imagename)
|
24 |
+
return imagename
|
25 |
+
|
26 |
+
# pdf_to_image('junks\\Attention is all u need.pdf')
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ultralyticsplus==0.0.23
|
2 |
+
ultralytics==8.0.21
|
3 |
+
transformers
|
4 |
+
pypdfium2==3.15.0
|
5 |
+
ak-minio==5.0.7.post2
|
6 |
+
fastapi[all]
|
static/fastapi.png
ADDED
static/minIO.png
ADDED
table_extraction.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ultralyticsplus import YOLO, render_result
|
2 |
+
from pathlib import Path, PurePath
|
3 |
+
|
4 |
+
class Table_extraction():
|
5 |
+
|
6 |
+
def __init__(self, image, OUTPUTS_DIR):
|
7 |
+
self.model = YOLO('keremberke/yolov8m-table-extraction')
|
8 |
+
self.model.overrides['conf'] = 0.25
|
9 |
+
self.model.overrides['iou'] = 0.45
|
10 |
+
self.model.overrides['agnostic_nms'] = False
|
11 |
+
self.model.overrides['max_det'] = 1000
|
12 |
+
self.image = image
|
13 |
+
self.OUTPUTS_DIR = OUTPUTS_DIR
|
14 |
+
|
15 |
+
|
16 |
+
def get_results(self):
|
17 |
+
self.results = self.model(self.image)
|
18 |
+
render = render_result(model=self.model, image=self.image, result=self.results[0])
|
19 |
+
op_img = self.OUTPUTS_DIR + Path(self.image).name
|
20 |
+
render.save(op_img)
|
21 |
+
return op_img
|
22 |
+
|
23 |
+
# def recognize_coords(self):
|
24 |
+
# final_results = []
|
25 |
+
# result = str(self.results)
|
26 |
+
# print(result)
|
27 |
+
# result = result.split('(')
|
28 |
+
# print('Result', result)
|
29 |
+
# if '\n' not in result[1]:
|
30 |
+
# print('In if')
|
31 |
+
# coords = []
|
32 |
+
# result = result[1][2: -3].split(',')
|
33 |
+
# for i in result:
|
34 |
+
# coords.append(float(i))
|
35 |
+
# final_results.append(coords)
|
36 |
+
# return final_results
|
37 |
+
# else:
|
38 |
+
# result = result[1:][0]
|
39 |
+
# result = result.split('\n')
|
40 |
+
# j = 0
|
41 |
+
# print('Results: ', result)
|
42 |
+
# for i in result:
|
43 |
+
# coords = []
|
44 |
+
# i = i.strip()
|
45 |
+
# if j == 0:
|
46 |
+
# print(i)
|
47 |
+
# i = i[2:-2]
|
48 |
+
# print(i)
|
49 |
+
# elif j == len(result) - 1:
|
50 |
+
# i = i[1:-3]
|
51 |
+
# else:
|
52 |
+
# i = i[1:-2]
|
53 |
+
# j+=1
|
54 |
+
# print(i)
|
55 |
+
# i = i.split(',')
|
56 |
+
# print(i)
|
57 |
+
# for k in i:
|
58 |
+
# coords.append(float(k))
|
59 |
+
# final_results.append(coords)
|
60 |
+
# return final_results
|
61 |
+
|
62 |
+
|
63 |
+
# te = Table_extraction('junks\9.png', 'outputs/123')
|
64 |
+
# te.get_results()
|
65 |
+
# print(te.recognize_coords())
|