jordyvl commited on
Commit
6720717
1 Parent(s): eab9f1c

Create app.py

Browse files

still to adapt with sliders, base functions added

Files changed (1) hide show
  1. app.py +164 -0
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ import gradio as gr
5
+ from collections import OrderedDict
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ from io import BytesIO
8
+ import PyPDF2
9
+ import pdf2image
10
+
11
+ MAX_PAGES = 50
12
+ MAX_PDF_SIZE = 100000000 # almost 100MB
13
+ MIN_WIDTH, MIN_HEIGHT = 150, 150
14
+
15
+
16
+ def equal_image_grid(images):
17
+ def compute_grid(n, max_cols=6):
18
+ equalDivisor = int(n**0.5)
19
+ cols = min(equalDivisor, max_cols)
20
+ rows = equalDivisor
21
+ if rows * cols >= n:
22
+ return rows, cols
23
+ cols += 1
24
+ if rows * cols >= n:
25
+ return rows, cols
26
+ while rows * cols < n:
27
+ rows += 1
28
+ return rows, cols
29
+
30
+ # assert len(images) == rows*cols
31
+ rows, cols = compute_grid(len(images))
32
+
33
+ # rescaling to min width [height padding]
34
+ images = [im for im in images if (im.height > 0) and (im.width > 0)] # could be NA
35
+
36
+ min_width = min(im.width for im in images)
37
+ images = [im.resize((min_width, int(im.height * min_width / im.width)), resample=Image.BICUBIC) for im in images]
38
+
39
+ w, h = max([img.size[0] for img in images]), max([img.size[1] for img in images])
40
+
41
+ grid = Image.new("RGB", size=(cols * w, rows * h))
42
+ grid_w, grid_h = grid.size
43
+
44
+ for i, img in enumerate(images):
45
+ grid.paste(img, box=(i % cols * w, i // cols * h))
46
+ return grid
47
+
48
+
49
+ def add_pagenumbers(im_list, height_scale=40):
50
+ def add_pagenumber(image, i):
51
+ width, height = image.size
52
+ draw = ImageDraw.Draw(image)
53
+ fontsize = int((width * height) ** (0.5) / height_scale)
54
+ font = ImageFont.truetype("Arial.ttf", fontsize)
55
+ margin = int(2 * fontsize)
56
+ draw.text(
57
+ (width - margin, height - margin),
58
+ str(i + 1),
59
+ fill="#D00917",
60
+ font=font,
61
+ spacing=4,
62
+ align="right",
63
+ )
64
+
65
+ for i, image in enumerate(im_list):
66
+ add_pagenumber(image, i)
67
+
68
+
69
+ def pdf_to_grid(pdf_path):
70
+ reader = PyPDF2.PdfReader(pdf_path)
71
+ reached_page_limit = False
72
+ images = []
73
+ try:
74
+ for p, page in enumerate(reader.pages):
75
+ if reached_page_limit:
76
+ break
77
+ for image in page.images:
78
+ im = Image.open(BytesIO(image.data))
79
+ if im.width < MIN_WIDTH and im.height < MIN_HEIGHT:
80
+ continue
81
+ images.append(im)
82
+ except Exception as e:
83
+ print(f"{pdf_path} PyPDF get_images {e}")
84
+ images = pdf2image.convert_from_path(pdf_path)
85
+
86
+ # simpler but slower
87
+ # images = pdf2image.convert_from_path(pdf_path)
88
+
89
+ if len(images) == 0:
90
+ return None
91
+ add_pagenumbers(images)
92
+ return equal_image_grid(images)
93
+
94
+
95
+ def main(complexity, evidence, form, operation, type):
96
+ # need to write a query on diagnostic test and sample from it based on slider values
97
+ # then return the sample
98
+ query = " and ".join(
99
+ [
100
+ f"{cat}_{val} == {True}"
101
+ for cat, val in zip(meta_cats.keys(), [complexity, evidence, form, operation, type])
102
+ if val
103
+ ]
104
+ )
105
+ results = DIAGNOSTIC_TEST.query(query)
106
+ if len(results) == 0:
107
+ return f"No results found for query {query}", "", "", "", ""
108
+
109
+ for i, sample in results.sample(frac=1).iterrows():
110
+ if not sample['nhash']:
111
+ continue
112
+ print("Sampled: ", sample["nhash"])
113
+
114
+ # first get PDF file
115
+ PDF, grid = None, None
116
+ pdf_path = PDF_PATH / "test" / (sample["nhash"] + ".pdf")
117
+ if not os.path.exists(pdf_path):
118
+ continue
119
+ PDF = pdf_path
120
+ grid = pdf_to_grid(pdf_path)
121
+ if not grid:
122
+ continue
123
+ question, answer = sample["label"] #might need to translate
124
+
125
+ return label, grid, PDF
126
+
127
+ _CLASSES = [
128
+ "letter",
129
+ "form",
130
+ "email",
131
+ "handwritten",
132
+ "advertisement",
133
+ "scientific report",
134
+ "scientific publication",
135
+ "specification",
136
+ "file folder",
137
+ "news article",
138
+ "budget",
139
+ "invoice",
140
+ "presentation",
141
+ "questionnaire",
142
+ "resume",
143
+ "memo",
144
+ ]
145
+ # test
146
+ # l, im, f = main(*slider_defaults)
147
+
148
+ #load both datasets in memory? --> easier retrieval
149
+
150
+ meta_cats = {'dataset': ['rvl_cdip', 'rvl_cdip_N'],
151
+ 'label': _CLASSES
152
+ }
153
+ sliders = [gr.Dropdown(choices=choices, value=choices[-1], label=label) for label, choices in meta_cats.items()]
154
+ slider_defaults = [None, "visual_checkbox", None, None, None] # [slider.value for slider in sliders]
155
+
156
+
157
+ outputs = [
158
+ gr.Textbox(label="label"),
159
+ gr.Image(label="image grid of PDF"),
160
+ gr.File(label="PDF"),
161
+ ]
162
+
163
+ iface = gr.Interface(fn=main, inputs=sliders, outputs=outputs, description="Visualize PDF samples from multi-page (PDF) document classification datasets", title='Beyond Document Page Classification: Examples')
164
+ iface.launch(share=True)