sxandie commited on
Commit
43a08bd
1 Parent(s): c28b7d5

Create new file

Browse files
Files changed (1) hide show
  1. dataset.py +155 -0
dataset.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Create file named dataset.py
2
+ ### Paste
3
+ # coding=utf-8
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import datasets
8
+ from PIL import Image
9
+ import pandas as pd
10
+
11
+ logger = datasets.logging.get_logger(__name__)
12
+ _CITATION = """{}"""
13
+ _DESCRIPTION = """Discharge Summary"""
14
+
15
+
16
+ def load_image(image_path):
17
+ image = Image.open(image_path)
18
+ w, h = image.size
19
+ return image, (w, h)
20
+
21
+ def normalize_bbox(bbox, size):
22
+ return [
23
+ int(1000 * bbox[0] / size[0]),
24
+ int(1000 * bbox[1] / size[1]),
25
+ int(1000 * bbox[2] / size[0]),
26
+ int(1000 * bbox[3] / size[1]),
27
+ ]
28
+
29
+
30
+ class SroieConfig(datasets.BuilderConfig):
31
+ """BuilderConfig for SROIE"""
32
+ def __init__(self, **kwargs):
33
+ """BuilderConfig for SROIE.
34
+ Args:
35
+ **kwargs: keyword arguments forwarded to super.
36
+ """
37
+ super(SroieConfig, self).__init__(**kwargs)
38
+
39
+
40
+ class Sroie(datasets.GeneratorBasedBuilder):
41
+ BUILDER_CONFIGS = [
42
+ SroieConfig(name="discharge", version=datasets.Version("1.0.0"), description="Discharge summary dataset"),
43
+ ]
44
+
45
+ def _info(self):
46
+ return datasets.DatasetInfo(
47
+ description=_DESCRIPTION,
48
+ features=datasets.Features(
49
+ {
50
+ "id": datasets.Value("string"),
51
+ "words": datasets.Sequence(datasets.Value("string")),
52
+ "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
53
+ "ner_tags": datasets.Sequence(
54
+ datasets.features.ClassLabel(
55
+ names=['others',
56
+ 'produttore_key',
57
+ 'produttore_value',
58
+ 'cliente_key',
59
+ 'cliente_value',
60
+ 'unitloc_key',
61
+ 'unitloc_value',
62
+ 'operatore_key',
63
+ 'operatore_value',
64
+ 'referente_key',
65
+ 'referente_value',
66
+ 'cfproduttore_key',
67
+ 'cfproduttore_value',
68
+ 'telefono_key',
69
+ 'telefono_value',
70
+ 'emailcliente_key',
71
+ 'emailcliente_value',
72
+ 'datarichiesta_key',
73
+ 'datarichiesta_value',
74
+ 'orariorichiesta_key',
75
+ 'orariorichiesta_value',
76
+ 'emailproduttore_key',
77
+ 'emailproduttore_value',
78
+ 'mattina_key',
79
+ 'mattina_value',
80
+ 'pomeriggio_key',
81
+ 'pomeriggio_value',
82
+ 'cer_key',
83
+ 'cer_value',
84
+ 'descrizione_key',
85
+ 'descrizione_value',
86
+ 'sf_key',
87
+ 'sf_value',
88
+ 'classpericolo_key',
89
+ 'classpericolo_value',
90
+ 'destino_key',
91
+ 'destino_value',
92
+ 'confezionamento_key',
93
+ 'confezionamento_value',
94
+ 'destinazione_key',
95
+ 'destinazione_value'
96
+ ]
97
+ )
98
+ ),
99
+ #"image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
100
+ "image_path": datasets.Value("string"),
101
+ }
102
+ ),
103
+ supervised_keys=None,
104
+ citation=_CITATION,
105
+ homepage="",
106
+ )
107
+
108
+ def _split_generators(self, dl_manager):
109
+ """Returns SplitGenerators."""
110
+ """Uses local files located with data_dir"""
111
+ #downloaded_file = dl_manager.download_and_extract(_URLS)
112
+ # move files from the second URL together with files from the first one.
113
+ dest = Path('dataset')
114
+
115
+ return [
116
+ datasets.SplitGenerator(
117
+ name=datasets.Split.TRAIN, gen_kwargs={"filepath": dest/"train"}
118
+ ),
119
+ datasets.SplitGenerator(
120
+ name=datasets.Split.TEST, gen_kwargs={"filepath": dest/"test"}
121
+ ),
122
+ ]
123
+
124
+ def _generate_examples(self, filepath):
125
+
126
+ logger.info("⏳ Generating examples from = %s", filepath)
127
+ ann_dir = os.path.join(filepath, "annotation_dir")
128
+ img_dir = os.path.join(filepath, "img_dir")
129
+
130
+ for guid, fname in enumerate(sorted(os.listdir(img_dir))):
131
+
132
+ name, ext = os.path.splitext(fname)
133
+ file_path = os.path.join(ann_dir, name + ".csv")
134
+
135
+
136
+ df = pd.read_csv(file_path)
137
+
138
+ image_path = os.path.join(img_dir, fname)
139
+
140
+ image, size = load_image(image_path)
141
+
142
+ boxes = [[xmin, ymin, xmax, ymax] for xmin, ymin, xmax, ymax in zip(df['left'],df['top'],df['left']+df['width'],df['top']+df['height'])]
143
+ text = [i for i in df['text']]
144
+ label = [i for i in df['label']]
145
+
146
+ boxes = [normalize_bbox(box, size) for box in boxes]
147
+
148
+ print(image_path)
149
+ for i in boxes:
150
+ for j in i:
151
+ if j>1000:
152
+ print(j)
153
+ pass
154
+
155
+ yield guid, {"id": str(guid), "words": text, "bboxes": boxes, "ner_tags": label, "image_path": image_path}