sxandie commited on
Commit
c28b7d5
1 Parent(s): a7be311

Delete dataset.py

Browse files
Files changed (1) hide show
  1. dataset.py +0 -155
dataset.py DELETED
@@ -1,155 +0,0 @@
1
- ### Create file named dataset.py
2
- ### Paste
3
- # coding=utf-8
4
- import json
5
- import os
6
- from pathlib import Path
7
- import datasets
8
- from PIL import Image
9
- import pandas as pd
10
-
11
- logger = datasets.logging.get_logger(__name__)
12
- _CITATION = """{}"""
13
- _DESCRIPTION = """Discharge Summary"""
14
-
15
-
16
- def load_image(image_path):
17
- image = Image.open(image_path)
18
- w, h = image.size
19
- return image, (w, h)
20
-
21
- def normalize_bbox(bbox, size):
22
- return [
23
- int(1000 * bbox[0] / size[0]),
24
- int(1000 * bbox[1] / size[1]),
25
- int(1000 * bbox[2] / size[0]),
26
- int(1000 * bbox[3] / size[1]),
27
- ]
28
-
29
-
30
- class SroieConfig(datasets.BuilderConfig):
31
- """BuilderConfig for SROIE"""
32
- def __init__(self, **kwargs):
33
- """BuilderConfig for SROIE.
34
- Args:
35
- **kwargs: keyword arguments forwarded to super.
36
- """
37
- super(SroieConfig, self).__init__(**kwargs)
38
-
39
-
40
- class Sroie(datasets.GeneratorBasedBuilder):
41
- BUILDER_CONFIGS = [
42
- SroieConfig(name="discharge", version=datasets.Version("1.0.0"), description="Discharge summary dataset"),
43
- ]
44
-
45
- def _info(self):
46
- return datasets.DatasetInfo(
47
- description=_DESCRIPTION,
48
- features=datasets.Features(
49
- {
50
- "id": datasets.Value("string"),
51
- "words": datasets.Sequence(datasets.Value("string")),
52
- "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
53
- "ner_tags": datasets.Sequence(
54
- datasets.features.ClassLabel(
55
- names=['others',
56
- 'produttore_key',
57
- 'produttore_value',
58
- 'cliente_key',
59
- 'cliente_value',
60
- 'unitloc_key',
61
- 'unitloc_value',
62
- 'operatore_key',
63
- 'operatore_value',
64
- 'referente_key',
65
- 'referente_value',
66
- 'cfproduttore_key',
67
- 'cfproduttore_value',
68
- 'telefono_key',
69
- 'telefono_value',
70
- 'emailcliente_key',
71
- 'emailcliente_value',
72
- 'datarichiesta_key',
73
- 'datarichiesta_value',
74
- 'orariorichiesta_key',
75
- 'orariorichiesta_value',
76
- 'emailproduttore_key',
77
- 'emailproduttore_value',
78
- 'mattina_key',
79
- 'mattina_value',
80
- 'pomeriggio_key',
81
- 'pomeriggio_value',
82
- 'cer_key',
83
- 'cer_value',
84
- 'descrizione_key',
85
- 'descrizione_value',
86
- 'sf_key',
87
- 'sf_value',
88
- 'classpericolo_key',
89
- 'classpericolo_value',
90
- 'destino_key',
91
- 'destino_value',
92
- 'confezionamento_key',
93
- 'confezionamento_value',
94
- 'destinazione_key',
95
- 'destinazione_value'
96
- ]
97
- )
98
- ),
99
- #"image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
100
- "image_path": datasets.Value("string"),
101
- }
102
- ),
103
- supervised_keys=None,
104
- citation=_CITATION,
105
- homepage="",
106
- )
107
-
108
- def _split_generators(self, dl_manager):
109
- """Returns SplitGenerators."""
110
- """Uses local files located with data_dir"""
111
- #downloaded_file = dl_manager.download_and_extract(_URLS)
112
- # move files from the second URL together with files from the first one.
113
- dest = Path('dataset')
114
-
115
- return [
116
- datasets.SplitGenerator(
117
- name=datasets.Split.TRAIN, gen_kwargs={"filepath": dest/"train"}
118
- ),
119
- datasets.SplitGenerator(
120
- name=datasets.Split.TEST, gen_kwargs={"filepath": dest/"test"}
121
- ),
122
- ]
123
-
124
- def _generate_examples(self, filepath):
125
-
126
- logger.info("⏳ Generating examples from = %s", filepath)
127
- ann_dir = os.path.join(filepath, "annotation_dir")
128
- img_dir = os.path.join(filepath, "img_dir")
129
-
130
- for guid, fname in enumerate(sorted(os.listdir(img_dir))):
131
-
132
- name, ext = os.path.splitext(fname)
133
- file_path = os.path.join(ann_dir, name + ".csv")
134
-
135
-
136
- df = pd.read_csv(file_path)
137
-
138
- image_path = os.path.join(img_dir, fname)
139
-
140
- image, size = load_image(image_path)
141
-
142
- boxes = [[xmin, ymin, xmax, ymax] for xmin, ymin, xmax, ymax in zip(df['left'],df['top'],df['left']+df['width'],df['top']+df['height'])]
143
- text = [i for i in df['text']]
144
- label = [i for i in df['label']]
145
-
146
- boxes = [normalize_bbox(box, size) for box in boxes]
147
-
148
- print(image_path)
149
- for i in boxes:
150
- for j in i:
151
- if j>1000:
152
- print(j)
153
- pass
154
-
155
- yield guid, {"id": str(guid), "words": text, "bboxes": boxes, "ner_tags": label, "image_path": image_path}