Galuh Sahid
commited on
Commit
•
ba7a003
1
Parent(s):
bb13925
Add download_logs and scripts
Browse files- .gitattributes +1 -0
- data/download_logs/cc12m_2m_download.log +3 -0
- data/download_logs/cc12m_download.log +3 -0
- data/download_logs/cc3m_download.log +3 -0
- data/download_logs/wit_download.log +3 -0
- data/scripts/.DS_Store +0 -0
- data/scripts/cc12m.py +49 -0
- data/scripts/cc12m_disk1.py +58 -0
- data/scripts/cc12m_disk2.py +58 -0
- data/scripts/cc3m.py +58 -0
- data/scripts/cc3m_modified.py +58 -0
- data/scripts/cc_propn.py +72 -0
- data/scripts/coco.py +40 -0
- data/scripts/flicker8k.py +42 -0
- data/scripts/flickr30k.py +48 -0
- data/scripts/subcaption.py +59 -0
- data/scripts/wit.py +56 -0
- data/scripts/wit_propn.py +68 -0
.gitattributes
CHANGED
@@ -16,3 +16,4 @@
|
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
18 |
flax_model.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
|
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
18 |
flax_model.msgpack filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.log filter=lfs diff=lfs merge=lfs -text
|
data/download_logs/cc12m_2m_download.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e00288c04e8d226862a37c1ca4d4953a40770d30f7f7f696aafcba2ed57212d
|
3 |
+
size 145262942
|
data/download_logs/cc12m_download.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c25de5a6a1bffc69309fdf30d53a27978bb0de8b49a80429b14f880c6470495b
|
3 |
+
size 262929928
|
data/download_logs/cc3m_download.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f5aeaaf47c1370a5da33fc6a97303a1ef5d020670e06fbc9e8474b41a5eb3ba
|
3 |
+
size 126513213
|
data/download_logs/wit_download.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:361c1a8fd3704ec101be204690949ff20b02eac54c39c9fb4ed934d0497ff6ba
|
3 |
+
size 233568
|
data/scripts/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
data/scripts/cc12m.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os.path
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import contexttimer
|
7 |
+
|
8 |
+
# Setup
|
9 |
+
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
|
10 |
+
|
11 |
+
if len(sys.argv) != 4:
|
12 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
13 |
+
exit(1)
|
14 |
+
|
15 |
+
annotation_file = sys.argv[1]
|
16 |
+
images_dir = sys.argv[2]
|
17 |
+
output_file = sys.argv[3]
|
18 |
+
|
19 |
+
logging.info("Processing cc12m dataset")
|
20 |
+
|
21 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
22 |
+
df = pd.read_csv(annotation_file, delimiter='\t')
|
23 |
+
|
24 |
+
lines = []
|
25 |
+
|
26 |
+
df = df[["caption", "url"]]
|
27 |
+
|
28 |
+
print(f"Loaded {len(df)} images.")
|
29 |
+
|
30 |
+
for index, caption_reference_description, image_url in df.itertuples():
|
31 |
+
index+=1
|
32 |
+
base_url = os.path.basename(image_url) # extract base url
|
33 |
+
stem, ext = os.path.splitext(base_url) # split into stem and extension
|
34 |
+
filename = f'{index:08d}---{stem}.jpg'
|
35 |
+
|
36 |
+
full_image_path = images_dir+"/"+filename
|
37 |
+
|
38 |
+
if os.path.isfile(full_image_path):
|
39 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
|
40 |
+
else:
|
41 |
+
#print(f"{full_image_path} doesn't exist")
|
42 |
+
logging.error(full_image_path)
|
43 |
+
|
44 |
+
|
45 |
+
with open(output_file, "w") as f:
|
46 |
+
f.write("\n".join(lines))
|
47 |
+
|
48 |
+
logging.info(f"Processing cc12m dataset done. {len(lines)} images processed.")
|
49 |
+
|
data/scripts/cc12m_disk1.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os.path
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import contexttimer
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# Setup
|
10 |
+
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
|
11 |
+
|
12 |
+
if len(sys.argv) != 4:
|
13 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
14 |
+
exit(1)
|
15 |
+
|
16 |
+
annotation_file = sys.argv[1]
|
17 |
+
images_dir = sys.argv[2]
|
18 |
+
output_file = sys.argv[3]
|
19 |
+
|
20 |
+
logging.info("Processing cc12m dataset")
|
21 |
+
|
22 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
23 |
+
df = pd.read_csv(annotation_file, delimiter='\t')
|
24 |
+
|
25 |
+
lines = []
|
26 |
+
|
27 |
+
df = df[["caption", "url"]]
|
28 |
+
|
29 |
+
df = df.replace('', np.nan)
|
30 |
+
df = df.dropna()
|
31 |
+
|
32 |
+
print(f"Loaded {len(df)} images.")
|
33 |
+
|
34 |
+
for index, caption_reference_description, image_url in df.itertuples():
|
35 |
+
index+=1
|
36 |
+
base_url = os.path.basename(image_url) # extract base url
|
37 |
+
stem, ext = os.path.splitext(base_url) # split into stem and extension
|
38 |
+
filename = f'{index:08d}---{stem}.jpg'
|
39 |
+
|
40 |
+
full_image_path = images_dir+"/"+filename
|
41 |
+
|
42 |
+
if os.path.isfile(full_image_path):
|
43 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
|
44 |
+
else:
|
45 |
+
#print(f"{full_image_path} doesn't exist")
|
46 |
+
logging.error(full_image_path)
|
47 |
+
|
48 |
+
train_lines = lines[:-150_001]
|
49 |
+
valid_lines = lines[-150_001:]
|
50 |
+
|
51 |
+
with open(output_file+"_train.json", "w") as f:
|
52 |
+
f.write("\n".join(train_lines))
|
53 |
+
|
54 |
+
with open(output_file+"_val.json", "w") as f:
|
55 |
+
f.write("\n".join(valid_lines))
|
56 |
+
|
57 |
+
logging.info(f"Processing cc12m dataset done. {len(lines)} images processed.")
|
58 |
+
|
data/scripts/cc12m_disk2.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os.path
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import contexttimer
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# Setup
|
10 |
+
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
|
11 |
+
|
12 |
+
if len(sys.argv) != 4:
|
13 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
14 |
+
exit(1)
|
15 |
+
|
16 |
+
annotation_file = sys.argv[1]
|
17 |
+
images_dir = sys.argv[2]
|
18 |
+
output_file = sys.argv[3]
|
19 |
+
|
20 |
+
logging.info("Processing cc12m dataset")
|
21 |
+
|
22 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
23 |
+
df = pd.read_csv(annotation_file, delimiter='\t')
|
24 |
+
|
25 |
+
lines = []
|
26 |
+
|
27 |
+
df = df[["caption", "url"]]
|
28 |
+
|
29 |
+
df = df.replace('', np.nan)
|
30 |
+
df = df.dropna()
|
31 |
+
|
32 |
+
print(f"Loaded {len(df)} images.")
|
33 |
+
|
34 |
+
for index, caption_reference_description, image_url in df.itertuples():
|
35 |
+
index+=1
|
36 |
+
base_url = os.path.basename(image_url) # extract base url
|
37 |
+
stem, ext = os.path.splitext(base_url) # split into stem and extension
|
38 |
+
filename = f'{index:08d}---{stem}.jpg'
|
39 |
+
|
40 |
+
full_image_path = images_dir+"/"+filename
|
41 |
+
|
42 |
+
if os.path.isfile(full_image_path):
|
43 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
|
44 |
+
else:
|
45 |
+
#print(f"{full_image_path} doesn't exist")
|
46 |
+
logging.error(full_image_path)
|
47 |
+
|
48 |
+
train_lines = lines[:-500_001]
|
49 |
+
valid_lines = lines[-500_001:]
|
50 |
+
|
51 |
+
with open(output_file+"_train.json", "w") as f:
|
52 |
+
f.write("\n".join(train_lines))
|
53 |
+
|
54 |
+
with open(output_file+"_val.json", "w") as f:
|
55 |
+
f.write("\n".join(valid_lines))
|
56 |
+
|
57 |
+
logging.info(f"Processing cc12m dataset done. {len(lines)} images processed.")
|
58 |
+
|
data/scripts/cc3m.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os.path
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import contexttimer
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# Setup
|
10 |
+
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
|
11 |
+
|
12 |
+
if len(sys.argv) != 4:
|
13 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
14 |
+
exit(1)
|
15 |
+
|
16 |
+
annotation_file = sys.argv[1]
|
17 |
+
images_dir = sys.argv[2]
|
18 |
+
output_file = sys.argv[3]
|
19 |
+
|
20 |
+
logging.info("Processing cc3m dataset")
|
21 |
+
|
22 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
23 |
+
df = pd.read_csv(annotation_file, delimiter='\t')
|
24 |
+
|
25 |
+
lines = []
|
26 |
+
|
27 |
+
df = df[["caption", "url"]]
|
28 |
+
|
29 |
+
df = df.replace('', np.nan)
|
30 |
+
df = df.dropna()
|
31 |
+
|
32 |
+
print(f"Loaded {len(df)} images.")
|
33 |
+
|
34 |
+
for index, caption_reference_description, image_url in df.itertuples():
|
35 |
+
index+=1
|
36 |
+
base_url = os.path.basename(image_url) # extract base url
|
37 |
+
stem, ext = os.path.splitext(base_url) # split into stem and extension
|
38 |
+
filename = f'{index:08d}---{stem}.jpg'
|
39 |
+
|
40 |
+
full_image_path = images_dir+"/"+filename
|
41 |
+
|
42 |
+
if os.path.isfile(full_image_path):
|
43 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
|
44 |
+
else:
|
45 |
+
#print(f"{full_image_path} doesn't exist")
|
46 |
+
logging.error(full_image_path)
|
47 |
+
|
48 |
+
train_lines = lines[:-300_001]
|
49 |
+
valid_lines = lines[-300_001:]
|
50 |
+
|
51 |
+
with open(output_file+"_train.json", "w") as f:
|
52 |
+
f.write("\n".join(train_lines))
|
53 |
+
|
54 |
+
with open(output_file+"_val.json", "w") as f:
|
55 |
+
f.write("\n".join(valid_lines))
|
56 |
+
|
57 |
+
logging.info(f"Processing cc3m dataset done. {len(lines)} images processed.")
|
58 |
+
|
data/scripts/cc3m_modified.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os.path
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import contexttimer
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# Setup
|
10 |
+
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
|
11 |
+
|
12 |
+
if len(sys.argv) != 4:
|
13 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
14 |
+
exit(1)
|
15 |
+
|
16 |
+
annotation_file = sys.argv[1]
|
17 |
+
images_dir = sys.argv[2]
|
18 |
+
output_file = sys.argv[3]
|
19 |
+
|
20 |
+
logging.info("Processing cc3m dataset")
|
21 |
+
|
22 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
23 |
+
df = pd.read_csv(annotation_file, delimiter='\t')
|
24 |
+
|
25 |
+
lines = []
|
26 |
+
|
27 |
+
df = df[["caption", "url", "index_row"]]
|
28 |
+
|
29 |
+
df = df.replace('', np.nan)
|
30 |
+
df = df.dropna()
|
31 |
+
|
32 |
+
print(f"Loaded {len(df)} images.")
|
33 |
+
|
34 |
+
for index, caption_reference_description, image_url, index_row in df.itertuples():
|
35 |
+
index_row+=1
|
36 |
+
base_url = os.path.basename(image_url) # extract base url
|
37 |
+
stem, ext = os.path.splitext(base_url) # split into stem and extension
|
38 |
+
filename = f'{index_row:08d}---{stem}.jpg'
|
39 |
+
|
40 |
+
full_image_path = images_dir+"/"+filename
|
41 |
+
|
42 |
+
if os.path.isfile(full_image_path):
|
43 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
|
44 |
+
else:
|
45 |
+
#print(f"{full_image_path} doesn't exist")
|
46 |
+
logging.error(full_image_path)
|
47 |
+
|
48 |
+
train_lines = lines[:-300_001]
|
49 |
+
valid_lines = lines[-300_001:]
|
50 |
+
|
51 |
+
with open(output_file+"_train.json", "w") as f:
|
52 |
+
f.write("\n".join(train_lines))
|
53 |
+
|
54 |
+
with open(output_file+"_val.json", "w") as f:
|
55 |
+
f.write("\n".join(valid_lines))
|
56 |
+
|
57 |
+
logging.info(f"Processing cc3m dataset done. {len(lines)} images processed.")
|
58 |
+
|
data/scripts/cc_propn.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from datetime import datetime
|
4 |
+
import pandas as pd
|
5 |
+
import contexttimer
|
6 |
+
from urllib.request import urlopen
|
7 |
+
import requests
|
8 |
+
from PIL import Image
|
9 |
+
import torch
|
10 |
+
from torchvision.transforms import functional as TF
|
11 |
+
from multiprocessing import Pool
|
12 |
+
from tqdm import tqdm
|
13 |
+
import logging
|
14 |
+
import sys
|
15 |
+
import numpy as np
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
from nltk.tag import CRFTagger
|
20 |
+
ct = CRFTagger()
|
21 |
+
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
|
22 |
+
|
23 |
+
headers = {
|
24 |
+
"User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
|
25 |
+
"X-Forwarded-For": "64.18.15.200",
|
26 |
+
}
|
27 |
+
|
28 |
+
# Setup
|
29 |
+
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
|
30 |
+
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
31 |
+
|
32 |
+
'''if len(sys.argv) != 3:
|
33 |
+
print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training")
|
34 |
+
exit(1)'''
|
35 |
+
|
36 |
+
# Load data
|
37 |
+
print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')
|
38 |
+
|
39 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
40 |
+
df = pd.read_csv(sys.argv[1], delimiter='\t')
|
41 |
+
df = df[["caption", "url"]]
|
42 |
+
|
43 |
+
def drop_no(text):
|
44 |
+
try:
|
45 |
+
if len(text)==0:
|
46 |
+
return True
|
47 |
+
elif len(text) > 96:
|
48 |
+
return True
|
49 |
+
text = text.split()
|
50 |
+
result = ct.tag_sents([text])
|
51 |
+
nnp_cnt = 0
|
52 |
+
total = len(result[0])
|
53 |
+
|
54 |
+
for x in result[0]:
|
55 |
+
if x[1] == "NNP":
|
56 |
+
nnp_cnt += 1
|
57 |
+
|
58 |
+
if (nnp_cnt/total)>=0.8:
|
59 |
+
return True
|
60 |
+
return False
|
61 |
+
except Exception as e:
|
62 |
+
print(e)
|
63 |
+
return True
|
64 |
+
|
65 |
+
df["to_drop"]=df["caption"].apply(drop_no)
|
66 |
+
df = df[df["to_drop"]==False]
|
67 |
+
df = df.drop("to_drop",axis=1)
|
68 |
+
|
69 |
+
df["index_row"] = df.index
|
70 |
+
|
71 |
+
df.to_csv(sys.argv[2], sep='\t',index=False)
|
72 |
+
|
data/scripts/coco.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import collections
|
3 |
+
import logging
|
4 |
+
import sys
|
5 |
+
|
6 |
+
if len(sys.argv) != 4:
|
7 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
8 |
+
exit(1)
|
9 |
+
|
10 |
+
annotation_file = sys.argv[1]
|
11 |
+
images_dir = sys.argv[2]
|
12 |
+
output_file = sys.argv[3]
|
13 |
+
|
14 |
+
logging.info("Processing COCO dataset")
|
15 |
+
|
16 |
+
with open(annotation_file, "r") as f:
|
17 |
+
annotations = json.load(f)["annotations"]
|
18 |
+
|
19 |
+
image_path_to_caption = collections.defaultdict(list)
|
20 |
+
for element in annotations:
|
21 |
+
caption = f"{element['caption'].lower().rstrip('.')}"
|
22 |
+
image_path = images_dir + "/%012d.jpg" % (element["image_id"])
|
23 |
+
image_path_to_caption[image_path].append(caption)
|
24 |
+
|
25 |
+
lines = []
|
26 |
+
for image_path, captions in image_path_to_caption.items():
|
27 |
+
lines.append(json.dumps({"image_path": image_path, "captions": captions}))
|
28 |
+
|
29 |
+
train_lines = lines[:-10_001]
|
30 |
+
valid_lines = lines[-10_001:]
|
31 |
+
|
32 |
+
with open(output_file+"_train.json", "w") as f:
|
33 |
+
f.write("\n".join(train_lines))
|
34 |
+
|
35 |
+
with open(output_file+"_val.json", "w") as f:
|
36 |
+
f.write("\n".join(valid_lines))
|
37 |
+
|
38 |
+
logging.info(f"Processing COCO dataset done. {len(lines)} images processed.")
|
39 |
+
|
40 |
+
# python scripts/coco.py annotations/coco_captions_train2017.json coco_dataset_train.json
|
data/scripts/flicker8k.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import sys
|
4 |
+
import os.path
|
5 |
+
|
6 |
+
if len(sys.argv) != 4:
|
7 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
8 |
+
exit(1)
|
9 |
+
|
10 |
+
annotation_file = sys.argv[1]
|
11 |
+
images_dir = sys.argv[2]
|
12 |
+
output_file = sys.argv[3]
|
13 |
+
|
14 |
+
logging.info("Processing Flicker 8k dataset")
|
15 |
+
|
16 |
+
with open(annotation_file, "r") as f:
|
17 |
+
annotations = json.load(f)
|
18 |
+
|
19 |
+
lines = []
|
20 |
+
for image_path, captions in annotations.items():
|
21 |
+
edited_captions = []
|
22 |
+
for caption in captions:
|
23 |
+
if len(caption) > 0:
|
24 |
+
edited_captions.append(caption.replace("<start> ", "").replace(" <end>", ""))
|
25 |
+
full_image_path = images_dir+"/"+image_path
|
26 |
+
if os.path.isfile(full_image_path):
|
27 |
+
if len(edited_captions) > 0:
|
28 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": edited_captions}))
|
29 |
+
else:
|
30 |
+
print(f"{full_image_path} doesn't exist")
|
31 |
+
|
32 |
+
train_lines = lines[:-801]
|
33 |
+
valid_lines = lines[-801:]
|
34 |
+
|
35 |
+
with open(output_file+"_train.json", "w") as f:
|
36 |
+
f.write("\n".join(train_lines))
|
37 |
+
|
38 |
+
with open(output_file+"_val.json", "w") as f:
|
39 |
+
f.write("\n".join(valid_lines))
|
40 |
+
|
41 |
+
logging.info(f"Processing Flicker 8k dataset done. {len(lines)} images processed.")
|
42 |
+
|
data/scripts/flickr30k.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os.path
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import contexttimer
|
7 |
+
|
8 |
+
if len(sys.argv) != 4:
|
9 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
10 |
+
exit(1)
|
11 |
+
|
12 |
+
annotation_file = sys.argv[1]
|
13 |
+
images_dir = sys.argv[2]
|
14 |
+
output_file = sys.argv[3]
|
15 |
+
|
16 |
+
logging.info("Processing Flicker 30k dataset")
|
17 |
+
|
18 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
19 |
+
df = pd.read_csv(annotation_file, delimiter='\t')
|
20 |
+
|
21 |
+
images_dict = {}
|
22 |
+
|
23 |
+
for index, caption, image_name in df.itertuples():
|
24 |
+
if image_name in images_dict:
|
25 |
+
images_dict[image_name] += [caption]
|
26 |
+
else:
|
27 |
+
images_dict[image_name] = [caption]
|
28 |
+
|
29 |
+
lines = []
|
30 |
+
|
31 |
+
for image_path, captions in images_dict.items():
|
32 |
+
full_image_path = images_dir+"/"+image_name
|
33 |
+
if os.path.isfile(full_image_path):
|
34 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": captions}))
|
35 |
+
else:
|
36 |
+
print(f"{full_image_path} doesn't exist")
|
37 |
+
|
38 |
+
train_lines = lines[:-3_001]
|
39 |
+
valid_lines = lines[-3_001:]
|
40 |
+
|
41 |
+
with open(output_file+"_train.json", "w") as f:
|
42 |
+
f.write("\n".join(train_lines))
|
43 |
+
|
44 |
+
with open(output_file+"_val.json", "w") as f:
|
45 |
+
f.write("\n".join(valid_lines))
|
46 |
+
|
47 |
+
logging.info(f"Processing Flicker 30k dataset done. {len(lines)} images processed.")
|
48 |
+
|
data/scripts/subcaption.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os.path
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import contexttimer
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# Setup
|
10 |
+
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
|
11 |
+
|
12 |
+
if len(sys.argv) != 4:
|
13 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
14 |
+
exit(1)
|
15 |
+
|
16 |
+
annotation_file = sys.argv[1]
|
17 |
+
images_dir = sys.argv[2]
|
18 |
+
output_file = sys.argv[3]
|
19 |
+
|
20 |
+
logging.info("Processing subcaption dataset")
|
21 |
+
|
22 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
23 |
+
df = pd.read_csv(annotation_file, delimiter='\t')
|
24 |
+
|
25 |
+
lines = []
|
26 |
+
|
27 |
+
df = df[["caption", "url"]]
|
28 |
+
|
29 |
+
df = df.replace('', np.nan)
|
30 |
+
df = df.dropna()
|
31 |
+
|
32 |
+
print(f"Loaded {len(df)} images.")
|
33 |
+
|
34 |
+
for index, caption_reference_description, image_url in df.itertuples():
|
35 |
+
#index+=1
|
36 |
+
base_url = os.path.basename(image_url) # extract base url
|
37 |
+
stem, ext = os.path.splitext(base_url) # split into stem and extension
|
38 |
+
filename = f'{index:08d}---{stem}.jpg'
|
39 |
+
|
40 |
+
full_image_path = images_dir+"/"+filename
|
41 |
+
|
42 |
+
if os.path.isfile(full_image_path):
|
43 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
|
44 |
+
else:
|
45 |
+
#print(f"{full_image_path} doesn't exist")
|
46 |
+
logging.error(full_image_path)
|
47 |
+
|
48 |
+
|
49 |
+
train_lines = lines[:-100_001]
|
50 |
+
valid_lines = lines[-100_001:]
|
51 |
+
|
52 |
+
with open(output_file+"_train.json", "w") as f:
|
53 |
+
f.write("\n".join(train_lines))
|
54 |
+
|
55 |
+
with open(output_file+"_val.json", "w") as f:
|
56 |
+
f.write("\n".join(valid_lines))
|
57 |
+
|
58 |
+
logging.info(f"Processing subcaption dataset done. {len(lines)} images processed.")
|
59 |
+
|
data/scripts/wit.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os.path
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import contexttimer
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
if len(sys.argv) != 4:
|
10 |
+
print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
|
11 |
+
exit(1)
|
12 |
+
|
13 |
+
annotation_file = sys.argv[1]
|
14 |
+
images_dir = sys.argv[2]
|
15 |
+
output_file = sys.argv[3]
|
16 |
+
|
17 |
+
logging.info("Processing WIT dataset")
|
18 |
+
|
19 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
20 |
+
df = pd.read_csv(annotation_file, delimiter='\t')
|
21 |
+
|
22 |
+
images_dict = {}
|
23 |
+
|
24 |
+
lines = []
|
25 |
+
|
26 |
+
df = df[["caption_reference_description", "image_url"]]
|
27 |
+
|
28 |
+
|
29 |
+
df = df.replace('', np.nan)
|
30 |
+
df = df.dropna()
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
for index, caption_reference_description, image_url in df.itertuples():
|
35 |
+
base_url = os.path.basename(image_url) # extract base url
|
36 |
+
stem, ext = os.path.splitext(base_url) # split into stem and extension
|
37 |
+
filename = f'{stem}.jpg'
|
38 |
+
|
39 |
+
full_image_path = images_dir+"/"+filename
|
40 |
+
|
41 |
+
if os.path.isfile(full_image_path):
|
42 |
+
lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
|
43 |
+
else:
|
44 |
+
print(f"{full_image_path} doesn't exist")
|
45 |
+
|
46 |
+
train_lines = lines[:-9_001]
|
47 |
+
valid_lines = lines[-9_001:]
|
48 |
+
|
49 |
+
with open(output_file+"_train.json", "w") as f:
|
50 |
+
f.write("\n".join(train_lines))
|
51 |
+
|
52 |
+
with open(output_file+"_val.json", "w") as f:
|
53 |
+
f.write("\n".join(valid_lines))
|
54 |
+
|
55 |
+
logging.info(f"Processing Flicker WIT dataset done. {len(lines)} images processed.")
|
56 |
+
|
data/scripts/wit_propn.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from datetime import datetime
|
4 |
+
import pandas as pd
|
5 |
+
import contexttimer
|
6 |
+
from urllib.request import urlopen
|
7 |
+
import requests
|
8 |
+
from PIL import Image
|
9 |
+
import torch
|
10 |
+
from torchvision.transforms import functional as TF
|
11 |
+
from multiprocessing import Pool
|
12 |
+
from tqdm import tqdm
|
13 |
+
import logging
|
14 |
+
import sys
|
15 |
+
import numpy as np
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
from nltk.tag import CRFTagger
|
20 |
+
ct = CRFTagger()
|
21 |
+
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
|
22 |
+
|
23 |
+
headers = {
|
24 |
+
"User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
|
25 |
+
"X-Forwarded-For": "64.18.15.200",
|
26 |
+
}
|
27 |
+
|
28 |
+
# Setup
|
29 |
+
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
|
30 |
+
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
31 |
+
|
32 |
+
'''if len(sys.argv) != 3:
|
33 |
+
print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training")
|
34 |
+
exit(1)'''
|
35 |
+
|
36 |
+
# Load data
|
37 |
+
print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')
|
38 |
+
|
39 |
+
with contexttimer.Timer(prefix="Loading from tsv"):
|
40 |
+
df = pd.read_csv(sys.argv[1], delimiter='\t')
|
41 |
+
df = df[["caption_reference_description", "image_url"]]
|
42 |
+
|
43 |
+
def drop_no(text):
|
44 |
+
try:
|
45 |
+
if len(text)==0:
|
46 |
+
return True
|
47 |
+
text = text.split()
|
48 |
+
result = ct.tag_sents([text])
|
49 |
+
nnp_cnt = 0
|
50 |
+
total = len(result[0])
|
51 |
+
|
52 |
+
for x in result[0]:
|
53 |
+
if x[1] == "NNP":
|
54 |
+
nnp_cnt += 1
|
55 |
+
|
56 |
+
if (nnp_cnt/total)>=0.8:
|
57 |
+
return True
|
58 |
+
return False
|
59 |
+
except Exception as e:
|
60 |
+
print(e)
|
61 |
+
return True
|
62 |
+
|
63 |
+
df["to_drop"]=df["caption_reference_description"].apply(drop_no)
|
64 |
+
df = df[df["to_drop"]==False]
|
65 |
+
df = df.drop("to_drop",axis=1)
|
66 |
+
|
67 |
+
df.to_csv(sys.argv[2], sep='\t')
|
68 |
+
|