File size: 7,570 Bytes
320e465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import cv2
import numpy as np

import torch
from collections import defaultdict


def tensor_to_numpy(image):
    image_np = (image.numpy() * 255).astype('uint8')
    return image_np


def tensor_to_np_float(image):
    image_np = image.numpy().astype('float32')
    return image_np


def detach_to_cpu(x):
    return x.detach().cpu()


def transpose_np(x):
    return np.transpose(x, [1, 2, 0])


def tensor_to_gray_im(x):
    x = detach_to_cpu(x)
    x = tensor_to_numpy(x)
    x = transpose_np(x)
    return x


def tensor_to_im(x):
    x = detach_to_cpu(x).clamp(0, 1)
    x = tensor_to_numpy(x)
    x = transpose_np(x)
    return x


# Predefined key <-> caption dict
key_captions = {
    'im': 'Image',
    'gt': 'GT',
}
"""
Return an image array with captions
keys in dictionary will be used as caption if not provided
values should contain lists of cv2 images
"""


def get_image_array(images, grid_shape, captions={}):
    h, w = grid_shape
    cate_counts = len(images)
    rows_counts = len(next(iter(images.values())))

    font = cv2.FONT_HERSHEY_SIMPLEX

    output_image = np.zeros([w * cate_counts, h * (rows_counts + 1), 3], dtype=np.uint8)
    col_cnt = 0
    for k, v in images.items():

        # Default as key value itself
        caption = captions.get(k, k)

        # Handles new line character
        dy = 40
        for i, line in enumerate(caption.split('\n')):
            cv2.putText(output_image, line, (10, col_cnt * w + 100 + i * dy), font, 0.8,
                        (255, 255, 255), 2, cv2.LINE_AA)

        # Put images
        for row_cnt, img in enumerate(v):
            im_shape = img.shape
            if len(im_shape) == 2:
                img = img[..., np.newaxis]

            img = (img * 255).astype('uint8')

            output_image[(col_cnt + 0) * w:(col_cnt + 1) * w,
                         (row_cnt + 1) * h:(row_cnt + 2) * h, :] = img

        col_cnt += 1

    return output_image


def base_transform(im, size):
    im = tensor_to_np_float(im)
    if len(im.shape) == 3:
        im = im.transpose((1, 2, 0))
    else:
        im = im[:, :, None]

    # Resize
    if im.shape[1] != size:
        im = cv2.resize(im, size, interpolation=cv2.INTER_NEAREST)

    return im.clip(0, 1)


def im_transform(im, size):
    return base_transform(detach_to_cpu(im), size=size)


def mask_transform(mask, size):
    return base_transform(detach_to_cpu(mask), size=size)


def logits_transform(mask, size):
    return base_transform(detach_to_cpu(torch.sigmoid(mask)), size=size)


def add_attention(mask, pos):
    mask = mask[:, :, None].repeat(3, axis=2)
    pos = (pos + 1) / 2
    for i in range(pos.shape[0]):
        y = int(pos[i][0] * mask.shape[0])
        x = int(pos[i][1] * mask.shape[1])
        y = max(min(y, mask.shape[0] - 1), 0)
        x = max(min(x, mask.shape[1] - 1), 0)
        # mask[y, x, :] = (255, 0, 0)
        cv2.circle(mask, (x, y), 5, (1, 0, 0), -1)
    return mask


def vis(images, size, num_objects):
    req_images = defaultdict(list)

    b, t = images['rgb'].shape[:2]

    # limit the number of images saved
    b = min(2, b)

    # find max num objects
    max_num_objects = max(num_objects[:b])

    GT_suffix = ''
    for bi in range(b):
        GT_suffix += ' \n%s' % images['info']['name'][bi][-25:-4]

    for bi in range(b):
        for ti in range(t):
            req_images['RGB'].append(im_transform(images['rgb'][bi, ti], size))
            aux = images[f'aux_{max(ti, 1)}']  # no aux_0, use aux_1 for shape
            if 'sensory_logits' in aux:
                sensory_aux = aux['sensory_logits'][bi].softmax(dim=0)
            # batch_size * num_objects * num_levels * H * W
            q_mask_aux = aux['q_logits'][bi].softmax(dim=0)
            num_levels = q_mask_aux.shape[1]

            for oi in range(max_num_objects):
                if ti == 0 or oi >= num_objects[bi]:
                    req_images[f'Mask_{oi}'].append(
                        mask_transform(images['first_frame_gt'][bi][0, oi], size))
                    req_images[f'S-Aux_{oi}'].append(
                        mask_transform(images['first_frame_gt'][bi][0, oi], size))
                    for l in range(num_levels):
                        req_images[f'Q-Aux-L{l}_{oi}'].append(
                            mask_transform(images['first_frame_gt'][bi][0, oi], size))
                else:
                    mask = mask_transform(images[f'masks_{ti}'][bi][oi], size)
                    req_images[f'Mask_{oi}'].append(mask)
                    if 'sensory_logits' in aux:
                        req_images[f'S-Aux_{oi}'].append(mask_transform(sensory_aux[oi + 1], size))

                    for l in range(num_levels):
                        mask = mask_transform(q_mask_aux[oi + 1, l], size)
                        req_images[f'Q-Aux-L{l}_{oi}'].append(mask)

                req_images[f'GT_{oi}_{GT_suffix}'].append(
                    mask_transform(images['cls_gt'][bi, ti, 0] == (oi + 1), size))

    return get_image_array(req_images, size, key_captions)


def vis_debug(images, size, num_objects):
    req_images = defaultdict(list)

    b, t = images['rgb'].shape[:2]

    # limit the number of images saved
    b = min(2, b)

    # find max num objects
    max_num_objects = max(num_objects[:b])

    GT_suffix = ''
    for bi in range(b):
        GT_suffix += ' \n%s' % images['info']['name'][bi][-25:-4]

    for bi in range(b):
        for ti in range(t):
            req_images['RGB'].append(im_transform(images['rgb'][bi, ti], size))
            aux = images[f'aux_{max(ti, 1)}']  # no aux_0, use aux_1 for shape
            sensory_aux = aux['sensory_logits'][bi].softmax(dim=0)
            # batch_size * num_objects * num_levels * H * W
            q_mask_aux = aux['q_logits'][bi].softmax(dim=0)
            attn_mask = aux['attn_mask'][bi]
            num_levels = q_mask_aux.shape[1]
            num_queries = attn_mask.shape[1]

            for oi in range(max_num_objects):
                if ti == 0 or oi >= num_objects[bi]:
                    req_images[f'Mask_{oi}'].append(
                        mask_transform(images['first_frame_gt'][bi][0, oi], size))
                    req_images[f'S-Aux_{oi}'].append(
                        mask_transform(images['first_frame_gt'][bi][0, oi], size))
                    for l in range(num_levels):
                        req_images[f'Q-Aux-L{l}_{oi}'].append(
                            mask_transform(images['first_frame_gt'][bi][0, oi], size))
                    for q in range(num_queries):
                        req_images[f'Attn-Mask-Q{q}_{oi}'].append(
                            mask_transform(images['first_frame_gt'][bi][0, oi], size))
                else:
                    mask = mask_transform(images[f'masks_{ti}'][bi][oi], size)
                    req_images[f'Mask_{oi}'].append(mask)
                    req_images[f'S-Aux_{oi}'].append(mask_transform(sensory_aux[oi + 1], size))

                    for l in range(num_levels):
                        mask = mask_transform(q_mask_aux[oi + 1, l], size)
                        req_images[f'Q-Aux-L{l}_{oi}'].append(mask)
                    for q in range(num_queries):
                        mask = mask_transform(1 - attn_mask[oi, q].float(), size)
                        req_images[f'Attn-Mask-Q{q}_{oi}'].append(mask)

                req_images[f'GT_{oi}_{GT_suffix}'].append(
                    mask_transform(images['cls_gt'][bi, ti, 0] == (oi + 1), size))

    return get_image_array(req_images, size, key_captions)