Imagroune commited on
Commit
a3f9aa4
1 Parent(s): 92d617e
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/__init__-checkpoint.py ADDED
File without changes
.ipynb_checkpoints/modeling_feynmodel-checkpoint.py ADDED
@@ -0,0 +1,1528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_fynmodel : Imed MAGROUNE / 2024 - 09
2
+ # original code from modeling_FeynModel
3
+ # add DaVit Vision Tower
4
+ #
5
+ # update generate forward function
6
+ #
7
+ # add lora adapters
8
+ #
9
+ # train on coco OD and vision reasoning
10
+ # train on ScenceQA
11
+ #
12
+ # todo add mamaba layer
13
+ #
14
+ # todo train on Arc-AGI
15
+
16
+
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.utils import (
19
+ ModelOutput,
20
+ add_start_docstrings,
21
+ add_start_docstrings_to_model_forward,
22
+ is_flash_attn_2_available,
23
+ logging,
24
+ replace_return_docstrings,
25
+ is_flash_attn_2_available,
26
+ is_flash_attn_greater_or_equal_2_10,
27
+ )
28
+ from transformers.activations import ACT2FN
29
+ from transformers.modeling_attn_mask_utils import (
30
+ _prepare_4d_attention_mask,
31
+ _prepare_4d_attention_mask_for_sdpa,
32
+ _prepare_4d_causal_attention_mask,
33
+ _prepare_4d_causal_attention_mask_for_sdpa,
34
+ )
35
+ from transformers.modeling_outputs import (
36
+ BaseModelOutput,
37
+ BaseModelOutputWithPastAndCrossAttentions,
38
+ Seq2SeqLMOutput,
39
+ Seq2SeqModelOutput,
40
+ )
41
+
42
+ from transformers.cache_utils import Cache, HybridCache
43
+ from transformers.modeling_outputs import (
44
+ BaseModelOutputWithPast,
45
+ CausalLMOutputWithPast,
46
+ SequenceClassifierOutputWithPast,
47
+ TokenClassifierOutput,
48
+ )
49
+
50
+ from typing import List, Optional, Tuple, Union
51
+
52
+ from transformers.models.gemma2.modeling_gemma2 import Gemma2Model, Gemma2ForCausalLM,Gemma2DecoderLayer,Gemma2RMSNorm
53
+ from configuration_feynmodel import FeynModelConfig,Florence2VisionConfig
54
+
55
+ from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM
56
+ import json
57
+ import math
58
+ import torch
59
+ from torch import nn
60
+ import torch.nn.functional as F
61
+ import logging
62
+
63
+ from transformers.utils import (
64
+ ModelOutput,
65
+ add_start_docstrings,
66
+ add_start_docstrings_to_model_forward,
67
+ is_flash_attn_2_available,
68
+ logging,
69
+ replace_return_docstrings,
70
+ is_flash_attn_2_available,
71
+ is_flash_attn_greater_or_equal_2_10,
72
+ )
73
+
74
+ from transformers.modeling_utils import PreTrainedModel
75
+
76
+ from collections import OrderedDict
77
+ from einops import rearrange
78
+ from timm.models.layers import DropPath, trunc_normal_
79
+
80
+ logger = logging.get_logger(__name__)
81
+
82
+ class MySequential(nn.Sequential):
83
+ def forward(self, *inputs):
84
+ for module in self._modules.values():
85
+ if type(inputs) == tuple:
86
+ inputs = module(*inputs)
87
+ else:
88
+ inputs = module(inputs)
89
+ return inputs
90
+
91
+
92
+ class PreNorm(nn.Module):
93
+ def __init__(self, norm, fn, drop_path=None):
94
+ super().__init__()
95
+ self.norm = norm
96
+ self.fn = fn
97
+ self.drop_path = drop_path
98
+
99
+ def forward(self, x, *args, **kwargs):
100
+ shortcut = x
101
+ if self.norm != None:
102
+ x, size = self.fn(self.norm(x), *args, **kwargs)
103
+ else:
104
+ x, size = self.fn(x, *args, **kwargs)
105
+
106
+ if self.drop_path:
107
+ x = self.drop_path(x)
108
+
109
+ x = shortcut + x
110
+
111
+ return x, size
112
+
113
+
114
+ class Mlp(nn.Module):
115
+ def __init__(
116
+ self,
117
+ in_features,
118
+ hidden_features=None,
119
+ out_features=None,
120
+ act_layer=nn.GELU,
121
+ ):
122
+ super().__init__()
123
+ out_features = out_features or in_features
124
+ hidden_features = hidden_features or in_features
125
+ self.net = nn.Sequential(OrderedDict([
126
+ ("fc1", nn.Linear(in_features, hidden_features)),
127
+ ("act", act_layer()),
128
+ ("fc2", nn.Linear(hidden_features, out_features))
129
+ ]))
130
+
131
+ def forward(self, x, size):
132
+ return self.net(x), size
133
+
134
+
135
+ class DepthWiseConv2d(nn.Module):
136
+ def __init__(
137
+ self,
138
+ dim_in,
139
+ kernel_size,
140
+ padding,
141
+ stride,
142
+ bias=True,
143
+ ):
144
+ super().__init__()
145
+ self.dw = nn.Conv2d(
146
+ dim_in, dim_in,
147
+ kernel_size=kernel_size,
148
+ padding=padding,
149
+ groups=dim_in,
150
+ stride=stride,
151
+ bias=bias
152
+ )
153
+
154
+ def forward(self, x, size):
155
+ B, N, C = x.shape
156
+ H, W = size
157
+ assert N == H * W
158
+
159
+ x = self.dw(x.transpose(1, 2).view(B, C, H, W))
160
+ size = (x.size(-2), x.size(-1))
161
+ x = x.flatten(2).transpose(1, 2)
162
+ return x, size
163
+
164
+
165
+ class ConvEmbed(nn.Module):
166
+ """ Image to Patch Embedding
167
+ """
168
+
169
+ def __init__(
170
+ self,
171
+ patch_size=7,
172
+ in_chans=3,
173
+ embed_dim=64,
174
+ stride=4,
175
+ padding=2,
176
+ norm_layer=None,
177
+ pre_norm=True
178
+ ):
179
+ super().__init__()
180
+ self.patch_size = patch_size
181
+
182
+ self.proj = nn.Conv2d(
183
+ in_chans, embed_dim,
184
+ kernel_size=patch_size,
185
+ stride=stride,
186
+ padding=padding
187
+ )
188
+
189
+ dim_norm = in_chans if pre_norm else embed_dim
190
+ self.norm = norm_layer(dim_norm) if norm_layer else None
191
+
192
+ self.pre_norm = pre_norm
193
+
194
+ def forward(self, x, size):
195
+ H, W = size
196
+ if len(x.size()) == 3:
197
+ if self.norm and self.pre_norm:
198
+ x = self.norm(x)
199
+ x = rearrange(
200
+ x, 'b (h w) c -> b c h w',
201
+ h=H, w=W
202
+ )
203
+
204
+ x = self.proj(x)
205
+
206
+ _, _, H, W = x.shape
207
+ x = rearrange(x, 'b c h w -> b (h w) c')
208
+ if self.norm and not self.pre_norm:
209
+ x = self.norm(x)
210
+
211
+ return x, (H, W)
212
+
213
+
214
+ class ChannelAttention(nn.Module):
215
+
216
+ def __init__(self, dim, groups=8, qkv_bias=True):
217
+ super().__init__()
218
+
219
+ self.groups = groups
220
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
221
+ self.proj = nn.Linear(dim, dim)
222
+
223
+ def forward(self, x, size):
224
+ B, N, C = x.shape
225
+
226
+ qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4)
227
+ q, k, v = qkv[0], qkv[1], qkv[2]
228
+
229
+ q = q * (float(N) ** -0.5)
230
+ attention = q.transpose(-1, -2) @ k
231
+ attention = attention.softmax(dim=-1)
232
+ x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
233
+ x = x.transpose(1, 2).reshape(B, N, C)
234
+ x = self.proj(x)
235
+ return x, size
236
+
237
+
238
+ class ChannelBlock(nn.Module):
239
+
240
+ def __init__(self, dim, groups, mlp_ratio=4., qkv_bias=True,
241
+ drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
242
+ conv_at_attn=True, conv_at_ffn=True):
243
+ super().__init__()
244
+
245
+ drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
246
+
247
+ self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
248
+ self.channel_attn = PreNorm(
249
+ norm_layer(dim),
250
+ ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
251
+ drop_path
252
+ )
253
+ self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
254
+ self.ffn = PreNorm(
255
+ norm_layer(dim),
256
+ Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
257
+ drop_path
258
+ )
259
+
260
+ def forward(self, x, size):
261
+ if self.conv1:
262
+ x, size = self.conv1(x, size)
263
+ x, size = self.channel_attn(x, size)
264
+
265
+ if self.conv2:
266
+ x, size = self.conv2(x, size)
267
+ x, size = self.ffn(x, size)
268
+
269
+ return x, size
270
+
271
+
272
+ def window_partition(x, window_size: int):
273
+ B, H, W, C = x.shape
274
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
275
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
276
+ return windows
277
+
278
+
279
+ def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
280
+ B = batch_size
281
+ # this will cause onnx conversion failed for dynamic axis, because treated as constant
282
+ # int(windows.shape[0] / (H * W / window_size / window_size))
283
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
284
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
285
+ return x
286
+
287
+
288
+ class WindowAttention(nn.Module):
289
+ def __init__(self, dim, num_heads, window_size, qkv_bias=True):
290
+
291
+ super().__init__()
292
+ self.dim = dim
293
+ self.window_size = window_size
294
+ self.num_heads = num_heads
295
+ head_dim = dim // num_heads
296
+ self.scale = float(head_dim) ** -0.5
297
+
298
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
299
+ self.proj = nn.Linear(dim, dim)
300
+
301
+ self.softmax = nn.Softmax(dim=-1)
302
+
303
+ def forward(self, x, size):
304
+
305
+ H, W = size
306
+ B, L, C = x.shape
307
+ assert L == H * W, "input feature has wrong size"
308
+
309
+ x = x.view(B, H, W, C)
310
+
311
+ pad_l = pad_t = 0
312
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
313
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
314
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
315
+ _, Hp, Wp, _ = x.shape
316
+
317
+ x = window_partition(x, self.window_size)
318
+ x = x.view(-1, self.window_size * self.window_size, C)
319
+
320
+ # W-MSA/SW-MSA
321
+ # attn_windows = self.attn(x_windows)
322
+
323
+ B_, N, C = x.shape
324
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
325
+ q, k, v = qkv[0], qkv[1], qkv[2]
326
+
327
+ q = q * self.scale
328
+ attn = (q @ k.transpose(-2, -1))
329
+ attn = self.softmax(attn)
330
+
331
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
332
+ x = self.proj(x)
333
+
334
+ # merge windows
335
+ x = x.view(
336
+ -1, self.window_size, self.window_size, C
337
+ )
338
+ x = window_reverse(x, B, self.window_size, Hp, Wp)
339
+
340
+ if pad_r > 0 or pad_b > 0:
341
+ x = x[:, :H, :W, :].contiguous()
342
+
343
+ x = x.view(B, H * W, C)
344
+
345
+ return x, size
346
+
347
+
348
+ class SpatialBlock(nn.Module):
349
+
350
+ def __init__(self, dim, num_heads, window_size,
351
+ mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU,
352
+ norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
353
+ super().__init__()
354
+
355
+ drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
356
+
357
+ self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
358
+ self.window_attn = PreNorm(
359
+ norm_layer(dim),
360
+ WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
361
+ drop_path
362
+ )
363
+ self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
364
+ self.ffn = PreNorm(
365
+ norm_layer(dim),
366
+ Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
367
+ drop_path
368
+ )
369
+
370
+ def forward(self, x, size):
371
+ if self.conv1:
372
+ x, size = self.conv1(x, size)
373
+ x, size = self.window_attn(x, size)
374
+
375
+ if self.conv2:
376
+ x, size = self.conv2(x, size)
377
+ x, size = self.ffn(x, size)
378
+ return x, size
379
+
380
+
381
+ class DaViT(nn.Module):
382
+ """ DaViT: Dual-Attention Transformer
383
+
384
+ Args:
385
+ in_chans (int): Number of input image channels. Default: 3.
386
+ num_classes (int): Number of classes for classification head. Default: 1000.
387
+ patch_size (tuple(int)): Patch size of convolution in different stages. Default: (7, 2, 2, 2).
388
+ patch_stride (tuple(int)): Patch stride of convolution in different stages. Default: (4, 2, 2, 2).
389
+ patch_padding (tuple(int)): Patch padding of convolution in different stages. Default: (3, 0, 0, 0).
390
+ patch_prenorm (tuple(bool)): If True, perform norm before convlution layer. Default: (True, False, False, False).
391
+ embed_dims (tuple(int)): Patch embedding dimension in different stages. Default: (64, 128, 192, 256).
392
+ num_heads (tuple(int)): Number of spatial attention heads in different stages. Default: (4, 8, 12, 16).
393
+ num_groups (tuple(int)): Number of channel groups in different stages. Default: (4, 8, 12, 16).
394
+ window_size (int): Window size. Default: 7.
395
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
396
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True.
397
+ drop_path_rate (float): Stochastic depth rate. Default: 0.1.
398
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
399
+ enable_checkpoint (bool): If True, enable checkpointing. Default: False.
400
+ conv_at_attn (bool): If True, performe depthwise convolution before attention layer. Default: True.
401
+ conv_at_ffn (bool): If True, performe depthwise convolution before ffn layer. Default: True.
402
+ """
403
+
404
+ def __init__(
405
+ self,
406
+ in_chans=3,
407
+ num_classes=1000,
408
+ depths=(1, 1, 3, 1),
409
+ patch_size=(7, 2, 2, 2),
410
+ patch_stride=(4, 2, 2, 2),
411
+ patch_padding=(3, 0, 0, 0),
412
+ patch_prenorm=(False, False, False, False),
413
+ embed_dims=(64, 128, 192, 256),
414
+ num_heads=(3, 6, 12, 24),
415
+ num_groups=(3, 6, 12, 24),
416
+ window_size=7,
417
+ mlp_ratio=4.,
418
+ qkv_bias=True,
419
+ drop_path_rate=0.1,
420
+ norm_layer=nn.LayerNorm,
421
+ enable_checkpoint=False,
422
+ conv_at_attn=True,
423
+ conv_at_ffn=True,
424
+ ):
425
+ super().__init__()
426
+
427
+ self.num_classes = num_classes
428
+ self.embed_dims = embed_dims
429
+ self.num_heads = num_heads
430
+ self.num_groups = num_groups
431
+ self.num_stages = len(self.embed_dims)
432
+ self.enable_checkpoint = enable_checkpoint
433
+ assert self.num_stages == len(self.num_heads) == len(self.num_groups)
434
+
435
+ num_stages = len(embed_dims)
436
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
437
+
438
+ depth_offset = 0
439
+ convs = []
440
+ blocks = []
441
+ for i in range(num_stages):
442
+ conv_embed = ConvEmbed(
443
+ patch_size=patch_size[i],
444
+ stride=patch_stride[i],
445
+ padding=patch_padding[i],
446
+ in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
447
+ embed_dim=self.embed_dims[i],
448
+ norm_layer=norm_layer,
449
+ pre_norm=patch_prenorm[i]
450
+ )
451
+ convs.append(conv_embed)
452
+
453
+ block = MySequential(
454
+ *[
455
+ MySequential(OrderedDict([
456
+ (
457
+ 'spatial_block', SpatialBlock(
458
+ embed_dims[i],
459
+ num_heads[i],
460
+ window_size,
461
+ drop_path_rate=dpr[depth_offset+j*2],
462
+ qkv_bias=qkv_bias,
463
+ mlp_ratio=mlp_ratio,
464
+ conv_at_attn=conv_at_attn,
465
+ conv_at_ffn=conv_at_ffn,
466
+ )
467
+ ),
468
+ (
469
+ 'channel_block', ChannelBlock(
470
+ embed_dims[i],
471
+ num_groups[i],
472
+ drop_path_rate=dpr[depth_offset+j*2+1],
473
+ qkv_bias=qkv_bias,
474
+ mlp_ratio=mlp_ratio,
475
+ conv_at_attn=conv_at_attn,
476
+ conv_at_ffn=conv_at_ffn,
477
+ )
478
+ )
479
+ ])) for j in range(depths[i])
480
+ ]
481
+ )
482
+ blocks.append(block)
483
+ depth_offset += depths[i]*2
484
+
485
+ self.convs = nn.ModuleList(convs)
486
+ self.blocks = nn.ModuleList(blocks)
487
+
488
+ self.norms = norm_layer(self.embed_dims[-1])
489
+ self.avgpool = nn.AdaptiveAvgPool1d(1)
490
+ self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
491
+
492
+ self.apply(self._init_weights)
493
+
494
+ @property
495
+ def dim_out(self):
496
+ return self.embed_dims[-1]
497
+
498
+ def _init_weights(self, m):
499
+ if isinstance(m, nn.Linear):
500
+ trunc_normal_(m.weight, std=0.02)
501
+ if m.bias is not None:
502
+ nn.init.constant_(m.bias, 0)
503
+ elif isinstance(m, nn.Conv2d):
504
+ nn.init.normal_(m.weight, std=0.02)
505
+ for name, _ in m.named_parameters():
506
+ if name in ['bias']:
507
+ nn.init.constant_(m.bias, 0)
508
+ elif isinstance(m, nn.LayerNorm):
509
+ nn.init.constant_(m.weight, 1.0)
510
+ nn.init.constant_(m.bias, 0)
511
+ elif isinstance(m, nn.BatchNorm2d):
512
+ nn.init.constant_(m.weight, 1.0)
513
+ nn.init.constant_(m.bias, 0)
514
+
515
+ def forward_features_unpool(self, x):
516
+ """
517
+ forward until avg pooling
518
+ Args:
519
+ x (_type_): input image tensor
520
+ """
521
+ input_size = (x.size(2), x.size(3))
522
+ for conv, block in zip(self.convs, self.blocks):
523
+ x, input_size = conv(x, input_size)
524
+ if self.enable_checkpoint:
525
+ x, input_size = checkpoint.checkpoint(block, x, input_size)
526
+ else:
527
+ x, input_size = block(x, input_size)
528
+ return x
529
+
530
+ def forward_features(self, x):
531
+ x = self.forward_features_unpool(x)
532
+
533
+ # (batch_size, num_tokens, token_dim)
534
+ x = self.avgpool(x.transpose(1, 2))
535
+ # (batch_size, 1, num_tokens)
536
+ x = torch.flatten(x, 1)
537
+ x = self.norms(x)
538
+
539
+ return x
540
+
541
+ def forward(self, x):
542
+ x = self.forward_features(x)
543
+ x = self.head(x)
544
+ return x
545
+
546
+ @classmethod
547
+ def from_config(cls, config):
548
+ return cls(
549
+ depths=config.depths,
550
+ embed_dims=config.dim_embed,
551
+ num_heads=config.num_heads,
552
+ num_groups=config.num_groups,
553
+ patch_size=config.patch_size,
554
+ patch_stride=config.patch_stride,
555
+ patch_padding=config.patch_padding,
556
+ patch_prenorm=config.patch_prenorm,
557
+ drop_path_rate=config.drop_path_rate,
558
+ window_size=config.window_size,
559
+ )
560
+
561
+
562
+
563
+
564
+ _CONFIG_FOR_DOC = "FeynModelConfig"
565
+
566
+ FEYNMODEL_START_DOCSTRING = r"""
567
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
568
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
569
+ etc.)
570
+
571
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
572
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
573
+ and behavior.
574
+
575
+ Parameters:
576
+ config ([`FeynModelConfig`]):
577
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
578
+ load the weights associated with the model, only the configuration. Check out the
579
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
580
+ """
581
+ FEYNMODEL_INPUTS_DOCSTRING = r"""
582
+ Args:
583
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
584
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
585
+ it.
586
+
587
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
588
+ [`PreTrainedTokenizer.__call__`] for details.
589
+
590
+ [What are input IDs?](../glossary#input-ids)
591
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
592
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
593
+
594
+ - 1 for tokens that are **not masked**,
595
+ - 0 for tokens that are **masked**.
596
+
597
+ [What are attention masks?](../glossary#attention-mask)
598
+
599
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
600
+ [`PreTrainedTokenizer.__call__`] for details.
601
+
602
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
603
+ `past_key_values`).
604
+
605
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
606
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
607
+ information on the default strategy.
608
+
609
+ - 1 indicates the head is **not masked**,
610
+ - 0 indicates the head is **masked**.
611
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
612
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
613
+ config.n_positions - 1]`.
614
+
615
+ [What are position IDs?](../glossary#position-ids)
616
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
617
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
618
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
619
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
620
+
621
+ Two formats are allowed:
622
+ - a [`~cache_utils.Cache`] instance;
623
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
624
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
625
+ cache format.
626
+
627
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
628
+ legacy cache format will be returned.
629
+
630
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
631
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
632
+ of shape `(batch_size, sequence_length)`.
633
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
634
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
635
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
636
+ model's internal embedding lookup matrix.
637
+ use_cache (`bool`, *optional*):
638
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
639
+ `past_key_values`).
640
+ output_attentions (`bool`, *optional*):
641
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
642
+ tensors for more detail.
643
+ output_hidden_states (`bool`, *optional*):
644
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
645
+ more detail.
646
+ return_dict (`bool`, *optional*):
647
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
648
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
649
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
650
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
651
+ the complete sequence length.
652
+ """
653
+
654
+ # Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
655
+ def _prepare_4d_causal_attention_mask_with_cache_position(
656
+ attention_mask: torch.Tensor,
657
+ sequence_length: int,
658
+ target_length: int,
659
+ dtype: torch.dtype,
660
+ device: torch.device,
661
+ min_dtype: float,
662
+ cache_position: torch.Tensor,
663
+ batch_size: int,
664
+ ):
665
+
666
+ #print(f" +++++++++ prepare 4K +++++++++++++++ rec {attention_mask.size()} sequence_length {sequence_length}")
667
+ if attention_mask is not None and attention_mask.dim() == 4:
668
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
669
+ #print("+++++++++++++++++ return it")
670
+ #causal_mask = attention_mask
671
+ # In this case we assume that the mask comes already in inverted form.
672
+ causal_mask = attention_mask[:, :, -sequence_length:, :]
673
+ #print(f"+++++++++++++++++ truncated causal_mask to last {sequence_length} elements, size: {causal_mask.size()}")
674
+ #print(f"+++++++++++++++++ return it causal_mask {causal_mask.size()} !!!!!!!!! attention_mask {attention_mask.size()}")
675
+ else:
676
+ #print("+++++++++++++++++++++ else +++++++++++++++++")
677
+ causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
678
+ #print(f"++++++++++++++++ causal_mask {causal_mask.size()} ++++++++++++++++++ sequence_length = {sequence_length} ")
679
+ if sequence_length != 1:
680
+ causal_mask = torch.triu(causal_mask, diagonal=1)
681
+ #print(f"++++++++++++++++++ causal_mask = torch.triu ++++++++++ {causal_mask.size()} ")
682
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
683
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
684
+ #print(f"+++++++++++++++++++++ avant if attention_mask is not None:, causal_mask={causal_mask.size()}")
685
+ if attention_mask is not None:
686
+ #print(" +++++++++++++ attention_mask is None++++++++++++")
687
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
688
+ mask_length = attention_mask.shape[-1]
689
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
690
+ padding_mask = padding_mask == 0
691
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
692
+ padding_mask, min_dtype
693
+ )
694
+ #print(f"+++++++++++++++++++ 4K returning causal_mask {causal_mask.size()} +++++++++++++++++++")
695
+
696
+ return causal_mask
697
+
698
+ class LearnedAbsolutePositionEmbedding2D(nn.Module):
699
+ """
700
+ This module learns positional embeddings up to a fixed maximum size.
701
+ """
702
+
703
+ def __init__(self, embedding_dim=256, num_pos=50):
704
+ super().__init__()
705
+ self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
706
+ self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
707
+
708
+ def forward(self, pixel_values):
709
+ """
710
+ pixel_values: (batch_size, height, width, num_channels)
711
+ returns: (batch_size, height, width, embedding_dim * 2)
712
+ """
713
+ if len(pixel_values.shape) != 4:
714
+ raise ValueError('pixel_values must be a 4D tensor')
715
+ height, width = pixel_values.shape[1:3]
716
+ width_values = torch.arange(width, device=pixel_values.device)
717
+ height_values = torch.arange(height, device=pixel_values.device)
718
+ x_emb = self.column_embeddings(width_values)
719
+ y_emb = self.row_embeddings(height_values)
720
+ # (height, width, embedding_dim * 2)
721
+ pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
722
+ # (embedding_dim * 2, height, width)
723
+ pos = pos.permute(2, 0, 1)
724
+ pos = pos.unsqueeze(0)
725
+ # (batch_size, embedding_dim * 2, height, width)
726
+ pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
727
+ # (batch_size, height, width, embedding_dim * 2)
728
+ pos = pos.permute(0, 2, 3, 1)
729
+ return pos
730
+
731
+ class PositionalEmbeddingCosine1D(nn.Module):
732
+ """
733
+ This class implements a very simple positional encoding. It follows closely
734
+ the encoder from the link below:
735
+ https://pytorch.org/tutorials/beginner/translation_transformer.html
736
+ Args:
737
+ embed_dim: The dimension of the embeddings.
738
+ dropout_prob: The dropout probability.
739
+ max_seq_len: The maximum length to precompute the positional encodings.
740
+ """
741
+ def __init__(
742
+ self,
743
+ embed_dim: int = 512,
744
+ max_seq_len: int = 1024) -> None:
745
+ super(PositionalEmbeddingCosine1D, self).__init__()
746
+ self.embed_dim = embed_dim
747
+ self.max_seq_len = max_seq_len
748
+ # Generate the sinusoidal arrays.
749
+ factor = math.log(10000)
750
+ denominator = torch.exp(
751
+ -factor * torch.arange(0, self.embed_dim, 2) / self.embed_dim)
752
+ # Matrix where rows correspond to a positional embedding as a function
753
+ # of the position index (i.e., the row index).
754
+ frequencies = \
755
+ torch.arange(0, self.max_seq_len) \
756
+ .reshape(self.max_seq_len, 1) * denominator
757
+ pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
758
+ # Populate uneven entries.
759
+ pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
760
+ pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
761
+ # Save the positional embeddings in a constant buffer.
762
+ self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
763
+
764
+ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
765
+ """
766
+ Args:
767
+ seq_embeds: The sequence embeddings in order. Allowed size:
768
+ 1. [T, D], where T is the length of the sequence, and D is the
769
+ frame embedding dimension.
770
+ 2. [B, T, D], where B is the batch size and T and D are the
771
+ same as above.
772
+ Returns a tensor of with the same dimensions as the input: i.e.,
773
+ [1, T, D] or [T, D].
774
+ """
775
+ shape_len = len(seq_embeds.shape)
776
+ assert 2 <= shape_len <= 3
777
+ len_seq = seq_embeds.size(-2)
778
+ assert len_seq <= self.max_seq_len
779
+ pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
780
+ # Adapt pre-computed positional embeddings to the input.
781
+ if shape_len == 3:
782
+ pos_embeds = pos_embeds.view(
783
+ (1, pos_embeds.size(0), pos_embeds.size(1)))
784
+ return pos_embeds
785
+
786
+
787
+ class LearnedAbsolutePositionEmbedding1D(nn.Module):
788
+ """
789
+ Learnable absolute positional embeddings for 1D sequences.
790
+ Args:
791
+ embed_dim: The dimension of the embeddings.
792
+ max_seq_len: The maximum length to precompute the positional encodings.
793
+ """
794
+ def __init__(
795
+ self,
796
+ embedding_dim: int = 512,
797
+ num_pos: int = 1024) -> None:
798
+ super(LearnedAbsolutePositionEmbedding1D, self).__init__()
799
+ self.embeddings = nn.Embedding(num_pos, embedding_dim)
800
+ self.num_pos = num_pos
801
+
802
+ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
803
+ """
804
+ Args:
805
+ seq_embeds: The sequence embeddings in order. Allowed size:
806
+ 1. [T, D], where T is the length of the sequence, and D is the
807
+ frame embedding dimension.
808
+ 2. [B, T, D], where B is the batch size and T and D are the
809
+ same as above.
810
+ Returns a tensor of with the same dimensions as the input: i.e.,
811
+ [1, T, D] or [T, D].
812
+ """
813
+ shape_len = len(seq_embeds.shape)
814
+ assert 2 <= shape_len <= 3
815
+ len_seq = seq_embeds.size(-2)
816
+ assert len_seq <= self.num_pos
817
+ # [T, D]
818
+ pos_embeds = self.embeddings(torch.arange(len_seq).to(seq_embeds.device))
819
+ # Adapt pre-computed positional embeddings to the input.
820
+ if shape_len == 3:
821
+ pos_embeds = pos_embeds.view(
822
+ (1, pos_embeds.size(0), pos_embeds.size(1)))
823
+ return pos_embeds
824
+
825
+ def create_git_attention_mask(
826
+ tgt: torch.Tensor,
827
+ memory: torch.Tensor,
828
+ max_length: int
829
+ ) -> torch.Tensor:
830
+ # Obtain the dimensions of the target text and memory
831
+ batch_size = tgt.size(0)
832
+ num_tgt = tgt.shape[1]
833
+ num_memory = memory.shape[1]
834
+ total_length = num_memory + num_tgt
835
+
836
+ # Create the top left part of the attention matrix
837
+ top_left = torch.zeros((num_memory, num_memory)) # Attention enabled in this region
838
+ top_right = torch.full((num_memory, num_tgt), float(-3.4028e+38)) # Attention disabled here
839
+
840
+ # Bottom left part of the attention matrix
841
+ bottom_left = torch.zeros((num_tgt, num_memory)) # Attention enabled here
842
+
843
+ # Create a lower triangular matrix for the bottom right part
844
+ bottom_right = torch.tril(torch.ones(num_tgt, num_tgt))
845
+
846
+ # Transform 1s to 0 to enable attention, and 0s to -inf to block attention
847
+ bottom_right = bottom_right.masked_fill(bottom_right == 0, float(-3.4028e+38))
848
+ bottom_right = bottom_right.masked_fill(bottom_right == 1, float(0))
849
+
850
+ # Concatenate matrices to form the full mask
851
+ left = torch.cat((top_left, bottom_left), dim=0)
852
+ right = torch.cat((top_right, bottom_right), dim=0)
853
+
854
+ # Combine left and right parts
855
+ full_attention_mask = torch.cat((left, right), dim=1)
856
+
857
+ # Add padding to reach max_length
858
+ padding = torch.full((total_length, max_length - total_length), float(-3.4028e+38))
859
+ full_attention_mask = torch.cat((full_attention_mask, padding), dim=1)
860
+
861
+ # Add an axis for multi-heads and batch_size
862
+ full_attention_mask = full_attention_mask[None, None, :, :]
863
+
864
+ # Expand the mask to have shape (batch_size, 1, seq_length, max_length)
865
+ full_attention_mask = full_attention_mask.expand(batch_size, 1, full_attention_mask.size(-2), full_attention_mask.size(-1))
866
+
867
+ return full_attention_mask
868
+
869
+ def get_position_ids_from_binary_attention_mask(mask):
870
+ """
871
+ Extract position IDs from a binary attention mask.
872
+
873
+ Args:
874
+ mask (torch.Tensor): The attention mask tensor of shape (1, 1, seq_len, seq_len),
875
+ where 1 indicates allowed attention and 0 indicates blocked attention.
876
+
877
+ Returns:
878
+ list: A list of lists where each sublist contains the allowed position IDs for each query position.
879
+ """
880
+ # Assuming the mask is of shape (1, 1, seq_len, seq_len)
881
+ _, _, seq_len, _ = mask.shape
882
+
883
+ # Create a tensor with position IDs from 0 to seq_len - 1
884
+ position_ids = torch.arange(seq_len, dtype=torch.long, device=mask.device)
885
+
886
+ # Add a batch dimension
887
+ position_ids = position_ids.unsqueeze(0)
888
+
889
+ return position_ids
890
+
891
+ def ensure_tensor(variable):
892
+ # Check if the variable is a torch.Tensor
893
+ if isinstance(variable, torch.Tensor):
894
+ # print("Variable is already a tensor.")
895
+ return variable
896
+ else:
897
+ #print("Variable is not a tensor, converting...")
898
+ try:
899
+ # Convert the variable to a tensor
900
+ tensor = torch.tensor(variable)
901
+ #print("Conversion successful.")
902
+ return tensor
903
+ except Exception as e:
904
+ print(f"Error converting to tensor: {e}")
905
+ raise
906
+
907
+ @add_start_docstrings(
908
+ "The bare Model outputting raw hidden-states without any specific head on top.",
909
+ FEYNMODEL_START_DOCSTRING,
910
+ )
911
+ class FeynModel(Gemma2Model):
912
+ """
913
+ Transformer decoder consisting of *config.num_hidden_layers* layers.
914
+ Each layer is a [`FeynModelDecoderLayer`] + ['LoraLayer'] for *proj* moduls
915
+ NB : LoraLayers will be added and activatd on proj modules onpy if pixel_values is not None
916
+
917
+ Args:
918
+ config: FeynModelConfig
919
+ """
920
+
921
+ def __init__(self, config: FeynModelConfig):
922
+ super().__init__(config)
923
+ # Initialize weights and apply final processing
924
+ self.mode='llm'
925
+ '''
926
+ self.image_patch_tokens = int(
927
+ (config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1
928
+ )
929
+
930
+ if config.num_image_with_embedding is not None:
931
+ self.image_patch_tokens *= config.num_image_with_embedding
932
+ '''
933
+ self.image_patch_tokens = 577
934
+ self.post_init()
935
+
936
+ def get_input_embeddings(self):
937
+ return self.embed_tokens
938
+
939
+ def set_input_embeddings(self, value):
940
+ self.embed_tokens = value
941
+
942
+
943
+
944
+
945
+ @add_start_docstrings_to_model_forward(FEYNMODEL_INPUTS_DOCSTRING)
946
+ def forward(
947
+ self,
948
+ input_ids: torch.LongTensor = None,
949
+ attention_mask: Optional[torch.Tensor] = None,
950
+ position_ids: Optional[torch.LongTensor] = None,
951
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
952
+ inputs_embeds: Optional[torch.FloatTensor] = None,
953
+ use_cache: Optional[bool] = None,
954
+ output_attentions: Optional[bool] = None,
955
+ output_hidden_states: Optional[bool] = None,
956
+ return_dict: Optional[bool] = None,
957
+ cache_position: Optional[torch.LongTensor] = None,
958
+ causal_attention_mask: Optional[torch.Tensor] = None,
959
+ **kwargs,
960
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
961
+
962
+ # print(f" self.mode = {self.mode}")
963
+ # Ensure cache_position is initialized if not provided
964
+
965
+
966
+ if cache_position is None:
967
+ batch_size = input_ids.size(0) if input_ids is not None else inputs_embeds.size(0)
968
+ cache_position = torch.zeros((batch_size,), dtype=torch.long, device=input_ids.device if input_ids is not None else inputs_embeds.device)
969
+
970
+
971
+
972
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
973
+ output_hidden_states = (
974
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
975
+ )
976
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
977
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
978
+
979
+ if (input_ids is None) ^ (inputs_embeds is not None):
980
+ raise ValueError(
981
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
982
+ )
983
+
984
+ if self.gradient_checkpointing and self.training and use_cache:
985
+ logger.warning_once(
986
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
987
+ )
988
+ use_cache = False
989
+
990
+ if inputs_embeds is None:
991
+ inputs_embeds = self.embed_tokens(input_ids)
992
+ causal_mask = self._update_causal_mask(
993
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
994
+ )
995
+ else:
996
+ causal_mask = ensure_tensor(causal_attention_mask)
997
+ position_ids = get_position_ids_from_binary_attention_mask(attention_mask)
998
+
999
+ #print(f" causal_mask = {causal_mask} ")
1000
+
1001
+ if cache_position is None:
1002
+ cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device)
1003
+
1004
+ if position_ids is None :
1005
+ position_ids = cache_position.unsqueeze(0)
1006
+
1007
+
1008
+
1009
+ # Convert position_ids to a tensor if not already
1010
+ if not isinstance(position_ids, torch.Tensor):
1011
+
1012
+ position_ids = torch.tensor(position_ids, dtype=torch.long, device=inputs_embeds.device)
1013
+
1014
+
1015
+ # embed positions
1016
+ hidden_states = inputs_embeds
1017
+
1018
+ # normalized
1019
+ # FeynModel downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
1020
+ # See https://github.com/huggingface/transformers/pull/29402
1021
+ normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
1022
+ hidden_states = hidden_states * normalizer
1023
+
1024
+ all_hidden_states = () if output_hidden_states else None
1025
+ all_self_attns = () if output_attentions else None
1026
+
1027
+ for decoder_layer in self.layers:
1028
+ if output_hidden_states:
1029
+ all_hidden_states += (hidden_states,)
1030
+
1031
+ if self.gradient_checkpointing and self.training:
1032
+ layer_outputs = self._gradient_checkpointing_func(
1033
+ decoder_layer.__call__,
1034
+ hidden_states,
1035
+ causal_mask,
1036
+ position_ids,
1037
+ past_key_values,
1038
+ output_attentions,
1039
+ use_cache,
1040
+ cache_position,
1041
+ )
1042
+ else:
1043
+ layer_outputs = decoder_layer(
1044
+ hidden_states,
1045
+ attention_mask=causal_mask,
1046
+ position_ids=position_ids,
1047
+ past_key_value=past_key_values,
1048
+ output_attentions=output_attentions,
1049
+ use_cache=use_cache,
1050
+ cache_position=cache_position,
1051
+ )
1052
+
1053
+ hidden_states = layer_outputs[0]
1054
+
1055
+ if output_attentions:
1056
+ all_self_attns += (layer_outputs[1],)
1057
+
1058
+ hidden_states = self.norm(hidden_states)
1059
+
1060
+ # add hidden states from the last decoder layer
1061
+ if output_hidden_states:
1062
+ all_hidden_states += (hidden_states,)
1063
+
1064
+ next_cache = past_key_values if use_cache else None
1065
+
1066
+ if not return_dict:
1067
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1068
+ return BaseModelOutputWithPast(
1069
+ last_hidden_state=hidden_states,
1070
+ past_key_values=next_cache,
1071
+ hidden_states=all_hidden_states,
1072
+ attentions=all_self_attns,
1073
+ )
1074
+
1075
+
1076
+
1077
+ def _update_causal_mask(
1078
+ self,
1079
+ attention_mask: torch.Tensor,
1080
+ input_tensor: torch.Tensor,
1081
+ cache_position: torch.Tensor,
1082
+ past_key_values: Cache,
1083
+ output_attentions: bool,
1084
+ ):
1085
+
1086
+ # print(f" _start _____ _update_causal_mask attention_mask {attention_mask.size()} {attention_mask} ")
1087
+ # Flash Attention currently doesn't support static cache but FeynModel work only with static cache.
1088
+ # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape
1089
+ # to cut out keys/values trailing 0 used in static cache. This workaround should be compile compatible
1090
+ # as it doesn't cause dynamic control issues.
1091
+ if self.config._attn_implementation == "flash_attention_2":
1092
+ return attention_mask
1093
+
1094
+ dtype, device = input_tensor.dtype, input_tensor.device
1095
+ min_dtype = torch.finfo(dtype).min
1096
+ sequence_length = input_tensor.shape[1]
1097
+ if isinstance(past_key_values, HybridCache):
1098
+ target_length = past_key_values.get_max_length()
1099
+ else:
1100
+ target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
1101
+
1102
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1103
+ causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
1104
+ attention_mask,
1105
+ sequence_length=sequence_length,
1106
+ target_length=target_length,
1107
+ dtype=dtype,
1108
+ device=device,
1109
+ min_dtype=min_dtype,
1110
+ cache_position=cache_position,
1111
+ batch_size=input_tensor.shape[0],
1112
+ )
1113
+ #print(f" _end ______ _update_causal_mask causal_mask {causal_mask.size()} {causal_mask} ")
1114
+ return causal_mask
1115
+
1116
+
1117
+
1118
+ class FeynModelForCausalLM(Gemma2ForCausalLM):
1119
+ _tied_weights_keys = ["lm_head.weight"]
1120
+ config_class = FeynModelConfig
1121
+ def __init__(self, config):
1122
+ super().__init__(config)
1123
+ config.vision_config=Florence2VisionConfig.from_dict(config.vision_config)
1124
+ self.model = FeynModel(config)
1125
+
1126
+ # assert config.vision_config.model_type== 'davit', 'only DaViT is supported for now'
1127
+ self.vision_tower = DaViT.from_config(config=config.vision_config)
1128
+ self._build_image_projection_layers(config)
1129
+
1130
+ self.__causal_attention_mask = None
1131
+
1132
+ # Initialize weights and apply final processing
1133
+ self.post_init()
1134
+
1135
+ ################ Vision Tower ########################
1136
+ def _build_image_projection_layers(self, config):
1137
+ image_dim_out = config.vision_config.dim_embed[-1]
1138
+ dim_projection = config.vision_config.projection_dim
1139
+ self.image_projection = nn.Parameter(
1140
+ torch.empty(image_dim_out, dim_projection)
1141
+ )
1142
+ self.image_proj_norm = nn.LayerNorm(dim_projection)
1143
+ image_pos_embed_config = config.vision_config.image_pos_embed
1144
+ if image_pos_embed_config['type'] == 'learned_abs_2d':
1145
+ self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
1146
+ embedding_dim=image_dim_out,
1147
+ num_pos=image_pos_embed_config['max_pos_embeddings']
1148
+ )
1149
+ else:
1150
+ raise NotImplementedError('Not implemented yet')
1151
+
1152
+ self.image_feature_source = config.vision_config.image_feature_source
1153
+
1154
+ # temporal embedding
1155
+ visual_temporal_embedding_config = config.vision_config.visual_temporal_embedding
1156
+ if visual_temporal_embedding_config['type'] == 'COSINE':
1157
+ self.visual_temporal_embed = PositionalEmbeddingCosine1D(
1158
+ embed_dim=image_dim_out,
1159
+ max_seq_len=visual_temporal_embedding_config['max_temporal_embeddings']
1160
+ )
1161
+ else:
1162
+ raise NotImplementedError('Not implemented yet')
1163
+
1164
+
1165
+
1166
+ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds):
1167
+ batch_size, image_token_length = image_features.size()[:-1]
1168
+ device = image_features.device
1169
+ image_attention_mask = torch.ones(batch_size, image_token_length, device=device)
1170
+
1171
+ if inputs_embeds is None:
1172
+ return image_features, image_attention_mask
1173
+
1174
+ task_prefix_embeds = inputs_embeds
1175
+ task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
1176
+
1177
+ # Assurer que les masques d'attention sont de deux dimensions
1178
+ if len(task_prefix_attention_mask.shape) == 3:
1179
+ task_prefix_attention_mask = task_prefix_attention_mask.squeeze(1)
1180
+
1181
+ # Vérifier la dimension de batch et ajuster si nécessaire
1182
+ if image_features.size(0) != task_prefix_embeds.size(0):
1183
+ raise ValueError("Batch sizes of image_features and task_prefix_embeds do not match")
1184
+
1185
+ # Ajouter une dimension fictive si les dimensions ne sont pas alignées
1186
+ if image_features.dim() < task_prefix_embeds.dim():
1187
+ image_features = image_features.unsqueeze(-1)
1188
+ elif task_prefix_embeds.dim() < image_features.dim():
1189
+ task_prefix_embeds = task_prefix_embeds.unsqueeze(-1)
1190
+
1191
+ # Assurer que toutes les dimensions, sauf dim=1, sont identiques
1192
+ if image_features.size(2) != task_prefix_embeds.size(2):
1193
+ # Ajuster ou signaler une erreur si les dimensions internes ne sont pas compatibles
1194
+ raise ValueError("Internal dimensions of image_features and task_prefix_embeds do not match")
1195
+
1196
+ inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
1197
+ attention_mask = torch.cat([image_attention_mask, task_prefix_attention_mask], dim=1)
1198
+
1199
+ return inputs_embeds, attention_mask
1200
+
1201
+ def _encode_image(self, pixel_values):
1202
+ if len(pixel_values.shape) == 4:
1203
+ batch_size, C, H, W = pixel_values.shape
1204
+ T = 1
1205
+ x = self.vision_tower.forward_features_unpool(pixel_values)
1206
+ else:
1207
+ # Ajoute une dimension de batch au début si 'pixel_values' n'a que 3 dimensions (C, H, W)
1208
+ pixel_values = pixel_values.unsqueeze(0) # Ajoute une dimension de batch
1209
+ batch_size, C, H, W = pixel_values.shape
1210
+ T = 1
1211
+ x = self.vision_tower.forward_features_unpool(pixel_values)
1212
+
1213
+ if self.image_pos_embed is not None:
1214
+ x = x.view(batch_size * T, -1, x.shape[-1])
1215
+ num_tokens = x.shape[-2]
1216
+ h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5)
1217
+ assert h * w == num_tokens, 'only support square feature maps for now'
1218
+ x = x.view(batch_size * T, h, w, x.shape[-1])
1219
+ pos_embed = self.image_pos_embed(x)
1220
+ x = x + pos_embed
1221
+ x = x.view(batch_size, T * h*w, x.shape[-1])
1222
+
1223
+ if self.visual_temporal_embed is not None:
1224
+ visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
1225
+ x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
1226
+
1227
+ x_feat_dict = {}
1228
+
1229
+ spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
1230
+ x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
1231
+
1232
+ temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
1233
+ x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
1234
+
1235
+ x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
1236
+ x_feat_dict['last_frame'] = x
1237
+
1238
+ new_x = []
1239
+ for _image_feature_source in self.image_feature_source:
1240
+ if _image_feature_source not in x_feat_dict:
1241
+ raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
1242
+ new_x.append(x_feat_dict[_image_feature_source])
1243
+
1244
+ x = torch.cat(new_x, dim=1)
1245
+
1246
+ x = x @ self.image_projection
1247
+ x = self.image_proj_norm(x)
1248
+
1249
+ return x
1250
+ #######################################################
1251
+
1252
+ def get_input_embeddings(self):
1253
+ return self.model.embed_tokens
1254
+
1255
+ def set_input_embeddings(self, value):
1256
+ self.model.embed_tokens = value
1257
+
1258
+ def get_output_embeddings(self):
1259
+ return self.lm_head
1260
+
1261
+ def set_output_embeddings(self, new_embeddings):
1262
+ self.lm_head = new_embeddings
1263
+
1264
+ def set_decoder(self, decoder):
1265
+ self.model = decoder
1266
+
1267
+ def get_decoder(self):
1268
+ return self.model
1269
+
1270
+ @add_start_docstrings_to_model_forward(FEYNMODEL_INPUTS_DOCSTRING)
1271
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1272
+ def forward(
1273
+ self,
1274
+ input_ids: torch.LongTensor = None,
1275
+ pixel_values: Optional[torch.Tensor] = None,
1276
+ attention_mask: Optional[torch.Tensor] = None,
1277
+ position_ids: Optional[torch.LongTensor] = None,
1278
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1279
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1280
+ labels: Optional[torch.LongTensor] = None,
1281
+ use_cache: Optional[bool] = None,
1282
+ output_attentions: Optional[bool] = None,
1283
+ output_hidden_states: Optional[bool] = None,
1284
+ return_dict: Optional[bool] = None,
1285
+ cache_position: Optional[torch.LongTensor] = None,
1286
+ **kwargs,
1287
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1288
+ r"""
1289
+ Args:
1290
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1291
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1292
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1293
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1294
+
1295
+ Returns:
1296
+
1297
+ Example:
1298
+
1299
+ ```python
1300
+ >>> from transformers import AutoTokenizer, GemmaForCausalLM
1301
+
1302
+ >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
1303
+ >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
1304
+
1305
+ >>> prompt = "What is your favorite condiment?"
1306
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1307
+
1308
+ >>> # Generate
1309
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1310
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1311
+ "What is your favorite condiment?"
1312
+ ```"""
1313
+
1314
+
1315
+ if self.training and self.config._attn_implementation != "eager":
1316
+ logger.warning_once(
1317
+ "It is strongly recommended to train FeynModel models with the `eager` attention implementation "
1318
+ f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
1319
+ )
1320
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1321
+ output_hidden_states = (
1322
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1323
+ )
1324
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1325
+
1326
+ if pixel_values is not None:
1327
+ self.model.mode='vlm'
1328
+
1329
+ if input_ids is not None:
1330
+ inputs_embeds = self.get_input_embeddings()(input_ids)
1331
+ image_features = self._encode_image(pixel_values)
1332
+ inputs_embeds, causal_attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds )
1333
+ causal_attention_mask = create_git_attention_mask(tgt=input_ids, memory=image_features,max_length=2048)
1334
+ causal_attention_mask=causal_attention_mask.to(input_ids.device)
1335
+ self.__causal_attention_mask=causal_attention_mask
1336
+
1337
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1338
+ if pixel_values is not None:
1339
+ outputs = self.model(
1340
+ input_ids=None,
1341
+ attention_mask=causal_attention_mask,
1342
+ position_ids=position_ids,
1343
+ past_key_values=past_key_values,
1344
+ inputs_embeds=inputs_embeds,
1345
+ use_cache=use_cache,
1346
+ output_attentions=output_attentions,
1347
+ output_hidden_states=output_hidden_states,
1348
+ return_dict=return_dict,
1349
+ cache_position=cache_position,
1350
+ causal_attention_mask=causal_attention_mask,
1351
+ )
1352
+ else:
1353
+ outputs = self.model(
1354
+ input_ids=input_ids,
1355
+ attention_mask=attention_mask,
1356
+ position_ids=position_ids,
1357
+ past_key_values=past_key_values,
1358
+ inputs_embeds=inputs_embeds,
1359
+ use_cache=use_cache,
1360
+ output_attentions=output_attentions,
1361
+ output_hidden_states=output_hidden_states,
1362
+ return_dict=return_dict,
1363
+ cache_position=cache_position,
1364
+ causal_attention_mask=self.__causal_attention_mask,
1365
+ )
1366
+
1367
+
1368
+ hidden_states = outputs[0]
1369
+ logits = self.lm_head(hidden_states)
1370
+
1371
+ if self.config.final_logit_softcapping is not None:
1372
+ logits = logits / self.config.final_logit_softcapping
1373
+ logits = torch.tanh(logits)
1374
+ logits = logits * self.config.final_logit_softcapping
1375
+
1376
+
1377
+ logits = logits.float()
1378
+ loss = None
1379
+ if labels is not None:
1380
+ # we are doing next-token prediction; shift prediction scores and input ids by one
1381
+ num_image_tokens = self.model.image_patch_tokens
1382
+ shifted_logits = logits[:, num_image_tokens:-1, :].contiguous()
1383
+ labels = labels[:, 1:].contiguous()
1384
+ loss_fct = CrossEntropyLoss()
1385
+ loss = loss_fct(shifted_logits.view(-1, self.config.vocab_size), labels.view(-1))
1386
+
1387
+ if not return_dict:
1388
+
1389
+ output = (logits,) + outputs[1:]
1390
+ return (loss,) + output if loss is not None else output
1391
+
1392
+ return CausalLMOutputWithPast(
1393
+ loss=loss,
1394
+ logits=logits,
1395
+ past_key_values=outputs.past_key_values,
1396
+ hidden_states=outputs.hidden_states,
1397
+ attentions=outputs.attentions,
1398
+ )
1399
+
1400
+ def prepare_inputs_for_generation(
1401
+ self,
1402
+ input_ids,
1403
+ past_key_values=None,
1404
+ attention_mask=None,
1405
+ inputs_embeds=None,
1406
+ cache_position=None,
1407
+ position_ids=None,
1408
+ use_cache=True,
1409
+ **kwargs,
1410
+ ):
1411
+
1412
+
1413
+ # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
1414
+ # Exception 1: when passing input_embeds, input_ids may be missing entries
1415
+ # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
1416
+ if past_key_values is not None:
1417
+ if inputs_embeds is not None: # Exception 1
1418
+ input_ids = input_ids[:, -cache_position.shape[0] :]
1419
+ elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
1420
+ input_ids = input_ids[:, cache_position]
1421
+
1422
+ if attention_mask is not None and position_ids is None:
1423
+ # create position_ids on the fly for batch generation
1424
+ position_ids = attention_mask.long().cumsum(-1) - 1
1425
+ position_ids.masked_fill_(attention_mask == 0, 1)
1426
+ if past_key_values:
1427
+ # print(f"+-+-+-+-+-+-+++ past_key_values +-+-+++- position_ids {position_ids.size()} ================= ")
1428
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1429
+ # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
1430
+ # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride
1431
+ # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the
1432
+ # batch size = 1 case, `position_ids` is already contiguous but with varying stride
1433
+ # which retriggers a capture.
1434
+ position_ids = position_ids.clone(memory_format=torch.contiguous_format)
1435
+ # print(f"+-+-+-+-+-+-+++ past_key_values +-+-+++- position_ids cmlone ==> {position_ids.size()} ================= ")
1436
+
1437
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1438
+ if inputs_embeds is not None and cache_position[0] == 0:
1439
+ #print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> first generation step>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><")
1440
+ model_inputs = {"inputs_embeds": inputs_embeds}
1441
+ else:
1442
+ # The clone here is for the same reason as for `position_ids`.
1443
+ # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> The clone here is for the same reason as for `position_ids` ==> input_ids input_ids.clone.>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><")
1444
+ model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format)}
1445
+
1446
+ if isinstance(past_key_values, HybridCache) and attention_mask.ndim == 2:
1447
+ if inputs_embeds is not None and input_ids.size(1)!= 0 :
1448
+ ###################### V ############## add _ for _ = inputs_embeds.shape
1449
+ batch_size, sequence_length, _ = inputs_embeds.shape
1450
+ device = inputs_embeds.device
1451
+ #print(f"1111111 +-+-+-+-+-+-+-+-+-+- sequence_length = inputs_embeds {sequence_length}")
1452
+ else:
1453
+ batch_size, sequence_length = position_ids.shape
1454
+ device = input_ids.device
1455
+ #print(f"22222222 +-+-+-+-+-+-+-+-+-+- sequence_length = input_ids.shape {sequence_length}")
1456
+
1457
+ dtype = self.lm_head.weight.dtype
1458
+ min_dtype = torch.finfo(dtype).min
1459
+
1460
+ attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
1461
+ attention_mask,
1462
+ sequence_length=sequence_length,
1463
+ target_length=past_key_values.get_max_length(),
1464
+ dtype=dtype,
1465
+ device=device,
1466
+ min_dtype=min_dtype,
1467
+ cache_position=cache_position,
1468
+ batch_size=batch_size,
1469
+ )
1470
+
1471
+
1472
+ model_inputs.update(
1473
+ {
1474
+ "position_ids": position_ids,
1475
+ "cache_position": cache_position,
1476
+ "past_key_values": past_key_values,
1477
+ "use_cache": use_cache,
1478
+ "attention_mask": attention_mask,
1479
+ }
1480
+ )
1481
+ return model_inputs
1482
+
1483
+ def generate(
1484
+ self,
1485
+ input_ids,
1486
+ pixel_values=None,
1487
+ max_length=None,
1488
+ do_sample=True,
1489
+ temperature=0.7,
1490
+ **kwargs
1491
+ ):
1492
+ print("Fonction generate personnalisée appelée")
1493
+
1494
+ if pixel_values is not None:
1495
+ if input_ids is not None:
1496
+ print("input")
1497
+ inputs_embeds = self.get_input_embeddings()(input_ids)
1498
+ print("pixels")
1499
+ image_features = self._encode_image(pixel_values)
1500
+ inputs_embeds, causal_attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds )
1501
+ causal_attention_mask = create_git_attention_mask(tgt=input_ids, memory=image_features,max_length=max_length)
1502
+ causal_attention_mask=causal_attention_mask.to(input_ids.device)
1503
+ self.__causal_attention_mask=causal_attention_mask
1504
+ self.model.mode='vlm'
1505
+ result = super().generate(
1506
+ input_ids=None,
1507
+ inputs_embeds=inputs_embeds,
1508
+ max_length=max_length,
1509
+ do_sample=do_sample,
1510
+ temperature=temperature,
1511
+ **kwargs
1512
+ )
1513
+
1514
+ else:
1515
+ print("llm")
1516
+ self.model.mode=='llm'
1517
+ result = super().generate(
1518
+ input_ids=input_ids,
1519
+ #inputs_embeds=None,
1520
+ max_length=max_length,
1521
+ do_sample=do_sample,
1522
+ temperature=temperature,
1523
+ **kwargs
1524
+ )
1525
+ self.__causal_attention_mask = None
1526
+
1527
+ return result
1528
+
__init__.py ADDED
File without changes
config.json ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "FeynModelForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "attn_logit_softcapping": 50.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_feynmodel.FeynModelConfig",
10
+ "AutoModelForCausalLM": "modeling_feynmodel.FeynModelForCausalLM"
11
+ },
12
+ "cache_implementation": "hybrid",
13
+ "final_logit_softcapping": 30.0,
14
+ "head_dim": 256,
15
+ "hidden_act": "gelu_pytorch_tanh",
16
+ "hidden_activation": "gelu_pytorch_tanh",
17
+ "hidden_size": 2304,
18
+ "ignore_index": -100,
19
+ "init_std": 0.02,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 9216,
22
+ "max_position_embeddings": 8192,
23
+ "model_type": "FeynModel",
24
+ "num_attention_heads": 8,
25
+ "num_hidden_layers": 26,
26
+ "num_key_value_heads": 4,
27
+ "projection_dim": 1024,
28
+ "query_pre_attn_scalar": 256,
29
+ "rms_norm_eps": 1e-06,
30
+ "rope_theta": 10000.0,
31
+ "sliding_window": 4096,
32
+ "text_config": {
33
+ "_name_or_path": "Imagroune/feynmodel",
34
+ "add_cross_attention": false,
35
+ "architectures": [
36
+ "FeynModelForCausalLM"
37
+ ],
38
+ "attention_bias": false,
39
+ "attention_dropout": 0.0,
40
+ "attn_logit_softcapping": 50.0,
41
+ "bad_words_ids": null,
42
+ "begin_suppress_tokens": null,
43
+ "bos_token_id": 2,
44
+ "cache_implementation": "hybrid",
45
+ "chunk_size_feed_forward": 0,
46
+ "cross_attention_hidden_size": null,
47
+ "decoder_start_token_id": null,
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "early_stopping": false,
51
+ "encoder_no_repeat_ngram_size": 0,
52
+ "eos_token_id": [
53
+ 1,
54
+ 107
55
+ ],
56
+ "exponential_decay_length_penalty": null,
57
+ "final_logit_softcapping": 30.0,
58
+ "finetuning_task": null,
59
+ "forced_bos_token_id": null,
60
+ "forced_eos_token_id": null,
61
+ "head_dim": 256,
62
+ "hidden_act": "gelu_pytorch_tanh",
63
+ "hidden_activation": "gelu_pytorch_tanh",
64
+ "hidden_size": 2304,
65
+ "id2label": {
66
+ "0": "LABEL_0",
67
+ "1": "LABEL_1"
68
+ },
69
+ "init_std": 0.02,
70
+ "initializer_range": 0.02,
71
+ "intermediate_size": 9216,
72
+ "is_decoder": false,
73
+ "is_encoder_decoder": false,
74
+ "label2id": {
75
+ "LABEL_0": 0,
76
+ "LABEL_1": 1
77
+ },
78
+ "length_penalty": 1.0,
79
+ "max_length": 20,
80
+ "max_position_embeddings": 8192,
81
+ "min_length": 0,
82
+ "model_type": "FeynModel",
83
+ "no_repeat_ngram_size": 0,
84
+ "num_attention_heads": 8,
85
+ "num_beam_groups": 1,
86
+ "num_beams": 1,
87
+ "num_hidden_layers": 26,
88
+ "num_key_value_heads": 4,
89
+ "num_return_sequences": 1,
90
+ "output_attentions": false,
91
+ "output_hidden_states": false,
92
+ "output_scores": false,
93
+ "pad_token_id": 0,
94
+ "prefix": null,
95
+ "problem_type": null,
96
+ "pruned_heads": {},
97
+ "query_pre_attn_scalar": 256,
98
+ "remove_invalid_values": false,
99
+ "repetition_penalty": 1.0,
100
+ "return_dict": true,
101
+ "return_dict_in_generate": false,
102
+ "rms_norm_eps": 1e-06,
103
+ "rope_theta": 10000.0,
104
+ "sep_token_id": null,
105
+ "sliding_window": 4096,
106
+ "suppress_tokens": null,
107
+ "task_specific_params": null,
108
+ "temperature": 1.0,
109
+ "tf_legacy_loss": false,
110
+ "tie_encoder_decoder": false,
111
+ "tie_word_embeddings": true,
112
+ "tokenizer_class": null,
113
+ "top_k": 50,
114
+ "top_p": 1.0,
115
+ "torch_dtype": "float16",
116
+ "torchscript": false,
117
+ "typical_p": 1.0,
118
+ "use_bfloat16": false,
119
+ "use_cache": true,
120
+ "vocab_size": 256000
121
+ },
122
+ "torch_dtype": "float32",
123
+ "transformers_version": "4.44.2",
124
+ "use_cache": true,
125
+ "vision_config": {
126
+ "_name_or_path": "",
127
+ "add_cross_attention": false,
128
+ "architectures": null,
129
+ "bad_words_ids": null,
130
+ "begin_suppress_tokens": null,
131
+ "bos_token_id": null,
132
+ "chunk_size_feed_forward": 0,
133
+ "cross_attention_hidden_size": null,
134
+ "decoder_start_token_id": null,
135
+ "depths": [
136
+ 1,
137
+ 1,
138
+ 9,
139
+ 1
140
+ ],
141
+ "dim_embed": [
142
+ 128,
143
+ 256,
144
+ 512,
145
+ 1024
146
+ ],
147
+ "diversity_penalty": 0.0,
148
+ "do_sample": false,
149
+ "drop_path_rate": 0.1,
150
+ "early_stopping": false,
151
+ "enable_checkpoint": false,
152
+ "encoder_no_repeat_ngram_size": 0,
153
+ "eos_token_id": null,
154
+ "exponential_decay_length_penalty": null,
155
+ "finetuning_task": null,
156
+ "forced_bos_token_id": null,
157
+ "forced_eos_token_id": null,
158
+ "id2label": {
159
+ "0": "LABEL_0",
160
+ "1": "LABEL_1"
161
+ },
162
+ "image_feature_source": [
163
+ "spatial_avg_pool",
164
+ "temporal_avg_pool"
165
+ ],
166
+ "image_pos_embed": {
167
+ "max_pos_embeddings": 50,
168
+ "type": "learned_abs_2d"
169
+ },
170
+ "is_decoder": false,
171
+ "is_encoder_decoder": false,
172
+ "label2id": {
173
+ "LABEL_0": 0,
174
+ "LABEL_1": 1
175
+ },
176
+ "length_penalty": 1.0,
177
+ "max_length": 20,
178
+ "min_length": 0,
179
+ "model_type": "florence2_vision",
180
+ "no_repeat_ngram_size": 0,
181
+ "num_beam_groups": 1,
182
+ "num_beams": 1,
183
+ "num_groups": [
184
+ 4,
185
+ 8,
186
+ 16,
187
+ 32
188
+ ],
189
+ "num_heads": [
190
+ 4,
191
+ 8,
192
+ 16,
193
+ 32
194
+ ],
195
+ "num_return_sequences": 1,
196
+ "output_attentions": false,
197
+ "output_hidden_states": false,
198
+ "output_scores": false,
199
+ "pad_token_id": null,
200
+ "patch_padding": [
201
+ 3,
202
+ 1,
203
+ 1,
204
+ 1
205
+ ],
206
+ "patch_prenorm": [
207
+ false,
208
+ true,
209
+ true,
210
+ true
211
+ ],
212
+ "patch_size": [
213
+ 7,
214
+ 3,
215
+ 3,
216
+ 3
217
+ ],
218
+ "patch_stride": [
219
+ 4,
220
+ 2,
221
+ 2,
222
+ 2
223
+ ],
224
+ "prefix": null,
225
+ "problem_type": null,
226
+ "projection_dim": 2304,
227
+ "pruned_heads": {},
228
+ "remove_invalid_values": false,
229
+ "repetition_penalty": 1.0,
230
+ "return_dict": true,
231
+ "return_dict_in_generate": false,
232
+ "sep_token_id": null,
233
+ "suppress_tokens": null,
234
+ "task_specific_params": null,
235
+ "temperature": 1.0,
236
+ "tf_legacy_loss": false,
237
+ "tie_encoder_decoder": false,
238
+ "tie_word_embeddings": true,
239
+ "tokenizer_class": null,
240
+ "top_k": 50,
241
+ "top_p": 1.0,
242
+ "torch_dtype": null,
243
+ "torchscript": false,
244
+ "typical_p": 1.0,
245
+ "use_bfloat16": false,
246
+ "visual_temporal_embedding": {
247
+ "max_temporal_embeddings": 100,
248
+ "type": "COSINE"
249
+ },
250
+ "window_size": 12
251
+ },
252
+ "vocab_size": 256000
253
+ }
configuration_feynmodel.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ import copy
3
+
4
+ class Florence2VisionConfig(PretrainedConfig):
5
+
6
+ model_type = "florence2_vision"
7
+ keys_to_ignore_at_inference = ["past_key_values"]
8
+
9
+ def __init__(
10
+ self,
11
+ drop_path_rate=0.1,
12
+ patch_size=[7, 3, 3, 3],
13
+ patch_stride=[4, 2, 2, 2],
14
+ patch_padding=[3, 1, 1, 1],
15
+ patch_prenorm=[False, True, True, True],
16
+ enable_checkpoint=False,
17
+ dim_embed=[256, 512, 1024, 2048],
18
+ num_heads=[8, 16, 32, 64],
19
+ num_groups=[8, 16, 32, 64],
20
+ depths=[1, 1, 9, 1],
21
+ window_size=12,
22
+ projection_dim=1024,
23
+ visual_temporal_embedding=None,
24
+ image_pos_embed=None,
25
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
26
+ **kwargs,
27
+ ):
28
+ self.drop_path_rate = drop_path_rate
29
+ self.patch_size = patch_size
30
+ self.patch_stride = patch_stride
31
+ self.patch_padding = patch_padding
32
+ self.patch_prenorm = patch_prenorm
33
+ self.enable_checkpoint = enable_checkpoint
34
+ self.dim_embed = dim_embed
35
+ self.num_heads = num_heads
36
+ self.num_groups = num_groups
37
+ self.depths = depths
38
+ self.window_size = window_size
39
+ self.projection_dim = projection_dim
40
+ self.visual_temporal_embedding = visual_temporal_embedding
41
+ self.image_pos_embed = image_pos_embed
42
+ self.image_feature_source = image_feature_source
43
+
44
+ super().__init__(**kwargs)
45
+
46
+
47
+
48
+ class Gemma2Config(PretrainedConfig):
49
+
50
+ model_type = "gemma2"
51
+ keys_to_ignore_at_inference = ["past_key_values"]
52
+
53
+ def __init__(
54
+ self,
55
+ vocab_size=256000,
56
+ hidden_size=3072,
57
+ intermediate_size=24576,
58
+ num_hidden_layers=28,
59
+ num_attention_heads=16,
60
+ num_key_value_heads=16,
61
+ head_dim=256,
62
+ hidden_activation="gelu_pytorch_tanh",
63
+ max_position_embeddings=8192,
64
+ initializer_range=0.02,
65
+ rms_norm_eps=1e-6,
66
+ use_cache=True,
67
+ pad_token_id=0,
68
+ eos_token_id=1,
69
+ bos_token_id=2,
70
+ tie_word_embeddings=True,
71
+ rope_theta=10000.0,
72
+ attention_bias=False,
73
+ attention_dropout=0.0,
74
+ final_logit_softcapping=30.0,
75
+ attn_logit_softcapping=50.0,
76
+ query_pre_attn_scalar=224,
77
+ sliding_window=4096,
78
+ **kwargs,
79
+ ):
80
+ self.vocab_size = vocab_size
81
+ self.max_position_embeddings = max_position_embeddings
82
+ self.hidden_size = hidden_size
83
+ self.intermediate_size = intermediate_size
84
+ self.num_hidden_layers = num_hidden_layers
85
+ self.num_attention_heads = num_attention_heads
86
+ self.head_dim = head_dim
87
+ self.num_key_value_heads = num_key_value_heads
88
+ self.hidden_activation = hidden_activation
89
+ self.initializer_range = initializer_range
90
+ self.rms_norm_eps = rms_norm_eps
91
+ self.use_cache = use_cache
92
+ self.rope_theta = rope_theta
93
+ self.attention_bias = attention_bias
94
+ self.attention_dropout = attention_dropout
95
+ self.attn_logit_softcapping = attn_logit_softcapping
96
+
97
+ super().__init__(
98
+ pad_token_id=pad_token_id,
99
+ bos_token_id=bos_token_id,
100
+ eos_token_id=eos_token_id,
101
+ tie_word_embeddings=tie_word_embeddings,
102
+ **kwargs,
103
+ )
104
+ self.final_logit_softcapping = final_logit_softcapping
105
+ self.query_pre_attn_scalar = query_pre_attn_scalar
106
+ self.sliding_window = sliding_window
107
+ self.cache_implementation = "hybrid"
108
+
109
+
110
+
111
+ class FeynModelConfig(PretrainedConfig):
112
+ r"""
113
+ This is the configuration class to store the configuration of a [`FeynModel`]. It is used to instantiate a FeynModel
114
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
115
+ defaults will yield a similar configuration to that of the Gemma2-2B + Florence-2-Base + FeynModel V0.1.0.
116
+ ```python
117
+ >>> from transformers import FeynModel, FeynModelConfig
118
+ >>> # Initializing a FeynModel style configuration
119
+ >>> configuration = FeynModelConfig()
120
+ >>> # Initializing a model
121
+ >>> model = FeynModel(configuration)
122
+ >>> # Accessing the model configuration
123
+ >>> configuration = model.config
124
+ ```"""
125
+
126
+ # model_type = "gemma2"
127
+ # is_composition = False
128
+ model_type = "FeynModel"
129
+ keys_to_ignore_at_inference = ["past_key_values"]
130
+
131
+ def __init__(
132
+ self,
133
+ vision_config=None,
134
+ text_config=None,
135
+ ignore_index=-100,
136
+ vocab_size=256000,
137
+ projection_dim=1024,
138
+ **kwargs,
139
+ ):
140
+ self.ignore_index = ignore_index
141
+ self.vocab_size = vocab_size
142
+ self.projection_dim = projection_dim
143
+ self.vision_config = vision_config
144
+ self.vocab_size = self.vocab_size
145
+
146
+ self.text_config = text_config
147
+ # self.sliding_window = text_config.sliding_window
148
+ # Ajout des attributs de text_config à l'instance actuelle de Config
149
+
150
+ if text_config is not None:
151
+ for attr, value in text_config.items():
152
+ setattr(self, attr, value)
153
+
154
+
155
+ super().__init__(**kwargs)
156
+
157
+
158
+
159
+
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "cache_implementation": "hybrid",
5
+ "eos_token_id": [
6
+ 1,
7
+ 107
8
+ ],
9
+ "pad_token_id": 0,
10
+ "transformers_version": "4.44.2"
11
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4214710d3c4b31d9a89527da2c196e344c9a41fbf6e4a7e942a8a626b9e911c5
3
+ size 4917078632
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e74788d2d95dc3174fd671a7e987fb4fb0243e25b1c8803a1fef8e084117638e
3
+ size 4983443424
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24ca5c47d98179d7796291d5d4d6c2b8706f5173d2dcfc3fa57a6d394575f9fd
3
+ size 932581696
model.safetensors.index.json ADDED
@@ -0,0 +1,705 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 10833017760
4
+ },
5
+ "weight_map": {
6
+ "image_pos_embed.column_embeddings.weight": "model-00003-of-00003.safetensors",
7
+ "image_pos_embed.row_embeddings.weight": "model-00003-of-00003.safetensors",
8
+ "image_proj_norm.bias": "model-00003-of-00003.safetensors",
9
+ "image_proj_norm.weight": "model-00003-of-00003.safetensors",
10
+ "image_projection": "model-00001-of-00003.safetensors",
11
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.0.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.1.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
31
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
35
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.10.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.10.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.11.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.11.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.12.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.12.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.13.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.13.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.14.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.14.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.15.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.15.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.16.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.16.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.17.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.17.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
118
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
120
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
121
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
122
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
124
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
125
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.18.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.18.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.19.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.19.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
144
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
145
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
146
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
147
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
148
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
149
+ "model.layers.2.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
150
+ "model.layers.2.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
151
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
152
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
153
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
154
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
155
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
156
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
157
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
158
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
159
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
160
+ "model.layers.20.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
161
+ "model.layers.20.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
162
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
163
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
164
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
165
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
166
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
167
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
168
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
169
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
170
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
171
+ "model.layers.21.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
172
+ "model.layers.21.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
173
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
174
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
175
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
176
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
177
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
178
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
179
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
180
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
181
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
182
+ "model.layers.22.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
183
+ "model.layers.22.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
184
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
185
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
186
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
187
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
188
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
189
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
190
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
191
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
192
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
193
+ "model.layers.23.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
194
+ "model.layers.23.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
196
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
197
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
198
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
199
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.24.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.24.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
208
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
209
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
210
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.25.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
216
+ "model.layers.25.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
217
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
218
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
219
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
220
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
221
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
225
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
226
+ "model.layers.3.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
227
+ "model.layers.3.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
228
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
229
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
230
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
231
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
232
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
233
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
234
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
235
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
236
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
237
+ "model.layers.4.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
238
+ "model.layers.4.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
239
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
240
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
241
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
242
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.5.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.5.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.6.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.6.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.7.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.7.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors",
277
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
278
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
279
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
280
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
281
+ "model.layers.8.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
282
+ "model.layers.8.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
292
+ "model.layers.9.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
293
+ "model.layers.9.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
294
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
295
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
296
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
297
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
298
+ "model.norm.weight": "model-00003-of-00003.safetensors",
299
+ "vision_tower.blocks.0.0.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
300
+ "vision_tower.blocks.0.0.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
301
+ "vision_tower.blocks.0.0.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
302
+ "vision_tower.blocks.0.0.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
303
+ "vision_tower.blocks.0.0.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
304
+ "vision_tower.blocks.0.0.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
305
+ "vision_tower.blocks.0.0.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
306
+ "vision_tower.blocks.0.0.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
307
+ "vision_tower.blocks.0.0.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
308
+ "vision_tower.blocks.0.0.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
309
+ "vision_tower.blocks.0.0.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
310
+ "vision_tower.blocks.0.0.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
311
+ "vision_tower.blocks.0.0.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
312
+ "vision_tower.blocks.0.0.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
313
+ "vision_tower.blocks.0.0.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
314
+ "vision_tower.blocks.0.0.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
315
+ "vision_tower.blocks.0.0.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
316
+ "vision_tower.blocks.0.0.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
317
+ "vision_tower.blocks.0.0.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
318
+ "vision_tower.blocks.0.0.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
319
+ "vision_tower.blocks.0.0.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
320
+ "vision_tower.blocks.0.0.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
321
+ "vision_tower.blocks.0.0.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
322
+ "vision_tower.blocks.0.0.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
323
+ "vision_tower.blocks.0.0.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
324
+ "vision_tower.blocks.0.0.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
325
+ "vision_tower.blocks.0.0.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
326
+ "vision_tower.blocks.0.0.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
327
+ "vision_tower.blocks.0.0.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
328
+ "vision_tower.blocks.0.0.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
329
+ "vision_tower.blocks.0.0.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
330
+ "vision_tower.blocks.0.0.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
331
+ "vision_tower.blocks.1.0.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
332
+ "vision_tower.blocks.1.0.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
333
+ "vision_tower.blocks.1.0.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
334
+ "vision_tower.blocks.1.0.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
335
+ "vision_tower.blocks.1.0.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
336
+ "vision_tower.blocks.1.0.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
337
+ "vision_tower.blocks.1.0.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
338
+ "vision_tower.blocks.1.0.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
339
+ "vision_tower.blocks.1.0.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
340
+ "vision_tower.blocks.1.0.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
341
+ "vision_tower.blocks.1.0.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
342
+ "vision_tower.blocks.1.0.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
343
+ "vision_tower.blocks.1.0.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
344
+ "vision_tower.blocks.1.0.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
345
+ "vision_tower.blocks.1.0.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
346
+ "vision_tower.blocks.1.0.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
347
+ "vision_tower.blocks.1.0.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
348
+ "vision_tower.blocks.1.0.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
349
+ "vision_tower.blocks.1.0.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
350
+ "vision_tower.blocks.1.0.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
351
+ "vision_tower.blocks.1.0.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
352
+ "vision_tower.blocks.1.0.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
353
+ "vision_tower.blocks.1.0.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
354
+ "vision_tower.blocks.1.0.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
355
+ "vision_tower.blocks.1.0.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
356
+ "vision_tower.blocks.1.0.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
357
+ "vision_tower.blocks.1.0.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
358
+ "vision_tower.blocks.1.0.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
359
+ "vision_tower.blocks.1.0.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
360
+ "vision_tower.blocks.1.0.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
361
+ "vision_tower.blocks.1.0.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
362
+ "vision_tower.blocks.1.0.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
363
+ "vision_tower.blocks.2.0.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
364
+ "vision_tower.blocks.2.0.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
365
+ "vision_tower.blocks.2.0.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
366
+ "vision_tower.blocks.2.0.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
367
+ "vision_tower.blocks.2.0.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
368
+ "vision_tower.blocks.2.0.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
369
+ "vision_tower.blocks.2.0.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
370
+ "vision_tower.blocks.2.0.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
371
+ "vision_tower.blocks.2.0.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
372
+ "vision_tower.blocks.2.0.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
373
+ "vision_tower.blocks.2.0.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
374
+ "vision_tower.blocks.2.0.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
375
+ "vision_tower.blocks.2.0.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
376
+ "vision_tower.blocks.2.0.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
377
+ "vision_tower.blocks.2.0.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
378
+ "vision_tower.blocks.2.0.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
379
+ "vision_tower.blocks.2.0.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
380
+ "vision_tower.blocks.2.0.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
381
+ "vision_tower.blocks.2.0.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
382
+ "vision_tower.blocks.2.0.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
383
+ "vision_tower.blocks.2.0.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
384
+ "vision_tower.blocks.2.0.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
385
+ "vision_tower.blocks.2.0.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
386
+ "vision_tower.blocks.2.0.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
387
+ "vision_tower.blocks.2.0.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
388
+ "vision_tower.blocks.2.0.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
389
+ "vision_tower.blocks.2.0.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
390
+ "vision_tower.blocks.2.0.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
391
+ "vision_tower.blocks.2.0.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
392
+ "vision_tower.blocks.2.0.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
393
+ "vision_tower.blocks.2.0.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
394
+ "vision_tower.blocks.2.0.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
395
+ "vision_tower.blocks.2.1.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
396
+ "vision_tower.blocks.2.1.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
397
+ "vision_tower.blocks.2.1.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
398
+ "vision_tower.blocks.2.1.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
399
+ "vision_tower.blocks.2.1.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
400
+ "vision_tower.blocks.2.1.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
401
+ "vision_tower.blocks.2.1.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
402
+ "vision_tower.blocks.2.1.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
403
+ "vision_tower.blocks.2.1.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
404
+ "vision_tower.blocks.2.1.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
405
+ "vision_tower.blocks.2.1.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
406
+ "vision_tower.blocks.2.1.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
407
+ "vision_tower.blocks.2.1.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
408
+ "vision_tower.blocks.2.1.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
409
+ "vision_tower.blocks.2.1.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
410
+ "vision_tower.blocks.2.1.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
411
+ "vision_tower.blocks.2.1.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
412
+ "vision_tower.blocks.2.1.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
413
+ "vision_tower.blocks.2.1.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
414
+ "vision_tower.blocks.2.1.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
415
+ "vision_tower.blocks.2.1.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
416
+ "vision_tower.blocks.2.1.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
417
+ "vision_tower.blocks.2.1.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
418
+ "vision_tower.blocks.2.1.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
419
+ "vision_tower.blocks.2.1.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
420
+ "vision_tower.blocks.2.1.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
421
+ "vision_tower.blocks.2.1.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
422
+ "vision_tower.blocks.2.1.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
423
+ "vision_tower.blocks.2.1.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
424
+ "vision_tower.blocks.2.1.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
425
+ "vision_tower.blocks.2.1.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
426
+ "vision_tower.blocks.2.1.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
427
+ "vision_tower.blocks.2.2.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
428
+ "vision_tower.blocks.2.2.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
429
+ "vision_tower.blocks.2.2.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
430
+ "vision_tower.blocks.2.2.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
431
+ "vision_tower.blocks.2.2.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
432
+ "vision_tower.blocks.2.2.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
433
+ "vision_tower.blocks.2.2.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
434
+ "vision_tower.blocks.2.2.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
435
+ "vision_tower.blocks.2.2.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
436
+ "vision_tower.blocks.2.2.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
437
+ "vision_tower.blocks.2.2.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
438
+ "vision_tower.blocks.2.2.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
439
+ "vision_tower.blocks.2.2.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
440
+ "vision_tower.blocks.2.2.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
441
+ "vision_tower.blocks.2.2.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
442
+ "vision_tower.blocks.2.2.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
443
+ "vision_tower.blocks.2.2.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
444
+ "vision_tower.blocks.2.2.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
445
+ "vision_tower.blocks.2.2.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
446
+ "vision_tower.blocks.2.2.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
447
+ "vision_tower.blocks.2.2.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
448
+ "vision_tower.blocks.2.2.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
449
+ "vision_tower.blocks.2.2.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
450
+ "vision_tower.blocks.2.2.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
451
+ "vision_tower.blocks.2.2.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
452
+ "vision_tower.blocks.2.2.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
453
+ "vision_tower.blocks.2.2.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
454
+ "vision_tower.blocks.2.2.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
455
+ "vision_tower.blocks.2.2.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
456
+ "vision_tower.blocks.2.2.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
457
+ "vision_tower.blocks.2.2.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
458
+ "vision_tower.blocks.2.2.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
459
+ "vision_tower.blocks.2.3.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
460
+ "vision_tower.blocks.2.3.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
461
+ "vision_tower.blocks.2.3.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
462
+ "vision_tower.blocks.2.3.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
463
+ "vision_tower.blocks.2.3.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
464
+ "vision_tower.blocks.2.3.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
465
+ "vision_tower.blocks.2.3.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
466
+ "vision_tower.blocks.2.3.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
467
+ "vision_tower.blocks.2.3.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
468
+ "vision_tower.blocks.2.3.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
469
+ "vision_tower.blocks.2.3.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
470
+ "vision_tower.blocks.2.3.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
471
+ "vision_tower.blocks.2.3.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
472
+ "vision_tower.blocks.2.3.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
473
+ "vision_tower.blocks.2.3.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
474
+ "vision_tower.blocks.2.3.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
475
+ "vision_tower.blocks.2.3.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
476
+ "vision_tower.blocks.2.3.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
477
+ "vision_tower.blocks.2.3.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
478
+ "vision_tower.blocks.2.3.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
479
+ "vision_tower.blocks.2.3.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
480
+ "vision_tower.blocks.2.3.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
481
+ "vision_tower.blocks.2.3.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
482
+ "vision_tower.blocks.2.3.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
483
+ "vision_tower.blocks.2.3.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
484
+ "vision_tower.blocks.2.3.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
485
+ "vision_tower.blocks.2.3.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
486
+ "vision_tower.blocks.2.3.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
487
+ "vision_tower.blocks.2.3.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
488
+ "vision_tower.blocks.2.3.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
489
+ "vision_tower.blocks.2.3.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
490
+ "vision_tower.blocks.2.3.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
491
+ "vision_tower.blocks.2.4.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
492
+ "vision_tower.blocks.2.4.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
493
+ "vision_tower.blocks.2.4.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
494
+ "vision_tower.blocks.2.4.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
495
+ "vision_tower.blocks.2.4.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
496
+ "vision_tower.blocks.2.4.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
497
+ "vision_tower.blocks.2.4.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
498
+ "vision_tower.blocks.2.4.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
499
+ "vision_tower.blocks.2.4.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
500
+ "vision_tower.blocks.2.4.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
501
+ "vision_tower.blocks.2.4.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
502
+ "vision_tower.blocks.2.4.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
503
+ "vision_tower.blocks.2.4.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
504
+ "vision_tower.blocks.2.4.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
505
+ "vision_tower.blocks.2.4.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
506
+ "vision_tower.blocks.2.4.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
507
+ "vision_tower.blocks.2.4.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
508
+ "vision_tower.blocks.2.4.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
509
+ "vision_tower.blocks.2.4.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
510
+ "vision_tower.blocks.2.4.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
511
+ "vision_tower.blocks.2.4.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
512
+ "vision_tower.blocks.2.4.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
513
+ "vision_tower.blocks.2.4.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
514
+ "vision_tower.blocks.2.4.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
515
+ "vision_tower.blocks.2.4.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
516
+ "vision_tower.blocks.2.4.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
517
+ "vision_tower.blocks.2.4.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
518
+ "vision_tower.blocks.2.4.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
519
+ "vision_tower.blocks.2.4.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
520
+ "vision_tower.blocks.2.4.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
521
+ "vision_tower.blocks.2.4.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
522
+ "vision_tower.blocks.2.4.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
523
+ "vision_tower.blocks.2.5.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
524
+ "vision_tower.blocks.2.5.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
525
+ "vision_tower.blocks.2.5.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
526
+ "vision_tower.blocks.2.5.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
527
+ "vision_tower.blocks.2.5.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
528
+ "vision_tower.blocks.2.5.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
529
+ "vision_tower.blocks.2.5.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
530
+ "vision_tower.blocks.2.5.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
531
+ "vision_tower.blocks.2.5.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
532
+ "vision_tower.blocks.2.5.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
533
+ "vision_tower.blocks.2.5.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
534
+ "vision_tower.blocks.2.5.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
535
+ "vision_tower.blocks.2.5.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
536
+ "vision_tower.blocks.2.5.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
537
+ "vision_tower.blocks.2.5.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
538
+ "vision_tower.blocks.2.5.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
539
+ "vision_tower.blocks.2.5.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
540
+ "vision_tower.blocks.2.5.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
541
+ "vision_tower.blocks.2.5.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
542
+ "vision_tower.blocks.2.5.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
543
+ "vision_tower.blocks.2.5.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
544
+ "vision_tower.blocks.2.5.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
545
+ "vision_tower.blocks.2.5.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
546
+ "vision_tower.blocks.2.5.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
547
+ "vision_tower.blocks.2.5.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
548
+ "vision_tower.blocks.2.5.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
549
+ "vision_tower.blocks.2.5.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
550
+ "vision_tower.blocks.2.5.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
551
+ "vision_tower.blocks.2.5.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
552
+ "vision_tower.blocks.2.5.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
553
+ "vision_tower.blocks.2.5.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
554
+ "vision_tower.blocks.2.5.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
555
+ "vision_tower.blocks.2.6.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
556
+ "vision_tower.blocks.2.6.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
557
+ "vision_tower.blocks.2.6.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
558
+ "vision_tower.blocks.2.6.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
559
+ "vision_tower.blocks.2.6.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
560
+ "vision_tower.blocks.2.6.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
561
+ "vision_tower.blocks.2.6.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
562
+ "vision_tower.blocks.2.6.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
563
+ "vision_tower.blocks.2.6.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
564
+ "vision_tower.blocks.2.6.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
565
+ "vision_tower.blocks.2.6.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
566
+ "vision_tower.blocks.2.6.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
567
+ "vision_tower.blocks.2.6.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
568
+ "vision_tower.blocks.2.6.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
569
+ "vision_tower.blocks.2.6.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
570
+ "vision_tower.blocks.2.6.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
571
+ "vision_tower.blocks.2.6.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
572
+ "vision_tower.blocks.2.6.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
573
+ "vision_tower.blocks.2.6.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
574
+ "vision_tower.blocks.2.6.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
575
+ "vision_tower.blocks.2.6.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
576
+ "vision_tower.blocks.2.6.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
577
+ "vision_tower.blocks.2.6.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
578
+ "vision_tower.blocks.2.6.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
579
+ "vision_tower.blocks.2.6.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
580
+ "vision_tower.blocks.2.6.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
581
+ "vision_tower.blocks.2.6.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
582
+ "vision_tower.blocks.2.6.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
583
+ "vision_tower.blocks.2.6.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
584
+ "vision_tower.blocks.2.6.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
585
+ "vision_tower.blocks.2.6.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
586
+ "vision_tower.blocks.2.6.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
587
+ "vision_tower.blocks.2.7.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
588
+ "vision_tower.blocks.2.7.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
589
+ "vision_tower.blocks.2.7.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
590
+ "vision_tower.blocks.2.7.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
591
+ "vision_tower.blocks.2.7.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
592
+ "vision_tower.blocks.2.7.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
593
+ "vision_tower.blocks.2.7.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
594
+ "vision_tower.blocks.2.7.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
595
+ "vision_tower.blocks.2.7.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
596
+ "vision_tower.blocks.2.7.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
597
+ "vision_tower.blocks.2.7.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
598
+ "vision_tower.blocks.2.7.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
599
+ "vision_tower.blocks.2.7.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
600
+ "vision_tower.blocks.2.7.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
601
+ "vision_tower.blocks.2.7.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
602
+ "vision_tower.blocks.2.7.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
603
+ "vision_tower.blocks.2.7.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
604
+ "vision_tower.blocks.2.7.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
605
+ "vision_tower.blocks.2.7.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
606
+ "vision_tower.blocks.2.7.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
607
+ "vision_tower.blocks.2.7.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
608
+ "vision_tower.blocks.2.7.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
609
+ "vision_tower.blocks.2.7.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
610
+ "vision_tower.blocks.2.7.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
611
+ "vision_tower.blocks.2.7.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
612
+ "vision_tower.blocks.2.7.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
613
+ "vision_tower.blocks.2.7.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
614
+ "vision_tower.blocks.2.7.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
615
+ "vision_tower.blocks.2.7.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
616
+ "vision_tower.blocks.2.7.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
617
+ "vision_tower.blocks.2.7.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
618
+ "vision_tower.blocks.2.7.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
619
+ "vision_tower.blocks.2.8.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
620
+ "vision_tower.blocks.2.8.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
621
+ "vision_tower.blocks.2.8.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
622
+ "vision_tower.blocks.2.8.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
623
+ "vision_tower.blocks.2.8.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
624
+ "vision_tower.blocks.2.8.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
625
+ "vision_tower.blocks.2.8.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
626
+ "vision_tower.blocks.2.8.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
627
+ "vision_tower.blocks.2.8.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
628
+ "vision_tower.blocks.2.8.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
629
+ "vision_tower.blocks.2.8.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
630
+ "vision_tower.blocks.2.8.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
631
+ "vision_tower.blocks.2.8.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
632
+ "vision_tower.blocks.2.8.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
633
+ "vision_tower.blocks.2.8.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
634
+ "vision_tower.blocks.2.8.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
635
+ "vision_tower.blocks.2.8.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
636
+ "vision_tower.blocks.2.8.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
637
+ "vision_tower.blocks.2.8.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
638
+ "vision_tower.blocks.2.8.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
639
+ "vision_tower.blocks.2.8.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
640
+ "vision_tower.blocks.2.8.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
641
+ "vision_tower.blocks.2.8.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
642
+ "vision_tower.blocks.2.8.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
643
+ "vision_tower.blocks.2.8.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
644
+ "vision_tower.blocks.2.8.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
645
+ "vision_tower.blocks.2.8.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
646
+ "vision_tower.blocks.2.8.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
647
+ "vision_tower.blocks.2.8.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
648
+ "vision_tower.blocks.2.8.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
649
+ "vision_tower.blocks.2.8.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
650
+ "vision_tower.blocks.2.8.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
651
+ "vision_tower.blocks.3.0.channel_block.channel_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
652
+ "vision_tower.blocks.3.0.channel_block.channel_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
653
+ "vision_tower.blocks.3.0.channel_block.channel_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
654
+ "vision_tower.blocks.3.0.channel_block.channel_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
655
+ "vision_tower.blocks.3.0.channel_block.channel_attn.norm.bias": "model-00003-of-00003.safetensors",
656
+ "vision_tower.blocks.3.0.channel_block.channel_attn.norm.weight": "model-00003-of-00003.safetensors",
657
+ "vision_tower.blocks.3.0.channel_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
658
+ "vision_tower.blocks.3.0.channel_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
659
+ "vision_tower.blocks.3.0.channel_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
660
+ "vision_tower.blocks.3.0.channel_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
661
+ "vision_tower.blocks.3.0.channel_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
662
+ "vision_tower.blocks.3.0.channel_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
663
+ "vision_tower.blocks.3.0.channel_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
664
+ "vision_tower.blocks.3.0.channel_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
665
+ "vision_tower.blocks.3.0.channel_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
666
+ "vision_tower.blocks.3.0.channel_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
667
+ "vision_tower.blocks.3.0.spatial_block.conv1.fn.dw.bias": "model-00003-of-00003.safetensors",
668
+ "vision_tower.blocks.3.0.spatial_block.conv1.fn.dw.weight": "model-00003-of-00003.safetensors",
669
+ "vision_tower.blocks.3.0.spatial_block.conv2.fn.dw.bias": "model-00003-of-00003.safetensors",
670
+ "vision_tower.blocks.3.0.spatial_block.conv2.fn.dw.weight": "model-00003-of-00003.safetensors",
671
+ "vision_tower.blocks.3.0.spatial_block.ffn.fn.net.fc1.bias": "model-00003-of-00003.safetensors",
672
+ "vision_tower.blocks.3.0.spatial_block.ffn.fn.net.fc1.weight": "model-00003-of-00003.safetensors",
673
+ "vision_tower.blocks.3.0.spatial_block.ffn.fn.net.fc2.bias": "model-00003-of-00003.safetensors",
674
+ "vision_tower.blocks.3.0.spatial_block.ffn.fn.net.fc2.weight": "model-00003-of-00003.safetensors",
675
+ "vision_tower.blocks.3.0.spatial_block.ffn.norm.bias": "model-00003-of-00003.safetensors",
676
+ "vision_tower.blocks.3.0.spatial_block.ffn.norm.weight": "model-00003-of-00003.safetensors",
677
+ "vision_tower.blocks.3.0.spatial_block.window_attn.fn.proj.bias": "model-00003-of-00003.safetensors",
678
+ "vision_tower.blocks.3.0.spatial_block.window_attn.fn.proj.weight": "model-00003-of-00003.safetensors",
679
+ "vision_tower.blocks.3.0.spatial_block.window_attn.fn.qkv.bias": "model-00003-of-00003.safetensors",
680
+ "vision_tower.blocks.3.0.spatial_block.window_attn.fn.qkv.weight": "model-00003-of-00003.safetensors",
681
+ "vision_tower.blocks.3.0.spatial_block.window_attn.norm.bias": "model-00003-of-00003.safetensors",
682
+ "vision_tower.blocks.3.0.spatial_block.window_attn.norm.weight": "model-00003-of-00003.safetensors",
683
+ "vision_tower.convs.0.norm.bias": "model-00003-of-00003.safetensors",
684
+ "vision_tower.convs.0.norm.weight": "model-00003-of-00003.safetensors",
685
+ "vision_tower.convs.0.proj.bias": "model-00003-of-00003.safetensors",
686
+ "vision_tower.convs.0.proj.weight": "model-00003-of-00003.safetensors",
687
+ "vision_tower.convs.1.norm.bias": "model-00003-of-00003.safetensors",
688
+ "vision_tower.convs.1.norm.weight": "model-00003-of-00003.safetensors",
689
+ "vision_tower.convs.1.proj.bias": "model-00003-of-00003.safetensors",
690
+ "vision_tower.convs.1.proj.weight": "model-00003-of-00003.safetensors",
691
+ "vision_tower.convs.2.norm.bias": "model-00003-of-00003.safetensors",
692
+ "vision_tower.convs.2.norm.weight": "model-00003-of-00003.safetensors",
693
+ "vision_tower.convs.2.proj.bias": "model-00003-of-00003.safetensors",
694
+ "vision_tower.convs.2.proj.weight": "model-00003-of-00003.safetensors",
695
+ "vision_tower.convs.3.norm.bias": "model-00003-of-00003.safetensors",
696
+ "vision_tower.convs.3.norm.weight": "model-00003-of-00003.safetensors",
697
+ "vision_tower.convs.3.proj.bias": "model-00003-of-00003.safetensors",
698
+ "vision_tower.convs.3.proj.weight": "model-00003-of-00003.safetensors",
699
+ "vision_tower.head.bias": "model-00003-of-00003.safetensors",
700
+ "vision_tower.head.weight": "model-00003-of-00003.safetensors",
701
+ "vision_tower.norms.bias": "model-00003-of-00003.safetensors",
702
+ "vision_tower.norms.weight": "model-00003-of-00003.safetensors",
703
+ "visual_temporal_embed.pos_idx_to_embed": "model-00003-of-00003.safetensors"
704
+ }
705
+ }
modeling_feynmodel.py ADDED
@@ -0,0 +1,1528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_fynmodel : Imed MAGROUNE / 2024 - 09
2
+ # original code from modeling_FeynModel
3
+ # add DaVit Vision Tower
4
+ #
5
+ # update generate forward function
6
+ #
7
+ # add lora adapters
8
+ #
9
+ # train on coco OD and vision reasoning
10
+ # train on ScenceQA
11
+ #
12
+ # todo add mamaba layer
13
+ #
14
+ # todo train on Arc-AGI
15
+
16
+
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.utils import (
19
+ ModelOutput,
20
+ add_start_docstrings,
21
+ add_start_docstrings_to_model_forward,
22
+ is_flash_attn_2_available,
23
+ logging,
24
+ replace_return_docstrings,
25
+ is_flash_attn_2_available,
26
+ is_flash_attn_greater_or_equal_2_10,
27
+ )
28
+ from transformers.activations import ACT2FN
29
+ from transformers.modeling_attn_mask_utils import (
30
+ _prepare_4d_attention_mask,
31
+ _prepare_4d_attention_mask_for_sdpa,
32
+ _prepare_4d_causal_attention_mask,
33
+ _prepare_4d_causal_attention_mask_for_sdpa,
34
+ )
35
+ from transformers.modeling_outputs import (
36
+ BaseModelOutput,
37
+ BaseModelOutputWithPastAndCrossAttentions,
38
+ Seq2SeqLMOutput,
39
+ Seq2SeqModelOutput,
40
+ )
41
+
42
+ from transformers.cache_utils import Cache, HybridCache
43
+ from transformers.modeling_outputs import (
44
+ BaseModelOutputWithPast,
45
+ CausalLMOutputWithPast,
46
+ SequenceClassifierOutputWithPast,
47
+ TokenClassifierOutput,
48
+ )
49
+
50
+ from typing import List, Optional, Tuple, Union
51
+
52
+ from transformers.models.gemma2.modeling_gemma2 import Gemma2Model, Gemma2ForCausalLM,Gemma2DecoderLayer,Gemma2RMSNorm
53
+ from .configuration_feynmodel import FeynModelConfig,Florence2VisionConfig
54
+
55
+ from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM
56
+ import json
57
+ import math
58
+ import torch
59
+ from torch import nn
60
+ import torch.nn.functional as F
61
+ import logging
62
+
63
+ from transformers.utils import (
64
+ ModelOutput,
65
+ add_start_docstrings,
66
+ add_start_docstrings_to_model_forward,
67
+ is_flash_attn_2_available,
68
+ logging,
69
+ replace_return_docstrings,
70
+ is_flash_attn_2_available,
71
+ is_flash_attn_greater_or_equal_2_10,
72
+ )
73
+
74
+ from transformers.modeling_utils import PreTrainedModel
75
+
76
+ from collections import OrderedDict
77
+ from einops import rearrange
78
+ from timm.models.layers import DropPath, trunc_normal_
79
+
80
+ logger = logging.get_logger(__name__)
81
+
82
+ class MySequential(nn.Sequential):
83
+ def forward(self, *inputs):
84
+ for module in self._modules.values():
85
+ if type(inputs) == tuple:
86
+ inputs = module(*inputs)
87
+ else:
88
+ inputs = module(inputs)
89
+ return inputs
90
+
91
+
92
+ class PreNorm(nn.Module):
93
+ def __init__(self, norm, fn, drop_path=None):
94
+ super().__init__()
95
+ self.norm = norm
96
+ self.fn = fn
97
+ self.drop_path = drop_path
98
+
99
+ def forward(self, x, *args, **kwargs):
100
+ shortcut = x
101
+ if self.norm != None:
102
+ x, size = self.fn(self.norm(x), *args, **kwargs)
103
+ else:
104
+ x, size = self.fn(x, *args, **kwargs)
105
+
106
+ if self.drop_path:
107
+ x = self.drop_path(x)
108
+
109
+ x = shortcut + x
110
+
111
+ return x, size
112
+
113
+
114
+ class Mlp(nn.Module):
115
+ def __init__(
116
+ self,
117
+ in_features,
118
+ hidden_features=None,
119
+ out_features=None,
120
+ act_layer=nn.GELU,
121
+ ):
122
+ super().__init__()
123
+ out_features = out_features or in_features
124
+ hidden_features = hidden_features or in_features
125
+ self.net = nn.Sequential(OrderedDict([
126
+ ("fc1", nn.Linear(in_features, hidden_features)),
127
+ ("act", act_layer()),
128
+ ("fc2", nn.Linear(hidden_features, out_features))
129
+ ]))
130
+
131
+ def forward(self, x, size):
132
+ return self.net(x), size
133
+
134
+
135
+ class DepthWiseConv2d(nn.Module):
136
+ def __init__(
137
+ self,
138
+ dim_in,
139
+ kernel_size,
140
+ padding,
141
+ stride,
142
+ bias=True,
143
+ ):
144
+ super().__init__()
145
+ self.dw = nn.Conv2d(
146
+ dim_in, dim_in,
147
+ kernel_size=kernel_size,
148
+ padding=padding,
149
+ groups=dim_in,
150
+ stride=stride,
151
+ bias=bias
152
+ )
153
+
154
+ def forward(self, x, size):
155
+ B, N, C = x.shape
156
+ H, W = size
157
+ assert N == H * W
158
+
159
+ x = self.dw(x.transpose(1, 2).view(B, C, H, W))
160
+ size = (x.size(-2), x.size(-1))
161
+ x = x.flatten(2).transpose(1, 2)
162
+ return x, size
163
+
164
+
165
+ class ConvEmbed(nn.Module):
166
+ """ Image to Patch Embedding
167
+ """
168
+
169
+ def __init__(
170
+ self,
171
+ patch_size=7,
172
+ in_chans=3,
173
+ embed_dim=64,
174
+ stride=4,
175
+ padding=2,
176
+ norm_layer=None,
177
+ pre_norm=True
178
+ ):
179
+ super().__init__()
180
+ self.patch_size = patch_size
181
+
182
+ self.proj = nn.Conv2d(
183
+ in_chans, embed_dim,
184
+ kernel_size=patch_size,
185
+ stride=stride,
186
+ padding=padding
187
+ )
188
+
189
+ dim_norm = in_chans if pre_norm else embed_dim
190
+ self.norm = norm_layer(dim_norm) if norm_layer else None
191
+
192
+ self.pre_norm = pre_norm
193
+
194
+ def forward(self, x, size):
195
+ H, W = size
196
+ if len(x.size()) == 3:
197
+ if self.norm and self.pre_norm:
198
+ x = self.norm(x)
199
+ x = rearrange(
200
+ x, 'b (h w) c -> b c h w',
201
+ h=H, w=W
202
+ )
203
+
204
+ x = self.proj(x)
205
+
206
+ _, _, H, W = x.shape
207
+ x = rearrange(x, 'b c h w -> b (h w) c')
208
+ if self.norm and not self.pre_norm:
209
+ x = self.norm(x)
210
+
211
+ return x, (H, W)
212
+
213
+
214
+ class ChannelAttention(nn.Module):
215
+
216
+ def __init__(self, dim, groups=8, qkv_bias=True):
217
+ super().__init__()
218
+
219
+ self.groups = groups
220
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
221
+ self.proj = nn.Linear(dim, dim)
222
+
223
+ def forward(self, x, size):
224
+ B, N, C = x.shape
225
+
226
+ qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4)
227
+ q, k, v = qkv[0], qkv[1], qkv[2]
228
+
229
+ q = q * (float(N) ** -0.5)
230
+ attention = q.transpose(-1, -2) @ k
231
+ attention = attention.softmax(dim=-1)
232
+ x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
233
+ x = x.transpose(1, 2).reshape(B, N, C)
234
+ x = self.proj(x)
235
+ return x, size
236
+
237
+
238
+ class ChannelBlock(nn.Module):
239
+
240
+ def __init__(self, dim, groups, mlp_ratio=4., qkv_bias=True,
241
+ drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
242
+ conv_at_attn=True, conv_at_ffn=True):
243
+ super().__init__()
244
+
245
+ drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
246
+
247
+ self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
248
+ self.channel_attn = PreNorm(
249
+ norm_layer(dim),
250
+ ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
251
+ drop_path
252
+ )
253
+ self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
254
+ self.ffn = PreNorm(
255
+ norm_layer(dim),
256
+ Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
257
+ drop_path
258
+ )
259
+
260
+ def forward(self, x, size):
261
+ if self.conv1:
262
+ x, size = self.conv1(x, size)
263
+ x, size = self.channel_attn(x, size)
264
+
265
+ if self.conv2:
266
+ x, size = self.conv2(x, size)
267
+ x, size = self.ffn(x, size)
268
+
269
+ return x, size
270
+
271
+
272
+ def window_partition(x, window_size: int):
273
+ B, H, W, C = x.shape
274
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
275
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
276
+ return windows
277
+
278
+
279
+ def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
280
+ B = batch_size
281
+ # this will cause onnx conversion failed for dynamic axis, because treated as constant
282
+ # int(windows.shape[0] / (H * W / window_size / window_size))
283
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
284
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
285
+ return x
286
+
287
+
288
+ class WindowAttention(nn.Module):
289
+ def __init__(self, dim, num_heads, window_size, qkv_bias=True):
290
+
291
+ super().__init__()
292
+ self.dim = dim
293
+ self.window_size = window_size
294
+ self.num_heads = num_heads
295
+ head_dim = dim // num_heads
296
+ self.scale = float(head_dim) ** -0.5
297
+
298
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
299
+ self.proj = nn.Linear(dim, dim)
300
+
301
+ self.softmax = nn.Softmax(dim=-1)
302
+
303
+ def forward(self, x, size):
304
+
305
+ H, W = size
306
+ B, L, C = x.shape
307
+ assert L == H * W, "input feature has wrong size"
308
+
309
+ x = x.view(B, H, W, C)
310
+
311
+ pad_l = pad_t = 0
312
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
313
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
314
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
315
+ _, Hp, Wp, _ = x.shape
316
+
317
+ x = window_partition(x, self.window_size)
318
+ x = x.view(-1, self.window_size * self.window_size, C)
319
+
320
+ # W-MSA/SW-MSA
321
+ # attn_windows = self.attn(x_windows)
322
+
323
+ B_, N, C = x.shape
324
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
325
+ q, k, v = qkv[0], qkv[1], qkv[2]
326
+
327
+ q = q * self.scale
328
+ attn = (q @ k.transpose(-2, -1))
329
+ attn = self.softmax(attn)
330
+
331
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
332
+ x = self.proj(x)
333
+
334
+ # merge windows
335
+ x = x.view(
336
+ -1, self.window_size, self.window_size, C
337
+ )
338
+ x = window_reverse(x, B, self.window_size, Hp, Wp)
339
+
340
+ if pad_r > 0 or pad_b > 0:
341
+ x = x[:, :H, :W, :].contiguous()
342
+
343
+ x = x.view(B, H * W, C)
344
+
345
+ return x, size
346
+
347
+
348
+ class SpatialBlock(nn.Module):
349
+
350
+ def __init__(self, dim, num_heads, window_size,
351
+ mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU,
352
+ norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
353
+ super().__init__()
354
+
355
+ drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
356
+
357
+ self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
358
+ self.window_attn = PreNorm(
359
+ norm_layer(dim),
360
+ WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
361
+ drop_path
362
+ )
363
+ self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
364
+ self.ffn = PreNorm(
365
+ norm_layer(dim),
366
+ Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
367
+ drop_path
368
+ )
369
+
370
+ def forward(self, x, size):
371
+ if self.conv1:
372
+ x, size = self.conv1(x, size)
373
+ x, size = self.window_attn(x, size)
374
+
375
+ if self.conv2:
376
+ x, size = self.conv2(x, size)
377
+ x, size = self.ffn(x, size)
378
+ return x, size
379
+
380
+
381
+ class DaViT(nn.Module):
382
+ """ DaViT: Dual-Attention Transformer
383
+
384
+ Args:
385
+ in_chans (int): Number of input image channels. Default: 3.
386
+ num_classes (int): Number of classes for classification head. Default: 1000.
387
+ patch_size (tuple(int)): Patch size of convolution in different stages. Default: (7, 2, 2, 2).
388
+ patch_stride (tuple(int)): Patch stride of convolution in different stages. Default: (4, 2, 2, 2).
389
+ patch_padding (tuple(int)): Patch padding of convolution in different stages. Default: (3, 0, 0, 0).
390
+ patch_prenorm (tuple(bool)): If True, perform norm before convlution layer. Default: (True, False, False, False).
391
+ embed_dims (tuple(int)): Patch embedding dimension in different stages. Default: (64, 128, 192, 256).
392
+ num_heads (tuple(int)): Number of spatial attention heads in different stages. Default: (4, 8, 12, 16).
393
+ num_groups (tuple(int)): Number of channel groups in different stages. Default: (4, 8, 12, 16).
394
+ window_size (int): Window size. Default: 7.
395
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
396
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True.
397
+ drop_path_rate (float): Stochastic depth rate. Default: 0.1.
398
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
399
+ enable_checkpoint (bool): If True, enable checkpointing. Default: False.
400
+ conv_at_attn (bool): If True, performe depthwise convolution before attention layer. Default: True.
401
+ conv_at_ffn (bool): If True, performe depthwise convolution before ffn layer. Default: True.
402
+ """
403
+
404
+ def __init__(
405
+ self,
406
+ in_chans=3,
407
+ num_classes=1000,
408
+ depths=(1, 1, 3, 1),
409
+ patch_size=(7, 2, 2, 2),
410
+ patch_stride=(4, 2, 2, 2),
411
+ patch_padding=(3, 0, 0, 0),
412
+ patch_prenorm=(False, False, False, False),
413
+ embed_dims=(64, 128, 192, 256),
414
+ num_heads=(3, 6, 12, 24),
415
+ num_groups=(3, 6, 12, 24),
416
+ window_size=7,
417
+ mlp_ratio=4.,
418
+ qkv_bias=True,
419
+ drop_path_rate=0.1,
420
+ norm_layer=nn.LayerNorm,
421
+ enable_checkpoint=False,
422
+ conv_at_attn=True,
423
+ conv_at_ffn=True,
424
+ ):
425
+ super().__init__()
426
+
427
+ self.num_classes = num_classes
428
+ self.embed_dims = embed_dims
429
+ self.num_heads = num_heads
430
+ self.num_groups = num_groups
431
+ self.num_stages = len(self.embed_dims)
432
+ self.enable_checkpoint = enable_checkpoint
433
+ assert self.num_stages == len(self.num_heads) == len(self.num_groups)
434
+
435
+ num_stages = len(embed_dims)
436
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
437
+
438
+ depth_offset = 0
439
+ convs = []
440
+ blocks = []
441
+ for i in range(num_stages):
442
+ conv_embed = ConvEmbed(
443
+ patch_size=patch_size[i],
444
+ stride=patch_stride[i],
445
+ padding=patch_padding[i],
446
+ in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
447
+ embed_dim=self.embed_dims[i],
448
+ norm_layer=norm_layer,
449
+ pre_norm=patch_prenorm[i]
450
+ )
451
+ convs.append(conv_embed)
452
+
453
+ block = MySequential(
454
+ *[
455
+ MySequential(OrderedDict([
456
+ (
457
+ 'spatial_block', SpatialBlock(
458
+ embed_dims[i],
459
+ num_heads[i],
460
+ window_size,
461
+ drop_path_rate=dpr[depth_offset+j*2],
462
+ qkv_bias=qkv_bias,
463
+ mlp_ratio=mlp_ratio,
464
+ conv_at_attn=conv_at_attn,
465
+ conv_at_ffn=conv_at_ffn,
466
+ )
467
+ ),
468
+ (
469
+ 'channel_block', ChannelBlock(
470
+ embed_dims[i],
471
+ num_groups[i],
472
+ drop_path_rate=dpr[depth_offset+j*2+1],
473
+ qkv_bias=qkv_bias,
474
+ mlp_ratio=mlp_ratio,
475
+ conv_at_attn=conv_at_attn,
476
+ conv_at_ffn=conv_at_ffn,
477
+ )
478
+ )
479
+ ])) for j in range(depths[i])
480
+ ]
481
+ )
482
+ blocks.append(block)
483
+ depth_offset += depths[i]*2
484
+
485
+ self.convs = nn.ModuleList(convs)
486
+ self.blocks = nn.ModuleList(blocks)
487
+
488
+ self.norms = norm_layer(self.embed_dims[-1])
489
+ self.avgpool = nn.AdaptiveAvgPool1d(1)
490
+ self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
491
+
492
+ self.apply(self._init_weights)
493
+
494
+ @property
495
+ def dim_out(self):
496
+ return self.embed_dims[-1]
497
+
498
+ def _init_weights(self, m):
499
+ if isinstance(m, nn.Linear):
500
+ trunc_normal_(m.weight, std=0.02)
501
+ if m.bias is not None:
502
+ nn.init.constant_(m.bias, 0)
503
+ elif isinstance(m, nn.Conv2d):
504
+ nn.init.normal_(m.weight, std=0.02)
505
+ for name, _ in m.named_parameters():
506
+ if name in ['bias']:
507
+ nn.init.constant_(m.bias, 0)
508
+ elif isinstance(m, nn.LayerNorm):
509
+ nn.init.constant_(m.weight, 1.0)
510
+ nn.init.constant_(m.bias, 0)
511
+ elif isinstance(m, nn.BatchNorm2d):
512
+ nn.init.constant_(m.weight, 1.0)
513
+ nn.init.constant_(m.bias, 0)
514
+
515
+ def forward_features_unpool(self, x):
516
+ """
517
+ forward until avg pooling
518
+ Args:
519
+ x (_type_): input image tensor
520
+ """
521
+ input_size = (x.size(2), x.size(3))
522
+ for conv, block in zip(self.convs, self.blocks):
523
+ x, input_size = conv(x, input_size)
524
+ if self.enable_checkpoint:
525
+ x, input_size = checkpoint.checkpoint(block, x, input_size)
526
+ else:
527
+ x, input_size = block(x, input_size)
528
+ return x
529
+
530
+ def forward_features(self, x):
531
+ x = self.forward_features_unpool(x)
532
+
533
+ # (batch_size, num_tokens, token_dim)
534
+ x = self.avgpool(x.transpose(1, 2))
535
+ # (batch_size, 1, num_tokens)
536
+ x = torch.flatten(x, 1)
537
+ x = self.norms(x)
538
+
539
+ return x
540
+
541
+ def forward(self, x):
542
+ x = self.forward_features(x)
543
+ x = self.head(x)
544
+ return x
545
+
546
+ @classmethod
547
+ def from_config(cls, config):
548
+ return cls(
549
+ depths=config.depths,
550
+ embed_dims=config.dim_embed,
551
+ num_heads=config.num_heads,
552
+ num_groups=config.num_groups,
553
+ patch_size=config.patch_size,
554
+ patch_stride=config.patch_stride,
555
+ patch_padding=config.patch_padding,
556
+ patch_prenorm=config.patch_prenorm,
557
+ drop_path_rate=config.drop_path_rate,
558
+ window_size=config.window_size,
559
+ )
560
+
561
+
562
+
563
+
564
+ _CONFIG_FOR_DOC = "FeynModelConfig"
565
+
566
+ FEYNMODEL_START_DOCSTRING = r"""
567
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
568
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
569
+ etc.)
570
+
571
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
572
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
573
+ and behavior.
574
+
575
+ Parameters:
576
+ config ([`FeynModelConfig`]):
577
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
578
+ load the weights associated with the model, only the configuration. Check out the
579
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
580
+ """
581
+ FEYNMODEL_INPUTS_DOCSTRING = r"""
582
+ Args:
583
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
584
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
585
+ it.
586
+
587
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
588
+ [`PreTrainedTokenizer.__call__`] for details.
589
+
590
+ [What are input IDs?](../glossary#input-ids)
591
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
592
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
593
+
594
+ - 1 for tokens that are **not masked**,
595
+ - 0 for tokens that are **masked**.
596
+
597
+ [What are attention masks?](../glossary#attention-mask)
598
+
599
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
600
+ [`PreTrainedTokenizer.__call__`] for details.
601
+
602
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
603
+ `past_key_values`).
604
+
605
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
606
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
607
+ information on the default strategy.
608
+
609
+ - 1 indicates the head is **not masked**,
610
+ - 0 indicates the head is **masked**.
611
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
612
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
613
+ config.n_positions - 1]`.
614
+
615
+ [What are position IDs?](../glossary#position-ids)
616
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
617
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
618
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
619
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
620
+
621
+ Two formats are allowed:
622
+ - a [`~cache_utils.Cache`] instance;
623
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
624
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
625
+ cache format.
626
+
627
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
628
+ legacy cache format will be returned.
629
+
630
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
631
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
632
+ of shape `(batch_size, sequence_length)`.
633
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
634
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
635
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
636
+ model's internal embedding lookup matrix.
637
+ use_cache (`bool`, *optional*):
638
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
639
+ `past_key_values`).
640
+ output_attentions (`bool`, *optional*):
641
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
642
+ tensors for more detail.
643
+ output_hidden_states (`bool`, *optional*):
644
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
645
+ more detail.
646
+ return_dict (`bool`, *optional*):
647
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
648
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
649
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
650
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
651
+ the complete sequence length.
652
+ """
653
+
654
+ # Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
655
+ def _prepare_4d_causal_attention_mask_with_cache_position(
656
+ attention_mask: torch.Tensor,
657
+ sequence_length: int,
658
+ target_length: int,
659
+ dtype: torch.dtype,
660
+ device: torch.device,
661
+ min_dtype: float,
662
+ cache_position: torch.Tensor,
663
+ batch_size: int,
664
+ ):
665
+
666
+ #print(f" +++++++++ prepare 4K +++++++++++++++ rec {attention_mask.size()} sequence_length {sequence_length}")
667
+ if attention_mask is not None and attention_mask.dim() == 4:
668
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
669
+ #print("+++++++++++++++++ return it")
670
+ #causal_mask = attention_mask
671
+ # In this case we assume that the mask comes already in inverted form.
672
+ causal_mask = attention_mask[:, :, -sequence_length:, :]
673
+ #print(f"+++++++++++++++++ truncated causal_mask to last {sequence_length} elements, size: {causal_mask.size()}")
674
+ #print(f"+++++++++++++++++ return it causal_mask {causal_mask.size()} !!!!!!!!! attention_mask {attention_mask.size()}")
675
+ else:
676
+ #print("+++++++++++++++++++++ else +++++++++++++++++")
677
+ causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
678
+ #print(f"++++++++++++++++ causal_mask {causal_mask.size()} ++++++++++++++++++ sequence_length = {sequence_length} ")
679
+ if sequence_length != 1:
680
+ causal_mask = torch.triu(causal_mask, diagonal=1)
681
+ #print(f"++++++++++++++++++ causal_mask = torch.triu ++++++++++ {causal_mask.size()} ")
682
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
683
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
684
+ #print(f"+++++++++++++++++++++ avant if attention_mask is not None:, causal_mask={causal_mask.size()}")
685
+ if attention_mask is not None:
686
+ #print(" +++++++++++++ attention_mask is None++++++++++++")
687
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
688
+ mask_length = attention_mask.shape[-1]
689
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
690
+ padding_mask = padding_mask == 0
691
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
692
+ padding_mask, min_dtype
693
+ )
694
+ #print(f"+++++++++++++++++++ 4K returning causal_mask {causal_mask.size()} +++++++++++++++++++")
695
+
696
+ return causal_mask
697
+
698
+ class LearnedAbsolutePositionEmbedding2D(nn.Module):
699
+ """
700
+ This module learns positional embeddings up to a fixed maximum size.
701
+ """
702
+
703
+ def __init__(self, embedding_dim=256, num_pos=50):
704
+ super().__init__()
705
+ self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
706
+ self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
707
+
708
+ def forward(self, pixel_values):
709
+ """
710
+ pixel_values: (batch_size, height, width, num_channels)
711
+ returns: (batch_size, height, width, embedding_dim * 2)
712
+ """
713
+ if len(pixel_values.shape) != 4:
714
+ raise ValueError('pixel_values must be a 4D tensor')
715
+ height, width = pixel_values.shape[1:3]
716
+ width_values = torch.arange(width, device=pixel_values.device)
717
+ height_values = torch.arange(height, device=pixel_values.device)
718
+ x_emb = self.column_embeddings(width_values)
719
+ y_emb = self.row_embeddings(height_values)
720
+ # (height, width, embedding_dim * 2)
721
+ pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
722
+ # (embedding_dim * 2, height, width)
723
+ pos = pos.permute(2, 0, 1)
724
+ pos = pos.unsqueeze(0)
725
+ # (batch_size, embedding_dim * 2, height, width)
726
+ pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
727
+ # (batch_size, height, width, embedding_dim * 2)
728
+ pos = pos.permute(0, 2, 3, 1)
729
+ return pos
730
+
731
+ class PositionalEmbeddingCosine1D(nn.Module):
732
+ """
733
+ This class implements a very simple positional encoding. It follows closely
734
+ the encoder from the link below:
735
+ https://pytorch.org/tutorials/beginner/translation_transformer.html
736
+ Args:
737
+ embed_dim: The dimension of the embeddings.
738
+ dropout_prob: The dropout probability.
739
+ max_seq_len: The maximum length to precompute the positional encodings.
740
+ """
741
+ def __init__(
742
+ self,
743
+ embed_dim: int = 512,
744
+ max_seq_len: int = 1024) -> None:
745
+ super(PositionalEmbeddingCosine1D, self).__init__()
746
+ self.embed_dim = embed_dim
747
+ self.max_seq_len = max_seq_len
748
+ # Generate the sinusoidal arrays.
749
+ factor = math.log(10000)
750
+ denominator = torch.exp(
751
+ -factor * torch.arange(0, self.embed_dim, 2) / self.embed_dim)
752
+ # Matrix where rows correspond to a positional embedding as a function
753
+ # of the position index (i.e., the row index).
754
+ frequencies = \
755
+ torch.arange(0, self.max_seq_len) \
756
+ .reshape(self.max_seq_len, 1) * denominator
757
+ pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
758
+ # Populate uneven entries.
759
+ pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
760
+ pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
761
+ # Save the positional embeddings in a constant buffer.
762
+ self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
763
+
764
+ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
765
+ """
766
+ Args:
767
+ seq_embeds: The sequence embeddings in order. Allowed size:
768
+ 1. [T, D], where T is the length of the sequence, and D is the
769
+ frame embedding dimension.
770
+ 2. [B, T, D], where B is the batch size and T and D are the
771
+ same as above.
772
+ Returns a tensor of with the same dimensions as the input: i.e.,
773
+ [1, T, D] or [T, D].
774
+ """
775
+ shape_len = len(seq_embeds.shape)
776
+ assert 2 <= shape_len <= 3
777
+ len_seq = seq_embeds.size(-2)
778
+ assert len_seq <= self.max_seq_len
779
+ pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
780
+ # Adapt pre-computed positional embeddings to the input.
781
+ if shape_len == 3:
782
+ pos_embeds = pos_embeds.view(
783
+ (1, pos_embeds.size(0), pos_embeds.size(1)))
784
+ return pos_embeds
785
+
786
+
787
+ class LearnedAbsolutePositionEmbedding1D(nn.Module):
788
+ """
789
+ Learnable absolute positional embeddings for 1D sequences.
790
+ Args:
791
+ embed_dim: The dimension of the embeddings.
792
+ max_seq_len: The maximum length to precompute the positional encodings.
793
+ """
794
+ def __init__(
795
+ self,
796
+ embedding_dim: int = 512,
797
+ num_pos: int = 1024) -> None:
798
+ super(LearnedAbsolutePositionEmbedding1D, self).__init__()
799
+ self.embeddings = nn.Embedding(num_pos, embedding_dim)
800
+ self.num_pos = num_pos
801
+
802
+ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
803
+ """
804
+ Args:
805
+ seq_embeds: The sequence embeddings in order. Allowed size:
806
+ 1. [T, D], where T is the length of the sequence, and D is the
807
+ frame embedding dimension.
808
+ 2. [B, T, D], where B is the batch size and T and D are the
809
+ same as above.
810
+ Returns a tensor of with the same dimensions as the input: i.e.,
811
+ [1, T, D] or [T, D].
812
+ """
813
+ shape_len = len(seq_embeds.shape)
814
+ assert 2 <= shape_len <= 3
815
+ len_seq = seq_embeds.size(-2)
816
+ assert len_seq <= self.num_pos
817
+ # [T, D]
818
+ pos_embeds = self.embeddings(torch.arange(len_seq).to(seq_embeds.device))
819
+ # Adapt pre-computed positional embeddings to the input.
820
+ if shape_len == 3:
821
+ pos_embeds = pos_embeds.view(
822
+ (1, pos_embeds.size(0), pos_embeds.size(1)))
823
+ return pos_embeds
824
+
825
+ def create_git_attention_mask(
826
+ tgt: torch.Tensor,
827
+ memory: torch.Tensor,
828
+ max_length: int
829
+ ) -> torch.Tensor:
830
+ # Obtain the dimensions of the target text and memory
831
+ batch_size = tgt.size(0)
832
+ num_tgt = tgt.shape[1]
833
+ num_memory = memory.shape[1]
834
+ total_length = num_memory + num_tgt
835
+
836
+ # Create the top left part of the attention matrix
837
+ top_left = torch.zeros((num_memory, num_memory)) # Attention enabled in this region
838
+ top_right = torch.full((num_memory, num_tgt), float(-3.4028e+38)) # Attention disabled here
839
+
840
+ # Bottom left part of the attention matrix
841
+ bottom_left = torch.zeros((num_tgt, num_memory)) # Attention enabled here
842
+
843
+ # Create a lower triangular matrix for the bottom right part
844
+ bottom_right = torch.tril(torch.ones(num_tgt, num_tgt))
845
+
846
+ # Transform 1s to 0 to enable attention, and 0s to -inf to block attention
847
+ bottom_right = bottom_right.masked_fill(bottom_right == 0, float(-3.4028e+38))
848
+ bottom_right = bottom_right.masked_fill(bottom_right == 1, float(0))
849
+
850
+ # Concatenate matrices to form the full mask
851
+ left = torch.cat((top_left, bottom_left), dim=0)
852
+ right = torch.cat((top_right, bottom_right), dim=0)
853
+
854
+ # Combine left and right parts
855
+ full_attention_mask = torch.cat((left, right), dim=1)
856
+
857
+ # Add padding to reach max_length
858
+ padding = torch.full((total_length, max_length - total_length), float(-3.4028e+38))
859
+ full_attention_mask = torch.cat((full_attention_mask, padding), dim=1)
860
+
861
+ # Add an axis for multi-heads and batch_size
862
+ full_attention_mask = full_attention_mask[None, None, :, :]
863
+
864
+ # Expand the mask to have shape (batch_size, 1, seq_length, max_length)
865
+ full_attention_mask = full_attention_mask.expand(batch_size, 1, full_attention_mask.size(-2), full_attention_mask.size(-1))
866
+
867
+ return full_attention_mask
868
+
869
+ def get_position_ids_from_binary_attention_mask(mask):
870
+ """
871
+ Extract position IDs from a binary attention mask.
872
+
873
+ Args:
874
+ mask (torch.Tensor): The attention mask tensor of shape (1, 1, seq_len, seq_len),
875
+ where 1 indicates allowed attention and 0 indicates blocked attention.
876
+
877
+ Returns:
878
+ list: A list of lists where each sublist contains the allowed position IDs for each query position.
879
+ """
880
+ # Assuming the mask is of shape (1, 1, seq_len, seq_len)
881
+ _, _, seq_len, _ = mask.shape
882
+
883
+ # Create a tensor with position IDs from 0 to seq_len - 1
884
+ position_ids = torch.arange(seq_len, dtype=torch.long, device=mask.device)
885
+
886
+ # Add a batch dimension
887
+ position_ids = position_ids.unsqueeze(0)
888
+
889
+ return position_ids
890
+
891
+ def ensure_tensor(variable):
892
+ # Check if the variable is a torch.Tensor
893
+ if isinstance(variable, torch.Tensor):
894
+ # print("Variable is already a tensor.")
895
+ return variable
896
+ else:
897
+ #print("Variable is not a tensor, converting...")
898
+ try:
899
+ # Convert the variable to a tensor
900
+ tensor = torch.tensor(variable)
901
+ #print("Conversion successful.")
902
+ return tensor
903
+ except Exception as e:
904
+ print(f"Error converting to tensor: {e}")
905
+ raise
906
+
907
+ @add_start_docstrings(
908
+ "The bare Model outputting raw hidden-states without any specific head on top.",
909
+ FEYNMODEL_START_DOCSTRING,
910
+ )
911
+ class FeynModel(Gemma2Model):
912
+ """
913
+ Transformer decoder consisting of *config.num_hidden_layers* layers.
914
+ Each layer is a [`FeynModelDecoderLayer`] + ['LoraLayer'] for *proj* moduls
915
+ NB : LoraLayers will be added and activatd on proj modules onpy if pixel_values is not None
916
+
917
+ Args:
918
+ config: FeynModelConfig
919
+ """
920
+
921
+ def __init__(self, config: FeynModelConfig):
922
+ super().__init__(config)
923
+ # Initialize weights and apply final processing
924
+ self.mode='llm'
925
+ '''
926
+ self.image_patch_tokens = int(
927
+ (config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1
928
+ )
929
+
930
+ if config.num_image_with_embedding is not None:
931
+ self.image_patch_tokens *= config.num_image_with_embedding
932
+ '''
933
+ self.image_patch_tokens = 577
934
+ self.post_init()
935
+
936
+ def get_input_embeddings(self):
937
+ return self.embed_tokens
938
+
939
+ def set_input_embeddings(self, value):
940
+ self.embed_tokens = value
941
+
942
+
943
+
944
+
945
+ @add_start_docstrings_to_model_forward(FEYNMODEL_INPUTS_DOCSTRING)
946
+ def forward(
947
+ self,
948
+ input_ids: torch.LongTensor = None,
949
+ attention_mask: Optional[torch.Tensor] = None,
950
+ position_ids: Optional[torch.LongTensor] = None,
951
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
952
+ inputs_embeds: Optional[torch.FloatTensor] = None,
953
+ use_cache: Optional[bool] = None,
954
+ output_attentions: Optional[bool] = None,
955
+ output_hidden_states: Optional[bool] = None,
956
+ return_dict: Optional[bool] = None,
957
+ cache_position: Optional[torch.LongTensor] = None,
958
+ causal_attention_mask: Optional[torch.Tensor] = None,
959
+ **kwargs,
960
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
961
+
962
+ # print(f" self.mode = {self.mode}")
963
+ # Ensure cache_position is initialized if not provided
964
+
965
+
966
+ if cache_position is None:
967
+ batch_size = input_ids.size(0) if input_ids is not None else inputs_embeds.size(0)
968
+ cache_position = torch.zeros((batch_size,), dtype=torch.long, device=input_ids.device if input_ids is not None else inputs_embeds.device)
969
+
970
+
971
+
972
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
973
+ output_hidden_states = (
974
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
975
+ )
976
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
977
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
978
+
979
+ if (input_ids is None) ^ (inputs_embeds is not None):
980
+ raise ValueError(
981
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
982
+ )
983
+
984
+ if self.gradient_checkpointing and self.training and use_cache:
985
+ logger.warning_once(
986
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
987
+ )
988
+ use_cache = False
989
+
990
+ if inputs_embeds is None:
991
+ inputs_embeds = self.embed_tokens(input_ids)
992
+ causal_mask = self._update_causal_mask(
993
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
994
+ )
995
+ else:
996
+ causal_mask = ensure_tensor(causal_attention_mask)
997
+ position_ids = get_position_ids_from_binary_attention_mask(attention_mask)
998
+
999
+ #print(f" causal_mask = {causal_mask} ")
1000
+
1001
+ if cache_position is None:
1002
+ cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device)
1003
+
1004
+ if position_ids is None :
1005
+ position_ids = cache_position.unsqueeze(0)
1006
+
1007
+
1008
+
1009
+ # Convert position_ids to a tensor if not already
1010
+ if not isinstance(position_ids, torch.Tensor):
1011
+
1012
+ position_ids = torch.tensor(position_ids, dtype=torch.long, device=inputs_embeds.device)
1013
+
1014
+
1015
+ # embed positions
1016
+ hidden_states = inputs_embeds
1017
+
1018
+ # normalized
1019
+ # FeynModel downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
1020
+ # See https://github.com/huggingface/transformers/pull/29402
1021
+ normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
1022
+ hidden_states = hidden_states * normalizer
1023
+
1024
+ all_hidden_states = () if output_hidden_states else None
1025
+ all_self_attns = () if output_attentions else None
1026
+
1027
+ for decoder_layer in self.layers:
1028
+ if output_hidden_states:
1029
+ all_hidden_states += (hidden_states,)
1030
+
1031
+ if self.gradient_checkpointing and self.training:
1032
+ layer_outputs = self._gradient_checkpointing_func(
1033
+ decoder_layer.__call__,
1034
+ hidden_states,
1035
+ causal_mask,
1036
+ position_ids,
1037
+ past_key_values,
1038
+ output_attentions,
1039
+ use_cache,
1040
+ cache_position,
1041
+ )
1042
+ else:
1043
+ layer_outputs = decoder_layer(
1044
+ hidden_states,
1045
+ attention_mask=causal_mask,
1046
+ position_ids=position_ids,
1047
+ past_key_value=past_key_values,
1048
+ output_attentions=output_attentions,
1049
+ use_cache=use_cache,
1050
+ cache_position=cache_position,
1051
+ )
1052
+
1053
+ hidden_states = layer_outputs[0]
1054
+
1055
+ if output_attentions:
1056
+ all_self_attns += (layer_outputs[1],)
1057
+
1058
+ hidden_states = self.norm(hidden_states)
1059
+
1060
+ # add hidden states from the last decoder layer
1061
+ if output_hidden_states:
1062
+ all_hidden_states += (hidden_states,)
1063
+
1064
+ next_cache = past_key_values if use_cache else None
1065
+
1066
+ if not return_dict:
1067
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1068
+ return BaseModelOutputWithPast(
1069
+ last_hidden_state=hidden_states,
1070
+ past_key_values=next_cache,
1071
+ hidden_states=all_hidden_states,
1072
+ attentions=all_self_attns,
1073
+ )
1074
+
1075
+
1076
+
1077
+ def _update_causal_mask(
1078
+ self,
1079
+ attention_mask: torch.Tensor,
1080
+ input_tensor: torch.Tensor,
1081
+ cache_position: torch.Tensor,
1082
+ past_key_values: Cache,
1083
+ output_attentions: bool,
1084
+ ):
1085
+
1086
+ # print(f" _start _____ _update_causal_mask attention_mask {attention_mask.size()} {attention_mask} ")
1087
+ # Flash Attention currently doesn't support static cache but FeynModel work only with static cache.
1088
+ # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape
1089
+ # to cut out keys/values trailing 0 used in static cache. This workaround should be compile compatible
1090
+ # as it doesn't cause dynamic control issues.
1091
+ if self.config._attn_implementation == "flash_attention_2":
1092
+ return attention_mask
1093
+
1094
+ dtype, device = input_tensor.dtype, input_tensor.device
1095
+ min_dtype = torch.finfo(dtype).min
1096
+ sequence_length = input_tensor.shape[1]
1097
+ if isinstance(past_key_values, HybridCache):
1098
+ target_length = past_key_values.get_max_length()
1099
+ else:
1100
+ target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
1101
+
1102
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1103
+ causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
1104
+ attention_mask,
1105
+ sequence_length=sequence_length,
1106
+ target_length=target_length,
1107
+ dtype=dtype,
1108
+ device=device,
1109
+ min_dtype=min_dtype,
1110
+ cache_position=cache_position,
1111
+ batch_size=input_tensor.shape[0],
1112
+ )
1113
+ #print(f" _end ______ _update_causal_mask causal_mask {causal_mask.size()} {causal_mask} ")
1114
+ return causal_mask
1115
+
1116
+
1117
+
1118
+ class FeynModelForCausalLM(Gemma2ForCausalLM):
1119
+ _tied_weights_keys = ["lm_head.weight"]
1120
+ config_class = FeynModelConfig
1121
+ def __init__(self, config):
1122
+ super().__init__(config)
1123
+ config.vision_config=Florence2VisionConfig.from_dict(config.vision_config)
1124
+ self.model = FeynModel(config)
1125
+
1126
+ # assert config.vision_config.model_type== 'davit', 'only DaViT is supported for now'
1127
+ self.vision_tower = DaViT.from_config(config=config.vision_config)
1128
+ self._build_image_projection_layers(config)
1129
+
1130
+ self.__causal_attention_mask = None
1131
+
1132
+ # Initialize weights and apply final processing
1133
+ self.post_init()
1134
+
1135
+ ################ Vision Tower ########################
1136
+ def _build_image_projection_layers(self, config):
1137
+ image_dim_out = config.vision_config.dim_embed[-1]
1138
+ dim_projection = config.vision_config.projection_dim
1139
+ self.image_projection = nn.Parameter(
1140
+ torch.empty(image_dim_out, dim_projection)
1141
+ )
1142
+ self.image_proj_norm = nn.LayerNorm(dim_projection)
1143
+ image_pos_embed_config = config.vision_config.image_pos_embed
1144
+ if image_pos_embed_config['type'] == 'learned_abs_2d':
1145
+ self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
1146
+ embedding_dim=image_dim_out,
1147
+ num_pos=image_pos_embed_config['max_pos_embeddings']
1148
+ )
1149
+ else:
1150
+ raise NotImplementedError('Not implemented yet')
1151
+
1152
+ self.image_feature_source = config.vision_config.image_feature_source
1153
+
1154
+ # temporal embedding
1155
+ visual_temporal_embedding_config = config.vision_config.visual_temporal_embedding
1156
+ if visual_temporal_embedding_config['type'] == 'COSINE':
1157
+ self.visual_temporal_embed = PositionalEmbeddingCosine1D(
1158
+ embed_dim=image_dim_out,
1159
+ max_seq_len=visual_temporal_embedding_config['max_temporal_embeddings']
1160
+ )
1161
+ else:
1162
+ raise NotImplementedError('Not implemented yet')
1163
+
1164
+
1165
+
1166
+ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds):
1167
+ batch_size, image_token_length = image_features.size()[:-1]
1168
+ device = image_features.device
1169
+ image_attention_mask = torch.ones(batch_size, image_token_length, device=device)
1170
+
1171
+ if inputs_embeds is None:
1172
+ return image_features, image_attention_mask
1173
+
1174
+ task_prefix_embeds = inputs_embeds
1175
+ task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
1176
+
1177
+ # Assurer que les masques d'attention sont de deux dimensions
1178
+ if len(task_prefix_attention_mask.shape) == 3:
1179
+ task_prefix_attention_mask = task_prefix_attention_mask.squeeze(1)
1180
+
1181
+ # Vérifier la dimension de batch et ajuster si nécessaire
1182
+ if image_features.size(0) != task_prefix_embeds.size(0):
1183
+ raise ValueError("Batch sizes of image_features and task_prefix_embeds do not match")
1184
+
1185
+ # Ajouter une dimension fictive si les dimensions ne sont pas alignées
1186
+ if image_features.dim() < task_prefix_embeds.dim():
1187
+ image_features = image_features.unsqueeze(-1)
1188
+ elif task_prefix_embeds.dim() < image_features.dim():
1189
+ task_prefix_embeds = task_prefix_embeds.unsqueeze(-1)
1190
+
1191
+ # Assurer que toutes les dimensions, sauf dim=1, sont identiques
1192
+ if image_features.size(2) != task_prefix_embeds.size(2):
1193
+ # Ajuster ou signaler une erreur si les dimensions internes ne sont pas compatibles
1194
+ raise ValueError("Internal dimensions of image_features and task_prefix_embeds do not match")
1195
+
1196
+ inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
1197
+ attention_mask = torch.cat([image_attention_mask, task_prefix_attention_mask], dim=1)
1198
+
1199
+ return inputs_embeds, attention_mask
1200
+
1201
+ def _encode_image(self, pixel_values):
1202
+ if len(pixel_values.shape) == 4:
1203
+ batch_size, C, H, W = pixel_values.shape
1204
+ T = 1
1205
+ x = self.vision_tower.forward_features_unpool(pixel_values)
1206
+ else:
1207
+ # Ajoute une dimension de batch au début si 'pixel_values' n'a que 3 dimensions (C, H, W)
1208
+ pixel_values = pixel_values.unsqueeze(0) # Ajoute une dimension de batch
1209
+ batch_size, C, H, W = pixel_values.shape
1210
+ T = 1
1211
+ x = self.vision_tower.forward_features_unpool(pixel_values)
1212
+
1213
+ if self.image_pos_embed is not None:
1214
+ x = x.view(batch_size * T, -1, x.shape[-1])
1215
+ num_tokens = x.shape[-2]
1216
+ h, w = int(num_tokens ** 0.5), int(num_tokens ** 0.5)
1217
+ assert h * w == num_tokens, 'only support square feature maps for now'
1218
+ x = x.view(batch_size * T, h, w, x.shape[-1])
1219
+ pos_embed = self.image_pos_embed(x)
1220
+ x = x + pos_embed
1221
+ x = x.view(batch_size, T * h*w, x.shape[-1])
1222
+
1223
+ if self.visual_temporal_embed is not None:
1224
+ visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
1225
+ x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
1226
+
1227
+ x_feat_dict = {}
1228
+
1229
+ spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
1230
+ x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
1231
+
1232
+ temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
1233
+ x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
1234
+
1235
+ x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
1236
+ x_feat_dict['last_frame'] = x
1237
+
1238
+ new_x = []
1239
+ for _image_feature_source in self.image_feature_source:
1240
+ if _image_feature_source not in x_feat_dict:
1241
+ raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
1242
+ new_x.append(x_feat_dict[_image_feature_source])
1243
+
1244
+ x = torch.cat(new_x, dim=1)
1245
+
1246
+ x = x @ self.image_projection
1247
+ x = self.image_proj_norm(x)
1248
+
1249
+ return x
1250
+ #######################################################
1251
+
1252
+ def get_input_embeddings(self):
1253
+ return self.model.embed_tokens
1254
+
1255
+ def set_input_embeddings(self, value):
1256
+ self.model.embed_tokens = value
1257
+
1258
+ def get_output_embeddings(self):
1259
+ return self.lm_head
1260
+
1261
+ def set_output_embeddings(self, new_embeddings):
1262
+ self.lm_head = new_embeddings
1263
+
1264
+ def set_decoder(self, decoder):
1265
+ self.model = decoder
1266
+
1267
+ def get_decoder(self):
1268
+ return self.model
1269
+
1270
+ @add_start_docstrings_to_model_forward(FEYNMODEL_INPUTS_DOCSTRING)
1271
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1272
+ def forward(
1273
+ self,
1274
+ input_ids: torch.LongTensor = None,
1275
+ pixel_values: Optional[torch.Tensor] = None,
1276
+ attention_mask: Optional[torch.Tensor] = None,
1277
+ position_ids: Optional[torch.LongTensor] = None,
1278
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1279
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1280
+ labels: Optional[torch.LongTensor] = None,
1281
+ use_cache: Optional[bool] = None,
1282
+ output_attentions: Optional[bool] = None,
1283
+ output_hidden_states: Optional[bool] = None,
1284
+ return_dict: Optional[bool] = None,
1285
+ cache_position: Optional[torch.LongTensor] = None,
1286
+ **kwargs,
1287
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1288
+ r"""
1289
+ Args:
1290
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1291
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1292
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1293
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1294
+
1295
+ Returns:
1296
+
1297
+ Example:
1298
+
1299
+ ```python
1300
+ >>> from transformers import AutoTokenizer, GemmaForCausalLM
1301
+
1302
+ >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
1303
+ >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
1304
+
1305
+ >>> prompt = "What is your favorite condiment?"
1306
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1307
+
1308
+ >>> # Generate
1309
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1310
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1311
+ "What is your favorite condiment?"
1312
+ ```"""
1313
+
1314
+
1315
+ if self.training and self.config._attn_implementation != "eager":
1316
+ logger.warning_once(
1317
+ "It is strongly recommended to train FeynModel models with the `eager` attention implementation "
1318
+ f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
1319
+ )
1320
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1321
+ output_hidden_states = (
1322
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1323
+ )
1324
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1325
+
1326
+ if pixel_values is not None:
1327
+ self.model.mode='vlm'
1328
+
1329
+ if input_ids is not None:
1330
+ inputs_embeds = self.get_input_embeddings()(input_ids)
1331
+ image_features = self._encode_image(pixel_values)
1332
+ inputs_embeds, causal_attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds )
1333
+ causal_attention_mask = create_git_attention_mask(tgt=input_ids, memory=image_features,max_length=2048)
1334
+ causal_attention_mask=causal_attention_mask.to(input_ids.device)
1335
+ self.__causal_attention_mask=causal_attention_mask
1336
+
1337
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1338
+ if pixel_values is not None:
1339
+ outputs = self.model(
1340
+ input_ids=None,
1341
+ attention_mask=causal_attention_mask,
1342
+ position_ids=position_ids,
1343
+ past_key_values=past_key_values,
1344
+ inputs_embeds=inputs_embeds,
1345
+ use_cache=use_cache,
1346
+ output_attentions=output_attentions,
1347
+ output_hidden_states=output_hidden_states,
1348
+ return_dict=return_dict,
1349
+ cache_position=cache_position,
1350
+ causal_attention_mask=causal_attention_mask,
1351
+ )
1352
+ else:
1353
+ outputs = self.model(
1354
+ input_ids=input_ids,
1355
+ attention_mask=attention_mask,
1356
+ position_ids=position_ids,
1357
+ past_key_values=past_key_values,
1358
+ inputs_embeds=inputs_embeds,
1359
+ use_cache=use_cache,
1360
+ output_attentions=output_attentions,
1361
+ output_hidden_states=output_hidden_states,
1362
+ return_dict=return_dict,
1363
+ cache_position=cache_position,
1364
+ causal_attention_mask=self.__causal_attention_mask,
1365
+ )
1366
+
1367
+
1368
+ hidden_states = outputs[0]
1369
+ logits = self.lm_head(hidden_states)
1370
+
1371
+ if self.config.final_logit_softcapping is not None:
1372
+ logits = logits / self.config.final_logit_softcapping
1373
+ logits = torch.tanh(logits)
1374
+ logits = logits * self.config.final_logit_softcapping
1375
+
1376
+
1377
+ logits = logits.float()
1378
+ loss = None
1379
+ if labels is not None:
1380
+ # we are doing next-token prediction; shift prediction scores and input ids by one
1381
+ num_image_tokens = self.model.image_patch_tokens
1382
+ shifted_logits = logits[:, num_image_tokens:-1, :].contiguous()
1383
+ labels = labels[:, 1:].contiguous()
1384
+ loss_fct = CrossEntropyLoss()
1385
+ loss = loss_fct(shifted_logits.view(-1, self.config.vocab_size), labels.view(-1))
1386
+
1387
+ if not return_dict:
1388
+
1389
+ output = (logits,) + outputs[1:]
1390
+ return (loss,) + output if loss is not None else output
1391
+
1392
+ return CausalLMOutputWithPast(
1393
+ loss=loss,
1394
+ logits=logits,
1395
+ past_key_values=outputs.past_key_values,
1396
+ hidden_states=outputs.hidden_states,
1397
+ attentions=outputs.attentions,
1398
+ )
1399
+
1400
+ def prepare_inputs_for_generation(
1401
+ self,
1402
+ input_ids,
1403
+ past_key_values=None,
1404
+ attention_mask=None,
1405
+ inputs_embeds=None,
1406
+ cache_position=None,
1407
+ position_ids=None,
1408
+ use_cache=True,
1409
+ **kwargs,
1410
+ ):
1411
+
1412
+
1413
+ # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
1414
+ # Exception 1: when passing input_embeds, input_ids may be missing entries
1415
+ # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
1416
+ if past_key_values is not None:
1417
+ if inputs_embeds is not None: # Exception 1
1418
+ input_ids = input_ids[:, -cache_position.shape[0] :]
1419
+ elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
1420
+ input_ids = input_ids[:, cache_position]
1421
+
1422
+ if attention_mask is not None and position_ids is None:
1423
+ # create position_ids on the fly for batch generation
1424
+ position_ids = attention_mask.long().cumsum(-1) - 1
1425
+ position_ids.masked_fill_(attention_mask == 0, 1)
1426
+ if past_key_values:
1427
+ # print(f"+-+-+-+-+-+-+++ past_key_values +-+-+++- position_ids {position_ids.size()} ================= ")
1428
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1429
+ # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
1430
+ # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride
1431
+ # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the
1432
+ # batch size = 1 case, `position_ids` is already contiguous but with varying stride
1433
+ # which retriggers a capture.
1434
+ position_ids = position_ids.clone(memory_format=torch.contiguous_format)
1435
+ # print(f"+-+-+-+-+-+-+++ past_key_values +-+-+++- position_ids cmlone ==> {position_ids.size()} ================= ")
1436
+
1437
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1438
+ if inputs_embeds is not None and cache_position[0] == 0:
1439
+ #print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> first generation step>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><")
1440
+ model_inputs = {"inputs_embeds": inputs_embeds}
1441
+ else:
1442
+ # The clone here is for the same reason as for `position_ids`.
1443
+ # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> The clone here is for the same reason as for `position_ids` ==> input_ids input_ids.clone.>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><")
1444
+ model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format)}
1445
+
1446
+ if isinstance(past_key_values, HybridCache) and attention_mask.ndim == 2:
1447
+ if inputs_embeds is not None and input_ids.size(1)!= 0 :
1448
+ ###################### V ############## add _ for _ = inputs_embeds.shape
1449
+ batch_size, sequence_length, _ = inputs_embeds.shape
1450
+ device = inputs_embeds.device
1451
+ #print(f"1111111 +-+-+-+-+-+-+-+-+-+- sequence_length = inputs_embeds {sequence_length}")
1452
+ else:
1453
+ batch_size, sequence_length = position_ids.shape
1454
+ device = input_ids.device
1455
+ #print(f"22222222 +-+-+-+-+-+-+-+-+-+- sequence_length = input_ids.shape {sequence_length}")
1456
+
1457
+ dtype = self.lm_head.weight.dtype
1458
+ min_dtype = torch.finfo(dtype).min
1459
+
1460
+ attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
1461
+ attention_mask,
1462
+ sequence_length=sequence_length,
1463
+ target_length=past_key_values.get_max_length(),
1464
+ dtype=dtype,
1465
+ device=device,
1466
+ min_dtype=min_dtype,
1467
+ cache_position=cache_position,
1468
+ batch_size=batch_size,
1469
+ )
1470
+
1471
+
1472
+ model_inputs.update(
1473
+ {
1474
+ "position_ids": position_ids,
1475
+ "cache_position": cache_position,
1476
+ "past_key_values": past_key_values,
1477
+ "use_cache": use_cache,
1478
+ "attention_mask": attention_mask,
1479
+ }
1480
+ )
1481
+ return model_inputs
1482
+
1483
+ def generate(
1484
+ self,
1485
+ input_ids,
1486
+ pixel_values=None,
1487
+ max_length=None,
1488
+ do_sample=True,
1489
+ temperature=0.7,
1490
+ **kwargs
1491
+ ):
1492
+
1493
+
1494
+ if pixel_values is not None:
1495
+ if input_ids is not None:
1496
+
1497
+ inputs_embeds = self.get_input_embeddings()(input_ids)
1498
+ print("pixels")
1499
+ image_features = self._encode_image(pixel_values)
1500
+ inputs_embeds, causal_attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds )
1501
+ causal_attention_mask = create_git_attention_mask(tgt=input_ids, memory=image_features,max_length=max_length)
1502
+ causal_attention_mask=causal_attention_mask.to(input_ids.device)
1503
+ self.__causal_attention_mask=causal_attention_mask
1504
+ self.model.mode='vlm'
1505
+ result = super().generate(
1506
+ input_ids=None,
1507
+ inputs_embeds=inputs_embeds,
1508
+ max_length=max_length,
1509
+ do_sample=do_sample,
1510
+ temperature=temperature,
1511
+ **kwargs
1512
+ )
1513
+
1514
+ else:
1515
+
1516
+ self.model.mode=='llm'
1517
+ result = super().generate(
1518
+ input_ids=input_ids,
1519
+ #inputs_embeds=None,
1520
+ max_length=max_length,
1521
+ do_sample=do_sample,
1522
+ temperature=temperature,
1523
+ **kwargs
1524
+ )
1525
+ self.__causal_attention_mask = None
1526
+
1527
+ return result
1528
+
preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "crop_size": {
6
+ "height": 768,
7
+ "width": 768
8
+ },
9
+ "do_center_crop": false,
10
+ "do_convert_rgb": null,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "image_mean": [
15
+ 0.485,
16
+ 0.456,
17
+ 0.406
18
+ ],
19
+ "image_processor_type": "CLIPImageProcessor",
20
+ "image_seq_length": 577,
21
+ "image_std": [
22
+ 0.229,
23
+ 0.224,
24
+ 0.225
25
+ ],
26
+ "processor_class": "Florence2Processor",
27
+ "resample": 3,
28
+ "rescale_factor": 0.00392156862745098,
29
+ "size": {
30
+ "height": 768,
31
+ "width": 768
32
+ }
33
+ }
processing_florence2.py ADDED
@@ -0,0 +1,1088 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Florence-2.
17
+ """
18
+
19
+ import re
20
+ import logging
21
+ from typing import List, Optional, Union
22
+ import numpy as np
23
+
24
+ import torch
25
+
26
+ from transformers.feature_extraction_utils import BatchFeature
27
+ from transformers.image_utils import ImageInput, is_valid_image
28
+ from transformers.processing_utils import ProcessorMixin
29
+ from transformers.tokenization_utils_base import (
30
+ PaddingStrategy,
31
+ PreTokenizedInput,
32
+ TextInput,
33
+ TruncationStrategy,
34
+ )
35
+ from transformers.utils import TensorType
36
+
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Copied from transformers.models.idefics2.processing_idefics2.is_url
41
+ def is_url(val) -> bool:
42
+ return isinstance(val, str) and val.startswith("http")
43
+
44
+ # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
45
+ def is_image_or_image_url(elem):
46
+ return is_url(elem) or is_valid_image(elem)
47
+
48
+
49
+ def _is_str_or_image(elem):
50
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
51
+
52
+
53
+ class Florence2Processor(ProcessorMixin):
54
+ r"""
55
+ Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
56
+
57
+ [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
58
+ [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
59
+
60
+ Args:
61
+ image_processor ([`CLIPImageProcessor`], *optional*):
62
+ The image processor is a required input.
63
+ tokenizer ([`BartTokenizerFast`], *optional*):
64
+ The tokenizer is a required input.
65
+ """
66
+
67
+ attributes = ["image_processor", "tokenizer"]
68
+ image_processor_class = "CLIPImageProcessor"
69
+ tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
70
+
71
+ def __init__(
72
+ self,
73
+ image_processor=None,
74
+ tokenizer=None,
75
+ ):
76
+ if image_processor is None:
77
+ raise ValueError("You need to specify an `image_processor`.")
78
+ if tokenizer is None:
79
+ raise ValueError("You need to specify a `tokenizer`.")
80
+ if not hasattr(image_processor, "image_seq_length"):
81
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
82
+
83
+ self.image_seq_length = image_processor.image_seq_length
84
+
85
+ tokens_to_add = {
86
+ 'additional_special_tokens': \
87
+ tokenizer.additional_special_tokens + \
88
+ ['<od>', '</od>', '<ocr>', '</ocr>'] + \
89
+ [f'<loc_{x}>' for x in range(1000)] + \
90
+ ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
91
+ }
92
+ tokenizer.add_special_tokens(tokens_to_add)
93
+
94
+ self.tasks_answer_post_processing_type = {
95
+ '<OCR>': 'pure_text',
96
+ '<OCR_WITH_REGION>': 'ocr',
97
+ '<CAPTION>': 'pure_text',
98
+ '<DETAILED_CAPTION>': 'pure_text',
99
+ '<MORE_DETAILED_CAPTION>': 'pure_text',
100
+ '<OD>': 'description_with_bboxes',
101
+ '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
102
+ '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
103
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
104
+ '<REGION_TO_SEGMENTATION>': 'polygons',
105
+ '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
106
+ '<REGION_TO_CATEGORY>': 'pure_text',
107
+ '<REGION_TO_DESCRIPTION>': 'pure_text',
108
+ '<REGION_TO_OCR>': 'pure_text',
109
+ '<REGION_PROPOSAL>': 'bboxes'
110
+ }
111
+
112
+ self.task_prompts_without_inputs = {
113
+ '<OCR>': 'What is the text in the image?',
114
+ '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
115
+ '<CAPTION>': 'What does the image describe?',
116
+ '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
117
+ '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
118
+ '<OD>': 'Locate the objects with category name in the image.',
119
+ '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
120
+ '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
121
+ }
122
+
123
+ self.task_prompts_with_input = {
124
+ '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
125
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
126
+ '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
127
+ '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
128
+ '<REGION_TO_CATEGORY>': 'What is the region {input}?',
129
+ '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
130
+ '<REGION_TO_OCR>': 'What text is in the region {input}?',
131
+ }
132
+
133
+ self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
134
+
135
+
136
+ super().__init__(image_processor, tokenizer)
137
+
138
+ def _construct_prompts(self, text):
139
+ # replace the task tokens with the task prompts if task token is in the text
140
+ prompts = []
141
+ for _text in text:
142
+ # 1. fixed task prompts without additional inputs
143
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
144
+ if task_token in _text:
145
+ assert _text == task_token, f"Task token {task_token} should be the only token in the text."
146
+ _text = task_prompt
147
+ break
148
+ # 2. task prompts with additional inputs
149
+ for task_token, task_prompt in self.task_prompts_with_input.items():
150
+ if task_token in _text:
151
+ _text = task_prompt.format(input=_text.replace(task_token, ''))
152
+ break
153
+ prompts.append(_text)
154
+ return prompts
155
+
156
+ def __call__(
157
+ self,
158
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
159
+ images: ImageInput = None,
160
+ tokenize_newline_separately: bool = True,
161
+ padding: Union[bool, str, PaddingStrategy] = False,
162
+ truncation: Union[bool, str, TruncationStrategy] = None,
163
+ max_length=None,
164
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
165
+ do_resize: bool = None,
166
+ do_normalize: bool = None,
167
+ image_mean: Optional[Union[float, List[float]]] = None,
168
+ image_std: Optional[Union[float, List[float]]] = None,
169
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
170
+ input_data_format: Optional[
171
+ Union[str, "ChannelDimension"] # noqa: F821
172
+ ] = None,
173
+ resample: "PILImageResampling" = None, # noqa: F821
174
+ do_convert_rgb: bool = None,
175
+ do_thumbnail: bool = None,
176
+ do_align_long_axis: bool = None,
177
+ do_rescale: bool = None,
178
+ ) -> BatchFeature:
179
+ """
180
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
181
+ and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
182
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
183
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
184
+ of the above two methods for more information.
185
+
186
+ Args:
187
+ text (`str`, `List[str]`, `List[List[str]]`):
188
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
189
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
190
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
191
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
192
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
193
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
194
+ number of channels, H and W are image height and width.
195
+ tokenize_newline_separately (`bool`, defaults to `True`):
196
+ Adds a separately tokenized '\n' at the end of the prompt.
197
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
198
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
199
+ index) among:
200
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
201
+ sequence if provided).
202
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
203
+ acceptable input length for the model if that argument is not provided.
204
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
205
+ lengths).
206
+ max_length (`int`, *optional*):
207
+ Maximum length of the returned list and optionally padding length (see above).
208
+ truncation (`bool`, *optional*):
209
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
210
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
211
+ If set, will return tensors of a particular framework. Acceptable values are:
212
+
213
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
214
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
215
+ - `'np'`: Return NumPy `np.ndarray` objects.
216
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
217
+
218
+ Returns:
219
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
220
+
221
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
222
+ is provided, the `input_ids` will also contain the suffix input ids.
223
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
224
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
225
+ `None`).
226
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
227
+ - **labels** -- Labels compatible with training if `suffix` is not None
228
+ """
229
+
230
+ return_token_type_ids = False
231
+
232
+ if images is None:
233
+ raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
234
+ if text is None:
235
+ logger.warning_once(
236
+ "You are using Florence-2 without a text prompt."
237
+ )
238
+ text = ""
239
+
240
+ if isinstance(text, List) and isinstance(images, List):
241
+ if len(images) < len(text):
242
+ raise ValueError(
243
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
244
+ )
245
+ if _is_str_or_image(text):
246
+ text = [text]
247
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
248
+ pass
249
+
250
+ pixel_values = self.image_processor(
251
+ images,
252
+ do_resize=do_resize,
253
+ do_normalize=do_normalize,
254
+ return_tensors=return_tensors,
255
+ image_mean=image_mean,
256
+ image_std=image_std,
257
+ input_data_format=input_data_format,
258
+ data_format=data_format,
259
+ resample=resample,
260
+ do_convert_rgb=do_convert_rgb,
261
+ )["pixel_values"]
262
+
263
+ if max_length is not None:
264
+ max_length -= self.image_seq_length # max_length has to account for the image tokens
265
+
266
+ text = self._construct_prompts(text)
267
+
268
+ inputs = self.tokenizer(
269
+ text,
270
+ return_tensors=return_tensors,
271
+ padding=padding,
272
+ max_length=max_length,
273
+ truncation=truncation,
274
+ return_token_type_ids=return_token_type_ids,
275
+ )
276
+
277
+ return_data = {**inputs, "pixel_values": pixel_values}
278
+
279
+ if return_token_type_ids:
280
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
281
+ return_data.update({"labels": labels})
282
+ return BatchFeature(data=return_data)
283
+
284
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
285
+ def batch_decode(self, *args, **kwargs):
286
+ """
287
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
288
+ refer to the docstring of this method for more information.
289
+ """
290
+ return self.tokenizer.batch_decode(*args, **kwargs)
291
+
292
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
293
+ def decode(self, *args, **kwargs):
294
+ """
295
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
296
+ the docstring of this method for more information.
297
+ """
298
+ return self.tokenizer.decode(*args, **kwargs)
299
+
300
+ @property
301
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
302
+ def model_input_names(self):
303
+ tokenizer_input_names = self.tokenizer.model_input_names
304
+ image_processor_input_names = self.image_processor.model_input_names
305
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
306
+
307
+ def post_process_generation(self, text, task, image_size):
308
+ """
309
+ Post-process the output of the model to each of the task outputs.
310
+
311
+ Args:
312
+ text (`str`): The text to post-process.
313
+ task (`str`): The task to post-process the text for.
314
+ image_size (`Tuple[int, int]`): The size of the image. height x width.
315
+ """
316
+
317
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
318
+ task_answer = self.post_processor(
319
+ text=text,
320
+ image_size=image_size,
321
+ parse_tasks=task_answer_post_processing_type,
322
+ )[task_answer_post_processing_type]
323
+
324
+ if task_answer_post_processing_type == 'pure_text':
325
+ final_answer = task_answer
326
+ # remove the special tokens
327
+ final_answer = final_answer.replace('<s>', '').replace('</s>', '')
328
+ elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
329
+ od_instances = task_answer
330
+ bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
331
+ labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
332
+ final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
333
+ elif task_answer_post_processing_type in ['ocr']:
334
+ bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
335
+ labels = [str(_od_instance['text']) for _od_instance in task_answer]
336
+ final_answer = {'quad_boxes': bboxes, 'labels': labels}
337
+ elif task_answer_post_processing_type in ['phrase_grounding']:
338
+ bboxes = []
339
+ labels = []
340
+ for _grounded_phrase in task_answer:
341
+ for _bbox in _grounded_phrase['bbox']:
342
+ bboxes.append(_bbox)
343
+ labels.append(_grounded_phrase['cat_name'])
344
+ final_answer = {'bboxes': bboxes, 'labels': labels}
345
+ elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
346
+ labels = []
347
+ polygons = []
348
+ for result in task_answer:
349
+ label = result['cat_name']
350
+ _polygons = result['polygons']
351
+ labels.append(label)
352
+ polygons.append(_polygons)
353
+ final_answer = {'polygons': polygons, 'labels': labels}
354
+ elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
355
+ bboxes = []
356
+ bboxes_labels = []
357
+ polygons = []
358
+ polygons_labels = []
359
+ for result in task_answer:
360
+ label = result['cat_name']
361
+ if 'polygons' in result:
362
+ _polygons = result['polygons']
363
+ polygons.append(_polygons)
364
+ polygons_labels.append(label)
365
+ else:
366
+ _bbox = result['bbox']
367
+ bboxes.append(_bbox)
368
+ bboxes_labels.append(label)
369
+ final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
370
+ else:
371
+ raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
372
+
373
+ final_answer = {
374
+ task: final_answer}
375
+ return final_answer
376
+
377
+ class BoxQuantizer(object):
378
+ def __init__(self, mode, bins):
379
+ self.mode = mode
380
+ self.bins = bins
381
+
382
+ def quantize(self, boxes: torch.Tensor, size):
383
+ bins_w, bins_h = self.bins # Quantization bins.
384
+ size_w, size_h = size # Original image size.
385
+ size_per_bin_w = size_w / bins_w
386
+ size_per_bin_h = size_h / bins_h
387
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
388
+
389
+ if self.mode == 'floor':
390
+ quantized_xmin = (
391
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
392
+ quantized_ymin = (
393
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
394
+ quantized_xmax = (
395
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
396
+ quantized_ymax = (
397
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
398
+
399
+ elif self.mode == 'round':
400
+ raise NotImplementedError()
401
+
402
+ else:
403
+ raise ValueError('Incorrect quantization type.')
404
+
405
+ quantized_boxes = torch.cat(
406
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
407
+ ).int()
408
+
409
+ return quantized_boxes
410
+
411
+ def dequantize(self, boxes: torch.Tensor, size):
412
+ bins_w, bins_h = self.bins # Quantization bins.
413
+ size_w, size_h = size # Original image size.
414
+ size_per_bin_w = size_w / bins_w
415
+ size_per_bin_h = size_h / bins_h
416
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
417
+
418
+ if self.mode == 'floor':
419
+ # Add 0.5 to use the center position of the bin as the coordinate.
420
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
421
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
422
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
423
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
424
+
425
+ elif self.mode == 'round':
426
+ raise NotImplementedError()
427
+
428
+ else:
429
+ raise ValueError('Incorrect quantization type.')
430
+
431
+ dequantized_boxes = torch.cat(
432
+ (dequantized_xmin, dequantized_ymin,
433
+ dequantized_xmax, dequantized_ymax), dim=-1
434
+ )
435
+
436
+ return dequantized_boxes
437
+
438
+
439
+ class CoordinatesQuantizer(object):
440
+ """
441
+ Quantize coornidates (Nx2)
442
+ """
443
+
444
+ def __init__(self, mode, bins):
445
+ self.mode = mode
446
+ self.bins = bins
447
+
448
+ def quantize(self, coordinates: torch.Tensor, size):
449
+ bins_w, bins_h = self.bins # Quantization bins.
450
+ size_w, size_h = size # Original image size.
451
+ size_per_bin_w = size_w / bins_w
452
+ size_per_bin_h = size_h / bins_h
453
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
454
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
455
+
456
+ if self.mode == 'floor':
457
+ quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
458
+ quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
459
+
460
+ elif self.mode == 'round':
461
+ raise NotImplementedError()
462
+
463
+ else:
464
+ raise ValueError('Incorrect quantization type.')
465
+
466
+ quantized_coordinates = torch.cat(
467
+ (quantized_x, quantized_y), dim=-1
468
+ ).int()
469
+
470
+ return quantized_coordinates
471
+
472
+ def dequantize(self, coordinates: torch.Tensor, size):
473
+ bins_w, bins_h = self.bins # Quantization bins.
474
+ size_w, size_h = size # Original image size.
475
+ size_per_bin_w = size_w / bins_w
476
+ size_per_bin_h = size_h / bins_h
477
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
478
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
479
+
480
+ if self.mode == 'floor':
481
+ # Add 0.5 to use the center position of the bin as the coordinate.
482
+ dequantized_x = (x + 0.5) * size_per_bin_w
483
+ dequantized_y = (y + 0.5) * size_per_bin_h
484
+
485
+ elif self.mode == 'round':
486
+ raise NotImplementedError()
487
+
488
+ else:
489
+ raise ValueError('Incorrect quantization type.')
490
+
491
+ dequantized_coordinates = torch.cat(
492
+ (dequantized_x, dequantized_y), dim=-1
493
+ )
494
+
495
+ return dequantized_coordinates
496
+
497
+
498
+ class Florence2PostProcesser(object):
499
+ """
500
+ Florence-2 post process for converting text prediction to various tasks results.
501
+
502
+ Args:
503
+ config: A dict of configs.
504
+ tokenizer: A tokenizer for decoding text to spans.
505
+ sample config:
506
+ UNIFIED_POST_PROCESS:
507
+ # commom configs
508
+ NUM_BBOX_HEIGHT_BINS: 1000
509
+ NUM_BBOX_WIDTH_BINS: 1000
510
+ COORDINATES_HEIGHT_BINS: 1000
511
+ COORDINATES_WIDTH_BINS: 1000
512
+ # task specific configs, override the common configs
513
+ PRASE_TASKS:
514
+ - TASK_NAME: 'video_dense_caption'
515
+ PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
516
+ SCORE_MODE: 'avg_cat_name_scores'
517
+ NUM_BINS: 100
518
+ - TASK_NAME: 'od'
519
+ PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
520
+ SCORE_MODE: 'avg_cat_name_scores'
521
+
522
+ Returns:
523
+ parsed_dict (dict): A dict of parsed results.
524
+ """
525
+ def __init__(
526
+ self,
527
+ tokenizer=None
528
+ ):
529
+ parse_tasks = []
530
+ parse_task_configs = {}
531
+ config = self._create_default_config()
532
+ for task in config['PARSE_TASKS']:
533
+ parse_tasks.append(task['TASK_NAME'])
534
+ parse_task_configs[task['TASK_NAME']] = task
535
+
536
+ self.config = config
537
+ self.parse_tasks = parse_tasks
538
+ self.parse_tasks_configs = parse_task_configs
539
+
540
+ self.tokenizer = tokenizer
541
+ if self.tokenizer is not None:
542
+ self.all_special_tokens = set(self.tokenizer.all_special_tokens)
543
+
544
+ self.init_quantizers()
545
+ self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
546
+
547
+ def _create_black_list_of_phrase_grounding(self):
548
+ black_list = {}
549
+
550
+ if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
551
+ black_list = set(
552
+ ['it', 'I', 'me', 'mine',
553
+ 'you', 'your', 'yours',
554
+ 'he', 'him', 'his',
555
+ 'she', 'her', 'hers',
556
+ 'they', 'them', 'their', 'theirs',
557
+ 'one', 'oneself',
558
+ 'we', 'us', 'our', 'ours',
559
+ 'you', 'your', 'yours',
560
+ 'they', 'them', 'their', 'theirs',
561
+ 'mine', 'yours', 'his', 'hers', 'its',
562
+ 'ours', 'yours', 'theirs',
563
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
564
+ 'ourselves', 'yourselves', 'themselves',
565
+ 'this', 'that',
566
+ 'these', 'those',
567
+ 'who', 'whom', 'whose', 'which', 'what',
568
+ 'who', 'whom', 'whose', 'which', 'that',
569
+ 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
570
+ 'each', 'everybody', 'everyone', 'everything',
571
+ 'few', 'many', 'nobody', 'none', 'one', 'several',
572
+ 'some', 'somebody', 'someone', 'something',
573
+ 'each other', 'one another',
574
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
575
+ 'ourselves', 'yourselves', 'themselves',
576
+ 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
577
+ 'other objects', 'lots', 'a set',
578
+ ]
579
+ )
580
+
581
+ return black_list
582
+
583
+ def _create_default_config(self):
584
+ config = {
585
+ 'NUM_BBOX_HEIGHT_BINS': 1000,
586
+ 'NUM_BBOX_WIDTH_BINS': 1000,
587
+ 'BOX_QUANTIZATION_MODE': 'floor',
588
+ 'COORDINATES_HEIGHT_BINS': 1000,
589
+ 'COORDINATES_WIDTH_BINS': 1000,
590
+ 'COORDINATES_QUANTIZATION_MODE': 'floor',
591
+ 'PARSE_TASKS': [
592
+ {
593
+ 'TASK_NAME': 'od',
594
+ 'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
595
+ },
596
+ {
597
+ 'TASK_NAME': 'ocr',
598
+ 'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
599
+ 'AREA_THRESHOLD': 0.00
600
+ },
601
+ {
602
+ 'TASK_NAME': 'phrase_grounding',
603
+ 'FILTER_BY_BLACK_LIST': True
604
+ },
605
+ {
606
+ 'TASK_NAME': 'pure_text',
607
+ },
608
+ {
609
+ 'TASK_NAME': 'description_with_bboxes',
610
+ },
611
+ {
612
+ 'TASK_NAME': 'description_with_polygons',
613
+ },
614
+ {
615
+ 'TASK_NAME': 'polygons',
616
+ },
617
+ {
618
+ 'TASK_NAME': 'bboxes',
619
+ },
620
+ {
621
+ 'TASK_NAME': 'description_with_bboxes_or_polygons',
622
+ }
623
+ ]
624
+ }
625
+
626
+ return config
627
+
628
+ def init_quantizers(self):
629
+ # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
630
+ num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
631
+ num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
632
+ box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
633
+ self.box_quantizer = BoxQuantizer(
634
+ box_quantization_mode,
635
+ (num_bbox_width_bins, num_bbox_height_bins),
636
+ )
637
+
638
+ num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
639
+ num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
640
+ box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
641
+ self.coordinates_quantizer = CoordinatesQuantizer(
642
+ box_quantization_mode,
643
+ (num_bbox_width_bins, num_bbox_height_bins),
644
+ )
645
+
646
+ def decode_with_spans(self, tokenizer, token_ids):
647
+ filtered_tokens = tokenizer.convert_ids_to_tokens(
648
+ token_ids, skip_special_tokens=False)
649
+ assert len(filtered_tokens) == len(token_ids)
650
+
651
+ # To avoid mixing byte-level and unicode for byte-level BPT
652
+ # we need to build string separately for added tokens and byte-level tokens
653
+ # cf. https://github.com/huggingface/transformers/issues/1133
654
+ sub_texts = []
655
+ for token in filtered_tokens:
656
+ if token in self.all_special_tokens:
657
+ sub_texts.append(token)
658
+ else:
659
+ if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
660
+ sub_text = tokenizer.convert_tokens_to_string([token])
661
+ elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
662
+ # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
663
+ # Note: Do not strip sub_text as it may have functional whitespace
664
+ sub_text = token.replace('▁', ' ')
665
+ else:
666
+ raise ValueError(f'type {type(tokenizer)} not supported')
667
+ sub_texts.append(sub_text)
668
+
669
+ text = ''
670
+ spans = []
671
+ for sub_text in sub_texts:
672
+ span = (len(text), len(text) + len(sub_text)) # [start index, end index).
673
+ text += sub_text
674
+ spans.append(span)
675
+
676
+ # Text format:
677
+ # 1. T5Tokenizer/T5TokenizerFast:
678
+ # "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
679
+ # Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
680
+ # 2. BartTokenizer (need to double check):
681
+ # "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
682
+ # Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
683
+ return text, spans
684
+
685
+ def parse_od_from_text_and_spans(
686
+ self,
687
+ text,
688
+ pattern,
689
+ image_size,
690
+ phrase_centric=False
691
+ ):
692
+ parsed = list(re.finditer(pattern, text))
693
+
694
+ instances = []
695
+ for i in range(len(parsed)):
696
+ # Prepare instance.
697
+ instance = {}
698
+
699
+ if phrase_centric:
700
+ bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
701
+ else:
702
+ bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
703
+ instance['bbox'] = self.box_quantizer.dequantize(
704
+ boxes=torch.tensor(bbox_bins),
705
+ size=image_size
706
+ ).tolist()
707
+
708
+ if phrase_centric:
709
+ instance['cat_name'] = parsed[i].group(1).lower().strip()
710
+ else:
711
+ instance['cat_name'] = parsed[i].group(5).lower().strip()
712
+ instances.append(instance)
713
+
714
+ return instances
715
+
716
+ def parse_ocr_from_text_and_spans(self,
717
+ text,
718
+ pattern,
719
+ image_size,
720
+ area_threshold=-1.0,
721
+ ):
722
+ bboxes = []
723
+ labels = []
724
+ text = text.replace('<s>', '')
725
+ # ocr with regions
726
+ parsed = re.findall(pattern, text)
727
+ instances = []
728
+ image_width, image_height = image_size
729
+
730
+ for ocr_line in parsed:
731
+ ocr_content = ocr_line[0]
732
+ quad_box = ocr_line[1:]
733
+ quad_box = [int(i) for i in quad_box]
734
+ quad_box = self.coordinates_quantizer.dequantize(
735
+ torch.tensor(np.array(quad_box).reshape(-1, 2)),
736
+ size=image_size
737
+ ).reshape(-1).tolist()
738
+
739
+ if area_threshold > 0:
740
+ x_coords = [i for i in quad_box[0::2]]
741
+ y_coords = [i for i in quad_box[1::2]]
742
+
743
+ # apply the Shoelace formula
744
+ area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
745
+
746
+ if area < (image_width * image_height) * area_threshold:
747
+ continue
748
+
749
+ bboxes.append(quad_box)
750
+ labels.append(ocr_content)
751
+ instances.append({
752
+ 'quad_box': quad_box,
753
+ 'text': ocr_content,
754
+ })
755
+ return instances
756
+
757
+ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
758
+ # ignore <s> </s> and <pad>
759
+ cur_span = 0
760
+ if text.startswith('<s>'):
761
+ cur_span += 3
762
+
763
+ text = text.replace('<s>', '')
764
+ text = text.replace('</s>', '')
765
+ text = text.replace('<pad>', '')
766
+
767
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
768
+ phrases = re.findall(pattern, text)
769
+
770
+ # pattern should be text pattern and od pattern
771
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
772
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
773
+
774
+ instances = []
775
+ for pharse_text in phrases:
776
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
777
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
778
+
779
+ if phrase_text_strip == '':
780
+ cur_span += len(pharse_text)
781
+ continue
782
+
783
+ # Prepare instance.
784
+ instance = {}
785
+
786
+ # parse phrase, get string
787
+ phrase = re.search(pattern, phrase_text_strip)
788
+ if phrase is None:
789
+ cur_span += len(pharse_text)
790
+ continue
791
+
792
+ # parse bboxes by box_pattern
793
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
794
+ if len(bboxes_parsed) == 0:
795
+ cur_span += len(pharse_text)
796
+ continue
797
+
798
+ phrase = phrase.group()
799
+ # remove leading and trailing spaces
800
+ phrase = phrase.strip()
801
+
802
+ if phrase in self.black_list_of_phrase_grounding:
803
+ cur_span += len(pharse_text)
804
+ continue
805
+
806
+ # a list of list
807
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
808
+ instance['bbox'] = self.box_quantizer.dequantize(
809
+ boxes=torch.tensor(bbox_bins),
810
+ size=image_size
811
+ ).tolist()
812
+
813
+ # exclude non-ascii characters
814
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
815
+ instance['cat_name'] = phrase
816
+
817
+ instances.append(instance)
818
+
819
+ return instances
820
+
821
+ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
822
+ # temporary parse solution, split by '.'
823
+ # ignore <s> </s> and <pad>
824
+
825
+ text = text.replace('<s>', '')
826
+ text = text.replace('</s>', '')
827
+ text = text.replace('<pad>', '')
828
+
829
+ if allow_empty_phrase:
830
+ pattern = rf"(?:(?:<loc_\d+>){{4,}})"
831
+ else:
832
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
833
+ phrases = re.findall(pattern, text)
834
+
835
+ # pattern should be text pattern and od pattern
836
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
837
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
838
+
839
+ instances = []
840
+ for pharse_text in phrases:
841
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
842
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
843
+
844
+ if phrase_text_strip == '' and not allow_empty_phrase:
845
+ continue
846
+
847
+ # parse phrase, get string
848
+ phrase = re.search(pattern, phrase_text_strip)
849
+ if phrase is None:
850
+ continue
851
+
852
+ phrase = phrase.group()
853
+ # remove leading and trailing spaces
854
+ phrase = phrase.strip()
855
+
856
+ # parse bboxes by box_pattern
857
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
858
+ if len(bboxes_parsed) == 0:
859
+ continue
860
+
861
+ # a list of list
862
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
863
+
864
+ bboxes = self.box_quantizer.dequantize(
865
+ boxes=torch.tensor(bbox_bins),
866
+ size=image_size
867
+ ).tolist()
868
+
869
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
870
+ for _bboxes in bboxes:
871
+ # Prepare instance.
872
+ instance = {}
873
+ instance['bbox'] = _bboxes
874
+ # exclude non-ascii characters
875
+ instance['cat_name'] = phrase
876
+ instances.append(instance)
877
+
878
+ return instances
879
+
880
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
881
+ allow_empty_phrase=False,
882
+ polygon_sep_token='<sep>',
883
+ polygon_start_token='<poly>',
884
+ polygon_end_token='</poly>',
885
+ with_box_at_start=False,
886
+ ):
887
+
888
+ # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
889
+ # ignore <s> </s> and <pad>
890
+
891
+ text = text.replace('<s>', '')
892
+ text = text.replace('</s>', '')
893
+ text = text.replace('<pad>', '')
894
+
895
+ if allow_empty_phrase:
896
+ pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
897
+ else:
898
+ # [^<]+: This part matches one or more characters that are not the < symbol.
899
+ # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
900
+ #
901
+ pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
902
+ phrases = re.findall(pattern, text)
903
+
904
+ phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
905
+ box_pattern = rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
906
+
907
+ # one polygons instance is separated by polygon_start_token and polygon_end_token
908
+ polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
909
+
910
+ instances = []
911
+ for phrase_text in phrases:
912
+
913
+ # exclude loc_\d+>
914
+ # need to get span if want to include category score
915
+ phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
916
+
917
+ # phrase = phrase.replace('<poly>', '')
918
+ # phrase = phrase.replace('poly>', '')
919
+
920
+ if phrase_text_strip == '' and not allow_empty_phrase:
921
+ continue
922
+
923
+
924
+ # parse phrase, get string
925
+ phrase = re.search(phrase_string_pattern, phrase_text_strip)
926
+ if phrase is None:
927
+ continue
928
+ phrase = phrase.group()
929
+ # remove leading and trailing spaces
930
+ phrase = phrase.strip()
931
+
932
+ # parse bboxes by box_pattern
933
+
934
+ # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
935
+ if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
936
+ polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
937
+ else:
938
+ polygons_instances_parsed = [phrase_text]
939
+
940
+ for _polygons_instances_parsed in polygons_instances_parsed:
941
+ # Prepare instance.
942
+ instance = {}
943
+
944
+ # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
945
+ if isinstance(_polygons_instances_parsed, str):
946
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
947
+ else:
948
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
949
+ if len(polygons_parsed) == 0:
950
+ continue
951
+
952
+ # a list of list (polygon)
953
+ bbox = []
954
+ polygons = []
955
+ for _polygon_parsed in polygons_parsed:
956
+ # group 1: whole <loc_\d+>...</loc_\d+>
957
+ _polygon = _polygon_parsed.group(1)
958
+ # parse into list of int
959
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
960
+ if with_box_at_start and len(bbox) == 0:
961
+ if len(_polygon) > 4:
962
+ # no valid bbox prediction
963
+ bbox = _polygon[:4]
964
+ _polygon = _polygon[4:]
965
+ else:
966
+ bbox = [0, 0, 0, 0]
967
+ # abandon last element if is not paired
968
+ if len(_polygon) % 2 == 1:
969
+ _polygon = _polygon[:-1]
970
+
971
+ # reshape into (n, 2)
972
+ _polygon = self.coordinates_quantizer.dequantize(
973
+ torch.tensor(np.array(_polygon).reshape(-1, 2)),
974
+ size=image_size
975
+ ).reshape(-1).tolist()
976
+ # reshape back
977
+ polygons.append(_polygon)
978
+
979
+ instance['cat_name'] = phrase
980
+ instance['polygons'] = polygons
981
+ if len(bbox) != 0:
982
+ instance['bbox'] = self.box_quantizer.dequantize(
983
+ boxes=torch.tensor([bbox]),
984
+ size=image_size
985
+ ).tolist()[0]
986
+
987
+ instances.append(instance)
988
+
989
+ return instances
990
+
991
+ def __call__(
992
+ self,
993
+ text=None,
994
+ image_size=None,
995
+ parse_tasks=None,
996
+ ):
997
+ """
998
+ Args:
999
+ text: model outputs
1000
+ image_size: (width, height)
1001
+ parse_tasks: a list of tasks to parse, if None, parse all tasks.
1002
+
1003
+ """
1004
+ if parse_tasks is not None:
1005
+ if isinstance(parse_tasks, str):
1006
+ parse_tasks = [parse_tasks]
1007
+ for _parse_task in parse_tasks:
1008
+ assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1009
+
1010
+ # sequence or text should be provided
1011
+ assert text is not None, 'text should be provided'
1012
+
1013
+ parsed_dict = {
1014
+ 'text': text
1015
+ }
1016
+
1017
+ for task in self.parse_tasks:
1018
+ if parse_tasks is not None and task not in parse_tasks:
1019
+ continue
1020
+
1021
+ pattern = self.parse_tasks_configs[task].get('PATTERN', None)
1022
+
1023
+ if task == 'ocr':
1024
+ instances = self.parse_ocr_from_text_and_spans(
1025
+ text,
1026
+ pattern=pattern,
1027
+ image_size=image_size,
1028
+ area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
1029
+ )
1030
+ parsed_dict['ocr'] = instances
1031
+ elif task == 'phrase_grounding':
1032
+ instances = self.parse_phrase_grounding_from_text_and_spans(
1033
+ text,
1034
+ pattern=pattern,
1035
+ image_size=image_size,
1036
+ )
1037
+ parsed_dict['phrase_grounding'] = instances
1038
+ elif task == 'pure_text':
1039
+ parsed_dict['pure_text'] = text
1040
+ elif task == 'description_with_bboxes':
1041
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1042
+ text,
1043
+ pattern=pattern,
1044
+ image_size=image_size,
1045
+ )
1046
+ parsed_dict['description_with_bboxes'] = instances
1047
+ elif task == 'description_with_polygons':
1048
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1049
+ text,
1050
+ pattern=pattern,
1051
+ image_size=image_size,
1052
+ )
1053
+ parsed_dict['description_with_polygons'] = instances
1054
+ elif task == 'polygons':
1055
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1056
+ text,
1057
+ pattern=pattern,
1058
+ image_size=image_size,
1059
+ allow_empty_phrase=True,
1060
+ )
1061
+ parsed_dict['polygons'] = instances
1062
+ elif task == 'bboxes':
1063
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1064
+ text,
1065
+ pattern=pattern,
1066
+ image_size=image_size,
1067
+ allow_empty_phrase=True,
1068
+ )
1069
+ parsed_dict['bboxes'] = instances
1070
+ elif task == 'description_with_bboxes_or_polygons':
1071
+ if '<poly>' in text:
1072
+ # only support either polygons or bboxes, not both at the same time
1073
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1074
+ text,
1075
+ pattern=pattern,
1076
+ image_size=image_size,
1077
+ )
1078
+ else:
1079
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1080
+ text,
1081
+ pattern=pattern,
1082
+ image_size=image_size,
1083
+ )
1084
+ parsed_dict['description_with_bboxes_or_polygons'] = instances
1085
+ else:
1086
+ raise ValueError("task {} is not supported".format(task))
1087
+
1088
+ return parsed_dict
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "processor_class": "Florence2Processor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<eos>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922
3
+ size 17525357
tokenizer_config.json ADDED
@@ -0,0 +1,2010 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "5": {
46
+ "content": "<2mass>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "6": {
54
+ "content": "[@BOS@]",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "7": {
62
+ "content": "<unused0>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "8": {
70
+ "content": "<unused1>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "9": {
78
+ "content": "<unused2>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "10": {
86
+ "content": "<unused3>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "11": {
94
+ "content": "<unused4>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "12": {
102
+ "content": "<unused5>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "13": {
110
+ "content": "<unused6>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "14": {
118
+ "content": "<unused7>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "15": {
126
+ "content": "<unused8>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "16": {
134
+ "content": "<unused9>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "17": {
142
+ "content": "<unused10>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "18": {
150
+ "content": "<unused11>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "19": {
158
+ "content": "<unused12>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "20": {
166
+ "content": "<unused13>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "21": {
174
+ "content": "<unused14>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "22": {
182
+ "content": "<unused15>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "23": {
190
+ "content": "<unused16>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "24": {
198
+ "content": "<unused17>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "25": {
206
+ "content": "<unused18>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "26": {
214
+ "content": "<unused19>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "27": {
222
+ "content": "<unused20>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "28": {
230
+ "content": "<unused21>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "29": {
238
+ "content": "<unused22>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "30": {
246
+ "content": "<unused23>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "31": {
254
+ "content": "<unused24>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "32": {
262
+ "content": "<unused25>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "33": {
270
+ "content": "<unused26>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "34": {
278
+ "content": "<unused27>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "35": {
286
+ "content": "<unused28>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "36": {
294
+ "content": "<unused29>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "37": {
302
+ "content": "<unused30>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "38": {
310
+ "content": "<unused31>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "39": {
318
+ "content": "<unused32>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": false
324
+ },
325
+ "40": {
326
+ "content": "<unused33>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ },
333
+ "41": {
334
+ "content": "<unused34>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": false
340
+ },
341
+ "42": {
342
+ "content": "<unused35>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": false
348
+ },
349
+ "43": {
350
+ "content": "<unused36>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": false
356
+ },
357
+ "44": {
358
+ "content": "<unused37>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": false
364
+ },
365
+ "45": {
366
+ "content": "<unused38>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": false
372
+ },
373
+ "46": {
374
+ "content": "<unused39>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": false
380
+ },
381
+ "47": {
382
+ "content": "<unused40>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": false
388
+ },
389
+ "48": {
390
+ "content": "<unused41>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": false
396
+ },
397
+ "49": {
398
+ "content": "<unused42>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": false
404
+ },
405
+ "50": {
406
+ "content": "<unused43>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "51": {
414
+ "content": "<unused44>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "52": {
422
+ "content": "<unused45>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "53": {
430
+ "content": "<unused46>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "54": {
438
+ "content": "<unused47>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "55": {
446
+ "content": "<unused48>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "56": {
454
+ "content": "<unused49>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "57": {
462
+ "content": "<unused50>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ },
469
+ "58": {
470
+ "content": "<unused51>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": false
476
+ },
477
+ "59": {
478
+ "content": "<unused52>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": false
484
+ },
485
+ "60": {
486
+ "content": "<unused53>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": false
492
+ },
493
+ "61": {
494
+ "content": "<unused54>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "62": {
502
+ "content": "<unused55>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ },
509
+ "63": {
510
+ "content": "<unused56>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": false
516
+ },
517
+ "64": {
518
+ "content": "<unused57>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": false
524
+ },
525
+ "65": {
526
+ "content": "<unused58>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": false
532
+ },
533
+ "66": {
534
+ "content": "<unused59>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": false
540
+ },
541
+ "67": {
542
+ "content": "<unused60>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": false
548
+ },
549
+ "68": {
550
+ "content": "<unused61>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": false
556
+ },
557
+ "69": {
558
+ "content": "<unused62>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": false
564
+ },
565
+ "70": {
566
+ "content": "<unused63>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": false
572
+ },
573
+ "71": {
574
+ "content": "<unused64>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": false
580
+ },
581
+ "72": {
582
+ "content": "<unused65>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": false
588
+ },
589
+ "73": {
590
+ "content": "<unused66>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": false
596
+ },
597
+ "74": {
598
+ "content": "<unused67>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": false
604
+ },
605
+ "75": {
606
+ "content": "<unused68>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": false
612
+ },
613
+ "76": {
614
+ "content": "<unused69>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": false
620
+ },
621
+ "77": {
622
+ "content": "<unused70>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": false
628
+ },
629
+ "78": {
630
+ "content": "<unused71>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": false
636
+ },
637
+ "79": {
638
+ "content": "<unused72>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": false
644
+ },
645
+ "80": {
646
+ "content": "<unused73>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": false
652
+ },
653
+ "81": {
654
+ "content": "<unused74>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": false
660
+ },
661
+ "82": {
662
+ "content": "<unused75>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": false
668
+ },
669
+ "83": {
670
+ "content": "<unused76>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": false
676
+ },
677
+ "84": {
678
+ "content": "<unused77>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": false
684
+ },
685
+ "85": {
686
+ "content": "<unused78>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": false
692
+ },
693
+ "86": {
694
+ "content": "<unused79>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": false
700
+ },
701
+ "87": {
702
+ "content": "<unused80>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": false
708
+ },
709
+ "88": {
710
+ "content": "<unused81>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": false
716
+ },
717
+ "89": {
718
+ "content": "<unused82>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": false
724
+ },
725
+ "90": {
726
+ "content": "<unused83>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": false
732
+ },
733
+ "91": {
734
+ "content": "<unused84>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": false
740
+ },
741
+ "92": {
742
+ "content": "<unused85>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": false
748
+ },
749
+ "93": {
750
+ "content": "<unused86>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": false
756
+ },
757
+ "94": {
758
+ "content": "<unused87>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": false
764
+ },
765
+ "95": {
766
+ "content": "<unused88>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": false
772
+ },
773
+ "96": {
774
+ "content": "<unused89>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": false
780
+ },
781
+ "97": {
782
+ "content": "<unused90>",
783
+ "lstrip": false,
784
+ "normalized": false,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": false
788
+ },
789
+ "98": {
790
+ "content": "<unused91>",
791
+ "lstrip": false,
792
+ "normalized": false,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": false
796
+ },
797
+ "99": {
798
+ "content": "<unused92>",
799
+ "lstrip": false,
800
+ "normalized": false,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": false
804
+ },
805
+ "100": {
806
+ "content": "<unused93>",
807
+ "lstrip": false,
808
+ "normalized": false,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": false
812
+ },
813
+ "101": {
814
+ "content": "<unused94>",
815
+ "lstrip": false,
816
+ "normalized": false,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": false
820
+ },
821
+ "102": {
822
+ "content": "<unused95>",
823
+ "lstrip": false,
824
+ "normalized": false,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": false
828
+ },
829
+ "103": {
830
+ "content": "<unused96>",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": false
836
+ },
837
+ "104": {
838
+ "content": "<unused97>",
839
+ "lstrip": false,
840
+ "normalized": false,
841
+ "rstrip": false,
842
+ "single_word": false,
843
+ "special": false
844
+ },
845
+ "105": {
846
+ "content": "<unused98>",
847
+ "lstrip": false,
848
+ "normalized": false,
849
+ "rstrip": false,
850
+ "single_word": false,
851
+ "special": false
852
+ },
853
+ "106": {
854
+ "content": "<start_of_turn>",
855
+ "lstrip": false,
856
+ "normalized": false,
857
+ "rstrip": false,
858
+ "single_word": false,
859
+ "special": true
860
+ },
861
+ "107": {
862
+ "content": "<end_of_turn>",
863
+ "lstrip": false,
864
+ "normalized": false,
865
+ "rstrip": false,
866
+ "single_word": false,
867
+ "special": true
868
+ },
869
+ "108": {
870
+ "content": "\n",
871
+ "lstrip": false,
872
+ "normalized": false,
873
+ "rstrip": false,
874
+ "single_word": false,
875
+ "special": false
876
+ },
877
+ "109": {
878
+ "content": "\n\n",
879
+ "lstrip": false,
880
+ "normalized": false,
881
+ "rstrip": false,
882
+ "single_word": false,
883
+ "special": false
884
+ },
885
+ "110": {
886
+ "content": "\n\n\n",
887
+ "lstrip": false,
888
+ "normalized": false,
889
+ "rstrip": false,
890
+ "single_word": false,
891
+ "special": false
892
+ },
893
+ "111": {
894
+ "content": "\n\n\n\n",
895
+ "lstrip": false,
896
+ "normalized": false,
897
+ "rstrip": false,
898
+ "single_word": false,
899
+ "special": false
900
+ },
901
+ "112": {
902
+ "content": "\n\n\n\n\n",
903
+ "lstrip": false,
904
+ "normalized": false,
905
+ "rstrip": false,
906
+ "single_word": false,
907
+ "special": false
908
+ },
909
+ "113": {
910
+ "content": "\n\n\n\n\n\n",
911
+ "lstrip": false,
912
+ "normalized": false,
913
+ "rstrip": false,
914
+ "single_word": false,
915
+ "special": false
916
+ },
917
+ "114": {
918
+ "content": "\n\n\n\n\n\n\n",
919
+ "lstrip": false,
920
+ "normalized": false,
921
+ "rstrip": false,
922
+ "single_word": false,
923
+ "special": false
924
+ },
925
+ "115": {
926
+ "content": "\n\n\n\n\n\n\n\n",
927
+ "lstrip": false,
928
+ "normalized": false,
929
+ "rstrip": false,
930
+ "single_word": false,
931
+ "special": false
932
+ },
933
+ "116": {
934
+ "content": "\n\n\n\n\n\n\n\n\n",
935
+ "lstrip": false,
936
+ "normalized": false,
937
+ "rstrip": false,
938
+ "single_word": false,
939
+ "special": false
940
+ },
941
+ "117": {
942
+ "content": "\n\n\n\n\n\n\n\n\n\n",
943
+ "lstrip": false,
944
+ "normalized": false,
945
+ "rstrip": false,
946
+ "single_word": false,
947
+ "special": false
948
+ },
949
+ "118": {
950
+ "content": "\n\n\n\n\n\n\n\n\n\n\n",
951
+ "lstrip": false,
952
+ "normalized": false,
953
+ "rstrip": false,
954
+ "single_word": false,
955
+ "special": false
956
+ },
957
+ "119": {
958
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n",
959
+ "lstrip": false,
960
+ "normalized": false,
961
+ "rstrip": false,
962
+ "single_word": false,
963
+ "special": false
964
+ },
965
+ "120": {
966
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n",
967
+ "lstrip": false,
968
+ "normalized": false,
969
+ "rstrip": false,
970
+ "single_word": false,
971
+ "special": false
972
+ },
973
+ "121": {
974
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
975
+ "lstrip": false,
976
+ "normalized": false,
977
+ "rstrip": false,
978
+ "single_word": false,
979
+ "special": false
980
+ },
981
+ "122": {
982
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
983
+ "lstrip": false,
984
+ "normalized": false,
985
+ "rstrip": false,
986
+ "single_word": false,
987
+ "special": false
988
+ },
989
+ "123": {
990
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
991
+ "lstrip": false,
992
+ "normalized": false,
993
+ "rstrip": false,
994
+ "single_word": false,
995
+ "special": false
996
+ },
997
+ "124": {
998
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
999
+ "lstrip": false,
1000
+ "normalized": false,
1001
+ "rstrip": false,
1002
+ "single_word": false,
1003
+ "special": false
1004
+ },
1005
+ "125": {
1006
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1007
+ "lstrip": false,
1008
+ "normalized": false,
1009
+ "rstrip": false,
1010
+ "single_word": false,
1011
+ "special": false
1012
+ },
1013
+ "126": {
1014
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1015
+ "lstrip": false,
1016
+ "normalized": false,
1017
+ "rstrip": false,
1018
+ "single_word": false,
1019
+ "special": false
1020
+ },
1021
+ "127": {
1022
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1023
+ "lstrip": false,
1024
+ "normalized": false,
1025
+ "rstrip": false,
1026
+ "single_word": false,
1027
+ "special": false
1028
+ },
1029
+ "128": {
1030
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1031
+ "lstrip": false,
1032
+ "normalized": false,
1033
+ "rstrip": false,
1034
+ "single_word": false,
1035
+ "special": false
1036
+ },
1037
+ "129": {
1038
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1039
+ "lstrip": false,
1040
+ "normalized": false,
1041
+ "rstrip": false,
1042
+ "single_word": false,
1043
+ "special": false
1044
+ },
1045
+ "130": {
1046
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1047
+ "lstrip": false,
1048
+ "normalized": false,
1049
+ "rstrip": false,
1050
+ "single_word": false,
1051
+ "special": false
1052
+ },
1053
+ "131": {
1054
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1055
+ "lstrip": false,
1056
+ "normalized": false,
1057
+ "rstrip": false,
1058
+ "single_word": false,
1059
+ "special": false
1060
+ },
1061
+ "132": {
1062
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1063
+ "lstrip": false,
1064
+ "normalized": false,
1065
+ "rstrip": false,
1066
+ "single_word": false,
1067
+ "special": false
1068
+ },
1069
+ "133": {
1070
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1071
+ "lstrip": false,
1072
+ "normalized": false,
1073
+ "rstrip": false,
1074
+ "single_word": false,
1075
+ "special": false
1076
+ },
1077
+ "134": {
1078
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1079
+ "lstrip": false,
1080
+ "normalized": false,
1081
+ "rstrip": false,
1082
+ "single_word": false,
1083
+ "special": false
1084
+ },
1085
+ "135": {
1086
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1087
+ "lstrip": false,
1088
+ "normalized": false,
1089
+ "rstrip": false,
1090
+ "single_word": false,
1091
+ "special": false
1092
+ },
1093
+ "136": {
1094
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1095
+ "lstrip": false,
1096
+ "normalized": false,
1097
+ "rstrip": false,
1098
+ "single_word": false,
1099
+ "special": false
1100
+ },
1101
+ "137": {
1102
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1103
+ "lstrip": false,
1104
+ "normalized": false,
1105
+ "rstrip": false,
1106
+ "single_word": false,
1107
+ "special": false
1108
+ },
1109
+ "138": {
1110
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1111
+ "lstrip": false,
1112
+ "normalized": false,
1113
+ "rstrip": false,
1114
+ "single_word": false,
1115
+ "special": false
1116
+ },
1117
+ "139": {
1118
+ "content": "▁▁",
1119
+ "lstrip": false,
1120
+ "normalized": false,
1121
+ "rstrip": false,
1122
+ "single_word": false,
1123
+ "special": false
1124
+ },
1125
+ "140": {
1126
+ "content": "▁▁▁",
1127
+ "lstrip": false,
1128
+ "normalized": false,
1129
+ "rstrip": false,
1130
+ "single_word": false,
1131
+ "special": false
1132
+ },
1133
+ "141": {
1134
+ "content": "▁▁▁▁",
1135
+ "lstrip": false,
1136
+ "normalized": false,
1137
+ "rstrip": false,
1138
+ "single_word": false,
1139
+ "special": false
1140
+ },
1141
+ "142": {
1142
+ "content": "▁▁▁▁▁",
1143
+ "lstrip": false,
1144
+ "normalized": false,
1145
+ "rstrip": false,
1146
+ "single_word": false,
1147
+ "special": false
1148
+ },
1149
+ "143": {
1150
+ "content": "▁▁▁▁▁▁",
1151
+ "lstrip": false,
1152
+ "normalized": false,
1153
+ "rstrip": false,
1154
+ "single_word": false,
1155
+ "special": false
1156
+ },
1157
+ "144": {
1158
+ "content": "▁▁▁▁▁▁▁",
1159
+ "lstrip": false,
1160
+ "normalized": false,
1161
+ "rstrip": false,
1162
+ "single_word": false,
1163
+ "special": false
1164
+ },
1165
+ "145": {
1166
+ "content": "▁▁▁▁▁▁▁▁",
1167
+ "lstrip": false,
1168
+ "normalized": false,
1169
+ "rstrip": false,
1170
+ "single_word": false,
1171
+ "special": false
1172
+ },
1173
+ "146": {
1174
+ "content": "▁▁▁▁▁▁▁▁▁",
1175
+ "lstrip": false,
1176
+ "normalized": false,
1177
+ "rstrip": false,
1178
+ "single_word": false,
1179
+ "special": false
1180
+ },
1181
+ "147": {
1182
+ "content": "▁▁▁▁▁▁▁▁▁▁",
1183
+ "lstrip": false,
1184
+ "normalized": false,
1185
+ "rstrip": false,
1186
+ "single_word": false,
1187
+ "special": false
1188
+ },
1189
+ "148": {
1190
+ "content": "▁▁▁▁▁▁▁▁▁▁▁",
1191
+ "lstrip": false,
1192
+ "normalized": false,
1193
+ "rstrip": false,
1194
+ "single_word": false,
1195
+ "special": false
1196
+ },
1197
+ "149": {
1198
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁",
1199
+ "lstrip": false,
1200
+ "normalized": false,
1201
+ "rstrip": false,
1202
+ "single_word": false,
1203
+ "special": false
1204
+ },
1205
+ "150": {
1206
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁",
1207
+ "lstrip": false,
1208
+ "normalized": false,
1209
+ "rstrip": false,
1210
+ "single_word": false,
1211
+ "special": false
1212
+ },
1213
+ "151": {
1214
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1215
+ "lstrip": false,
1216
+ "normalized": false,
1217
+ "rstrip": false,
1218
+ "single_word": false,
1219
+ "special": false
1220
+ },
1221
+ "152": {
1222
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1223
+ "lstrip": false,
1224
+ "normalized": false,
1225
+ "rstrip": false,
1226
+ "single_word": false,
1227
+ "special": false
1228
+ },
1229
+ "153": {
1230
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1231
+ "lstrip": false,
1232
+ "normalized": false,
1233
+ "rstrip": false,
1234
+ "single_word": false,
1235
+ "special": false
1236
+ },
1237
+ "154": {
1238
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1239
+ "lstrip": false,
1240
+ "normalized": false,
1241
+ "rstrip": false,
1242
+ "single_word": false,
1243
+ "special": false
1244
+ },
1245
+ "155": {
1246
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1247
+ "lstrip": false,
1248
+ "normalized": false,
1249
+ "rstrip": false,
1250
+ "single_word": false,
1251
+ "special": false
1252
+ },
1253
+ "156": {
1254
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1255
+ "lstrip": false,
1256
+ "normalized": false,
1257
+ "rstrip": false,
1258
+ "single_word": false,
1259
+ "special": false
1260
+ },
1261
+ "157": {
1262
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1263
+ "lstrip": false,
1264
+ "normalized": false,
1265
+ "rstrip": false,
1266
+ "single_word": false,
1267
+ "special": false
1268
+ },
1269
+ "158": {
1270
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1271
+ "lstrip": false,
1272
+ "normalized": false,
1273
+ "rstrip": false,
1274
+ "single_word": false,
1275
+ "special": false
1276
+ },
1277
+ "159": {
1278
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1279
+ "lstrip": false,
1280
+ "normalized": false,
1281
+ "rstrip": false,
1282
+ "single_word": false,
1283
+ "special": false
1284
+ },
1285
+ "160": {
1286
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1287
+ "lstrip": false,
1288
+ "normalized": false,
1289
+ "rstrip": false,
1290
+ "single_word": false,
1291
+ "special": false
1292
+ },
1293
+ "161": {
1294
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1295
+ "lstrip": false,
1296
+ "normalized": false,
1297
+ "rstrip": false,
1298
+ "single_word": false,
1299
+ "special": false
1300
+ },
1301
+ "162": {
1302
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1303
+ "lstrip": false,
1304
+ "normalized": false,
1305
+ "rstrip": false,
1306
+ "single_word": false,
1307
+ "special": false
1308
+ },
1309
+ "163": {
1310
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1311
+ "lstrip": false,
1312
+ "normalized": false,
1313
+ "rstrip": false,
1314
+ "single_word": false,
1315
+ "special": false
1316
+ },
1317
+ "164": {
1318
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1319
+ "lstrip": false,
1320
+ "normalized": false,
1321
+ "rstrip": false,
1322
+ "single_word": false,
1323
+ "special": false
1324
+ },
1325
+ "165": {
1326
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1327
+ "lstrip": false,
1328
+ "normalized": false,
1329
+ "rstrip": false,
1330
+ "single_word": false,
1331
+ "special": false
1332
+ },
1333
+ "166": {
1334
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1335
+ "lstrip": false,
1336
+ "normalized": false,
1337
+ "rstrip": false,
1338
+ "single_word": false,
1339
+ "special": false
1340
+ },
1341
+ "167": {
1342
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1343
+ "lstrip": false,
1344
+ "normalized": false,
1345
+ "rstrip": false,
1346
+ "single_word": false,
1347
+ "special": false
1348
+ },
1349
+ "168": {
1350
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1351
+ "lstrip": false,
1352
+ "normalized": false,
1353
+ "rstrip": false,
1354
+ "single_word": false,
1355
+ "special": false
1356
+ },
1357
+ "169": {
1358
+ "content": "<table>",
1359
+ "lstrip": false,
1360
+ "normalized": false,
1361
+ "rstrip": false,
1362
+ "single_word": false,
1363
+ "special": false
1364
+ },
1365
+ "170": {
1366
+ "content": "<caption>",
1367
+ "lstrip": false,
1368
+ "normalized": false,
1369
+ "rstrip": false,
1370
+ "single_word": false,
1371
+ "special": false
1372
+ },
1373
+ "171": {
1374
+ "content": "<thead>",
1375
+ "lstrip": false,
1376
+ "normalized": false,
1377
+ "rstrip": false,
1378
+ "single_word": false,
1379
+ "special": false
1380
+ },
1381
+ "172": {
1382
+ "content": "<tbody>",
1383
+ "lstrip": false,
1384
+ "normalized": false,
1385
+ "rstrip": false,
1386
+ "single_word": false,
1387
+ "special": false
1388
+ },
1389
+ "173": {
1390
+ "content": "<tfoot>",
1391
+ "lstrip": false,
1392
+ "normalized": false,
1393
+ "rstrip": false,
1394
+ "single_word": false,
1395
+ "special": false
1396
+ },
1397
+ "174": {
1398
+ "content": "<tr>",
1399
+ "lstrip": false,
1400
+ "normalized": false,
1401
+ "rstrip": false,
1402
+ "single_word": false,
1403
+ "special": false
1404
+ },
1405
+ "175": {
1406
+ "content": "<th>",
1407
+ "lstrip": false,
1408
+ "normalized": false,
1409
+ "rstrip": false,
1410
+ "single_word": false,
1411
+ "special": false
1412
+ },
1413
+ "176": {
1414
+ "content": "<td>",
1415
+ "lstrip": false,
1416
+ "normalized": false,
1417
+ "rstrip": false,
1418
+ "single_word": false,
1419
+ "special": false
1420
+ },
1421
+ "177": {
1422
+ "content": "</table>",
1423
+ "lstrip": false,
1424
+ "normalized": false,
1425
+ "rstrip": false,
1426
+ "single_word": false,
1427
+ "special": false
1428
+ },
1429
+ "178": {
1430
+ "content": "</caption>",
1431
+ "lstrip": false,
1432
+ "normalized": false,
1433
+ "rstrip": false,
1434
+ "single_word": false,
1435
+ "special": false
1436
+ },
1437
+ "179": {
1438
+ "content": "</thead>",
1439
+ "lstrip": false,
1440
+ "normalized": false,
1441
+ "rstrip": false,
1442
+ "single_word": false,
1443
+ "special": false
1444
+ },
1445
+ "180": {
1446
+ "content": "</tbody>",
1447
+ "lstrip": false,
1448
+ "normalized": false,
1449
+ "rstrip": false,
1450
+ "single_word": false,
1451
+ "special": false
1452
+ },
1453
+ "181": {
1454
+ "content": "</tfoot>",
1455
+ "lstrip": false,
1456
+ "normalized": false,
1457
+ "rstrip": false,
1458
+ "single_word": false,
1459
+ "special": false
1460
+ },
1461
+ "182": {
1462
+ "content": "</tr>",
1463
+ "lstrip": false,
1464
+ "normalized": false,
1465
+ "rstrip": false,
1466
+ "single_word": false,
1467
+ "special": false
1468
+ },
1469
+ "183": {
1470
+ "content": "</th>",
1471
+ "lstrip": false,
1472
+ "normalized": false,
1473
+ "rstrip": false,
1474
+ "single_word": false,
1475
+ "special": false
1476
+ },
1477
+ "184": {
1478
+ "content": "</td>",
1479
+ "lstrip": false,
1480
+ "normalized": false,
1481
+ "rstrip": false,
1482
+ "single_word": false,
1483
+ "special": false
1484
+ },
1485
+ "185": {
1486
+ "content": "<h1>",
1487
+ "lstrip": false,
1488
+ "normalized": false,
1489
+ "rstrip": false,
1490
+ "single_word": false,
1491
+ "special": false
1492
+ },
1493
+ "186": {
1494
+ "content": "<h2>",
1495
+ "lstrip": false,
1496
+ "normalized": false,
1497
+ "rstrip": false,
1498
+ "single_word": false,
1499
+ "special": false
1500
+ },
1501
+ "187": {
1502
+ "content": "<h3>",
1503
+ "lstrip": false,
1504
+ "normalized": false,
1505
+ "rstrip": false,
1506
+ "single_word": false,
1507
+ "special": false
1508
+ },
1509
+ "188": {
1510
+ "content": "<h4>",
1511
+ "lstrip": false,
1512
+ "normalized": false,
1513
+ "rstrip": false,
1514
+ "single_word": false,
1515
+ "special": false
1516
+ },
1517
+ "189": {
1518
+ "content": "<h5>",
1519
+ "lstrip": false,
1520
+ "normalized": false,
1521
+ "rstrip": false,
1522
+ "single_word": false,
1523
+ "special": false
1524
+ },
1525
+ "190": {
1526
+ "content": "<h6>",
1527
+ "lstrip": false,
1528
+ "normalized": false,
1529
+ "rstrip": false,
1530
+ "single_word": false,
1531
+ "special": false
1532
+ },
1533
+ "191": {
1534
+ "content": "<blockquote>",
1535
+ "lstrip": false,
1536
+ "normalized": false,
1537
+ "rstrip": false,
1538
+ "single_word": false,
1539
+ "special": false
1540
+ },
1541
+ "192": {
1542
+ "content": "</h1>",
1543
+ "lstrip": false,
1544
+ "normalized": false,
1545
+ "rstrip": false,
1546
+ "single_word": false,
1547
+ "special": false
1548
+ },
1549
+ "193": {
1550
+ "content": "</h2>",
1551
+ "lstrip": false,
1552
+ "normalized": false,
1553
+ "rstrip": false,
1554
+ "single_word": false,
1555
+ "special": false
1556
+ },
1557
+ "194": {
1558
+ "content": "</h3>",
1559
+ "lstrip": false,
1560
+ "normalized": false,
1561
+ "rstrip": false,
1562
+ "single_word": false,
1563
+ "special": false
1564
+ },
1565
+ "195": {
1566
+ "content": "</h4>",
1567
+ "lstrip": false,
1568
+ "normalized": false,
1569
+ "rstrip": false,
1570
+ "single_word": false,
1571
+ "special": false
1572
+ },
1573
+ "196": {
1574
+ "content": "</h5>",
1575
+ "lstrip": false,
1576
+ "normalized": false,
1577
+ "rstrip": false,
1578
+ "single_word": false,
1579
+ "special": false
1580
+ },
1581
+ "197": {
1582
+ "content": "</h6>",
1583
+ "lstrip": false,
1584
+ "normalized": false,
1585
+ "rstrip": false,
1586
+ "single_word": false,
1587
+ "special": false
1588
+ },
1589
+ "198": {
1590
+ "content": "</blockquote>",
1591
+ "lstrip": false,
1592
+ "normalized": false,
1593
+ "rstrip": false,
1594
+ "single_word": false,
1595
+ "special": false
1596
+ },
1597
+ "199": {
1598
+ "content": "<strong>",
1599
+ "lstrip": false,
1600
+ "normalized": false,
1601
+ "rstrip": false,
1602
+ "single_word": false,
1603
+ "special": false
1604
+ },
1605
+ "200": {
1606
+ "content": "<em>",
1607
+ "lstrip": false,
1608
+ "normalized": false,
1609
+ "rstrip": false,
1610
+ "single_word": false,
1611
+ "special": false
1612
+ },
1613
+ "201": {
1614
+ "content": "<b>",
1615
+ "lstrip": false,
1616
+ "normalized": false,
1617
+ "rstrip": false,
1618
+ "single_word": false,
1619
+ "special": false
1620
+ },
1621
+ "202": {
1622
+ "content": "<i>",
1623
+ "lstrip": false,
1624
+ "normalized": false,
1625
+ "rstrip": false,
1626
+ "single_word": false,
1627
+ "special": false
1628
+ },
1629
+ "203": {
1630
+ "content": "<u>",
1631
+ "lstrip": false,
1632
+ "normalized": false,
1633
+ "rstrip": false,
1634
+ "single_word": false,
1635
+ "special": false
1636
+ },
1637
+ "204": {
1638
+ "content": "<s>",
1639
+ "lstrip": false,
1640
+ "normalized": false,
1641
+ "rstrip": false,
1642
+ "single_word": false,
1643
+ "special": false
1644
+ },
1645
+ "205": {
1646
+ "content": "<sub>",
1647
+ "lstrip": false,
1648
+ "normalized": false,
1649
+ "rstrip": false,
1650
+ "single_word": false,
1651
+ "special": false
1652
+ },
1653
+ "206": {
1654
+ "content": "<sup>",
1655
+ "lstrip": false,
1656
+ "normalized": false,
1657
+ "rstrip": false,
1658
+ "single_word": false,
1659
+ "special": false
1660
+ },
1661
+ "207": {
1662
+ "content": "<code>",
1663
+ "lstrip": false,
1664
+ "normalized": false,
1665
+ "rstrip": false,
1666
+ "single_word": false,
1667
+ "special": false
1668
+ },
1669
+ "208": {
1670
+ "content": "</strong>",
1671
+ "lstrip": false,
1672
+ "normalized": false,
1673
+ "rstrip": false,
1674
+ "single_word": false,
1675
+ "special": false
1676
+ },
1677
+ "209": {
1678
+ "content": "</em>",
1679
+ "lstrip": false,
1680
+ "normalized": false,
1681
+ "rstrip": false,
1682
+ "single_word": false,
1683
+ "special": false
1684
+ },
1685
+ "210": {
1686
+ "content": "</b>",
1687
+ "lstrip": false,
1688
+ "normalized": false,
1689
+ "rstrip": false,
1690
+ "single_word": false,
1691
+ "special": false
1692
+ },
1693
+ "211": {
1694
+ "content": "</i>",
1695
+ "lstrip": false,
1696
+ "normalized": false,
1697
+ "rstrip": false,
1698
+ "single_word": false,
1699
+ "special": false
1700
+ },
1701
+ "212": {
1702
+ "content": "</u>",
1703
+ "lstrip": false,
1704
+ "normalized": false,
1705
+ "rstrip": false,
1706
+ "single_word": false,
1707
+ "special": false
1708
+ },
1709
+ "213": {
1710
+ "content": "</s>",
1711
+ "lstrip": false,
1712
+ "normalized": false,
1713
+ "rstrip": false,
1714
+ "single_word": false,
1715
+ "special": false
1716
+ },
1717
+ "214": {
1718
+ "content": "</sub>",
1719
+ "lstrip": false,
1720
+ "normalized": false,
1721
+ "rstrip": false,
1722
+ "single_word": false,
1723
+ "special": false
1724
+ },
1725
+ "215": {
1726
+ "content": "</sup>",
1727
+ "lstrip": false,
1728
+ "normalized": false,
1729
+ "rstrip": false,
1730
+ "single_word": false,
1731
+ "special": false
1732
+ },
1733
+ "216": {
1734
+ "content": "</code>",
1735
+ "lstrip": false,
1736
+ "normalized": false,
1737
+ "rstrip": false,
1738
+ "single_word": false,
1739
+ "special": false
1740
+ },
1741
+ "255968": {
1742
+ "content": "[toxicity=0]",
1743
+ "lstrip": false,
1744
+ "normalized": false,
1745
+ "rstrip": false,
1746
+ "single_word": false,
1747
+ "special": false
1748
+ },
1749
+ "255969": {
1750
+ "content": "\t\t",
1751
+ "lstrip": false,
1752
+ "normalized": false,
1753
+ "rstrip": false,
1754
+ "single_word": false,
1755
+ "special": false
1756
+ },
1757
+ "255970": {
1758
+ "content": "\t\t\t",
1759
+ "lstrip": false,
1760
+ "normalized": false,
1761
+ "rstrip": false,
1762
+ "single_word": false,
1763
+ "special": false
1764
+ },
1765
+ "255971": {
1766
+ "content": "\t\t\t\t",
1767
+ "lstrip": false,
1768
+ "normalized": false,
1769
+ "rstrip": false,
1770
+ "single_word": false,
1771
+ "special": false
1772
+ },
1773
+ "255972": {
1774
+ "content": "\t\t\t\t\t",
1775
+ "lstrip": false,
1776
+ "normalized": false,
1777
+ "rstrip": false,
1778
+ "single_word": false,
1779
+ "special": false
1780
+ },
1781
+ "255973": {
1782
+ "content": "\t\t\t\t\t\t",
1783
+ "lstrip": false,
1784
+ "normalized": false,
1785
+ "rstrip": false,
1786
+ "single_word": false,
1787
+ "special": false
1788
+ },
1789
+ "255974": {
1790
+ "content": "\t\t\t\t\t\t\t",
1791
+ "lstrip": false,
1792
+ "normalized": false,
1793
+ "rstrip": false,
1794
+ "single_word": false,
1795
+ "special": false
1796
+ },
1797
+ "255975": {
1798
+ "content": "\t\t\t\t\t\t\t\t",
1799
+ "lstrip": false,
1800
+ "normalized": false,
1801
+ "rstrip": false,
1802
+ "single_word": false,
1803
+ "special": false
1804
+ },
1805
+ "255976": {
1806
+ "content": "\t\t\t\t\t\t\t\t\t",
1807
+ "lstrip": false,
1808
+ "normalized": false,
1809
+ "rstrip": false,
1810
+ "single_word": false,
1811
+ "special": false
1812
+ },
1813
+ "255977": {
1814
+ "content": "\t\t\t\t\t\t\t\t\t\t",
1815
+ "lstrip": false,
1816
+ "normalized": false,
1817
+ "rstrip": false,
1818
+ "single_word": false,
1819
+ "special": false
1820
+ },
1821
+ "255978": {
1822
+ "content": "\t\t\t\t\t\t\t\t\t\t\t",
1823
+ "lstrip": false,
1824
+ "normalized": false,
1825
+ "rstrip": false,
1826
+ "single_word": false,
1827
+ "special": false
1828
+ },
1829
+ "255979": {
1830
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t",
1831
+ "lstrip": false,
1832
+ "normalized": false,
1833
+ "rstrip": false,
1834
+ "single_word": false,
1835
+ "special": false
1836
+ },
1837
+ "255980": {
1838
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t",
1839
+ "lstrip": false,
1840
+ "normalized": false,
1841
+ "rstrip": false,
1842
+ "single_word": false,
1843
+ "special": false
1844
+ },
1845
+ "255981": {
1846
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1847
+ "lstrip": false,
1848
+ "normalized": false,
1849
+ "rstrip": false,
1850
+ "single_word": false,
1851
+ "special": false
1852
+ },
1853
+ "255982": {
1854
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1855
+ "lstrip": false,
1856
+ "normalized": false,
1857
+ "rstrip": false,
1858
+ "single_word": false,
1859
+ "special": false
1860
+ },
1861
+ "255983": {
1862
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1863
+ "lstrip": false,
1864
+ "normalized": false,
1865
+ "rstrip": false,
1866
+ "single_word": false,
1867
+ "special": false
1868
+ },
1869
+ "255984": {
1870
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1871
+ "lstrip": false,
1872
+ "normalized": false,
1873
+ "rstrip": false,
1874
+ "single_word": false,
1875
+ "special": false
1876
+ },
1877
+ "255985": {
1878
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1879
+ "lstrip": false,
1880
+ "normalized": false,
1881
+ "rstrip": false,
1882
+ "single_word": false,
1883
+ "special": false
1884
+ },
1885
+ "255986": {
1886
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1887
+ "lstrip": false,
1888
+ "normalized": false,
1889
+ "rstrip": false,
1890
+ "single_word": false,
1891
+ "special": false
1892
+ },
1893
+ "255987": {
1894
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1895
+ "lstrip": false,
1896
+ "normalized": false,
1897
+ "rstrip": false,
1898
+ "single_word": false,
1899
+ "special": false
1900
+ },
1901
+ "255988": {
1902
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1903
+ "lstrip": false,
1904
+ "normalized": false,
1905
+ "rstrip": false,
1906
+ "single_word": false,
1907
+ "special": false
1908
+ },
1909
+ "255989": {
1910
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1911
+ "lstrip": false,
1912
+ "normalized": false,
1913
+ "rstrip": false,
1914
+ "single_word": false,
1915
+ "special": false
1916
+ },
1917
+ "255990": {
1918
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1919
+ "lstrip": false,
1920
+ "normalized": false,
1921
+ "rstrip": false,
1922
+ "single_word": false,
1923
+ "special": false
1924
+ },
1925
+ "255991": {
1926
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1927
+ "lstrip": false,
1928
+ "normalized": false,
1929
+ "rstrip": false,
1930
+ "single_word": false,
1931
+ "special": false
1932
+ },
1933
+ "255992": {
1934
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1935
+ "lstrip": false,
1936
+ "normalized": false,
1937
+ "rstrip": false,
1938
+ "single_word": false,
1939
+ "special": false
1940
+ },
1941
+ "255993": {
1942
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1943
+ "lstrip": false,
1944
+ "normalized": false,
1945
+ "rstrip": false,
1946
+ "single_word": false,
1947
+ "special": false
1948
+ },
1949
+ "255994": {
1950
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1951
+ "lstrip": false,
1952
+ "normalized": false,
1953
+ "rstrip": false,
1954
+ "single_word": false,
1955
+ "special": false
1956
+ },
1957
+ "255995": {
1958
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1959
+ "lstrip": false,
1960
+ "normalized": false,
1961
+ "rstrip": false,
1962
+ "single_word": false,
1963
+ "special": false
1964
+ },
1965
+ "255996": {
1966
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1967
+ "lstrip": false,
1968
+ "normalized": false,
1969
+ "rstrip": false,
1970
+ "single_word": false,
1971
+ "special": false
1972
+ },
1973
+ "255997": {
1974
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1975
+ "lstrip": false,
1976
+ "normalized": false,
1977
+ "rstrip": false,
1978
+ "single_word": false,
1979
+ "special": false
1980
+ },
1981
+ "255998": {
1982
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1983
+ "lstrip": false,
1984
+ "normalized": false,
1985
+ "rstrip": false,
1986
+ "single_word": false,
1987
+ "special": false
1988
+ },
1989
+ "255999": {
1990
+ "content": "<unused99>",
1991
+ "lstrip": false,
1992
+ "normalized": false,
1993
+ "rstrip": false,
1994
+ "single_word": false,
1995
+ "special": false
1996
+ }
1997
+ },
1998
+ "bos_token": "<bos>",
1999
+ "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
2000
+ "clean_up_tokenization_spaces": false,
2001
+ "eos_token": "<eos>",
2002
+ "model_max_length": 1000000000000000019884624838656,
2003
+ "pad_token": "<pad>",
2004
+ "processor_class": "Florence2Processor",
2005
+ "sp_model_kwargs": {},
2006
+ "spaces_between_special_tokens": false,
2007
+ "tokenizer_class": "GemmaTokenizer",
2008
+ "unk_token": "<unk>",
2009
+ "use_default_system_prompt": false
2010
+ }