czczup commited on
Commit
56dcc06
1 Parent(s): 8e927bb

Upload folder using huggingface_hub

Browse files
__init__.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torchvision.transforms as T
10
+ from torchvision.transforms import InterpolationMode
11
+ from transformers import LlamaTokenizer
12
+
13
+ from .configuration_intern_vit import InternVisionConfig
14
+ from .configuration_internvl import InternVLConfig
15
+ from .modeling_intern_vit import InternVisionModel
16
+ from .modeling_internvl import InternVL_C, InternVL_G, InternVLModel
17
+
18
+ __all__ = ['InternVisionConfig', 'InternVisionModel', 'InternVLConfig',
19
+ 'InternVLModel', 'InternVL_C', 'InternVL_G']
20
+
21
+
22
+ # Prefix the text "summarize:"
23
+ class InternVLTokenizer(nn.Module):
24
+ def __init__(self, model_path):
25
+ super(InternVLTokenizer, self).__init__()
26
+ self.tokenizer = LlamaTokenizer.from_pretrained(model_path)
27
+ self.tokenizer.pad_token = ' ' # allow padding
28
+ self.tokenizer.add_eos_token = True
29
+
30
+ def forward(self, text, prefix='summarize:'):
31
+ if type(text) == str:
32
+ text = prefix + text
33
+ elif type(text) == list:
34
+ text = [prefix + item for item in text]
35
+ text = self.tokenizer(text, return_tensors='pt', max_length=80, truncation=True, padding='max_length').input_ids
36
+ return text
37
+
38
+
39
+ def build_transform(task, image_size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
40
+ if task == 'retrieval':
41
+ transform = T.Compose([
42
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
43
+ T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
44
+ T.ToTensor(),
45
+ T.Normalize(mean=mean, std=std)])
46
+ else:
47
+ transform = T.Compose([
48
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
49
+ T.Resize(image_size, interpolation=InterpolationMode.BICUBIC),
50
+ T.CenterCrop(image_size),
51
+ T.ToTensor(),
52
+ T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
53
+ return transform
54
+
55
+
56
+ def load_internvl_c_huggingface(ckpt_path, device, task):
57
+ model = InternVL_C.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
58
+ if model.config.use_backbone_lora:
59
+ model.vision_model.merge_and_unload()
60
+ model.vision_model = model.vision_model.model
61
+ if model.config.use_qllama_lora:
62
+ model.qllama.merge_and_unload()
63
+ model.qllama = model.qllama.model
64
+ if model.config.force_image_size is not None:
65
+ image_size = model.config.force_image_size
66
+ else:
67
+ image_size = model.config.vision_config.image_size
68
+ transform = build_transform(task, image_size)
69
+ tokenizer = InternVLTokenizer(ckpt_path)
70
+ return model, transform, tokenizer
71
+
72
+
73
+ def load_internvl_g_huggingface(ckpt_path, device, task):
74
+ model = InternVL_G.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
75
+ if model.config.use_backbone_lora:
76
+ model.vision_model.merge_and_unload()
77
+ model.vision_model = model.vision_model.model
78
+ if model.config.use_qllama_lora:
79
+ model.qllama.merge_and_unload()
80
+ model.qllama = model.qllama.model
81
+ if model.config.force_image_size is not None:
82
+ image_size = model.config.force_image_size
83
+ else:
84
+ image_size = model.config.vision_config.image_size
85
+ transform = build_transform(task, image_size)
86
+ tokenizer = InternVLTokenizer(ckpt_path)
87
+ return model, transform, tokenizer
config.json ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "./",
4
+ "architectures": [
5
+ "InternVLModel"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_internvl.InternVLConfig",
9
+ "AutoModel": "modeling_internvl.InternVLModel"
10
+ },
11
+ "attn_pool_num_heads": 16,
12
+ "clip_embed_dim": 768,
13
+ "force_image_size": null,
14
+ "hidden_size": 4096,
15
+ "initializer_range": 0.02,
16
+ "label_smoothing": 0.0,
17
+ "max_txt_len": 32,
18
+ "model_type": "internvl",
19
+ "num_query_token": 96,
20
+ "qllama_config": {
21
+ "_name_or_path": "",
22
+ "add_cross_attention": false,
23
+ "architectures": [
24
+ "LlamaForCausalLM"
25
+ ],
26
+ "bad_words_ids": null,
27
+ "begin_suppress_tokens": null,
28
+ "bos_token_id": 1,
29
+ "chunk_size_feed_forward": 0,
30
+ "cross_attention_frequency": 2,
31
+ "cross_attention_hidden_size": null,
32
+ "decoder_start_token_id": null,
33
+ "diversity_penalty": 0.0,
34
+ "do_sample": false,
35
+ "early_stopping": false,
36
+ "encoder_no_repeat_ngram_size": 0,
37
+ "eos_token_id": 2,
38
+ "exponential_decay_length_penalty": null,
39
+ "finetuning_task": null,
40
+ "forced_bos_token_id": null,
41
+ "forced_eos_token_id": null,
42
+ "hidden_act": "silu",
43
+ "hidden_size": 4096,
44
+ "id2label": {
45
+ "0": "LABEL_0",
46
+ "1": "LABEL_1"
47
+ },
48
+ "initializer_range": 0.02,
49
+ "intermediate_size": 11008,
50
+ "is_decoder": false,
51
+ "is_encoder_decoder": false,
52
+ "label2id": {
53
+ "LABEL_0": 0,
54
+ "LABEL_1": 1
55
+ },
56
+ "length_penalty": 1.0,
57
+ "max_length": 20,
58
+ "max_position_embeddings": 2048,
59
+ "max_sequence_length": 2048,
60
+ "min_length": 0,
61
+ "model_type": "llama",
62
+ "no_repeat_ngram_size": 0,
63
+ "num_attention_heads": 32,
64
+ "num_beam_groups": 1,
65
+ "num_beams": 1,
66
+ "num_hidden_layers": 32,
67
+ "num_key_value_heads": 32,
68
+ "num_query_token": 96,
69
+ "num_return_sequences": 1,
70
+ "output_attentions": false,
71
+ "output_hidden_states": false,
72
+ "output_scores": false,
73
+ "pad_token_id": 0,
74
+ "prefix": null,
75
+ "pretraining_tp": 1,
76
+ "problem_type": null,
77
+ "pruned_heads": {},
78
+ "remove_invalid_values": false,
79
+ "repetition_penalty": 1.0,
80
+ "return_dict": true,
81
+ "return_dict_in_generate": false,
82
+ "rms_norm_eps": 1e-06,
83
+ "rope_scaling": null,
84
+ "sep_token_id": null,
85
+ "suppress_tokens": null,
86
+ "task_specific_params": null,
87
+ "temperature": 1.0,
88
+ "tf_legacy_loss": false,
89
+ "tie_encoder_decoder": false,
90
+ "tie_word_embeddings": false,
91
+ "tokenizer_class": null,
92
+ "top_k": 50,
93
+ "top_p": 1.0,
94
+ "torch_dtype": "float16",
95
+ "torchscript": false,
96
+ "transformers_version": "4.32.0",
97
+ "typical_p": 1.0,
98
+ "use_bfloat16": false,
99
+ "use_cache": false,
100
+ "vocab_size": 49954
101
+ },
102
+ "tie_word_embeddings": false,
103
+ "torch_dtype": "bfloat16",
104
+ "transformers_version": null,
105
+ "use_backbone_lora": 0,
106
+ "use_cache": false,
107
+ "use_decoder_only_language_model": true,
108
+ "use_qllama_lora": 0,
109
+ "vision_config": {
110
+ "_name_or_path": "",
111
+ "add_cross_attention": false,
112
+ "architectures": null,
113
+ "attention_dropout": 0.0,
114
+ "bad_words_ids": null,
115
+ "begin_suppress_tokens": null,
116
+ "bos_token_id": null,
117
+ "chunk_size_feed_forward": 0,
118
+ "cross_attention_hidden_size": null,
119
+ "decoder_start_token_id": null,
120
+ "diversity_penalty": 0.0,
121
+ "do_sample": false,
122
+ "drop_path_rate": 0.0,
123
+ "dropout": 0.0,
124
+ "early_stopping": false,
125
+ "encoder_no_repeat_ngram_size": 0,
126
+ "eos_token_id": null,
127
+ "exponential_decay_length_penalty": null,
128
+ "finetuning_task": null,
129
+ "forced_bos_token_id": null,
130
+ "forced_eos_token_id": null,
131
+ "hidden_act": "gelu",
132
+ "hidden_size": 3200,
133
+ "id2label": {
134
+ "0": "LABEL_0",
135
+ "1": "LABEL_1"
136
+ },
137
+ "image_size": 224,
138
+ "initializer_factor": 0.1,
139
+ "initializer_range": 1e-10,
140
+ "intermediate_size": 12800,
141
+ "is_decoder": false,
142
+ "is_encoder_decoder": false,
143
+ "label2id": {
144
+ "LABEL_0": 0,
145
+ "LABEL_1": 1
146
+ },
147
+ "layer_norm_eps": 1e-06,
148
+ "length_penalty": 1.0,
149
+ "max_length": 20,
150
+ "min_length": 0,
151
+ "model_type": "intern_vit_6b",
152
+ "no_repeat_ngram_size": 0,
153
+ "num_attention_heads": 25,
154
+ "num_beam_groups": 1,
155
+ "num_beams": 1,
156
+ "num_channels": 3,
157
+ "num_hidden_layers": 48,
158
+ "num_return_sequences": 1,
159
+ "output_attentions": false,
160
+ "output_hidden_states": false,
161
+ "output_scores": false,
162
+ "pad_token_id": null,
163
+ "patch_size": 14,
164
+ "prefix": null,
165
+ "problem_type": null,
166
+ "pruned_heads": {},
167
+ "qk_normalization": true,
168
+ "qkv_bias": false,
169
+ "remove_invalid_values": false,
170
+ "repetition_penalty": 1.0,
171
+ "return_dict": true,
172
+ "return_dict_in_generate": false,
173
+ "sep_token_id": null,
174
+ "suppress_tokens": null,
175
+ "task_specific_params": null,
176
+ "temperature": 1.0,
177
+ "tf_legacy_loss": false,
178
+ "tie_encoder_decoder": false,
179
+ "tie_word_embeddings": true,
180
+ "tokenizer_class": null,
181
+ "top_k": 50,
182
+ "top_p": 1.0,
183
+ "torch_dtype": null,
184
+ "torchscript": false,
185
+ "transformers_version": "4.32.0",
186
+ "typical_p": 1.0,
187
+ "use_bfloat16": false,
188
+ "use_flash_attn": true
189
+ }
190
+ }
configuration_intern_vit.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import os
7
+ from typing import Union
8
+
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.utils import logging
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+
15
+ class InternVisionConfig(PretrainedConfig):
16
+ r"""
17
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
18
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
19
+
20
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
21
+ documentation from [`PretrainedConfig`] for more information.
22
+
23
+ Args:
24
+ num_channels (`int`, *optional*, defaults to 3):
25
+ Number of color channels in the input images (e.g., 3 for RGB).
26
+ patch_size (`int`, *optional*, defaults to 14):
27
+ The size (resolution) of each patch.
28
+ image_size (`int`, *optional*, defaults to 224):
29
+ The size (resolution) of each image.
30
+ qkv_bias (`bool`, *optional*, defaults to `False`):
31
+ Whether to add a bias to the queries and values in the self-attention layers.
32
+ hidden_size (`int`, *optional*, defaults to 3200):
33
+ Dimensionality of the encoder layers and the pooler layer.
34
+ num_attention_heads (`int`, *optional*, defaults to 25):
35
+ Number of attention heads for each attention layer in the Transformer encoder.
36
+ intermediate_size (`int`, *optional*, defaults to 12800):
37
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
38
+ qk_normalization (`bool`, *optional*, defaults to `True`):
39
+ Whether to normalize the queries and keys in the self-attention layers.
40
+ num_hidden_layers (`int`, *optional*, defaults to 48):
41
+ Number of hidden layers in the Transformer encoder.
42
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
43
+ Whether to use flash attention mechanism.
44
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
45
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
46
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
47
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
48
+ The epsilon used by the layer normalization layers.
49
+ dropout (`float`, *optional*, defaults to 0.0):
50
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
51
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
52
+ Dropout rate for stochastic depth.
53
+ attention_dropout (`float`, *optional*, defaults to 0.0):
54
+ The dropout ratio for the attention probabilities.
55
+ initializer_range (`float`, *optional*, defaults to 0.02):
56
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
57
+ initializer_factor (`float`, *optional*, defaults to 0.1):
58
+ A factor for layer scale.
59
+ """
60
+
61
+ model_type = 'intern_vit_6b'
62
+
63
+ def __init__(
64
+ self,
65
+ num_channels=3,
66
+ patch_size=14,
67
+ image_size=224,
68
+ qkv_bias=False,
69
+ hidden_size=3200,
70
+ num_attention_heads=25,
71
+ intermediate_size=12800,
72
+ qk_normalization=True,
73
+ num_hidden_layers=48,
74
+ use_flash_attn=True,
75
+ hidden_act='gelu',
76
+ layer_norm_eps=1e-6,
77
+ dropout=0.0,
78
+ drop_path_rate=0.0,
79
+ attention_dropout=0.0,
80
+ initializer_range=0.02,
81
+ initializer_factor=0.1,
82
+ **kwargs,
83
+ ):
84
+ super().__init__(**kwargs)
85
+
86
+ self.hidden_size = hidden_size
87
+ self.intermediate_size = intermediate_size
88
+ self.dropout = dropout
89
+ self.drop_path_rate = drop_path_rate
90
+ self.num_hidden_layers = num_hidden_layers
91
+ self.num_attention_heads = num_attention_heads
92
+ self.num_channels = num_channels
93
+ self.patch_size = patch_size
94
+ self.image_size = image_size
95
+ self.initializer_range = initializer_range
96
+ self.initializer_factor = initializer_factor
97
+ self.attention_dropout = attention_dropout
98
+ self.layer_norm_eps = layer_norm_eps
99
+ self.hidden_act = hidden_act
100
+ self.qkv_bias = qkv_bias
101
+ self.qk_normalization = qk_normalization
102
+ self.use_flash_attn = use_flash_attn
103
+
104
+ @classmethod
105
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
106
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
107
+
108
+ if 'vision_config' in config_dict:
109
+ config_dict = config_dict['vision_config']
110
+
111
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
112
+ logger.warning(
113
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
114
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
115
+ )
116
+
117
+ return cls.from_dict(config_dict, **kwargs)
configuration_internvl.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import copy
7
+
8
+ from transformers import LlamaConfig
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.utils import logging
11
+
12
+ from .configuration_intern_vit import InternVisionConfig
13
+
14
+ logger = logging.get_logger(__name__)
15
+
16
+
17
+ class InternVLConfig(PretrainedConfig):
18
+ r"""
19
+ [`InternVLConfig`] is the configuration class to store the configuration of a
20
+ [`InternVLModel`]. It is used to instantiate a InternVLModel according to the specified
21
+ arguments, defining the InternViT-6B and QLLaMA configs. Instantiating a configuration with
22
+ the defaults will yield a similar configuration to that of the InternVL architecture.
23
+
24
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
25
+ documentation from [`PretrainedConfig`] for more information.
26
+
27
+ Args:
28
+ vision_config (`dict`, *optional*):
29
+ Dictionary of configuration options used to initialize [`InternVisionConfig`].
30
+ qllama_config (`dict`, *optional*):
31
+ Dictionary of configuration options used to initialize [`LLaMAConfig`].
32
+ clip_embed_dim (`int`, *optional*, defaults to 768):
33
+ Size of the embeddings from the CLIP model.
34
+ attn_pool_num_heads (`int`, *optional*, defaults to 16):
35
+ Number of attention heads used in the attention pooling layers.
36
+ num_query_token (`int`, *optional*, defaults to 96):
37
+ Number of query tokens used in the transformer.
38
+ label_smoothing (`float`, *optional*, defaults to 0.0):
39
+ The amount of label smoothing to apply.
40
+ cross_attention_frequency (`int`, *optional*, defaults to 2):
41
+ The frequency of cross-attention layers in the model.
42
+ use_backbone_lora (`int`, *optional*, defaults to 0):
43
+ If non-zero, indicates the use of LoRA in the backbone of the model.
44
+ use_qllama_lora (`int`, *optional*, defaults to 0):
45
+ If non-zero, indicates the use of LoRA in the QLLaMA of the model.
46
+ force_image_size (`int` or `None`, *optional*):
47
+ If not None, forces the model to use this specific image size.
48
+ initializer_range (`float`, *optional*, defaults to 0.02):
49
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
50
+ kwargs (*optional*):
51
+ Dictionary of additional keyword arguments.
52
+ """
53
+
54
+ model_type = 'internvl'
55
+ is_composition = True
56
+
57
+ def __init__(
58
+ self,
59
+ vision_config=None,
60
+ qllama_config=None,
61
+ clip_embed_dim=768,
62
+ attn_pool_num_heads=16,
63
+ num_query_token=96,
64
+ label_smoothing=0.0,
65
+ cross_attention_frequency=2,
66
+ use_backbone_lora=0,
67
+ use_qllama_lora=0,
68
+ force_image_size=None,
69
+ initializer_range=0.02,
70
+ **kwargs):
71
+ super().__init__(**kwargs)
72
+
73
+ if vision_config is None:
74
+ vision_config = {}
75
+ logger.info('vision_config is None. initializing the InternVisionConfig with default values.')
76
+
77
+ if qllama_config is None:
78
+ qllama_config = {}
79
+ logger.info(
80
+ 'qllama_config is None. Initializing the InternTextConfig config with default values (`LlamaConfig`).')
81
+
82
+ self.vision_config = InternVisionConfig(**vision_config)
83
+ self.qllama_config = LlamaConfig(**qllama_config)
84
+ self.qllama_config.num_query_token = num_query_token
85
+ self.qllama_config.cross_attention_frequency = cross_attention_frequency
86
+ self.hidden_size = self.qllama_config.hidden_size
87
+
88
+ self.clip_embed_dim = clip_embed_dim
89
+ self.attn_pool_num_heads = attn_pool_num_heads
90
+ self.num_query_token = num_query_token
91
+ self.label_smoothing = label_smoothing
92
+ self.use_backbone_lora = use_backbone_lora
93
+ self.use_qllama_lora = use_qllama_lora
94
+ self.force_image_size = force_image_size
95
+ self.initializer_range = initializer_range
96
+
97
+ def to_dict(self):
98
+ """
99
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
100
+
101
+ Returns:
102
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
103
+ """
104
+ output = copy.deepcopy(self.__dict__)
105
+ output['vision_config'] = self.vision_config.to_dict()
106
+ output['qllama_config'] = self.qllama_config.to_dict()
107
+ output['model_type'] = self.__class__.model_type
108
+ return output
flash_attention.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
2
+ import torch
3
+ import torch.nn as nn
4
+ from einops import rearrange
5
+
6
+ try: # v1
7
+ from flash_attn.flash_attn_interface import \
8
+ flash_attn_unpadded_qkvpacked_func
9
+ except: # v2
10
+ from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
11
+
12
+ from flash_attn.bert_padding import pad_input, unpad_input
13
+
14
+
15
+ class FlashAttention(nn.Module):
16
+ """Implement the scaled dot product attention with softmax.
17
+ Arguments
18
+ ---------
19
+ softmax_scale: The temperature to use for the softmax attention.
20
+ (default: 1/sqrt(d_keys) where d_keys is computed at
21
+ runtime)
22
+ attention_dropout: The dropout rate to apply to the attention
23
+ (default: 0.0)
24
+ """
25
+
26
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
27
+ super().__init__()
28
+ self.softmax_scale = softmax_scale
29
+ self.dropout_p = attention_dropout
30
+
31
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
32
+ max_s=None, need_weights=False):
33
+ """Implements the multihead softmax attention.
34
+ Arguments
35
+ ---------
36
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
37
+ if unpadded: (nnz, 3, h, d)
38
+ key_padding_mask: a bool tensor of shape (B, S)
39
+ """
40
+ assert not need_weights
41
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
42
+ assert qkv.is_cuda
43
+
44
+ if cu_seqlens is None:
45
+ batch_size = qkv.shape[0]
46
+ seqlen = qkv.shape[1]
47
+ if key_padding_mask is None:
48
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
49
+ max_s = seqlen
50
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
51
+ device=qkv.device)
52
+ output = flash_attn_unpadded_qkvpacked_func(
53
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
54
+ softmax_scale=self.softmax_scale, causal=causal
55
+ )
56
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
57
+ else:
58
+ nheads = qkv.shape[-2]
59
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
60
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
61
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
62
+ output_unpad = flash_attn_unpadded_qkvpacked_func(
63
+ x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
64
+ softmax_scale=self.softmax_scale, causal=causal
65
+ )
66
+ output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
67
+ indices, batch_size, seqlen),
68
+ 'b s (h d) -> b s h d', h=nheads)
69
+ else:
70
+ assert max_s is not None
71
+ output = flash_attn_unpadded_qkvpacked_func(
72
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
73
+ softmax_scale=self.softmax_scale, causal=causal
74
+ )
75
+
76
+ return output, None
modeling_intern_vit.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ from typing import Optional, Tuple, Union
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ import torch.utils.checkpoint
11
+ from einops import rearrange
12
+ from timm.models.layers import DropPath
13
+ from torch import nn
14
+ from transformers.activations import ACT2FN
15
+ from transformers.modeling_outputs import (BaseModelOutput,
16
+ BaseModelOutputWithPooling)
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.utils import logging
19
+
20
+ from .configuration_intern_vit import InternVisionConfig
21
+
22
+ try:
23
+ from .flash_attention import FlashAttention
24
+ has_flash_attn = True
25
+ except:
26
+ print('FlashAttention is not installed.')
27
+ has_flash_attn = False
28
+
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ class InternRMSNorm(nn.Module):
34
+ def __init__(self, hidden_size, eps=1e-6):
35
+ super().__init__()
36
+ self.weight = nn.Parameter(torch.ones(hidden_size))
37
+ self.variance_epsilon = eps
38
+
39
+ def forward(self, hidden_states):
40
+ input_dtype = hidden_states.dtype
41
+ hidden_states = hidden_states.to(torch.float32)
42
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
43
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
44
+ return self.weight * hidden_states.to(input_dtype)
45
+
46
+
47
+ try:
48
+ from apex.normalization import FusedRMSNorm
49
+
50
+ InternRMSNorm = FusedRMSNorm # noqa
51
+
52
+ logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
53
+ except ImportError:
54
+ # using the normal InternRMSNorm
55
+ pass
56
+ except Exception:
57
+ logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
58
+ pass
59
+
60
+
61
+ class InternVisionEmbeddings(nn.Module):
62
+ def __init__(self, config: InternVisionConfig):
63
+ super().__init__()
64
+ self.config = config
65
+ self.embed_dim = config.hidden_size
66
+ self.image_size = config.image_size
67
+ self.patch_size = config.patch_size
68
+
69
+ self.class_embedding = nn.Parameter(
70
+ torch.randn(1, 1, self.embed_dim),
71
+ )
72
+
73
+ self.patch_embedding = nn.Conv2d(
74
+ in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
75
+ )
76
+
77
+ self.num_patches = (self.image_size // self.patch_size) ** 2
78
+ self.num_positions = self.num_patches + 1
79
+
80
+ self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
81
+
82
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
83
+ batch_size = pixel_values.shape[0]
84
+ target_dtype = self.patch_embedding.weight.dtype
85
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
86
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
87
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
88
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
89
+ embeddings = embeddings + self.position_embedding.to(target_dtype)
90
+ return embeddings
91
+
92
+
93
+ class InternAttention(nn.Module):
94
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
95
+
96
+ def __init__(self, config: InternVisionConfig):
97
+ super().__init__()
98
+ self.config = config
99
+ self.embed_dim = config.hidden_size
100
+ self.num_heads = config.num_attention_heads
101
+ self.use_flash_attn = config.use_flash_attn and has_flash_attn
102
+ if config.use_flash_attn and not has_flash_attn:
103
+ print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
104
+ self.head_dim = self.embed_dim // self.num_heads
105
+ if self.head_dim * self.num_heads != self.embed_dim:
106
+ raise ValueError(
107
+ f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
108
+ f' {self.num_heads}).'
109
+ )
110
+
111
+ self.scale = self.head_dim ** -0.5
112
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
113
+ self.attn_drop = nn.Dropout(config.attention_dropout)
114
+ self.proj_drop = nn.Dropout(config.dropout)
115
+
116
+ self.qk_normalization = config.qk_normalization
117
+
118
+ if self.qk_normalization:
119
+ self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
120
+ self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
121
+
122
+ if self.use_flash_attn:
123
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
124
+ self.proj = nn.Linear(self.embed_dim, self.embed_dim)
125
+
126
+ def _naive_attn(self, x):
127
+ B, N, C = x.shape
128
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
129
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
130
+
131
+ if self.qk_normalization:
132
+ B_, H_, N_, D_ = q.shape
133
+ q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
134
+ k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
135
+
136
+ attn = ((q * self.scale) @ k.transpose(-2, -1))
137
+ attn = attn.softmax(dim=-1)
138
+ attn = self.attn_drop(attn)
139
+
140
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
141
+ x = self.proj(x)
142
+ x = self.proj_drop(x)
143
+ return x
144
+
145
+ def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
146
+ qkv = self.qkv(x)
147
+ qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
148
+
149
+ if self.qk_normalization:
150
+ q, k, v = qkv.unbind(2)
151
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
152
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
153
+ qkv = torch.stack([q, k, v], dim=2)
154
+
155
+ context, _ = self.inner_attn(
156
+ qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
157
+ )
158
+ outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
159
+ outs = self.proj_drop(outs)
160
+ return outs
161
+
162
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
163
+ x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
164
+ return x
165
+
166
+
167
+ class InternMLP(nn.Module):
168
+ def __init__(self, config: InternVisionConfig):
169
+ super().__init__()
170
+ self.config = config
171
+ self.act = ACT2FN[config.hidden_act]
172
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
173
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
174
+
175
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
176
+ hidden_states = self.fc1(hidden_states)
177
+ hidden_states = self.act(hidden_states)
178
+ hidden_states = self.fc2(hidden_states)
179
+ return hidden_states
180
+
181
+
182
+ class InternVisionEncoderLayer(nn.Module):
183
+ def __init__(self, config: InternVisionConfig, drop_path_rate: float):
184
+ super().__init__()
185
+ self.embed_dim = config.hidden_size
186
+ self.intermediate_size = config.intermediate_size
187
+
188
+ self.attn = InternAttention(config)
189
+ self.mlp = InternMLP(config)
190
+ self.norm1 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
191
+ self.norm2 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
192
+
193
+ self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
194
+ self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
195
+ self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
196
+ self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
197
+
198
+ def forward(
199
+ self,
200
+ hidden_states: torch.Tensor,
201
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
202
+ """
203
+ Args:
204
+ hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
205
+ """
206
+ hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
207
+
208
+ hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
209
+
210
+ return hidden_states
211
+
212
+
213
+ class InternVisionEncoder(nn.Module):
214
+ """
215
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
216
+ [`InternEncoderLayer`].
217
+
218
+ Args:
219
+ config (`InternConfig`):
220
+ The corresponding vision configuration for the `InternEncoder`.
221
+ """
222
+
223
+ def __init__(self, config: InternVisionConfig):
224
+ super().__init__()
225
+ self.config = config
226
+ # stochastic depth decay rule
227
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
228
+ self.layers = nn.ModuleList([
229
+ InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
230
+ self.gradient_checkpointing = True
231
+
232
+ def forward(
233
+ self,
234
+ inputs_embeds,
235
+ output_hidden_states: Optional[bool] = None,
236
+ return_dict: Optional[bool] = None,
237
+ ) -> Union[Tuple, BaseModelOutput]:
238
+ r"""
239
+ Args:
240
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
241
+ Embedded representation of the inputs. Should be float, not int tokens.
242
+ output_hidden_states (`bool`, *optional*):
243
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
244
+ for more detail.
245
+ return_dict (`bool`, *optional*):
246
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
247
+ """
248
+ output_hidden_states = (
249
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
250
+ )
251
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
252
+
253
+ encoder_states = () if output_hidden_states else None
254
+ hidden_states = inputs_embeds
255
+
256
+ for idx, encoder_layer in enumerate(self.layers):
257
+ if output_hidden_states:
258
+ encoder_states = encoder_states + (hidden_states,)
259
+ if self.gradient_checkpointing and self.training:
260
+ layer_outputs = torch.utils.checkpoint.checkpoint(
261
+ encoder_layer,
262
+ hidden_states)
263
+ else:
264
+ layer_outputs = encoder_layer(
265
+ hidden_states,
266
+ )
267
+ hidden_states = layer_outputs
268
+
269
+ if output_hidden_states:
270
+ encoder_states = encoder_states + (hidden_states,)
271
+
272
+ if not return_dict:
273
+ return tuple(v for v in [hidden_states, encoder_states] if v is not None)
274
+ return BaseModelOutput(
275
+ last_hidden_state=hidden_states, hidden_states=encoder_states
276
+ )
277
+
278
+
279
+ class InternVisionModel(PreTrainedModel):
280
+ main_input_name = 'pixel_values'
281
+ config_class = InternVisionConfig
282
+
283
+ def __init__(self, config: InternVisionConfig):
284
+ super().__init__(config)
285
+ self.config = config
286
+
287
+ self.embeddings = InternVisionEmbeddings(config)
288
+ self.encoder = InternVisionEncoder(config)
289
+
290
+ def resize_pos_embeddings(self, old_size, new_size, patch_size):
291
+ pos_emb = self.embeddings.position_embedding
292
+ _, num_positions, embed_dim = pos_emb.shape
293
+ cls_emb = pos_emb[:, :1, :]
294
+ pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
295
+ pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
296
+ pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
297
+ pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
298
+ self.embeddings.position_embedding = nn.Parameter(pos_emb)
299
+ logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
300
+
301
+ def get_input_embeddings(self):
302
+ return self.embeddings
303
+
304
+ def forward(
305
+ self,
306
+ pixel_values: Optional[torch.FloatTensor] = None,
307
+ output_hidden_states: Optional[bool] = None,
308
+ return_dict: Optional[bool] = None,
309
+ pixel_embeds: Optional[torch.FloatTensor] = None,
310
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
311
+ output_hidden_states = (
312
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
313
+ )
314
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
315
+
316
+ if pixel_values is None and pixel_embeds is None:
317
+ raise ValueError('You have to specify pixel_values or pixel_embeds')
318
+
319
+ if pixel_embeds is not None:
320
+ hidden_states = pixel_embeds
321
+ else:
322
+ if len(pixel_values.shape) == 4:
323
+ hidden_states = self.embeddings(pixel_values)
324
+ else:
325
+ raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
326
+ encoder_outputs = self.encoder(
327
+ inputs_embeds=hidden_states,
328
+ output_hidden_states=output_hidden_states,
329
+ return_dict=return_dict,
330
+ )
331
+ last_hidden_state = encoder_outputs.last_hidden_state
332
+ pooled_output = last_hidden_state[:, 0, :]
333
+
334
+ if not return_dict:
335
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
336
+
337
+ return BaseModelOutputWithPooling(
338
+ last_hidden_state=last_hidden_state,
339
+ pooler_output=pooled_output,
340
+ hidden_states=encoder_outputs.hidden_states,
341
+ attentions=encoder_outputs.attentions,
342
+ )
modeling_internvl.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ from functools import partial
7
+ from typing import Optional
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn.functional as F
12
+ import torch.utils.checkpoint
13
+ from peft import LoraConfig, get_peft_model
14
+ from timm.models.layers import DropPath
15
+ from torch import nn
16
+ from transformers import GenerationConfig
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.utils import logging
19
+
20
+ from .configuration_internvl import InternVLConfig
21
+ from .modeling_intern_vit import (InternVisionEmbeddings, InternVisionEncoder,
22
+ InternVisionModel)
23
+ from .modeling_qllama import LlamaForCausalLM, _expand_mask, _make_causal_mask
24
+
25
+ try:
26
+ from .flash_attention import FlashAttention # v1/v2
27
+ except:
28
+ print('FlashAttention is not installed.')
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ class InternVLPreTrainedModel(PreTrainedModel):
34
+ """
35
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
36
+ models.
37
+ """
38
+
39
+ config_class = InternVLConfig
40
+ base_model_prefix = 'internvl'
41
+ supports_gradient_checkpointing = True
42
+ _keys_to_ignore_on_load_missing = [
43
+ r'position_ids',
44
+ ]
45
+ _no_split_modules = ['InternAttention', 'LlamaDecoderLayer', 'LlamaForCausalLM']
46
+ _skip_keys_device_placement = 'past_key_values'
47
+ _keep_in_fp32_modules = ['wo']
48
+
49
+ def _init_weights(self, module):
50
+ """Initialize the weights"""
51
+ factor = self.config.initializer_range
52
+ if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
53
+ module.weight.data.normal_(mean=0.0, std=factor)
54
+ if hasattr(module, 'bias') and module.bias is not None:
55
+ module.bias.data.zero_()
56
+ if isinstance(module, InternVisionEmbeddings):
57
+ if hasattr(self.config, 'vision_config'):
58
+ factor = self.config.vision_config.initializer_range
59
+ nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
60
+ nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
61
+ elif isinstance(module, nn.LayerNorm):
62
+ module.bias.data.zero_()
63
+ module.weight.data.fill_(1.0)
64
+ elif isinstance(module, nn.Linear) and module.bias is not None:
65
+ module.bias.data.zero_()
66
+
67
+ def _set_gradient_checkpointing(self, module, value=False):
68
+ if isinstance(module, InternVisionModel):
69
+ module.gradient_checkpointing = value
70
+ if isinstance(module, InternVisionEncoder):
71
+ module.gradient_checkpointing = value
72
+
73
+
74
+ class CrossAttention(nn.Module):
75
+ def __init__(
76
+ self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
77
+ proj_drop=0., attn_head_dim=None, out_dim=None):
78
+ super().__init__()
79
+ if out_dim is None:
80
+ out_dim = dim
81
+ self.num_heads = num_heads
82
+ head_dim = dim // num_heads
83
+ if attn_head_dim is not None:
84
+ head_dim = attn_head_dim
85
+ all_head_dim = head_dim * self.num_heads
86
+ self.scale = qk_scale or head_dim ** -0.5
87
+ assert all_head_dim == dim
88
+
89
+ self.q = nn.Linear(dim, all_head_dim, bias=False)
90
+ self.k = nn.Linear(dim, all_head_dim, bias=False)
91
+ self.v = nn.Linear(dim, all_head_dim, bias=False)
92
+
93
+ if qkv_bias:
94
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
95
+ self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
96
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
97
+ else:
98
+ self.q_bias = None
99
+ self.k_bias = None
100
+ self.v_bias = None
101
+
102
+ self.attn_drop = nn.Dropout(attn_drop)
103
+ self.proj = nn.Linear(all_head_dim, out_dim)
104
+ self.proj_drop = nn.Dropout(proj_drop)
105
+
106
+ def forward(self, x, k=None, v=None):
107
+ B, N, C = x.shape
108
+ N_k = k.shape[1]
109
+ N_v = v.shape[1]
110
+
111
+ q_bias, k_bias, v_bias = None, None, None
112
+ if self.q_bias is not None:
113
+ q_bias = self.q_bias
114
+ k_bias = self.k_bias
115
+ v_bias = self.v_bias
116
+
117
+ q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
118
+ q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) # (B, N_head, N_q, dim)
119
+
120
+ k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
121
+ k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
122
+
123
+ v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
124
+ v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
125
+
126
+ q = q * self.scale
127
+ attn = (q @ k.transpose(-2, -1)) # (B, N_head, N_q, N_k)
128
+
129
+ attn = attn.softmax(dim=-1)
130
+ attn = self.attn_drop(attn)
131
+
132
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
133
+ x = self.proj(x)
134
+ x = self.proj_drop(x)
135
+
136
+ return x
137
+
138
+
139
+ class AttentiveBlock(nn.Module):
140
+
141
+ def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
142
+ drop_path=0., norm_layer=nn.LayerNorm, attn_head_dim=None, out_dim=None):
143
+ super().__init__()
144
+
145
+ self.norm1_q = norm_layer(dim)
146
+ self.norm1_k = norm_layer(dim)
147
+ self.norm1_v = norm_layer(dim)
148
+ self.cross_attn = CrossAttention(
149
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
150
+ proj_drop=drop, attn_head_dim=attn_head_dim, out_dim=out_dim)
151
+
152
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
153
+
154
+ def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None):
155
+ x_q = self.norm1_q(x_q + pos_q)
156
+ x_k = self.norm1_k(x_kv + pos_k)
157
+ x_v = self.norm1_v(x_kv)
158
+ x = self.cross_attn(x_q, k=x_k, v=x_v)
159
+
160
+ return x
161
+
162
+
163
+ class AttentionPoolingBlock(AttentiveBlock):
164
+
165
+ def forward(self, x):
166
+ x_q = x.mean(1, keepdim=True)
167
+ x_kv, pos_q, pos_k = x, 0, 0
168
+ x = super().forward(x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None)
169
+ x = x.squeeze(1)
170
+ return x
171
+
172
+
173
+ class InternVLModel(InternVLPreTrainedModel):
174
+ config_class = InternVLConfig
175
+ main_input_name = 'pixel_values'
176
+
177
+ def __init__(self, config: InternVLConfig):
178
+ super().__init__(config)
179
+
180
+ text_hidden_size = config.qllama_config.hidden_size
181
+ vision_hidden_size = config.vision_config.hidden_size
182
+ clip_embed_dim = config.clip_embed_dim
183
+ attn_pool_num_heads = config.attn_pool_num_heads
184
+ config.qllama_config.num_query_token = config.num_query_token
185
+ self.num_query_token = config.num_query_token
186
+ self.label_smoothing = config.label_smoothing
187
+
188
+ self.vision_model = InternVisionModel(config.vision_config) # frozen
189
+ self.qllama = LlamaForCausalLM(config.qllama_config) # frozen
190
+ self.query_tokens = nn.Parameter( # trainable
191
+ torch.zeros(1, config.num_query_token, text_hidden_size)
192
+ )
193
+
194
+ self.text_projection = nn.Parameter(torch.empty(text_hidden_size, clip_embed_dim)) # frozen
195
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) # trainable
196
+ self.clip_projector = AttentionPoolingBlock( # frozen
197
+ dim=vision_hidden_size, num_heads=attn_pool_num_heads, qkv_bias=True, qk_scale=None,
198
+ drop=0., attn_drop=0., norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=clip_embed_dim)
199
+ self.clip_projector2 = AttentionPoolingBlock( # trainable
200
+ dim=text_hidden_size, num_heads=attn_pool_num_heads, qkv_bias=True, qk_scale=None,
201
+ drop=0., attn_drop=0., norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=clip_embed_dim)
202
+ self.itm_head = nn.Linear(text_hidden_size, 2) # trainable
203
+ self.gradient_checkpointing = True
204
+
205
+ # Initialize weights and apply final processing
206
+ # self.post_init()
207
+
208
+ if config.use_backbone_lora:
209
+ self.wrap_backbone_lora(r=config.use_backbone_lora)
210
+ if config.use_qllama_lora:
211
+ self.wrap_qllama_lora(r=config.use_qllama_lora)
212
+ if config.force_image_size:
213
+ self.vision_model.resize_pos_embeddings(
214
+ old_size=config.vision_config.image_size,
215
+ new_size=config.force_image_size,
216
+ patch_size=config.vision_config.patch_size
217
+ )
218
+
219
+ def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
220
+ lora_config = LoraConfig(
221
+ r=r,
222
+ target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2'],
223
+ lora_alpha=lora_alpha,
224
+ lora_dropout=lora_dropout,
225
+ )
226
+ self.vision_model = get_peft_model(self.vision_model, lora_config)
227
+ self.vision_model.print_trainable_parameters()
228
+
229
+ def wrap_qllama_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
230
+ lora_config = LoraConfig(
231
+ r=r,
232
+ target_modules=['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj',
233
+ 'mlp.gate_proj', 'mlp.down_proj', 'mlp.up_proj'],
234
+ lora_alpha=lora_alpha,
235
+ lora_dropout=lora_dropout,
236
+ )
237
+ self.qllama = get_peft_model(self.qllama, lora_config)
238
+ self.qllama.print_trainable_parameters()
239
+
240
+ def get_input_embeddings(self):
241
+ return self.qllama.get_input_embeddings()
242
+
243
+ def set_input_embeddings(self, value):
244
+ self.qllama.set_input_embeddings(value)
245
+
246
+ def set_output_embeddings(self, new_embeddings):
247
+ self.qllama.set_output_embeddings(new_embeddings)
248
+
249
+ def get_output_embeddings(self) -> nn.Module:
250
+ return self.qllama.get_output_embeddings()
251
+
252
+ @torch.no_grad()
253
+ def generate(
254
+ self,
255
+ pixel_values: torch.FloatTensor,
256
+ input_ids: torch.FloatTensor,
257
+ attention_mask: torch.LongTensor,
258
+ generation_config: Optional[GenerationConfig] = None,
259
+ output_hidden_states: Optional[bool] = None,
260
+ return_dict: Optional[bool] = None,
261
+ **generate_kwargs,
262
+ ) -> torch.LongTensor:
263
+
264
+ vision_outputs = self.vision_model(
265
+ pixel_values=pixel_values,
266
+ output_hidden_states=output_hidden_states,
267
+ return_dict=return_dict)
268
+ image_embeds = vision_outputs[0]
269
+
270
+ batch_size = image_embeds.shape[0]
271
+ input_embeds = self.get_input_embeddings()(input_ids)
272
+ query_tokens = self.query_tokens.repeat(batch_size, 1, 1)
273
+ input_embeds = torch.cat([query_tokens, input_embeds], dim=1)
274
+ image_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
275
+ attention_mask = torch.cat([image_attention_mask, attention_mask], dim=1)
276
+
277
+ outputs = self.qllama.generate(
278
+ inputs_embeds=input_embeds,
279
+ attention_mask=attention_mask,
280
+ vision_hidden_states=image_embeds,
281
+ generation_config=generation_config,
282
+ use_zero_attention_mask=True,
283
+ **generate_kwargs,
284
+ )
285
+
286
+ return outputs
287
+
288
+ def get_text_features(
289
+ self,
290
+ input_ids: torch.Tensor,
291
+ attention_mask: torch.Tensor,
292
+ output_attentions: Optional[bool] = None,
293
+ output_hidden_states: Optional[bool] = None,
294
+ return_dict: Optional[bool] = None,
295
+ ):
296
+ r"""
297
+ Returns:
298
+ text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
299
+ The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
300
+ contains the language model logits, the past key values and the hidden states if
301
+ `output_hidden_states=True`.
302
+ ```"""
303
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
304
+ output_hidden_states = (
305
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
306
+ )
307
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
308
+
309
+ input_embeds = self.get_input_embeddings()(input_ids)
310
+ attention_mask = _expand_mask(attention_mask, input_embeds.dtype).to(
311
+ input_embeds.device) # [bsz, 1, tgt_seq_len, src_seq_len]
312
+ attention_mask += _make_causal_mask(
313
+ (attention_mask.shape[0], attention_mask.shape[2]),
314
+ input_embeds.dtype,
315
+ device=input_embeds.device
316
+ )
317
+ if type(self.qllama.model) == LlamaForCausalLM:
318
+ outputs = self.qllama.model.model.forward_train(
319
+ inputs_embeds=input_embeds,
320
+ vision_hidden_states=None,
321
+ attention_mask=attention_mask,
322
+ output_attentions=output_attentions,
323
+ output_hidden_states=output_hidden_states,
324
+ return_dict=return_dict,
325
+ ).last_hidden_state
326
+ else:
327
+ outputs = self.qllama.model.forward_train(
328
+ inputs_embeds=input_embeds,
329
+ vision_hidden_states=None,
330
+ attention_mask=attention_mask,
331
+ output_attentions=output_attentions,
332
+ output_hidden_states=output_hidden_states,
333
+ return_dict=return_dict,
334
+ ).last_hidden_state
335
+ return outputs
336
+
337
+ def get_image_features(
338
+ self,
339
+ pixel_values: torch.FloatTensor,
340
+ output_attentions: Optional[bool] = None,
341
+ output_hidden_states: Optional[bool] = None,
342
+ return_dict: Optional[bool] = None,
343
+ ):
344
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
345
+ output_hidden_states = (
346
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
347
+ )
348
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
349
+
350
+ vision_outputs = self.vision_model(
351
+ pixel_values=pixel_values,
352
+ output_hidden_states=output_hidden_states,
353
+ return_dict=return_dict)
354
+ image_embeds = vision_outputs[0]
355
+ backbone_embeds = image_embeds
356
+
357
+ batch_size = image_embeds.shape[0]
358
+ input_embeds = self.query_tokens.repeat(batch_size, 1, 1)
359
+
360
+ attention_mask = torch.ones(input_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
361
+ attention_mask = _expand_mask(attention_mask, input_embeds.dtype).to(
362
+ input_embeds.device) # [bsz, 1, tgt_seq_len, src_seq_len]
363
+ if type(self.qllama.model) == LlamaForCausalLM:
364
+ outputs = self.qllama.model.model.forward_train(
365
+ inputs_embeds=input_embeds,
366
+ vision_hidden_states=image_embeds,
367
+ attention_mask=attention_mask,
368
+ output_attentions=output_attentions,
369
+ output_hidden_states=output_hidden_states,
370
+ return_dict=return_dict,
371
+ ).last_hidden_state
372
+ else:
373
+ outputs = self.qllama.model.forward_train(
374
+ inputs_embeds=input_embeds,
375
+ vision_hidden_states=image_embeds,
376
+ attention_mask=attention_mask,
377
+ output_attentions=output_attentions,
378
+ output_hidden_states=output_hidden_states,
379
+ return_dict=return_dict,
380
+ ).last_hidden_state
381
+ return backbone_embeds, outputs
382
+
383
+ def encode_image(self, image, mode):
384
+ if mode == 'InternVL-C':
385
+ vision_outputs = self.vision_model(
386
+ pixel_values=image,
387
+ output_hidden_states=False,
388
+ return_dict=True)
389
+ image_embeds = vision_outputs[0]
390
+ image_embeds = self.clip_projector(image_embeds)
391
+ elif mode == 'InternVL-G':
392
+ backbone_embeds, image_embeds = self.get_image_features(
393
+ pixel_values=image,
394
+ output_hidden_states=False,
395
+ return_dict=True,
396
+ )
397
+ backbone_embeds = self.clip_projector(backbone_embeds)
398
+ image_embeds = self.clip_projector2(image_embeds)
399
+ # ensemble
400
+ backbone_embeds = backbone_embeds / backbone_embeds.norm(dim=1, keepdim=True)
401
+ image_embeds = image_embeds / image_embeds.norm(dim=1, keepdim=True)
402
+ image_embeds = image_embeds + backbone_embeds
403
+ else:
404
+ raise NotImplementedError
405
+ return image_embeds
406
+
407
+ def encode_text(self, text):
408
+ attention_mask = text > 0
409
+ text_embeds = self.get_text_features(
410
+ input_ids=text,
411
+ attention_mask=attention_mask,
412
+ output_attentions=False,
413
+ output_hidden_states=False,
414
+ return_dict=True,
415
+ )
416
+ text_embeds = text_embeds[torch.arange(text_embeds.shape[0]), attention_mask.sum(1) - 1]
417
+ text_embeds = text_embeds @ self.text_projection
418
+ return text_embeds
419
+
420
+ def forward(self, image, text, mode='InternVL-C'):
421
+ assert mode in ['InternVL-C', 'InternVL-G'], 'mode must be InternVL-C or InternVL-G'
422
+ image_features = self.encode_image(image, mode)
423
+ text_features = self.encode_text(text)
424
+
425
+ # normalized features
426
+ image_features = image_features / image_features.norm(dim=1, keepdim=True)
427
+ text_features = text_features / text_features.norm(dim=1, keepdim=True)
428
+
429
+ # cosine similarity as logits
430
+ logit_scale = self.logit_scale.exp()
431
+ logits_per_image = logit_scale * image_features @ text_features.t()
432
+ logits_per_text = logits_per_image.t()
433
+
434
+ return logits_per_image, logits_per_text
435
+
436
+
437
+ class InternVL_C(InternVLModel):
438
+
439
+ def encode_image(self, image):
440
+ vision_outputs = self.vision_model(
441
+ pixel_values=image,
442
+ output_hidden_states=False,
443
+ return_dict=True)
444
+ image_embeds = vision_outputs[0]
445
+ image_embeds = self.clip_projector(image_embeds)
446
+ return image_embeds
447
+
448
+ def encode_text(self, text):
449
+ attention_mask = text > 0
450
+ text_embeds = self.get_text_features(
451
+ input_ids=text,
452
+ attention_mask=attention_mask,
453
+ output_attentions=False,
454
+ output_hidden_states=False,
455
+ return_dict=True,
456
+ )
457
+ text_embeds = text_embeds[torch.arange(text_embeds.shape[0]), attention_mask.sum(1) - 1]
458
+ text_embeds = text_embeds @ self.text_projection
459
+ return text_embeds
460
+
461
+ def forward(self, image, text):
462
+ image_features = self.encode_image(image)
463
+ text_features = self.encode_text(text)
464
+
465
+ # normalized features
466
+ image_features = image_features / image_features.norm(dim=1, keepdim=True)
467
+ text_features = text_features / text_features.norm(dim=1, keepdim=True)
468
+
469
+ # cosine similarity as logits
470
+ logit_scale = self.logit_scale.exp()
471
+ logits_per_image = logit_scale * image_features @ text_features.t()
472
+ logits_per_text = logits_per_image.t()
473
+
474
+ return logits_per_image, logits_per_text
475
+
476
+
477
+ class InternVL_G(InternVLModel):
478
+
479
+ def encode_image(self, image):
480
+ backbone_embeds, image_embeds = self.get_image_features(
481
+ pixel_values=image,
482
+ output_hidden_states=False,
483
+ return_dict=True,
484
+ )
485
+ backbone_embeds = self.clip_projector(backbone_embeds)
486
+ image_embeds = self.clip_projector2(image_embeds)
487
+ # ensemble
488
+ backbone_embeds = backbone_embeds / backbone_embeds.norm(dim=1, keepdim=True)
489
+ image_embeds = image_embeds / image_embeds.norm(dim=1, keepdim=True)
490
+ image_embeds = image_embeds + backbone_embeds
491
+ return image_embeds
492
+
493
+ def encode_text(self, text):
494
+ attention_mask = text > 0
495
+ text_embeds = self.get_text_features(
496
+ input_ids=text,
497
+ attention_mask=attention_mask,
498
+ output_attentions=False,
499
+ output_hidden_states=False,
500
+ return_dict=True,
501
+ )
502
+ text_embeds = text_embeds[torch.arange(text_embeds.shape[0]), attention_mask.sum(1) - 1]
503
+ text_embeds = text_embeds @ self.text_projection
504
+ return text_embeds
505
+
506
+ def forward(self, image, text):
507
+ image_features = self.encode_image(image)
508
+ text_features = self.encode_text(text)
509
+
510
+ # normalized features
511
+ image_features = image_features / image_features.norm(dim=1, keepdim=True)
512
+ text_features = text_features / text_features.norm(dim=1, keepdim=True)
513
+
514
+ # cosine similarity as logits
515
+ logit_scale = self.logit_scale.exp()
516
+ logits_per_image = logit_scale * image_features @ text_features.t()
517
+ logits_per_text = logits_per_image.t()
518
+
519
+ return logits_per_image, logits_per_text
modeling_qllama.py ADDED
@@ -0,0 +1,1073 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
4
+ # and OPT implementations in this library. It has been modified from its
5
+ # original forms to accommodate minor architectural differences compared
6
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+ """ PyTorch QLLaMA model."""
20
+ import math
21
+ from typing import List, Optional, Tuple, Union
22
+
23
+ import torch
24
+ import torch.utils.checkpoint
25
+ from torch import nn
26
+ from torch.nn import CrossEntropyLoss
27
+ from transformers import LlamaConfig
28
+ from transformers.activations import ACT2FN
29
+ from transformers.modeling_outputs import (BaseModelOutputWithPast,
30
+ CausalLMOutputWithPast)
31
+ from transformers.modeling_utils import PreTrainedModel
32
+ from transformers.utils import (add_start_docstrings,
33
+ add_start_docstrings_to_model_forward, logging,
34
+ replace_return_docstrings)
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+ _CONFIG_FOR_DOC = 'LlamaConfig'
39
+
40
+
41
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
42
+ def _make_causal_mask(
43
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
44
+ ):
45
+ """
46
+ Make causal mask used for bi-directional self-attention.
47
+ """
48
+ bsz, tgt_len = input_ids_shape
49
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
50
+ mask_cond = torch.arange(mask.size(-1), device=device)
51
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
52
+ mask = mask.to(dtype)
53
+
54
+ if past_key_values_length > 0:
55
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
56
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
57
+
58
+
59
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
60
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
61
+ """
62
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
63
+ """
64
+ bsz, src_len = mask.size()
65
+ tgt_len = tgt_len if tgt_len is not None else src_len
66
+
67
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
68
+
69
+ inverted_mask = 1.0 - expanded_mask
70
+
71
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
72
+
73
+
74
+ class LlamaRMSNorm(nn.Module):
75
+ def __init__(self, hidden_size, eps=1e-6):
76
+ """
77
+ LlamaRMSNorm is equivalent to T5LayerNorm
78
+ """
79
+ super().__init__()
80
+ self.weight = nn.Parameter(torch.ones(hidden_size))
81
+ self.variance_epsilon = eps
82
+
83
+ def forward(self, hidden_states):
84
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
85
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
86
+
87
+ # convert into half-precision if necessary
88
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
89
+ hidden_states = hidden_states.to(self.weight.dtype)
90
+
91
+ return self.weight * hidden_states
92
+
93
+
94
+ try:
95
+ from functools import partial
96
+
97
+ from apex.normalization import FusedRMSNorm
98
+
99
+ LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6) # noqa
100
+ print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm')
101
+ except ImportError:
102
+ # using the normal LlamaRMSNorm
103
+ pass
104
+ except Exception:
105
+ print('discovered apex but it failed to load, falling back to LlamaRMSNorm')
106
+ pass
107
+
108
+
109
+ class LlamaRotaryEmbedding(torch.nn.Module):
110
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
111
+ super().__init__()
112
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
113
+ self.register_buffer('inv_freq', inv_freq)
114
+
115
+ # Build here to make `torch.jit.trace` work.
116
+ self.max_seq_len_cached = max_position_embeddings
117
+ t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
118
+ freqs = torch.einsum('i,j->ij', t, self.inv_freq)
119
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
120
+ emb = torch.cat((freqs, freqs), dim=-1)
121
+ self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
122
+ self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
123
+
124
+ def forward(self, x, seq_len=None):
125
+ # x: [bs, num_attention_heads, seq_len, head_size]
126
+ # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
127
+ if seq_len > self.max_seq_len_cached:
128
+ self.max_seq_len_cached = seq_len
129
+ t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
130
+ freqs = torch.einsum('i,j->ij', t, self.inv_freq)
131
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
132
+ emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
133
+ self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
134
+ self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
135
+ return (
136
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
137
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
138
+ )
139
+
140
+
141
+ class FixedLlamaRotaryEmbedding(torch.nn.Module):
142
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
143
+ super().__init__()
144
+
145
+ self.dim = dim
146
+ self.max_position_embeddings = max_position_embeddings
147
+ self.base = base
148
+ self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
149
+
150
+ # Build here to make `torch.jit.trace` work.
151
+ self._set_cos_sin_cache(
152
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
153
+ )
154
+
155
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
156
+ self.max_seq_len_cached = seq_len
157
+ t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
158
+
159
+ freqs = torch.outer(t, self.inv_freq)
160
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
161
+ emb = torch.cat((freqs, freqs), dim=-1)
162
+ self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
163
+ self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
164
+
165
+ def forward(self, x, seq_len=None):
166
+ # x: [bs, num_attention_heads, seq_len, head_size]
167
+ if seq_len > self.max_seq_len_cached:
168
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
169
+
170
+ return (
171
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
172
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
173
+ )
174
+
175
+
176
+ LlamaRotaryEmbedding = FixedLlamaRotaryEmbedding
177
+
178
+
179
+ def rotate_half(x):
180
+ """Rotates half the hidden dims of the input."""
181
+ x1 = x[..., : x.shape[-1] // 2]
182
+ x2 = x[..., x.shape[-1] // 2:]
183
+ return torch.cat((-x2, x1), dim=-1)
184
+
185
+
186
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
187
+ gather_indices = position_ids[:, None, :, None] # [bs, 1, seq_len, 1]
188
+ gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
189
+ cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
190
+ sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
191
+ q_embed = (q * cos) + (rotate_half(q) * sin)
192
+ k_embed = (k * cos) + (rotate_half(k) * sin)
193
+ return q_embed, k_embed
194
+
195
+
196
+ class LlamaMLP(nn.Module):
197
+ def __init__(
198
+ self,
199
+ hidden_size: int,
200
+ intermediate_size: int,
201
+ hidden_act: str,
202
+ ):
203
+ super().__init__()
204
+ self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
205
+ self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
206
+ self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
207
+ self.act_fn = ACT2FN[hidden_act]
208
+
209
+ def forward(self, x):
210
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
211
+
212
+
213
+ class LlamaAttention(nn.Module):
214
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
215
+
216
+ def __init__(self, config: LlamaConfig):
217
+ super().__init__()
218
+ self.config = config
219
+ self.hidden_size = config.hidden_size
220
+ self.num_heads = config.num_attention_heads
221
+ self.head_dim = self.hidden_size // self.num_heads
222
+ self.max_position_embeddings = config.max_position_embeddings
223
+
224
+ if (self.head_dim * self.num_heads) != self.hidden_size:
225
+ raise ValueError(
226
+ f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
227
+ f' and `num_heads`: {self.num_heads}).'
228
+ )
229
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
230
+ self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
231
+ self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
232
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
233
+ self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
234
+
235
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
236
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
237
+
238
+ def forward(
239
+ self,
240
+ hidden_states: torch.Tensor,
241
+ attention_mask: Optional[torch.Tensor] = None,
242
+ position_ids: Optional[torch.LongTensor] = None,
243
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
244
+ output_attentions: bool = False,
245
+ use_cache: bool = False,
246
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
247
+ bsz, q_len, _ = hidden_states.size()
248
+
249
+ query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
250
+ key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
251
+ value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
252
+
253
+ kv_seq_len = key_states.shape[-2]
254
+ if past_key_value is not None:
255
+ kv_seq_len += past_key_value[0].shape[-2]
256
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
257
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
258
+ # [bsz, nh, t, hd]
259
+
260
+ if past_key_value is not None:
261
+ # reuse k, v, self_attention
262
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
263
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
264
+
265
+ past_key_value = (key_states, value_states) if use_cache else None
266
+
267
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
268
+
269
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
270
+ raise ValueError(
271
+ f'Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is'
272
+ f' {attn_weights.size()}'
273
+ )
274
+
275
+ if attention_mask is not None:
276
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
277
+ raise ValueError(
278
+ f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
279
+ )
280
+ attn_weights = attn_weights + attention_mask
281
+ attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
282
+
283
+ # upcast attention to fp32
284
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
285
+ attn_output = torch.matmul(attn_weights, value_states)
286
+
287
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
288
+ raise ValueError(
289
+ f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
290
+ f' {attn_output.size()}'
291
+ )
292
+
293
+ attn_output = attn_output.transpose(1, 2)
294
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
295
+
296
+ attn_output = self.o_proj(attn_output)
297
+
298
+ if not output_attentions:
299
+ attn_weights = None
300
+
301
+ return attn_output, attn_weights, past_key_value
302
+
303
+
304
+ class LlamaCrossAttention(nn.Module):
305
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
306
+
307
+ def __init__(self, config: LlamaConfig):
308
+ super().__init__()
309
+ self.config = config
310
+ self.hidden_size = config.hidden_size
311
+ self.num_heads = config.num_attention_heads
312
+ self.head_dim = self.hidden_size // self.num_heads
313
+ self.max_position_embeddings = config.max_position_embeddings
314
+ self.vision_hidden_size = 3200
315
+
316
+ if (self.head_dim * self.num_heads) != self.hidden_size:
317
+ raise ValueError(
318
+ f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
319
+ f' and `num_heads`: {self.num_heads}).'
320
+ )
321
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
322
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
323
+ self.norm1 = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
324
+
325
+ self.k_proj = nn.Linear(self.vision_hidden_size, self.num_heads * self.head_dim, bias=False)
326
+ self.v_proj = nn.Linear(self.vision_hidden_size, self.num_heads * self.head_dim, bias=False)
327
+ self.norm2 = LlamaRMSNorm(self.vision_hidden_size, eps=config.rms_norm_eps)
328
+
329
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
330
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
331
+
332
+ def forward(
333
+ self,
334
+ hidden_states: torch.Tensor,
335
+ vision_hidden_states: torch.Tensor,
336
+ repeat_time: int = 1,
337
+ attention_mask: Optional[torch.Tensor] = None,
338
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
339
+ output_attentions: bool = False,
340
+ use_cache: bool = False,
341
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
342
+ hidden_states = self.norm1(hidden_states)
343
+
344
+ bsz, q_len, _ = hidden_states.size()
345
+
346
+ query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
347
+
348
+ vision_hidden_states = self.norm2(vision_hidden_states)
349
+
350
+ bs_v, kv_len, _ = vision_hidden_states.size()
351
+
352
+ key_states = self.k_proj(vision_hidden_states).view(
353
+ bs_v, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
354
+ value_states = self.v_proj(vision_hidden_states).view(
355
+ bs_v, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
356
+
357
+ key_states = key_states.repeat(repeat_time, 1, 1, 1)
358
+ value_states = value_states.repeat(repeat_time, 1, 1, 1)
359
+
360
+ kv_seq_len = key_states.shape[-2]
361
+ if past_key_value is not None:
362
+ kv_seq_len += past_key_value[0].shape[-2]
363
+
364
+ if past_key_value is not None:
365
+ # reuse k, v, self_attention
366
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
367
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
368
+
369
+ past_key_value = (key_states, value_states) if use_cache else None
370
+
371
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
372
+
373
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
374
+ raise ValueError(
375
+ f'Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is'
376
+ f' {attn_weights.size()}'
377
+ )
378
+
379
+ if attention_mask is not None:
380
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
381
+ raise ValueError(
382
+ f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
383
+ )
384
+ attn_weights = attn_weights + attention_mask
385
+ attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
386
+
387
+ # upcast attention to fp32
388
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
389
+ attn_output = torch.matmul(attn_weights, value_states)
390
+
391
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
392
+ raise ValueError(
393
+ f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
394
+ f' {attn_output.size()}'
395
+ )
396
+
397
+ attn_output = attn_output.transpose(1, 2)
398
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
399
+
400
+ attn_output = self.o_proj(attn_output)
401
+
402
+ if not output_attentions:
403
+ attn_weights = None
404
+
405
+ return attn_output, attn_weights, past_key_value
406
+
407
+
408
+ class LlamaDecoderLayer(nn.Module):
409
+ def __init__(self, config: LlamaConfig, use_cross_attn: bool):
410
+ super().__init__()
411
+ self.hidden_size = config.hidden_size
412
+ self.self_attn = LlamaAttention(config=config)
413
+ self.cross_attn = LlamaCrossAttention(config=config) if use_cross_attn else None
414
+ self.mlp = LlamaMLP(
415
+ hidden_size=self.hidden_size,
416
+ intermediate_size=config.intermediate_size,
417
+ hidden_act=config.hidden_act,
418
+ )
419
+ self.num_query_token = 96
420
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
421
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
422
+
423
+ def forward(
424
+ self,
425
+ hidden_states: torch.Tensor,
426
+ vision_hidden_states: torch.Tensor,
427
+ attention_mask: Optional[torch.Tensor] = None,
428
+ position_ids: Optional[torch.LongTensor] = None,
429
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
430
+ output_attentions: Optional[bool] = False,
431
+ use_cache: Optional[bool] = False,
432
+ repeat_time: int = 1,
433
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
434
+ """
435
+ Args:
436
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
437
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
438
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
439
+ output_attentions (`bool`, *optional*):
440
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
441
+ returned tensors for more detail.
442
+ use_cache (`bool`, *optional*):
443
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
444
+ (see `past_key_values`).
445
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
446
+ """
447
+
448
+ residual = hidden_states
449
+
450
+ hidden_states = self.input_layernorm(hidden_states)
451
+
452
+ # Self Attention
453
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
454
+ hidden_states=hidden_states,
455
+ attention_mask=attention_mask,
456
+ position_ids=position_ids,
457
+ past_key_value=past_key_value,
458
+ output_attentions=output_attentions,
459
+ use_cache=use_cache,
460
+ )
461
+ hidden_states = residual + hidden_states
462
+
463
+ # when using generate function and cache mode, the size of hidden_states is 1,
464
+ # so we should not use cross attention
465
+ if self.cross_attn is not None and hidden_states.size(1) >= self.num_query_token \
466
+ and vision_hidden_states is not None:
467
+ query_feats = hidden_states[:, :self.num_query_token, :]
468
+ text_feats = hidden_states[:, self.num_query_token:, :]
469
+ residual = query_feats
470
+ query_feats, _, _ = self.cross_attn(
471
+ hidden_states=query_feats,
472
+ vision_hidden_states=vision_hidden_states,
473
+ attention_mask=None, # not use attention mask in cross attention
474
+ past_key_value=past_key_value,
475
+ output_attentions=output_attentions,
476
+ use_cache=use_cache,
477
+ repeat_time=repeat_time,
478
+ )
479
+ query_feats = residual + query_feats
480
+ hidden_states = torch.cat([query_feats, text_feats], dim=1)
481
+
482
+ # Fully Connected
483
+ residual = hidden_states
484
+ hidden_states = self.post_attention_layernorm(hidden_states)
485
+ hidden_states = self.mlp(hidden_states)
486
+ hidden_states = residual + hidden_states
487
+
488
+ outputs = (hidden_states,)
489
+
490
+ if output_attentions:
491
+ outputs += (self_attn_weights,)
492
+
493
+ if use_cache:
494
+ outputs += (present_key_value,)
495
+
496
+ return outputs
497
+
498
+
499
+ LLAMA_START_DOCSTRING = r"""
500
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
501
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
502
+ etc.)
503
+
504
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
505
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
506
+ and behavior.
507
+
508
+ Parameters:
509
+ config ([`LlamaConfig`]):
510
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
511
+ load the weights associated with the model, only the configuration. Check out the
512
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
513
+ """
514
+
515
+
516
+ @add_start_docstrings(
517
+ 'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',
518
+ LLAMA_START_DOCSTRING,
519
+ )
520
+ class LlamaPreTrainedModel(PreTrainedModel):
521
+ config_class = LlamaConfig
522
+ base_model_prefix = 'model'
523
+ supports_gradient_checkpointing = True
524
+ _no_split_modules = ['LlamaDecoderLayer']
525
+ _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
526
+
527
+ def _init_weights(self, module):
528
+ std = self.config.initializer_range
529
+ if isinstance(module, nn.Linear):
530
+ module.weight.data.normal_(mean=0.0, std=std)
531
+ if module.bias is not None:
532
+ module.bias.data.zero_()
533
+ elif isinstance(module, nn.Embedding):
534
+ module.weight.data.normal_(mean=0.0, std=std)
535
+ if module.padding_idx is not None:
536
+ module.weight.data[module.padding_idx].zero_()
537
+
538
+ def _set_gradient_checkpointing(self, module, value=False):
539
+ if isinstance(module, LlamaModel):
540
+ module.gradient_checkpointing = value
541
+ if isinstance(module, LlamaDecoderLayer):
542
+ module.gradient_checkpointing = value
543
+
544
+
545
+ LLAMA_INPUTS_DOCSTRING = r"""
546
+ Args:
547
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
548
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
549
+ it.
550
+
551
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
552
+ [`PreTrainedTokenizer.__call__`] for details.
553
+
554
+ [What are input IDs?](../glossary#input-ids)
555
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
556
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
557
+
558
+ - 1 for tokens that are **not masked**,
559
+ - 0 for tokens that are **masked**.
560
+
561
+ [What are attention masks?](../glossary#attention-mask)
562
+
563
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
564
+ [`PreTrainedTokenizer.__call__`] for details.
565
+
566
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
567
+ `past_key_values`).
568
+
569
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
570
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
571
+ information on the default strategy.
572
+
573
+ - 1 indicates the head is **not masked**,
574
+ - 0 indicates the head is **masked**.
575
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
576
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
577
+ config.n_positions - 1]`.
578
+
579
+ [What are position IDs?](../glossary#position-ids)
580
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
581
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
582
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
583
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
584
+
585
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
586
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
587
+
588
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
589
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
590
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
591
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
592
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
593
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
594
+ model's internal embedding lookup matrix.
595
+ use_cache (`bool`, *optional*):
596
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
597
+ `past_key_values`).
598
+ output_attentions (`bool`, *optional*):
599
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
600
+ tensors for more detail.
601
+ output_hidden_states (`bool`, *optional*):
602
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
603
+ more detail.
604
+ return_dict (`bool`, *optional*):
605
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
606
+ """
607
+
608
+
609
+ @add_start_docstrings(
610
+ 'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',
611
+ LLAMA_START_DOCSTRING,
612
+ )
613
+ class LlamaModel(LlamaPreTrainedModel):
614
+ """
615
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
616
+
617
+ Args:
618
+ config: LlamaConfig
619
+ """
620
+
621
+ def __init__(self, config: LlamaConfig):
622
+ super().__init__(config)
623
+ self.padding_idx = config.pad_token_id
624
+ self.vocab_size = config.vocab_size
625
+ self.cross_attention_frequency = config.cross_attention_frequency
626
+ self.num_query_token = config.num_query_token
627
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
628
+ use_cross_attn = [idx % self.cross_attention_frequency == 0 for idx in range(config.num_hidden_layers)]
629
+ self.layers = nn.ModuleList(
630
+ [LlamaDecoderLayer(config, use_cross_attn[idx]) for idx in range(config.num_hidden_layers)])
631
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
632
+ self.gradient_checkpointing = False
633
+ # Initialize weights and apply final processing
634
+ # self.post_init()
635
+
636
+ def get_input_embeddings(self):
637
+ return self.embed_tokens
638
+
639
+ def set_input_embeddings(self, value):
640
+ self.embed_tokens = value
641
+
642
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
643
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
644
+ # create causal mask
645
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
646
+ combined_attention_mask = None
647
+ if input_shape[-1] > 1:
648
+ combined_attention_mask = _make_causal_mask(
649
+ input_shape,
650
+ inputs_embeds.dtype,
651
+ device=inputs_embeds.device,
652
+ past_key_values_length=past_key_values_length,
653
+ )
654
+
655
+ if attention_mask is not None:
656
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
657
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
658
+ inputs_embeds.device
659
+ )
660
+ combined_attention_mask = (
661
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
662
+ )
663
+
664
+ return combined_attention_mask
665
+
666
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
667
+ def forward(
668
+ self,
669
+ input_ids: torch.LongTensor = None,
670
+ attention_mask: Optional[torch.Tensor] = None,
671
+ position_ids: Optional[torch.LongTensor] = None,
672
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
673
+ inputs_embeds: Optional[torch.FloatTensor] = None,
674
+ vision_hidden_states: Optional[torch.FloatTensor] = None,
675
+ repeat_time: Optional[int] = 1,
676
+ use_cache: Optional[bool] = None,
677
+ output_attentions: Optional[bool] = None,
678
+ output_hidden_states: Optional[bool] = None,
679
+ use_zero_attention_mask: Optional[bool] = None,
680
+ return_dict: Optional[bool] = None,
681
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
682
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
683
+ output_hidden_states = (
684
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
685
+ )
686
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
687
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
688
+
689
+ # retrieve input_ids and inputs_embeds
690
+ if input_ids is not None and inputs_embeds is not None:
691
+ raise ValueError('You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time')
692
+ elif input_ids is not None:
693
+ batch_size, seq_length = input_ids.shape
694
+ elif inputs_embeds is not None:
695
+ batch_size, seq_length, _ = inputs_embeds.shape
696
+ else:
697
+ raise ValueError('You have to specify either decoder_input_ids or decoder_inputs_embeds')
698
+ seq_length_with_past = seq_length
699
+ past_key_values_length = 0
700
+
701
+ if past_key_values is not None:
702
+ past_key_values_length = past_key_values[0][0].shape[2]
703
+ seq_length_with_past = seq_length_with_past + past_key_values_length
704
+
705
+ if position_ids is None:
706
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
707
+ position_ids = torch.arange(
708
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
709
+ )
710
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
711
+ else:
712
+ position_ids = position_ids.view(-1, seq_length).long()
713
+
714
+ if inputs_embeds is None:
715
+ inputs_embeds = self.embed_tokens(input_ids)
716
+ # embed positions
717
+ if attention_mask is None:
718
+ attention_mask = torch.ones(
719
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
720
+ )
721
+ attention_mask = self._prepare_decoder_attention_mask(
722
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
723
+ )
724
+ if use_zero_attention_mask:
725
+ attention_mask[:, :, :self.num_query_token, :self.num_query_token] = 0
726
+
727
+ hidden_states = inputs_embeds
728
+
729
+ if self.gradient_checkpointing and self.training:
730
+ if use_cache:
731
+ logger.warning_once(
732
+ '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
733
+ )
734
+ use_cache = False
735
+
736
+ # decoder layers
737
+ all_hidden_states = () if output_hidden_states else None
738
+ all_self_attns = () if output_attentions else None
739
+ next_decoder_cache = () if use_cache else None
740
+
741
+ for idx, decoder_layer in enumerate(self.layers):
742
+ if output_hidden_states:
743
+ all_hidden_states += (hidden_states,)
744
+
745
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
746
+
747
+ layer_outputs = decoder_layer(
748
+ hidden_states,
749
+ vision_hidden_states,
750
+ attention_mask=attention_mask,
751
+ position_ids=position_ids,
752
+ past_key_value=past_key_value,
753
+ output_attentions=output_attentions,
754
+ use_cache=use_cache,
755
+ repeat_time=repeat_time,
756
+ )
757
+
758
+ hidden_states = layer_outputs[0]
759
+
760
+ if use_cache:
761
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
762
+
763
+ if output_attentions:
764
+ all_self_attns += (layer_outputs[1],)
765
+
766
+ hidden_states = self.norm(hidden_states)
767
+
768
+ # add hidden states from the last decoder layer
769
+ if output_hidden_states:
770
+ all_hidden_states += (hidden_states,)
771
+
772
+ next_cache = next_decoder_cache if use_cache else None
773
+ if not return_dict:
774
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
775
+ return BaseModelOutputWithPast(
776
+ last_hidden_state=hidden_states,
777
+ past_key_values=next_cache,
778
+ hidden_states=all_hidden_states,
779
+ attentions=all_self_attns,
780
+ )
781
+
782
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
783
+ def forward_train(
784
+ self,
785
+ input_ids: torch.LongTensor = None,
786
+ attention_mask: Optional[torch.Tensor] = None,
787
+ position_ids: Optional[torch.LongTensor] = None,
788
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
789
+ inputs_embeds: Optional[torch.FloatTensor] = None,
790
+ vision_hidden_states: Optional[torch.FloatTensor] = None,
791
+ repeat_time: Optional[int] = 1,
792
+ use_cache: Optional[bool] = None,
793
+ output_attentions: Optional[bool] = None,
794
+ output_hidden_states: Optional[bool] = None,
795
+ return_dict: Optional[bool] = None,
796
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
797
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
798
+ output_hidden_states = (
799
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
800
+ )
801
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
802
+
803
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
804
+
805
+ # retrieve input_ids and inputs_embeds
806
+ if input_ids is not None and inputs_embeds is not None:
807
+ raise ValueError('You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time')
808
+ elif input_ids is not None:
809
+ batch_size, seq_length = input_ids.shape
810
+ elif inputs_embeds is not None:
811
+ batch_size, seq_length, _ = inputs_embeds.shape
812
+ else:
813
+ raise ValueError('You have to specify either decoder_input_ids or decoder_inputs_embeds')
814
+
815
+ seq_length_with_past = seq_length
816
+ past_key_values_length = 0
817
+
818
+ if past_key_values is not None:
819
+ past_key_values_length = past_key_values[0][0].shape[2]
820
+ seq_length_with_past = seq_length_with_past + past_key_values_length
821
+
822
+ if position_ids is None:
823
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
824
+ position_ids = torch.arange(
825
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
826
+ )
827
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
828
+ else:
829
+ position_ids = position_ids.view(-1, seq_length).long()
830
+
831
+ if inputs_embeds is None:
832
+ inputs_embeds = self.embed_tokens(input_ids)
833
+ # embed positions
834
+ # if attention_mask is None:
835
+ # attention_mask = torch.ones(
836
+ # (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
837
+ # )
838
+ # attention_mask = self._prepare_decoder_attention_mask(
839
+ # attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
840
+ # )
841
+ hidden_states = inputs_embeds
842
+
843
+ if self.gradient_checkpointing and self.training:
844
+ if use_cache:
845
+ logger.warning_once(
846
+ '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
847
+ )
848
+ use_cache = False
849
+
850
+ # decoder layers
851
+ all_hidden_states = () if output_hidden_states else None
852
+ all_self_attns = () if output_attentions else None
853
+ next_decoder_cache = () if use_cache else None
854
+
855
+ for idx, decoder_layer in enumerate(self.layers):
856
+ if output_hidden_states:
857
+ all_hidden_states += (hidden_states,)
858
+
859
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
860
+
861
+ if self.gradient_checkpointing and self.training:
862
+
863
+ def create_custom_forward(module):
864
+ def custom_forward(*inputs):
865
+ # None for past_key_value
866
+ return module(*inputs, output_attentions, None, repeat_time)
867
+
868
+ return custom_forward
869
+
870
+ layer_outputs = torch.utils.checkpoint.checkpoint(
871
+ create_custom_forward(decoder_layer),
872
+ hidden_states,
873
+ vision_hidden_states,
874
+ attention_mask,
875
+ position_ids,
876
+ None,
877
+ )
878
+ else:
879
+ layer_outputs = decoder_layer(
880
+ hidden_states,
881
+ vision_hidden_states,
882
+ attention_mask=attention_mask,
883
+ position_ids=position_ids,
884
+ past_key_value=past_key_value,
885
+ output_attentions=output_attentions,
886
+ use_cache=use_cache,
887
+ repeat_time=repeat_time,
888
+ )
889
+
890
+ hidden_states = layer_outputs[0]
891
+
892
+ if use_cache:
893
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
894
+
895
+ if output_attentions:
896
+ all_self_attns += (layer_outputs[1],)
897
+
898
+ hidden_states = self.norm(hidden_states)
899
+
900
+ # add hidden states from the last decoder layer
901
+ if output_hidden_states:
902
+ all_hidden_states += (hidden_states,)
903
+
904
+ next_cache = next_decoder_cache if use_cache else None
905
+ if not return_dict:
906
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
907
+ return BaseModelOutputWithPast(
908
+ last_hidden_state=hidden_states,
909
+ past_key_values=next_cache,
910
+ hidden_states=all_hidden_states,
911
+ attentions=all_self_attns,
912
+ )
913
+
914
+
915
+ class LlamaForCausalLM(LlamaPreTrainedModel):
916
+ def __init__(self, config):
917
+ super().__init__(config)
918
+ self.model = LlamaModel(config)
919
+
920
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
921
+
922
+ # Initialize weights and apply final processing
923
+ # self.post_init()
924
+
925
+ def get_input_embeddings(self):
926
+ return self.model.embed_tokens
927
+
928
+ def set_input_embeddings(self, value):
929
+ self.model.embed_tokens = value
930
+
931
+ def get_output_embeddings(self):
932
+ return self.lm_head
933
+
934
+ def set_output_embeddings(self, new_embeddings):
935
+ self.lm_head = new_embeddings
936
+
937
+ def set_decoder(self, decoder):
938
+ self.model = decoder
939
+
940
+ def get_decoder(self):
941
+ return self.model
942
+
943
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
944
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
945
+ def forward(
946
+ self,
947
+ input_ids: torch.LongTensor = None,
948
+ attention_mask: Optional[torch.Tensor] = None,
949
+ position_ids: Optional[torch.LongTensor] = None,
950
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
951
+ inputs_embeds: Optional[torch.FloatTensor] = None,
952
+ vision_hidden_states: Optional[torch.FloatTensor] = None,
953
+ labels: Optional[torch.LongTensor] = None,
954
+ use_cache: Optional[bool] = None,
955
+ output_attentions: Optional[bool] = None,
956
+ output_hidden_states: Optional[bool] = None,
957
+ use_zero_attention_mask: Optional[bool] = None,
958
+ return_dict: Optional[bool] = None,
959
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
960
+ r"""
961
+ Args:
962
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
963
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
964
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
965
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
966
+
967
+ Returns:
968
+
969
+ Example:
970
+
971
+ ```python
972
+ >>> from transformers import AutoTokenizer, LlamaForCausalLM
973
+
974
+ >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
975
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
976
+
977
+ >>> prompt = "Hey, are you consciours? Can you talk to me?"
978
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
979
+
980
+ >>> # Generate
981
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
982
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
983
+ "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
984
+ ```"""
985
+
986
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
987
+ output_hidden_states = (
988
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
989
+ )
990
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
991
+
992
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
993
+ outputs = self.model(
994
+ input_ids=input_ids,
995
+ attention_mask=attention_mask,
996
+ position_ids=position_ids,
997
+ past_key_values=past_key_values,
998
+ inputs_embeds=inputs_embeds,
999
+ vision_hidden_states=vision_hidden_states,
1000
+ use_cache=use_cache,
1001
+ output_attentions=output_attentions,
1002
+ output_hidden_states=output_hidden_states,
1003
+ return_dict=return_dict,
1004
+ use_zero_attention_mask=use_zero_attention_mask,
1005
+ )
1006
+
1007
+ hidden_states = outputs[0]
1008
+ logits = self.lm_head(hidden_states)
1009
+
1010
+ loss = None
1011
+ if labels is not None:
1012
+ # Shift so that tokens < n predict n
1013
+ shift_logits = logits[..., :-1, :].contiguous()
1014
+ shift_labels = labels[..., 1:].contiguous()
1015
+ # Flatten the tokens
1016
+ loss_fct = CrossEntropyLoss()
1017
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1018
+ shift_labels = shift_labels.view(-1)
1019
+ # Enable model parallelism
1020
+ shift_labels = shift_labels.to(shift_logits.device)
1021
+ loss = loss_fct(shift_logits, shift_labels)
1022
+
1023
+ if not return_dict:
1024
+ output = (logits,) + outputs[1:]
1025
+ return (loss,) + output if loss is not None else output
1026
+
1027
+ return CausalLMOutputWithPast(
1028
+ loss=loss,
1029
+ logits=logits,
1030
+ past_key_values=outputs.past_key_values,
1031
+ hidden_states=outputs.hidden_states,
1032
+ attentions=outputs.attentions,
1033
+ )
1034
+
1035
+ def prepare_inputs_for_generation(
1036
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None,
1037
+ vision_hidden_states=None, use_zero_attention_mask=None, **kwargs
1038
+ ):
1039
+ if past_key_values:
1040
+ input_ids = input_ids[:, -1:]
1041
+
1042
+ position_ids = kwargs.get('position_ids', None)
1043
+ if attention_mask is not None and position_ids is None:
1044
+ # create position_ids on the fly for batch generation
1045
+ position_ids = attention_mask.long().cumsum(-1) - 1
1046
+ position_ids.masked_fill_(attention_mask == 0, 1)
1047
+ if past_key_values:
1048
+ position_ids = position_ids[:, -1].unsqueeze(-1)
1049
+
1050
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1051
+ if inputs_embeds is not None and past_key_values is None:
1052
+ model_inputs = {'inputs_embeds': inputs_embeds}
1053
+ else:
1054
+ model_inputs = {'input_ids': input_ids}
1055
+
1056
+ model_inputs.update(
1057
+ {
1058
+ 'position_ids': position_ids,
1059
+ 'past_key_values': past_key_values,
1060
+ 'use_cache': kwargs.get('use_cache'),
1061
+ 'attention_mask': attention_mask,
1062
+ 'vision_hidden_states': vision_hidden_states,
1063
+ 'use_zero_attention_mask': use_zero_attention_mask,
1064
+ }
1065
+ )
1066
+ return model_inputs
1067
+
1068
+ @staticmethod
1069
+ def _reorder_cache(past_key_values, beam_idx):
1070
+ reordered_past = ()
1071
+ for layer_past in past_key_values:
1072
+ reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
1073
+ return reordered_past
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 224
19
+ }
pytorch_model-00001-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a49179b5853f94174604ee23142631020f08ce464e45a29b6238a624addb407
3
+ size 9928107961
pytorch_model-00002-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e83b1754a4c3449848ecb7cb00ffddd80e34f2fd5cf8c4a7edea473ea9a7aac4
3
+ size 9980526335
pytorch_model-00003-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d03c3fec7174e3543f0be009f8ab4765fcd7f1837bf74ec16976c40297aa2e63
3
+ size 7761682141
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,1055 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 27669951494
4
+ },
5
+ "weight_map": {
6
+ "clip_projector.cross_attn.k.weight": "pytorch_model-00003-of-00003.bin",
7
+ "clip_projector.cross_attn.k_bias": "pytorch_model-00003-of-00003.bin",
8
+ "clip_projector.cross_attn.proj.bias": "pytorch_model-00003-of-00003.bin",
9
+ "clip_projector.cross_attn.proj.weight": "pytorch_model-00003-of-00003.bin",
10
+ "clip_projector.cross_attn.q.weight": "pytorch_model-00003-of-00003.bin",
11
+ "clip_projector.cross_attn.q_bias": "pytorch_model-00003-of-00003.bin",
12
+ "clip_projector.cross_attn.v.weight": "pytorch_model-00003-of-00003.bin",
13
+ "clip_projector.cross_attn.v_bias": "pytorch_model-00003-of-00003.bin",
14
+ "clip_projector.norm1_k.bias": "pytorch_model-00003-of-00003.bin",
15
+ "clip_projector.norm1_k.weight": "pytorch_model-00003-of-00003.bin",
16
+ "clip_projector.norm1_q.bias": "pytorch_model-00003-of-00003.bin",
17
+ "clip_projector.norm1_q.weight": "pytorch_model-00003-of-00003.bin",
18
+ "clip_projector.norm1_v.bias": "pytorch_model-00003-of-00003.bin",
19
+ "clip_projector.norm1_v.weight": "pytorch_model-00003-of-00003.bin",
20
+ "clip_projector2.cross_attn.k.weight": "pytorch_model-00003-of-00003.bin",
21
+ "clip_projector2.cross_attn.k_bias": "pytorch_model-00003-of-00003.bin",
22
+ "clip_projector2.cross_attn.proj.bias": "pytorch_model-00003-of-00003.bin",
23
+ "clip_projector2.cross_attn.proj.weight": "pytorch_model-00003-of-00003.bin",
24
+ "clip_projector2.cross_attn.q.weight": "pytorch_model-00003-of-00003.bin",
25
+ "clip_projector2.cross_attn.q_bias": "pytorch_model-00003-of-00003.bin",
26
+ "clip_projector2.cross_attn.v.weight": "pytorch_model-00003-of-00003.bin",
27
+ "clip_projector2.cross_attn.v_bias": "pytorch_model-00003-of-00003.bin",
28
+ "clip_projector2.norm1_k.bias": "pytorch_model-00003-of-00003.bin",
29
+ "clip_projector2.norm1_k.weight": "pytorch_model-00003-of-00003.bin",
30
+ "clip_projector2.norm1_q.bias": "pytorch_model-00003-of-00003.bin",
31
+ "clip_projector2.norm1_q.weight": "pytorch_model-00003-of-00003.bin",
32
+ "clip_projector2.norm1_v.bias": "pytorch_model-00003-of-00003.bin",
33
+ "clip_projector2.norm1_v.weight": "pytorch_model-00003-of-00003.bin",
34
+ "itm_head.bias": "pytorch_model-00003-of-00003.bin",
35
+ "itm_head.weight": "pytorch_model-00003-of-00003.bin",
36
+ "logit_scale": "pytorch_model-00001-of-00003.bin",
37
+ "qllama.lm_head.weight": "pytorch_model-00003-of-00003.bin",
38
+ "qllama.model.embed_tokens.weight": "pytorch_model-00002-of-00003.bin",
39
+ "qllama.model.layers.0.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
40
+ "qllama.model.layers.0.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
41
+ "qllama.model.layers.0.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
42
+ "qllama.model.layers.0.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
43
+ "qllama.model.layers.0.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
44
+ "qllama.model.layers.0.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
45
+ "qllama.model.layers.0.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
46
+ "qllama.model.layers.0.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
47
+ "qllama.model.layers.0.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
48
+ "qllama.model.layers.0.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
49
+ "qllama.model.layers.0.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
50
+ "qllama.model.layers.0.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
51
+ "qllama.model.layers.0.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
52
+ "qllama.model.layers.0.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
53
+ "qllama.model.layers.0.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
54
+ "qllama.model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
55
+ "qllama.model.layers.1.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
56
+ "qllama.model.layers.1.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
57
+ "qllama.model.layers.1.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
58
+ "qllama.model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
59
+ "qllama.model.layers.1.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
60
+ "qllama.model.layers.1.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
61
+ "qllama.model.layers.1.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
62
+ "qllama.model.layers.1.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
63
+ "qllama.model.layers.10.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
64
+ "qllama.model.layers.10.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
65
+ "qllama.model.layers.10.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
66
+ "qllama.model.layers.10.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
67
+ "qllama.model.layers.10.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
68
+ "qllama.model.layers.10.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
69
+ "qllama.model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
70
+ "qllama.model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
71
+ "qllama.model.layers.10.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
72
+ "qllama.model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
73
+ "qllama.model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
74
+ "qllama.model.layers.10.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
75
+ "qllama.model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
76
+ "qllama.model.layers.10.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
77
+ "qllama.model.layers.10.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
78
+ "qllama.model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
79
+ "qllama.model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
80
+ "qllama.model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
81
+ "qllama.model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
82
+ "qllama.model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
83
+ "qllama.model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
84
+ "qllama.model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
85
+ "qllama.model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
86
+ "qllama.model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
87
+ "qllama.model.layers.12.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
88
+ "qllama.model.layers.12.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
89
+ "qllama.model.layers.12.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
90
+ "qllama.model.layers.12.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
91
+ "qllama.model.layers.12.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
92
+ "qllama.model.layers.12.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
93
+ "qllama.model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
94
+ "qllama.model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
95
+ "qllama.model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
96
+ "qllama.model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
97
+ "qllama.model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
98
+ "qllama.model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
99
+ "qllama.model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
100
+ "qllama.model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
101
+ "qllama.model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
102
+ "qllama.model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
103
+ "qllama.model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
104
+ "qllama.model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
105
+ "qllama.model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
106
+ "qllama.model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
107
+ "qllama.model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
108
+ "qllama.model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
109
+ "qllama.model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
110
+ "qllama.model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
111
+ "qllama.model.layers.14.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
112
+ "qllama.model.layers.14.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
113
+ "qllama.model.layers.14.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
114
+ "qllama.model.layers.14.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
115
+ "qllama.model.layers.14.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
116
+ "qllama.model.layers.14.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
117
+ "qllama.model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
118
+ "qllama.model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
119
+ "qllama.model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
120
+ "qllama.model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
121
+ "qllama.model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
122
+ "qllama.model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
123
+ "qllama.model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
124
+ "qllama.model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
125
+ "qllama.model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
126
+ "qllama.model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
127
+ "qllama.model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
128
+ "qllama.model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
129
+ "qllama.model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
130
+ "qllama.model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
131
+ "qllama.model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
132
+ "qllama.model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
133
+ "qllama.model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
134
+ "qllama.model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
135
+ "qllama.model.layers.16.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
136
+ "qllama.model.layers.16.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
137
+ "qllama.model.layers.16.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
138
+ "qllama.model.layers.16.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
139
+ "qllama.model.layers.16.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
140
+ "qllama.model.layers.16.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
141
+ "qllama.model.layers.16.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
142
+ "qllama.model.layers.16.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
143
+ "qllama.model.layers.16.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
144
+ "qllama.model.layers.16.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
145
+ "qllama.model.layers.16.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
146
+ "qllama.model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
147
+ "qllama.model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
148
+ "qllama.model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
149
+ "qllama.model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
150
+ "qllama.model.layers.17.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
151
+ "qllama.model.layers.17.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
152
+ "qllama.model.layers.17.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
153
+ "qllama.model.layers.17.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
154
+ "qllama.model.layers.17.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
155
+ "qllama.model.layers.17.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
156
+ "qllama.model.layers.17.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
157
+ "qllama.model.layers.17.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
158
+ "qllama.model.layers.17.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
159
+ "qllama.model.layers.18.cross_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
160
+ "qllama.model.layers.18.cross_attn.norm1.weight": "pytorch_model-00003-of-00003.bin",
161
+ "qllama.model.layers.18.cross_attn.norm2.weight": "pytorch_model-00003-of-00003.bin",
162
+ "qllama.model.layers.18.cross_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
163
+ "qllama.model.layers.18.cross_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
164
+ "qllama.model.layers.18.cross_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
165
+ "qllama.model.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
166
+ "qllama.model.layers.18.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
167
+ "qllama.model.layers.18.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
168
+ "qllama.model.layers.18.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
169
+ "qllama.model.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
170
+ "qllama.model.layers.18.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
171
+ "qllama.model.layers.18.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
172
+ "qllama.model.layers.18.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
173
+ "qllama.model.layers.18.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
174
+ "qllama.model.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
175
+ "qllama.model.layers.19.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
176
+ "qllama.model.layers.19.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
177
+ "qllama.model.layers.19.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
178
+ "qllama.model.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
179
+ "qllama.model.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
180
+ "qllama.model.layers.19.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
181
+ "qllama.model.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
182
+ "qllama.model.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
183
+ "qllama.model.layers.2.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
184
+ "qllama.model.layers.2.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
185
+ "qllama.model.layers.2.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
186
+ "qllama.model.layers.2.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
187
+ "qllama.model.layers.2.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
188
+ "qllama.model.layers.2.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
189
+ "qllama.model.layers.2.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
190
+ "qllama.model.layers.2.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
191
+ "qllama.model.layers.2.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
192
+ "qllama.model.layers.2.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
193
+ "qllama.model.layers.2.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
194
+ "qllama.model.layers.2.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
195
+ "qllama.model.layers.2.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
196
+ "qllama.model.layers.2.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
197
+ "qllama.model.layers.2.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
198
+ "qllama.model.layers.20.cross_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
199
+ "qllama.model.layers.20.cross_attn.norm1.weight": "pytorch_model-00003-of-00003.bin",
200
+ "qllama.model.layers.20.cross_attn.norm2.weight": "pytorch_model-00003-of-00003.bin",
201
+ "qllama.model.layers.20.cross_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
202
+ "qllama.model.layers.20.cross_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
203
+ "qllama.model.layers.20.cross_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
204
+ "qllama.model.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
205
+ "qllama.model.layers.20.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
206
+ "qllama.model.layers.20.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
207
+ "qllama.model.layers.20.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
208
+ "qllama.model.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
209
+ "qllama.model.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
210
+ "qllama.model.layers.20.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
211
+ "qllama.model.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
212
+ "qllama.model.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
213
+ "qllama.model.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
214
+ "qllama.model.layers.21.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
215
+ "qllama.model.layers.21.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
216
+ "qllama.model.layers.21.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
217
+ "qllama.model.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
218
+ "qllama.model.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
219
+ "qllama.model.layers.21.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
220
+ "qllama.model.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
221
+ "qllama.model.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
222
+ "qllama.model.layers.22.cross_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
223
+ "qllama.model.layers.22.cross_attn.norm1.weight": "pytorch_model-00003-of-00003.bin",
224
+ "qllama.model.layers.22.cross_attn.norm2.weight": "pytorch_model-00003-of-00003.bin",
225
+ "qllama.model.layers.22.cross_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
226
+ "qllama.model.layers.22.cross_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
227
+ "qllama.model.layers.22.cross_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
228
+ "qllama.model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
229
+ "qllama.model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
230
+ "qllama.model.layers.22.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
231
+ "qllama.model.layers.22.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
232
+ "qllama.model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
233
+ "qllama.model.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
234
+ "qllama.model.layers.22.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
235
+ "qllama.model.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
236
+ "qllama.model.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
237
+ "qllama.model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
238
+ "qllama.model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
239
+ "qllama.model.layers.23.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
240
+ "qllama.model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
241
+ "qllama.model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
242
+ "qllama.model.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
243
+ "qllama.model.layers.23.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
244
+ "qllama.model.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
245
+ "qllama.model.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
246
+ "qllama.model.layers.24.cross_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
247
+ "qllama.model.layers.24.cross_attn.norm1.weight": "pytorch_model-00003-of-00003.bin",
248
+ "qllama.model.layers.24.cross_attn.norm2.weight": "pytorch_model-00003-of-00003.bin",
249
+ "qllama.model.layers.24.cross_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
250
+ "qllama.model.layers.24.cross_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
251
+ "qllama.model.layers.24.cross_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
252
+ "qllama.model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
253
+ "qllama.model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
254
+ "qllama.model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
255
+ "qllama.model.layers.24.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
256
+ "qllama.model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
257
+ "qllama.model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
258
+ "qllama.model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
259
+ "qllama.model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
260
+ "qllama.model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
261
+ "qllama.model.layers.25.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
262
+ "qllama.model.layers.25.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
263
+ "qllama.model.layers.25.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
264
+ "qllama.model.layers.25.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
265
+ "qllama.model.layers.25.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
266
+ "qllama.model.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
267
+ "qllama.model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
268
+ "qllama.model.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
269
+ "qllama.model.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
270
+ "qllama.model.layers.26.cross_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
271
+ "qllama.model.layers.26.cross_attn.norm1.weight": "pytorch_model-00003-of-00003.bin",
272
+ "qllama.model.layers.26.cross_attn.norm2.weight": "pytorch_model-00003-of-00003.bin",
273
+ "qllama.model.layers.26.cross_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
274
+ "qllama.model.layers.26.cross_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
275
+ "qllama.model.layers.26.cross_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
276
+ "qllama.model.layers.26.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
277
+ "qllama.model.layers.26.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
278
+ "qllama.model.layers.26.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
279
+ "qllama.model.layers.26.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
280
+ "qllama.model.layers.26.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
281
+ "qllama.model.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
282
+ "qllama.model.layers.26.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
283
+ "qllama.model.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
284
+ "qllama.model.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
285
+ "qllama.model.layers.27.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
286
+ "qllama.model.layers.27.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
287
+ "qllama.model.layers.27.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
288
+ "qllama.model.layers.27.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
289
+ "qllama.model.layers.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
290
+ "qllama.model.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
291
+ "qllama.model.layers.27.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
292
+ "qllama.model.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
293
+ "qllama.model.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
294
+ "qllama.model.layers.28.cross_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
295
+ "qllama.model.layers.28.cross_attn.norm1.weight": "pytorch_model-00003-of-00003.bin",
296
+ "qllama.model.layers.28.cross_attn.norm2.weight": "pytorch_model-00003-of-00003.bin",
297
+ "qllama.model.layers.28.cross_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
298
+ "qllama.model.layers.28.cross_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
299
+ "qllama.model.layers.28.cross_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
300
+ "qllama.model.layers.28.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
301
+ "qllama.model.layers.28.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
302
+ "qllama.model.layers.28.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
303
+ "qllama.model.layers.28.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
304
+ "qllama.model.layers.28.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
305
+ "qllama.model.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
306
+ "qllama.model.layers.28.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
307
+ "qllama.model.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
308
+ "qllama.model.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
309
+ "qllama.model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
310
+ "qllama.model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
311
+ "qllama.model.layers.29.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
312
+ "qllama.model.layers.29.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
313
+ "qllama.model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
314
+ "qllama.model.layers.29.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
315
+ "qllama.model.layers.29.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
316
+ "qllama.model.layers.29.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
317
+ "qllama.model.layers.29.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
318
+ "qllama.model.layers.3.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
319
+ "qllama.model.layers.3.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
320
+ "qllama.model.layers.3.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
321
+ "qllama.model.layers.3.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
322
+ "qllama.model.layers.3.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
323
+ "qllama.model.layers.3.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
324
+ "qllama.model.layers.3.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
325
+ "qllama.model.layers.3.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
326
+ "qllama.model.layers.3.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
327
+ "qllama.model.layers.30.cross_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
328
+ "qllama.model.layers.30.cross_attn.norm1.weight": "pytorch_model-00003-of-00003.bin",
329
+ "qllama.model.layers.30.cross_attn.norm2.weight": "pytorch_model-00003-of-00003.bin",
330
+ "qllama.model.layers.30.cross_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
331
+ "qllama.model.layers.30.cross_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
332
+ "qllama.model.layers.30.cross_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
333
+ "qllama.model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
334
+ "qllama.model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
335
+ "qllama.model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
336
+ "qllama.model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
337
+ "qllama.model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
338
+ "qllama.model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
339
+ "qllama.model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
340
+ "qllama.model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
341
+ "qllama.model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
342
+ "qllama.model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
343
+ "qllama.model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
344
+ "qllama.model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
345
+ "qllama.model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
346
+ "qllama.model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
347
+ "qllama.model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
348
+ "qllama.model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
349
+ "qllama.model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
350
+ "qllama.model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
351
+ "qllama.model.layers.4.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
352
+ "qllama.model.layers.4.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
353
+ "qllama.model.layers.4.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
354
+ "qllama.model.layers.4.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
355
+ "qllama.model.layers.4.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
356
+ "qllama.model.layers.4.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
357
+ "qllama.model.layers.4.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
358
+ "qllama.model.layers.4.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
359
+ "qllama.model.layers.4.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
360
+ "qllama.model.layers.4.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
361
+ "qllama.model.layers.4.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
362
+ "qllama.model.layers.4.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
363
+ "qllama.model.layers.4.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
364
+ "qllama.model.layers.4.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
365
+ "qllama.model.layers.4.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
366
+ "qllama.model.layers.5.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
367
+ "qllama.model.layers.5.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
368
+ "qllama.model.layers.5.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
369
+ "qllama.model.layers.5.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
370
+ "qllama.model.layers.5.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
371
+ "qllama.model.layers.5.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
372
+ "qllama.model.layers.5.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
373
+ "qllama.model.layers.5.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
374
+ "qllama.model.layers.5.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
375
+ "qllama.model.layers.6.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
376
+ "qllama.model.layers.6.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
377
+ "qllama.model.layers.6.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
378
+ "qllama.model.layers.6.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
379
+ "qllama.model.layers.6.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
380
+ "qllama.model.layers.6.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
381
+ "qllama.model.layers.6.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
382
+ "qllama.model.layers.6.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
383
+ "qllama.model.layers.6.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
384
+ "qllama.model.layers.6.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
385
+ "qllama.model.layers.6.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
386
+ "qllama.model.layers.6.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
387
+ "qllama.model.layers.6.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
388
+ "qllama.model.layers.6.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
389
+ "qllama.model.layers.6.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
390
+ "qllama.model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
391
+ "qllama.model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
392
+ "qllama.model.layers.7.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
393
+ "qllama.model.layers.7.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
394
+ "qllama.model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
395
+ "qllama.model.layers.7.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
396
+ "qllama.model.layers.7.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
397
+ "qllama.model.layers.7.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
398
+ "qllama.model.layers.7.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
399
+ "qllama.model.layers.8.cross_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
400
+ "qllama.model.layers.8.cross_attn.norm1.weight": "pytorch_model-00002-of-00003.bin",
401
+ "qllama.model.layers.8.cross_attn.norm2.weight": "pytorch_model-00002-of-00003.bin",
402
+ "qllama.model.layers.8.cross_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
403
+ "qllama.model.layers.8.cross_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
404
+ "qllama.model.layers.8.cross_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
405
+ "qllama.model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
406
+ "qllama.model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
407
+ "qllama.model.layers.8.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
408
+ "qllama.model.layers.8.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
409
+ "qllama.model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
410
+ "qllama.model.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
411
+ "qllama.model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
412
+ "qllama.model.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
413
+ "qllama.model.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
414
+ "qllama.model.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
415
+ "qllama.model.layers.9.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
416
+ "qllama.model.layers.9.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
417
+ "qllama.model.layers.9.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
418
+ "qllama.model.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
419
+ "qllama.model.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
420
+ "qllama.model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
421
+ "qllama.model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
422
+ "qllama.model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
423
+ "qllama.model.norm.weight": "pytorch_model-00003-of-00003.bin",
424
+ "query_tokens": "pytorch_model-00001-of-00003.bin",
425
+ "text_projection": "pytorch_model-00001-of-00003.bin",
426
+ "vision_model.embeddings.class_embedding": "pytorch_model-00001-of-00003.bin",
427
+ "vision_model.embeddings.patch_embedding.bias": "pytorch_model-00001-of-00003.bin",
428
+ "vision_model.embeddings.patch_embedding.weight": "pytorch_model-00001-of-00003.bin",
429
+ "vision_model.embeddings.position_embedding": "pytorch_model-00001-of-00003.bin",
430
+ "vision_model.encoder.layers.0.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
431
+ "vision_model.encoder.layers.0.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
432
+ "vision_model.encoder.layers.0.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
433
+ "vision_model.encoder.layers.0.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
434
+ "vision_model.encoder.layers.0.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
435
+ "vision_model.encoder.layers.0.ls1": "pytorch_model-00001-of-00003.bin",
436
+ "vision_model.encoder.layers.0.ls2": "pytorch_model-00001-of-00003.bin",
437
+ "vision_model.encoder.layers.0.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
438
+ "vision_model.encoder.layers.0.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
439
+ "vision_model.encoder.layers.0.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
440
+ "vision_model.encoder.layers.0.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
441
+ "vision_model.encoder.layers.0.norm1.weight": "pytorch_model-00001-of-00003.bin",
442
+ "vision_model.encoder.layers.0.norm2.weight": "pytorch_model-00001-of-00003.bin",
443
+ "vision_model.encoder.layers.1.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
444
+ "vision_model.encoder.layers.1.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
445
+ "vision_model.encoder.layers.1.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
446
+ "vision_model.encoder.layers.1.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
447
+ "vision_model.encoder.layers.1.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
448
+ "vision_model.encoder.layers.1.ls1": "pytorch_model-00001-of-00003.bin",
449
+ "vision_model.encoder.layers.1.ls2": "pytorch_model-00001-of-00003.bin",
450
+ "vision_model.encoder.layers.1.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
451
+ "vision_model.encoder.layers.1.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
452
+ "vision_model.encoder.layers.1.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
453
+ "vision_model.encoder.layers.1.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
454
+ "vision_model.encoder.layers.1.norm1.weight": "pytorch_model-00001-of-00003.bin",
455
+ "vision_model.encoder.layers.1.norm2.weight": "pytorch_model-00001-of-00003.bin",
456
+ "vision_model.encoder.layers.10.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
457
+ "vision_model.encoder.layers.10.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
458
+ "vision_model.encoder.layers.10.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
459
+ "vision_model.encoder.layers.10.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
460
+ "vision_model.encoder.layers.10.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
461
+ "vision_model.encoder.layers.10.ls1": "pytorch_model-00001-of-00003.bin",
462
+ "vision_model.encoder.layers.10.ls2": "pytorch_model-00001-of-00003.bin",
463
+ "vision_model.encoder.layers.10.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
464
+ "vision_model.encoder.layers.10.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
465
+ "vision_model.encoder.layers.10.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
466
+ "vision_model.encoder.layers.10.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
467
+ "vision_model.encoder.layers.10.norm1.weight": "pytorch_model-00001-of-00003.bin",
468
+ "vision_model.encoder.layers.10.norm2.weight": "pytorch_model-00001-of-00003.bin",
469
+ "vision_model.encoder.layers.11.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
470
+ "vision_model.encoder.layers.11.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
471
+ "vision_model.encoder.layers.11.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
472
+ "vision_model.encoder.layers.11.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
473
+ "vision_model.encoder.layers.11.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
474
+ "vision_model.encoder.layers.11.ls1": "pytorch_model-00001-of-00003.bin",
475
+ "vision_model.encoder.layers.11.ls2": "pytorch_model-00001-of-00003.bin",
476
+ "vision_model.encoder.layers.11.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
477
+ "vision_model.encoder.layers.11.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
478
+ "vision_model.encoder.layers.11.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
479
+ "vision_model.encoder.layers.11.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
480
+ "vision_model.encoder.layers.11.norm1.weight": "pytorch_model-00001-of-00003.bin",
481
+ "vision_model.encoder.layers.11.norm2.weight": "pytorch_model-00001-of-00003.bin",
482
+ "vision_model.encoder.layers.12.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
483
+ "vision_model.encoder.layers.12.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
484
+ "vision_model.encoder.layers.12.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
485
+ "vision_model.encoder.layers.12.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
486
+ "vision_model.encoder.layers.12.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
487
+ "vision_model.encoder.layers.12.ls1": "pytorch_model-00001-of-00003.bin",
488
+ "vision_model.encoder.layers.12.ls2": "pytorch_model-00001-of-00003.bin",
489
+ "vision_model.encoder.layers.12.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
490
+ "vision_model.encoder.layers.12.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
491
+ "vision_model.encoder.layers.12.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
492
+ "vision_model.encoder.layers.12.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
493
+ "vision_model.encoder.layers.12.norm1.weight": "pytorch_model-00001-of-00003.bin",
494
+ "vision_model.encoder.layers.12.norm2.weight": "pytorch_model-00001-of-00003.bin",
495
+ "vision_model.encoder.layers.13.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
496
+ "vision_model.encoder.layers.13.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
497
+ "vision_model.encoder.layers.13.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
498
+ "vision_model.encoder.layers.13.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
499
+ "vision_model.encoder.layers.13.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
500
+ "vision_model.encoder.layers.13.ls1": "pytorch_model-00001-of-00003.bin",
501
+ "vision_model.encoder.layers.13.ls2": "pytorch_model-00001-of-00003.bin",
502
+ "vision_model.encoder.layers.13.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
503
+ "vision_model.encoder.layers.13.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
504
+ "vision_model.encoder.layers.13.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
505
+ "vision_model.encoder.layers.13.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
506
+ "vision_model.encoder.layers.13.norm1.weight": "pytorch_model-00001-of-00003.bin",
507
+ "vision_model.encoder.layers.13.norm2.weight": "pytorch_model-00001-of-00003.bin",
508
+ "vision_model.encoder.layers.14.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
509
+ "vision_model.encoder.layers.14.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
510
+ "vision_model.encoder.layers.14.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
511
+ "vision_model.encoder.layers.14.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
512
+ "vision_model.encoder.layers.14.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
513
+ "vision_model.encoder.layers.14.ls1": "pytorch_model-00001-of-00003.bin",
514
+ "vision_model.encoder.layers.14.ls2": "pytorch_model-00001-of-00003.bin",
515
+ "vision_model.encoder.layers.14.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
516
+ "vision_model.encoder.layers.14.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
517
+ "vision_model.encoder.layers.14.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
518
+ "vision_model.encoder.layers.14.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
519
+ "vision_model.encoder.layers.14.norm1.weight": "pytorch_model-00001-of-00003.bin",
520
+ "vision_model.encoder.layers.14.norm2.weight": "pytorch_model-00001-of-00003.bin",
521
+ "vision_model.encoder.layers.15.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
522
+ "vision_model.encoder.layers.15.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
523
+ "vision_model.encoder.layers.15.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
524
+ "vision_model.encoder.layers.15.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
525
+ "vision_model.encoder.layers.15.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
526
+ "vision_model.encoder.layers.15.ls1": "pytorch_model-00001-of-00003.bin",
527
+ "vision_model.encoder.layers.15.ls2": "pytorch_model-00001-of-00003.bin",
528
+ "vision_model.encoder.layers.15.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
529
+ "vision_model.encoder.layers.15.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
530
+ "vision_model.encoder.layers.15.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
531
+ "vision_model.encoder.layers.15.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
532
+ "vision_model.encoder.layers.15.norm1.weight": "pytorch_model-00001-of-00003.bin",
533
+ "vision_model.encoder.layers.15.norm2.weight": "pytorch_model-00001-of-00003.bin",
534
+ "vision_model.encoder.layers.16.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
535
+ "vision_model.encoder.layers.16.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
536
+ "vision_model.encoder.layers.16.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
537
+ "vision_model.encoder.layers.16.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
538
+ "vision_model.encoder.layers.16.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
539
+ "vision_model.encoder.layers.16.ls1": "pytorch_model-00001-of-00003.bin",
540
+ "vision_model.encoder.layers.16.ls2": "pytorch_model-00001-of-00003.bin",
541
+ "vision_model.encoder.layers.16.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
542
+ "vision_model.encoder.layers.16.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
543
+ "vision_model.encoder.layers.16.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
544
+ "vision_model.encoder.layers.16.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
545
+ "vision_model.encoder.layers.16.norm1.weight": "pytorch_model-00001-of-00003.bin",
546
+ "vision_model.encoder.layers.16.norm2.weight": "pytorch_model-00001-of-00003.bin",
547
+ "vision_model.encoder.layers.17.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
548
+ "vision_model.encoder.layers.17.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
549
+ "vision_model.encoder.layers.17.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
550
+ "vision_model.encoder.layers.17.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
551
+ "vision_model.encoder.layers.17.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
552
+ "vision_model.encoder.layers.17.ls1": "pytorch_model-00001-of-00003.bin",
553
+ "vision_model.encoder.layers.17.ls2": "pytorch_model-00001-of-00003.bin",
554
+ "vision_model.encoder.layers.17.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
555
+ "vision_model.encoder.layers.17.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
556
+ "vision_model.encoder.layers.17.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
557
+ "vision_model.encoder.layers.17.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
558
+ "vision_model.encoder.layers.17.norm1.weight": "pytorch_model-00001-of-00003.bin",
559
+ "vision_model.encoder.layers.17.norm2.weight": "pytorch_model-00001-of-00003.bin",
560
+ "vision_model.encoder.layers.18.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
561
+ "vision_model.encoder.layers.18.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
562
+ "vision_model.encoder.layers.18.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
563
+ "vision_model.encoder.layers.18.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
564
+ "vision_model.encoder.layers.18.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
565
+ "vision_model.encoder.layers.18.ls1": "pytorch_model-00001-of-00003.bin",
566
+ "vision_model.encoder.layers.18.ls2": "pytorch_model-00001-of-00003.bin",
567
+ "vision_model.encoder.layers.18.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
568
+ "vision_model.encoder.layers.18.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
569
+ "vision_model.encoder.layers.18.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
570
+ "vision_model.encoder.layers.18.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
571
+ "vision_model.encoder.layers.18.norm1.weight": "pytorch_model-00001-of-00003.bin",
572
+ "vision_model.encoder.layers.18.norm2.weight": "pytorch_model-00001-of-00003.bin",
573
+ "vision_model.encoder.layers.19.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
574
+ "vision_model.encoder.layers.19.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
575
+ "vision_model.encoder.layers.19.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
576
+ "vision_model.encoder.layers.19.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
577
+ "vision_model.encoder.layers.19.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
578
+ "vision_model.encoder.layers.19.ls1": "pytorch_model-00001-of-00003.bin",
579
+ "vision_model.encoder.layers.19.ls2": "pytorch_model-00001-of-00003.bin",
580
+ "vision_model.encoder.layers.19.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
581
+ "vision_model.encoder.layers.19.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
582
+ "vision_model.encoder.layers.19.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
583
+ "vision_model.encoder.layers.19.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
584
+ "vision_model.encoder.layers.19.norm1.weight": "pytorch_model-00001-of-00003.bin",
585
+ "vision_model.encoder.layers.19.norm2.weight": "pytorch_model-00001-of-00003.bin",
586
+ "vision_model.encoder.layers.2.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
587
+ "vision_model.encoder.layers.2.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
588
+ "vision_model.encoder.layers.2.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
589
+ "vision_model.encoder.layers.2.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
590
+ "vision_model.encoder.layers.2.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
591
+ "vision_model.encoder.layers.2.ls1": "pytorch_model-00001-of-00003.bin",
592
+ "vision_model.encoder.layers.2.ls2": "pytorch_model-00001-of-00003.bin",
593
+ "vision_model.encoder.layers.2.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
594
+ "vision_model.encoder.layers.2.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
595
+ "vision_model.encoder.layers.2.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
596
+ "vision_model.encoder.layers.2.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
597
+ "vision_model.encoder.layers.2.norm1.weight": "pytorch_model-00001-of-00003.bin",
598
+ "vision_model.encoder.layers.2.norm2.weight": "pytorch_model-00001-of-00003.bin",
599
+ "vision_model.encoder.layers.20.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
600
+ "vision_model.encoder.layers.20.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
601
+ "vision_model.encoder.layers.20.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
602
+ "vision_model.encoder.layers.20.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
603
+ "vision_model.encoder.layers.20.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
604
+ "vision_model.encoder.layers.20.ls1": "pytorch_model-00001-of-00003.bin",
605
+ "vision_model.encoder.layers.20.ls2": "pytorch_model-00001-of-00003.bin",
606
+ "vision_model.encoder.layers.20.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
607
+ "vision_model.encoder.layers.20.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
608
+ "vision_model.encoder.layers.20.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
609
+ "vision_model.encoder.layers.20.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
610
+ "vision_model.encoder.layers.20.norm1.weight": "pytorch_model-00001-of-00003.bin",
611
+ "vision_model.encoder.layers.20.norm2.weight": "pytorch_model-00001-of-00003.bin",
612
+ "vision_model.encoder.layers.21.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
613
+ "vision_model.encoder.layers.21.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
614
+ "vision_model.encoder.layers.21.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
615
+ "vision_model.encoder.layers.21.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
616
+ "vision_model.encoder.layers.21.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
617
+ "vision_model.encoder.layers.21.ls1": "pytorch_model-00001-of-00003.bin",
618
+ "vision_model.encoder.layers.21.ls2": "pytorch_model-00001-of-00003.bin",
619
+ "vision_model.encoder.layers.21.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
620
+ "vision_model.encoder.layers.21.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
621
+ "vision_model.encoder.layers.21.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
622
+ "vision_model.encoder.layers.21.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
623
+ "vision_model.encoder.layers.21.norm1.weight": "pytorch_model-00001-of-00003.bin",
624
+ "vision_model.encoder.layers.21.norm2.weight": "pytorch_model-00001-of-00003.bin",
625
+ "vision_model.encoder.layers.22.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
626
+ "vision_model.encoder.layers.22.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
627
+ "vision_model.encoder.layers.22.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
628
+ "vision_model.encoder.layers.22.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
629
+ "vision_model.encoder.layers.22.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
630
+ "vision_model.encoder.layers.22.ls1": "pytorch_model-00001-of-00003.bin",
631
+ "vision_model.encoder.layers.22.ls2": "pytorch_model-00001-of-00003.bin",
632
+ "vision_model.encoder.layers.22.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
633
+ "vision_model.encoder.layers.22.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
634
+ "vision_model.encoder.layers.22.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
635
+ "vision_model.encoder.layers.22.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
636
+ "vision_model.encoder.layers.22.norm1.weight": "pytorch_model-00001-of-00003.bin",
637
+ "vision_model.encoder.layers.22.norm2.weight": "pytorch_model-00001-of-00003.bin",
638
+ "vision_model.encoder.layers.23.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
639
+ "vision_model.encoder.layers.23.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
640
+ "vision_model.encoder.layers.23.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
641
+ "vision_model.encoder.layers.23.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
642
+ "vision_model.encoder.layers.23.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
643
+ "vision_model.encoder.layers.23.ls1": "pytorch_model-00001-of-00003.bin",
644
+ "vision_model.encoder.layers.23.ls2": "pytorch_model-00001-of-00003.bin",
645
+ "vision_model.encoder.layers.23.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
646
+ "vision_model.encoder.layers.23.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
647
+ "vision_model.encoder.layers.23.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
648
+ "vision_model.encoder.layers.23.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
649
+ "vision_model.encoder.layers.23.norm1.weight": "pytorch_model-00001-of-00003.bin",
650
+ "vision_model.encoder.layers.23.norm2.weight": "pytorch_model-00001-of-00003.bin",
651
+ "vision_model.encoder.layers.24.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
652
+ "vision_model.encoder.layers.24.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
653
+ "vision_model.encoder.layers.24.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
654
+ "vision_model.encoder.layers.24.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
655
+ "vision_model.encoder.layers.24.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
656
+ "vision_model.encoder.layers.24.ls1": "pytorch_model-00001-of-00003.bin",
657
+ "vision_model.encoder.layers.24.ls2": "pytorch_model-00001-of-00003.bin",
658
+ "vision_model.encoder.layers.24.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
659
+ "vision_model.encoder.layers.24.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
660
+ "vision_model.encoder.layers.24.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
661
+ "vision_model.encoder.layers.24.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
662
+ "vision_model.encoder.layers.24.norm1.weight": "pytorch_model-00001-of-00003.bin",
663
+ "vision_model.encoder.layers.24.norm2.weight": "pytorch_model-00001-of-00003.bin",
664
+ "vision_model.encoder.layers.25.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
665
+ "vision_model.encoder.layers.25.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
666
+ "vision_model.encoder.layers.25.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
667
+ "vision_model.encoder.layers.25.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
668
+ "vision_model.encoder.layers.25.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
669
+ "vision_model.encoder.layers.25.ls1": "pytorch_model-00001-of-00003.bin",
670
+ "vision_model.encoder.layers.25.ls2": "pytorch_model-00001-of-00003.bin",
671
+ "vision_model.encoder.layers.25.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
672
+ "vision_model.encoder.layers.25.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
673
+ "vision_model.encoder.layers.25.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
674
+ "vision_model.encoder.layers.25.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
675
+ "vision_model.encoder.layers.25.norm1.weight": "pytorch_model-00001-of-00003.bin",
676
+ "vision_model.encoder.layers.25.norm2.weight": "pytorch_model-00001-of-00003.bin",
677
+ "vision_model.encoder.layers.26.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
678
+ "vision_model.encoder.layers.26.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
679
+ "vision_model.encoder.layers.26.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
680
+ "vision_model.encoder.layers.26.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
681
+ "vision_model.encoder.layers.26.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
682
+ "vision_model.encoder.layers.26.ls1": "pytorch_model-00001-of-00003.bin",
683
+ "vision_model.encoder.layers.26.ls2": "pytorch_model-00001-of-00003.bin",
684
+ "vision_model.encoder.layers.26.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
685
+ "vision_model.encoder.layers.26.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
686
+ "vision_model.encoder.layers.26.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
687
+ "vision_model.encoder.layers.26.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
688
+ "vision_model.encoder.layers.26.norm1.weight": "pytorch_model-00001-of-00003.bin",
689
+ "vision_model.encoder.layers.26.norm2.weight": "pytorch_model-00001-of-00003.bin",
690
+ "vision_model.encoder.layers.27.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
691
+ "vision_model.encoder.layers.27.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
692
+ "vision_model.encoder.layers.27.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
693
+ "vision_model.encoder.layers.27.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
694
+ "vision_model.encoder.layers.27.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
695
+ "vision_model.encoder.layers.27.ls1": "pytorch_model-00001-of-00003.bin",
696
+ "vision_model.encoder.layers.27.ls2": "pytorch_model-00001-of-00003.bin",
697
+ "vision_model.encoder.layers.27.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
698
+ "vision_model.encoder.layers.27.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
699
+ "vision_model.encoder.layers.27.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
700
+ "vision_model.encoder.layers.27.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
701
+ "vision_model.encoder.layers.27.norm1.weight": "pytorch_model-00001-of-00003.bin",
702
+ "vision_model.encoder.layers.27.norm2.weight": "pytorch_model-00001-of-00003.bin",
703
+ "vision_model.encoder.layers.28.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
704
+ "vision_model.encoder.layers.28.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
705
+ "vision_model.encoder.layers.28.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
706
+ "vision_model.encoder.layers.28.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
707
+ "vision_model.encoder.layers.28.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
708
+ "vision_model.encoder.layers.28.ls1": "pytorch_model-00001-of-00003.bin",
709
+ "vision_model.encoder.layers.28.ls2": "pytorch_model-00001-of-00003.bin",
710
+ "vision_model.encoder.layers.28.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
711
+ "vision_model.encoder.layers.28.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
712
+ "vision_model.encoder.layers.28.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
713
+ "vision_model.encoder.layers.28.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
714
+ "vision_model.encoder.layers.28.norm1.weight": "pytorch_model-00001-of-00003.bin",
715
+ "vision_model.encoder.layers.28.norm2.weight": "pytorch_model-00001-of-00003.bin",
716
+ "vision_model.encoder.layers.29.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
717
+ "vision_model.encoder.layers.29.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
718
+ "vision_model.encoder.layers.29.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
719
+ "vision_model.encoder.layers.29.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
720
+ "vision_model.encoder.layers.29.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
721
+ "vision_model.encoder.layers.29.ls1": "pytorch_model-00001-of-00003.bin",
722
+ "vision_model.encoder.layers.29.ls2": "pytorch_model-00001-of-00003.bin",
723
+ "vision_model.encoder.layers.29.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
724
+ "vision_model.encoder.layers.29.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
725
+ "vision_model.encoder.layers.29.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
726
+ "vision_model.encoder.layers.29.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
727
+ "vision_model.encoder.layers.29.norm1.weight": "pytorch_model-00001-of-00003.bin",
728
+ "vision_model.encoder.layers.29.norm2.weight": "pytorch_model-00001-of-00003.bin",
729
+ "vision_model.encoder.layers.3.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
730
+ "vision_model.encoder.layers.3.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
731
+ "vision_model.encoder.layers.3.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
732
+ "vision_model.encoder.layers.3.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
733
+ "vision_model.encoder.layers.3.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
734
+ "vision_model.encoder.layers.3.ls1": "pytorch_model-00001-of-00003.bin",
735
+ "vision_model.encoder.layers.3.ls2": "pytorch_model-00001-of-00003.bin",
736
+ "vision_model.encoder.layers.3.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
737
+ "vision_model.encoder.layers.3.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
738
+ "vision_model.encoder.layers.3.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
739
+ "vision_model.encoder.layers.3.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
740
+ "vision_model.encoder.layers.3.norm1.weight": "pytorch_model-00001-of-00003.bin",
741
+ "vision_model.encoder.layers.3.norm2.weight": "pytorch_model-00001-of-00003.bin",
742
+ "vision_model.encoder.layers.30.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
743
+ "vision_model.encoder.layers.30.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
744
+ "vision_model.encoder.layers.30.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
745
+ "vision_model.encoder.layers.30.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
746
+ "vision_model.encoder.layers.30.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
747
+ "vision_model.encoder.layers.30.ls1": "pytorch_model-00001-of-00003.bin",
748
+ "vision_model.encoder.layers.30.ls2": "pytorch_model-00001-of-00003.bin",
749
+ "vision_model.encoder.layers.30.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
750
+ "vision_model.encoder.layers.30.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
751
+ "vision_model.encoder.layers.30.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
752
+ "vision_model.encoder.layers.30.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
753
+ "vision_model.encoder.layers.30.norm1.weight": "pytorch_model-00001-of-00003.bin",
754
+ "vision_model.encoder.layers.30.norm2.weight": "pytorch_model-00001-of-00003.bin",
755
+ "vision_model.encoder.layers.31.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
756
+ "vision_model.encoder.layers.31.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
757
+ "vision_model.encoder.layers.31.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
758
+ "vision_model.encoder.layers.31.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
759
+ "vision_model.encoder.layers.31.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
760
+ "vision_model.encoder.layers.31.ls1": "pytorch_model-00001-of-00003.bin",
761
+ "vision_model.encoder.layers.31.ls2": "pytorch_model-00001-of-00003.bin",
762
+ "vision_model.encoder.layers.31.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
763
+ "vision_model.encoder.layers.31.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
764
+ "vision_model.encoder.layers.31.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
765
+ "vision_model.encoder.layers.31.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
766
+ "vision_model.encoder.layers.31.norm1.weight": "pytorch_model-00001-of-00003.bin",
767
+ "vision_model.encoder.layers.31.norm2.weight": "pytorch_model-00001-of-00003.bin",
768
+ "vision_model.encoder.layers.32.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
769
+ "vision_model.encoder.layers.32.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
770
+ "vision_model.encoder.layers.32.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
771
+ "vision_model.encoder.layers.32.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
772
+ "vision_model.encoder.layers.32.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
773
+ "vision_model.encoder.layers.32.ls1": "pytorch_model-00001-of-00003.bin",
774
+ "vision_model.encoder.layers.32.ls2": "pytorch_model-00001-of-00003.bin",
775
+ "vision_model.encoder.layers.32.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
776
+ "vision_model.encoder.layers.32.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
777
+ "vision_model.encoder.layers.32.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
778
+ "vision_model.encoder.layers.32.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
779
+ "vision_model.encoder.layers.32.norm1.weight": "pytorch_model-00001-of-00003.bin",
780
+ "vision_model.encoder.layers.32.norm2.weight": "pytorch_model-00001-of-00003.bin",
781
+ "vision_model.encoder.layers.33.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
782
+ "vision_model.encoder.layers.33.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
783
+ "vision_model.encoder.layers.33.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
784
+ "vision_model.encoder.layers.33.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
785
+ "vision_model.encoder.layers.33.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
786
+ "vision_model.encoder.layers.33.ls1": "pytorch_model-00001-of-00003.bin",
787
+ "vision_model.encoder.layers.33.ls2": "pytorch_model-00001-of-00003.bin",
788
+ "vision_model.encoder.layers.33.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
789
+ "vision_model.encoder.layers.33.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
790
+ "vision_model.encoder.layers.33.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
791
+ "vision_model.encoder.layers.33.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
792
+ "vision_model.encoder.layers.33.norm1.weight": "pytorch_model-00001-of-00003.bin",
793
+ "vision_model.encoder.layers.33.norm2.weight": "pytorch_model-00001-of-00003.bin",
794
+ "vision_model.encoder.layers.34.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
795
+ "vision_model.encoder.layers.34.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
796
+ "vision_model.encoder.layers.34.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
797
+ "vision_model.encoder.layers.34.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
798
+ "vision_model.encoder.layers.34.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
799
+ "vision_model.encoder.layers.34.ls1": "pytorch_model-00001-of-00003.bin",
800
+ "vision_model.encoder.layers.34.ls2": "pytorch_model-00001-of-00003.bin",
801
+ "vision_model.encoder.layers.34.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
802
+ "vision_model.encoder.layers.34.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
803
+ "vision_model.encoder.layers.34.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
804
+ "vision_model.encoder.layers.34.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
805
+ "vision_model.encoder.layers.34.norm1.weight": "pytorch_model-00001-of-00003.bin",
806
+ "vision_model.encoder.layers.34.norm2.weight": "pytorch_model-00001-of-00003.bin",
807
+ "vision_model.encoder.layers.35.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
808
+ "vision_model.encoder.layers.35.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
809
+ "vision_model.encoder.layers.35.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
810
+ "vision_model.encoder.layers.35.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
811
+ "vision_model.encoder.layers.35.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
812
+ "vision_model.encoder.layers.35.ls1": "pytorch_model-00001-of-00003.bin",
813
+ "vision_model.encoder.layers.35.ls2": "pytorch_model-00001-of-00003.bin",
814
+ "vision_model.encoder.layers.35.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
815
+ "vision_model.encoder.layers.35.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
816
+ "vision_model.encoder.layers.35.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
817
+ "vision_model.encoder.layers.35.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
818
+ "vision_model.encoder.layers.35.norm1.weight": "pytorch_model-00001-of-00003.bin",
819
+ "vision_model.encoder.layers.35.norm2.weight": "pytorch_model-00001-of-00003.bin",
820
+ "vision_model.encoder.layers.36.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
821
+ "vision_model.encoder.layers.36.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
822
+ "vision_model.encoder.layers.36.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
823
+ "vision_model.encoder.layers.36.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
824
+ "vision_model.encoder.layers.36.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
825
+ "vision_model.encoder.layers.36.ls1": "pytorch_model-00001-of-00003.bin",
826
+ "vision_model.encoder.layers.36.ls2": "pytorch_model-00001-of-00003.bin",
827
+ "vision_model.encoder.layers.36.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
828
+ "vision_model.encoder.layers.36.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
829
+ "vision_model.encoder.layers.36.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
830
+ "vision_model.encoder.layers.36.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
831
+ "vision_model.encoder.layers.36.norm1.weight": "pytorch_model-00001-of-00003.bin",
832
+ "vision_model.encoder.layers.36.norm2.weight": "pytorch_model-00001-of-00003.bin",
833
+ "vision_model.encoder.layers.37.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
834
+ "vision_model.encoder.layers.37.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
835
+ "vision_model.encoder.layers.37.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
836
+ "vision_model.encoder.layers.37.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
837
+ "vision_model.encoder.layers.37.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
838
+ "vision_model.encoder.layers.37.ls1": "pytorch_model-00001-of-00003.bin",
839
+ "vision_model.encoder.layers.37.ls2": "pytorch_model-00001-of-00003.bin",
840
+ "vision_model.encoder.layers.37.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
841
+ "vision_model.encoder.layers.37.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
842
+ "vision_model.encoder.layers.37.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
843
+ "vision_model.encoder.layers.37.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
844
+ "vision_model.encoder.layers.37.norm1.weight": "pytorch_model-00001-of-00003.bin",
845
+ "vision_model.encoder.layers.37.norm2.weight": "pytorch_model-00001-of-00003.bin",
846
+ "vision_model.encoder.layers.38.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
847
+ "vision_model.encoder.layers.38.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
848
+ "vision_model.encoder.layers.38.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
849
+ "vision_model.encoder.layers.38.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
850
+ "vision_model.encoder.layers.38.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
851
+ "vision_model.encoder.layers.38.ls1": "pytorch_model-00001-of-00003.bin",
852
+ "vision_model.encoder.layers.38.ls2": "pytorch_model-00001-of-00003.bin",
853
+ "vision_model.encoder.layers.38.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
854
+ "vision_model.encoder.layers.38.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
855
+ "vision_model.encoder.layers.38.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
856
+ "vision_model.encoder.layers.38.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
857
+ "vision_model.encoder.layers.38.norm1.weight": "pytorch_model-00001-of-00003.bin",
858
+ "vision_model.encoder.layers.38.norm2.weight": "pytorch_model-00001-of-00003.bin",
859
+ "vision_model.encoder.layers.39.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
860
+ "vision_model.encoder.layers.39.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
861
+ "vision_model.encoder.layers.39.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
862
+ "vision_model.encoder.layers.39.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
863
+ "vision_model.encoder.layers.39.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
864
+ "vision_model.encoder.layers.39.ls1": "pytorch_model-00001-of-00003.bin",
865
+ "vision_model.encoder.layers.39.ls2": "pytorch_model-00001-of-00003.bin",
866
+ "vision_model.encoder.layers.39.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
867
+ "vision_model.encoder.layers.39.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
868
+ "vision_model.encoder.layers.39.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
869
+ "vision_model.encoder.layers.39.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
870
+ "vision_model.encoder.layers.39.norm1.weight": "pytorch_model-00001-of-00003.bin",
871
+ "vision_model.encoder.layers.39.norm2.weight": "pytorch_model-00001-of-00003.bin",
872
+ "vision_model.encoder.layers.4.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
873
+ "vision_model.encoder.layers.4.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
874
+ "vision_model.encoder.layers.4.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
875
+ "vision_model.encoder.layers.4.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
876
+ "vision_model.encoder.layers.4.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
877
+ "vision_model.encoder.layers.4.ls1": "pytorch_model-00001-of-00003.bin",
878
+ "vision_model.encoder.layers.4.ls2": "pytorch_model-00001-of-00003.bin",
879
+ "vision_model.encoder.layers.4.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
880
+ "vision_model.encoder.layers.4.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
881
+ "vision_model.encoder.layers.4.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
882
+ "vision_model.encoder.layers.4.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
883
+ "vision_model.encoder.layers.4.norm1.weight": "pytorch_model-00001-of-00003.bin",
884
+ "vision_model.encoder.layers.4.norm2.weight": "pytorch_model-00001-of-00003.bin",
885
+ "vision_model.encoder.layers.40.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
886
+ "vision_model.encoder.layers.40.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
887
+ "vision_model.encoder.layers.40.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
888
+ "vision_model.encoder.layers.40.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
889
+ "vision_model.encoder.layers.40.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
890
+ "vision_model.encoder.layers.40.ls1": "pytorch_model-00001-of-00003.bin",
891
+ "vision_model.encoder.layers.40.ls2": "pytorch_model-00001-of-00003.bin",
892
+ "vision_model.encoder.layers.40.mlp.fc1.bias": "pytorch_model-00002-of-00003.bin",
893
+ "vision_model.encoder.layers.40.mlp.fc1.weight": "pytorch_model-00002-of-00003.bin",
894
+ "vision_model.encoder.layers.40.mlp.fc2.bias": "pytorch_model-00002-of-00003.bin",
895
+ "vision_model.encoder.layers.40.mlp.fc2.weight": "pytorch_model-00002-of-00003.bin",
896
+ "vision_model.encoder.layers.40.norm1.weight": "pytorch_model-00002-of-00003.bin",
897
+ "vision_model.encoder.layers.40.norm2.weight": "pytorch_model-00002-of-00003.bin",
898
+ "vision_model.encoder.layers.41.attn.k_norm.weight": "pytorch_model-00002-of-00003.bin",
899
+ "vision_model.encoder.layers.41.attn.proj.bias": "pytorch_model-00002-of-00003.bin",
900
+ "vision_model.encoder.layers.41.attn.proj.weight": "pytorch_model-00002-of-00003.bin",
901
+ "vision_model.encoder.layers.41.attn.q_norm.weight": "pytorch_model-00002-of-00003.bin",
902
+ "vision_model.encoder.layers.41.attn.qkv.weight": "pytorch_model-00002-of-00003.bin",
903
+ "vision_model.encoder.layers.41.ls1": "pytorch_model-00002-of-00003.bin",
904
+ "vision_model.encoder.layers.41.ls2": "pytorch_model-00002-of-00003.bin",
905
+ "vision_model.encoder.layers.41.mlp.fc1.bias": "pytorch_model-00002-of-00003.bin",
906
+ "vision_model.encoder.layers.41.mlp.fc1.weight": "pytorch_model-00002-of-00003.bin",
907
+ "vision_model.encoder.layers.41.mlp.fc2.bias": "pytorch_model-00002-of-00003.bin",
908
+ "vision_model.encoder.layers.41.mlp.fc2.weight": "pytorch_model-00002-of-00003.bin",
909
+ "vision_model.encoder.layers.41.norm1.weight": "pytorch_model-00002-of-00003.bin",
910
+ "vision_model.encoder.layers.41.norm2.weight": "pytorch_model-00002-of-00003.bin",
911
+ "vision_model.encoder.layers.42.attn.k_norm.weight": "pytorch_model-00002-of-00003.bin",
912
+ "vision_model.encoder.layers.42.attn.proj.bias": "pytorch_model-00002-of-00003.bin",
913
+ "vision_model.encoder.layers.42.attn.proj.weight": "pytorch_model-00002-of-00003.bin",
914
+ "vision_model.encoder.layers.42.attn.q_norm.weight": "pytorch_model-00002-of-00003.bin",
915
+ "vision_model.encoder.layers.42.attn.qkv.weight": "pytorch_model-00002-of-00003.bin",
916
+ "vision_model.encoder.layers.42.ls1": "pytorch_model-00002-of-00003.bin",
917
+ "vision_model.encoder.layers.42.ls2": "pytorch_model-00002-of-00003.bin",
918
+ "vision_model.encoder.layers.42.mlp.fc1.bias": "pytorch_model-00002-of-00003.bin",
919
+ "vision_model.encoder.layers.42.mlp.fc1.weight": "pytorch_model-00002-of-00003.bin",
920
+ "vision_model.encoder.layers.42.mlp.fc2.bias": "pytorch_model-00002-of-00003.bin",
921
+ "vision_model.encoder.layers.42.mlp.fc2.weight": "pytorch_model-00002-of-00003.bin",
922
+ "vision_model.encoder.layers.42.norm1.weight": "pytorch_model-00002-of-00003.bin",
923
+ "vision_model.encoder.layers.42.norm2.weight": "pytorch_model-00002-of-00003.bin",
924
+ "vision_model.encoder.layers.43.attn.k_norm.weight": "pytorch_model-00002-of-00003.bin",
925
+ "vision_model.encoder.layers.43.attn.proj.bias": "pytorch_model-00002-of-00003.bin",
926
+ "vision_model.encoder.layers.43.attn.proj.weight": "pytorch_model-00002-of-00003.bin",
927
+ "vision_model.encoder.layers.43.attn.q_norm.weight": "pytorch_model-00002-of-00003.bin",
928
+ "vision_model.encoder.layers.43.attn.qkv.weight": "pytorch_model-00002-of-00003.bin",
929
+ "vision_model.encoder.layers.43.ls1": "pytorch_model-00002-of-00003.bin",
930
+ "vision_model.encoder.layers.43.ls2": "pytorch_model-00002-of-00003.bin",
931
+ "vision_model.encoder.layers.43.mlp.fc1.bias": "pytorch_model-00002-of-00003.bin",
932
+ "vision_model.encoder.layers.43.mlp.fc1.weight": "pytorch_model-00002-of-00003.bin",
933
+ "vision_model.encoder.layers.43.mlp.fc2.bias": "pytorch_model-00002-of-00003.bin",
934
+ "vision_model.encoder.layers.43.mlp.fc2.weight": "pytorch_model-00002-of-00003.bin",
935
+ "vision_model.encoder.layers.43.norm1.weight": "pytorch_model-00002-of-00003.bin",
936
+ "vision_model.encoder.layers.43.norm2.weight": "pytorch_model-00002-of-00003.bin",
937
+ "vision_model.encoder.layers.44.attn.k_norm.weight": "pytorch_model-00002-of-00003.bin",
938
+ "vision_model.encoder.layers.44.attn.proj.bias": "pytorch_model-00002-of-00003.bin",
939
+ "vision_model.encoder.layers.44.attn.proj.weight": "pytorch_model-00002-of-00003.bin",
940
+ "vision_model.encoder.layers.44.attn.q_norm.weight": "pytorch_model-00002-of-00003.bin",
941
+ "vision_model.encoder.layers.44.attn.qkv.weight": "pytorch_model-00002-of-00003.bin",
942
+ "vision_model.encoder.layers.44.ls1": "pytorch_model-00002-of-00003.bin",
943
+ "vision_model.encoder.layers.44.ls2": "pytorch_model-00002-of-00003.bin",
944
+ "vision_model.encoder.layers.44.mlp.fc1.bias": "pytorch_model-00002-of-00003.bin",
945
+ "vision_model.encoder.layers.44.mlp.fc1.weight": "pytorch_model-00002-of-00003.bin",
946
+ "vision_model.encoder.layers.44.mlp.fc2.bias": "pytorch_model-00002-of-00003.bin",
947
+ "vision_model.encoder.layers.44.mlp.fc2.weight": "pytorch_model-00002-of-00003.bin",
948
+ "vision_model.encoder.layers.44.norm1.weight": "pytorch_model-00002-of-00003.bin",
949
+ "vision_model.encoder.layers.44.norm2.weight": "pytorch_model-00002-of-00003.bin",
950
+ "vision_model.encoder.layers.45.attn.k_norm.weight": "pytorch_model-00002-of-00003.bin",
951
+ "vision_model.encoder.layers.45.attn.proj.bias": "pytorch_model-00002-of-00003.bin",
952
+ "vision_model.encoder.layers.45.attn.proj.weight": "pytorch_model-00002-of-00003.bin",
953
+ "vision_model.encoder.layers.45.attn.q_norm.weight": "pytorch_model-00002-of-00003.bin",
954
+ "vision_model.encoder.layers.45.attn.qkv.weight": "pytorch_model-00002-of-00003.bin",
955
+ "vision_model.encoder.layers.45.ls1": "pytorch_model-00002-of-00003.bin",
956
+ "vision_model.encoder.layers.45.ls2": "pytorch_model-00002-of-00003.bin",
957
+ "vision_model.encoder.layers.45.mlp.fc1.bias": "pytorch_model-00002-of-00003.bin",
958
+ "vision_model.encoder.layers.45.mlp.fc1.weight": "pytorch_model-00002-of-00003.bin",
959
+ "vision_model.encoder.layers.45.mlp.fc2.bias": "pytorch_model-00002-of-00003.bin",
960
+ "vision_model.encoder.layers.45.mlp.fc2.weight": "pytorch_model-00002-of-00003.bin",
961
+ "vision_model.encoder.layers.45.norm1.weight": "pytorch_model-00002-of-00003.bin",
962
+ "vision_model.encoder.layers.45.norm2.weight": "pytorch_model-00002-of-00003.bin",
963
+ "vision_model.encoder.layers.46.attn.k_norm.weight": "pytorch_model-00002-of-00003.bin",
964
+ "vision_model.encoder.layers.46.attn.proj.bias": "pytorch_model-00002-of-00003.bin",
965
+ "vision_model.encoder.layers.46.attn.proj.weight": "pytorch_model-00002-of-00003.bin",
966
+ "vision_model.encoder.layers.46.attn.q_norm.weight": "pytorch_model-00002-of-00003.bin",
967
+ "vision_model.encoder.layers.46.attn.qkv.weight": "pytorch_model-00002-of-00003.bin",
968
+ "vision_model.encoder.layers.46.ls1": "pytorch_model-00002-of-00003.bin",
969
+ "vision_model.encoder.layers.46.ls2": "pytorch_model-00002-of-00003.bin",
970
+ "vision_model.encoder.layers.46.mlp.fc1.bias": "pytorch_model-00002-of-00003.bin",
971
+ "vision_model.encoder.layers.46.mlp.fc1.weight": "pytorch_model-00002-of-00003.bin",
972
+ "vision_model.encoder.layers.46.mlp.fc2.bias": "pytorch_model-00002-of-00003.bin",
973
+ "vision_model.encoder.layers.46.mlp.fc2.weight": "pytorch_model-00002-of-00003.bin",
974
+ "vision_model.encoder.layers.46.norm1.weight": "pytorch_model-00002-of-00003.bin",
975
+ "vision_model.encoder.layers.46.norm2.weight": "pytorch_model-00002-of-00003.bin",
976
+ "vision_model.encoder.layers.47.attn.k_norm.weight": "pytorch_model-00002-of-00003.bin",
977
+ "vision_model.encoder.layers.47.attn.proj.bias": "pytorch_model-00002-of-00003.bin",
978
+ "vision_model.encoder.layers.47.attn.proj.weight": "pytorch_model-00002-of-00003.bin",
979
+ "vision_model.encoder.layers.47.attn.q_norm.weight": "pytorch_model-00002-of-00003.bin",
980
+ "vision_model.encoder.layers.47.attn.qkv.weight": "pytorch_model-00002-of-00003.bin",
981
+ "vision_model.encoder.layers.47.ls1": "pytorch_model-00002-of-00003.bin",
982
+ "vision_model.encoder.layers.47.ls2": "pytorch_model-00002-of-00003.bin",
983
+ "vision_model.encoder.layers.47.mlp.fc1.bias": "pytorch_model-00002-of-00003.bin",
984
+ "vision_model.encoder.layers.47.mlp.fc1.weight": "pytorch_model-00002-of-00003.bin",
985
+ "vision_model.encoder.layers.47.mlp.fc2.bias": "pytorch_model-00002-of-00003.bin",
986
+ "vision_model.encoder.layers.47.mlp.fc2.weight": "pytorch_model-00002-of-00003.bin",
987
+ "vision_model.encoder.layers.47.norm1.weight": "pytorch_model-00002-of-00003.bin",
988
+ "vision_model.encoder.layers.47.norm2.weight": "pytorch_model-00002-of-00003.bin",
989
+ "vision_model.encoder.layers.5.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
990
+ "vision_model.encoder.layers.5.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
991
+ "vision_model.encoder.layers.5.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
992
+ "vision_model.encoder.layers.5.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
993
+ "vision_model.encoder.layers.5.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
994
+ "vision_model.encoder.layers.5.ls1": "pytorch_model-00001-of-00003.bin",
995
+ "vision_model.encoder.layers.5.ls2": "pytorch_model-00001-of-00003.bin",
996
+ "vision_model.encoder.layers.5.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
997
+ "vision_model.encoder.layers.5.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
998
+ "vision_model.encoder.layers.5.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
999
+ "vision_model.encoder.layers.5.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
1000
+ "vision_model.encoder.layers.5.norm1.weight": "pytorch_model-00001-of-00003.bin",
1001
+ "vision_model.encoder.layers.5.norm2.weight": "pytorch_model-00001-of-00003.bin",
1002
+ "vision_model.encoder.layers.6.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
1003
+ "vision_model.encoder.layers.6.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
1004
+ "vision_model.encoder.layers.6.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
1005
+ "vision_model.encoder.layers.6.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
1006
+ "vision_model.encoder.layers.6.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
1007
+ "vision_model.encoder.layers.6.ls1": "pytorch_model-00001-of-00003.bin",
1008
+ "vision_model.encoder.layers.6.ls2": "pytorch_model-00001-of-00003.bin",
1009
+ "vision_model.encoder.layers.6.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
1010
+ "vision_model.encoder.layers.6.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
1011
+ "vision_model.encoder.layers.6.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
1012
+ "vision_model.encoder.layers.6.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
1013
+ "vision_model.encoder.layers.6.norm1.weight": "pytorch_model-00001-of-00003.bin",
1014
+ "vision_model.encoder.layers.6.norm2.weight": "pytorch_model-00001-of-00003.bin",
1015
+ "vision_model.encoder.layers.7.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
1016
+ "vision_model.encoder.layers.7.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
1017
+ "vision_model.encoder.layers.7.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
1018
+ "vision_model.encoder.layers.7.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
1019
+ "vision_model.encoder.layers.7.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
1020
+ "vision_model.encoder.layers.7.ls1": "pytorch_model-00001-of-00003.bin",
1021
+ "vision_model.encoder.layers.7.ls2": "pytorch_model-00001-of-00003.bin",
1022
+ "vision_model.encoder.layers.7.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
1023
+ "vision_model.encoder.layers.7.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
1024
+ "vision_model.encoder.layers.7.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
1025
+ "vision_model.encoder.layers.7.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
1026
+ "vision_model.encoder.layers.7.norm1.weight": "pytorch_model-00001-of-00003.bin",
1027
+ "vision_model.encoder.layers.7.norm2.weight": "pytorch_model-00001-of-00003.bin",
1028
+ "vision_model.encoder.layers.8.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
1029
+ "vision_model.encoder.layers.8.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
1030
+ "vision_model.encoder.layers.8.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
1031
+ "vision_model.encoder.layers.8.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
1032
+ "vision_model.encoder.layers.8.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
1033
+ "vision_model.encoder.layers.8.ls1": "pytorch_model-00001-of-00003.bin",
1034
+ "vision_model.encoder.layers.8.ls2": "pytorch_model-00001-of-00003.bin",
1035
+ "vision_model.encoder.layers.8.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
1036
+ "vision_model.encoder.layers.8.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
1037
+ "vision_model.encoder.layers.8.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
1038
+ "vision_model.encoder.layers.8.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
1039
+ "vision_model.encoder.layers.8.norm1.weight": "pytorch_model-00001-of-00003.bin",
1040
+ "vision_model.encoder.layers.8.norm2.weight": "pytorch_model-00001-of-00003.bin",
1041
+ "vision_model.encoder.layers.9.attn.k_norm.weight": "pytorch_model-00001-of-00003.bin",
1042
+ "vision_model.encoder.layers.9.attn.proj.bias": "pytorch_model-00001-of-00003.bin",
1043
+ "vision_model.encoder.layers.9.attn.proj.weight": "pytorch_model-00001-of-00003.bin",
1044
+ "vision_model.encoder.layers.9.attn.q_norm.weight": "pytorch_model-00001-of-00003.bin",
1045
+ "vision_model.encoder.layers.9.attn.qkv.weight": "pytorch_model-00001-of-00003.bin",
1046
+ "vision_model.encoder.layers.9.ls1": "pytorch_model-00001-of-00003.bin",
1047
+ "vision_model.encoder.layers.9.ls2": "pytorch_model-00001-of-00003.bin",
1048
+ "vision_model.encoder.layers.9.mlp.fc1.bias": "pytorch_model-00001-of-00003.bin",
1049
+ "vision_model.encoder.layers.9.mlp.fc1.weight": "pytorch_model-00001-of-00003.bin",
1050
+ "vision_model.encoder.layers.9.mlp.fc2.bias": "pytorch_model-00001-of-00003.bin",
1051
+ "vision_model.encoder.layers.9.mlp.fc2.weight": "pytorch_model-00001-of-00003.bin",
1052
+ "vision_model.encoder.layers.9.norm1.weight": "pytorch_model-00001-of-00003.bin",
1053
+ "vision_model.encoder.layers.9.norm2.weight": "pytorch_model-00001-of-00003.bin"
1054
+ }
1055
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d967e855b1213a439df6c8ce2791f869c84b4f3b6cfacf22b86440b8192a2f8
3
+ size 757972
tokenizer_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": true,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": null,
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "pad_token": null,
24
+ "sp_model_kwargs": {},
25
+ "spaces_between_special_tokens": false,
26
+ "tokenizer_class": "LlamaTokenizer",
27
+ "unk_token": {
28
+ "__type": "AddedToken",
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "use_default_system_prompt": true,
36
+ "use_fast": false
37
+ }