Add print statements
Browse files- modeling_cogvlm.py +10 -12
- visual.py +12 -3
modeling_cogvlm.py
CHANGED
@@ -296,8 +296,8 @@ class CogVLMDecoderLayer(nn.Module):
|
|
296 |
|
297 |
hidden_states = self.input_layernorm(hidden_states)
|
298 |
|
299 |
-
if print_values:
|
300 |
-
|
301 |
|
302 |
# Self Attention
|
303 |
hidden_states, self_attn_weights, present_key_value = self.self_attn(
|
@@ -310,8 +310,8 @@ class CogVLMDecoderLayer(nn.Module):
|
|
310 |
use_cache=use_cache,
|
311 |
)
|
312 |
|
313 |
-
if print_values:
|
314 |
-
|
315 |
|
316 |
hidden_states = residual + hidden_states
|
317 |
|
@@ -464,12 +464,12 @@ class CogVLMModel(CogVLMPreTrainedModel):
|
|
464 |
repo_type="dataset",
|
465 |
)
|
466 |
|
467 |
-
print("First values of text embeddings:", inputs_embeds[0, :3, :3])
|
468 |
-
print("First values of images_features:", images_features[0, :3])
|
469 |
|
470 |
inputs_embeds = inputs_embeds.index_put([token_type_ids == VISION_TOKEN_TYPE], images_features)
|
471 |
|
472 |
-
print("First values of inputs_embeds after index_put:", inputs_embeds[0, :3, :3])
|
473 |
|
474 |
else: # single-modality
|
475 |
if token_type_ids is None:
|
@@ -542,8 +542,6 @@ class CogVLMModel(CogVLMPreTrainedModel):
|
|
542 |
else:
|
543 |
position_ids = position_ids.view(-1, seq_length).long()
|
544 |
|
545 |
-
print("Input ids:", input_ids)
|
546 |
-
|
547 |
if inputs_embeds is None:
|
548 |
inputs_embeds = self.embed_tokens(input_ids)
|
549 |
# embed positions
|
@@ -578,9 +576,9 @@ class CogVLMModel(CogVLMPreTrainedModel):
|
|
578 |
if output_hidden_states:
|
579 |
all_hidden_states += (hidden_states,)
|
580 |
|
581 |
-
if idx in [0, 1, 2]:
|
582 |
-
|
583 |
-
|
584 |
|
585 |
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
586 |
layer_outputs = decoder_layer(
|
|
|
296 |
|
297 |
hidden_states = self.input_layernorm(hidden_states)
|
298 |
|
299 |
+
# if print_values:
|
300 |
+
# print("Hidden states before self attention:", hidden_states[0,:3,:3])
|
301 |
|
302 |
# Self Attention
|
303 |
hidden_states, self_attn_weights, present_key_value = self.self_attn(
|
|
|
310 |
use_cache=use_cache,
|
311 |
)
|
312 |
|
313 |
+
# if print_values:
|
314 |
+
# print("Hidden states after self attention:", hidden_states[0,:3,:3])
|
315 |
|
316 |
hidden_states = residual + hidden_states
|
317 |
|
|
|
464 |
repo_type="dataset",
|
465 |
)
|
466 |
|
467 |
+
# print("First values of text embeddings:", inputs_embeds[0, :3, :3])
|
468 |
+
# print("First values of images_features:", images_features[0, :3])
|
469 |
|
470 |
inputs_embeds = inputs_embeds.index_put([token_type_ids == VISION_TOKEN_TYPE], images_features)
|
471 |
|
472 |
+
# print("First values of inputs_embeds after index_put:", inputs_embeds[0, :3, :3])
|
473 |
|
474 |
else: # single-modality
|
475 |
if token_type_ids is None:
|
|
|
542 |
else:
|
543 |
position_ids = position_ids.view(-1, seq_length).long()
|
544 |
|
|
|
|
|
545 |
if inputs_embeds is None:
|
546 |
inputs_embeds = self.embed_tokens(input_ids)
|
547 |
# embed positions
|
|
|
576 |
if output_hidden_states:
|
577 |
all_hidden_states += (hidden_states,)
|
578 |
|
579 |
+
# if idx in [0, 1, 2]:
|
580 |
+
# print(f"Hidden states before layer {idx}", hidden_states[0,:3,:3])
|
581 |
+
# print(f"Mean of hidden states before layer {idx}", hidden_states.mean())
|
582 |
|
583 |
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
584 |
layer_outputs = decoder_layer(
|
visual.py
CHANGED
@@ -74,9 +74,18 @@ class TransformerLayer(nn.Module):
|
|
74 |
self.mlp = MLP(config)
|
75 |
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
76 |
|
77 |
-
def forward(self, hidden_states):
|
78 |
attention_input = hidden_states
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
hidden_states = attention_input + attention_output
|
81 |
mlp_input = hidden_states
|
82 |
mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
|
@@ -105,7 +114,7 @@ class Transformer(nn.Module):
|
|
105 |
)
|
106 |
|
107 |
for idx, layer_module in enumerate(self.layers):
|
108 |
-
hidden_states = layer_module(hidden_states)
|
109 |
|
110 |
print("Shape of hidden states after CLIP:", hidden_states.shape)
|
111 |
torch.save(hidden_states, "hidden_states_after_clip.pt")
|
|
|
74 |
self.mlp = MLP(config)
|
75 |
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
76 |
|
77 |
+
def forward(self, hidden_states, print_values=False):
|
78 |
attention_input = hidden_states
|
79 |
+
|
80 |
+
if print_values:
|
81 |
+
print("Hidden states before attention:", attention_input[0, :3, :3])
|
82 |
+
|
83 |
+
attention_output = self.attention(attention_input)
|
84 |
+
|
85 |
+
if print_values:
|
86 |
+
print("Hidden states before attention:", attention_input[0, :3, :3])
|
87 |
+
|
88 |
+
attention_output = self.input_layernorm(attention_output)
|
89 |
hidden_states = attention_input + attention_output
|
90 |
mlp_input = hidden_states
|
91 |
mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
|
|
|
114 |
)
|
115 |
|
116 |
for idx, layer_module in enumerate(self.layers):
|
117 |
+
hidden_states = layer_module(hidden_states, print_values=idx==0)
|
118 |
|
119 |
print("Shape of hidden states after CLIP:", hidden_states.shape)
|
120 |
torch.save(hidden_states, "hidden_states_after_clip.pt")
|