Upload e2.5 + instruct
Browse files- modeling_megatron_gpt.py +44 -20
- pytorch_model-00001-of-00002.bin +1 -1
- pytorch_model-00002-of-00002.bin +1 -1
modeling_megatron_gpt.py
CHANGED
@@ -20,6 +20,7 @@
|
|
20 |
|
21 |
""" PyTorch MegatronGPT model."""
|
22 |
|
|
|
23 |
from typing import Optional, Tuple, Union
|
24 |
|
25 |
import torch
|
@@ -42,7 +43,12 @@ from transformers.modeling_outputs import (
|
|
42 |
)
|
43 |
from transformers.modeling_utils import PreTrainedModel
|
44 |
from transformers.utils import logging
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
def get_activation(act):
|
48 |
if act in ["gelu", "geglu", "fast-geglu"]:
|
@@ -57,6 +63,10 @@ logger = logging.get_logger(__name__)
|
|
57 |
|
58 |
_CONFIG_FOR_DOC = "MegatronGPTConfig"
|
59 |
|
|
|
|
|
|
|
|
|
60 |
class MegatronGPTPreTrainedModel(PreTrainedModel):
|
61 |
"""
|
62 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
@@ -187,7 +197,7 @@ class MegatronGPTAttention(nn.Module):
|
|
187 |
# Compute token offset for rotary embeddings (when decoding)
|
188 |
seq_len = key.shape[-2]
|
189 |
if has_layer_past:
|
190 |
-
seq_len
|
191 |
cos, sin = self.rotary_emb(value, seq_len=seq_len)
|
192 |
query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
|
193 |
query = torch.cat((query, query_pass), dim=-1)
|
@@ -420,8 +430,8 @@ class MegatronGPTMLP(nn.Module):
|
|
420 |
class MegatronGPTLayer(nn.Module):
|
421 |
def __init__(self, config, layer_idx):
|
422 |
super().__init__()
|
423 |
-
self.input_layernorm =
|
424 |
-
self.post_attention_layernorm =
|
425 |
self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
|
426 |
self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
|
427 |
self.self_attention = MegatronGPTAttention(config)
|
@@ -466,23 +476,36 @@ class MegatronGPTLayer(nn.Module):
|
|
466 |
|
467 |
return outputs
|
468 |
|
469 |
-
class
|
470 |
def __init__(self, normalization, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
479 |
self.normalization = normalization
|
480 |
|
481 |
def forward(self, x):
|
482 |
-
|
483 |
-
|
484 |
-
x
|
485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
|
487 |
|
488 |
|
@@ -551,7 +574,7 @@ class MegatronGPTModel(MegatronGPTPreTrainedModel):
|
|
551 |
self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
|
552 |
self.emb_dropout = nn.Dropout(config.hidden_dropout)
|
553 |
self.layers = nn.ModuleList([MegatronGPTLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
|
554 |
-
self.final_layernorm =
|
555 |
|
556 |
self.gradient_checkpointing = False
|
557 |
|
@@ -748,7 +771,7 @@ class MegatronGPTForCausalLM(MegatronGPTPreTrainedModel):
|
|
748 |
output_attentions: Optional[bool] = None,
|
749 |
output_hidden_states: Optional[bool] = None,
|
750 |
return_dict: Optional[bool] = None,
|
751 |
-
) -> Union[Tuple,
|
752 |
r"""
|
753 |
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
754 |
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
@@ -804,12 +827,13 @@ class MegatronGPTForCausalLM(MegatronGPTPreTrainedModel):
|
|
804 |
output = (lm_logits,) + outputs[1:]
|
805 |
return ((lm_loss,) + output) if lm_loss is not None else output
|
806 |
|
807 |
-
return
|
808 |
loss=lm_loss,
|
809 |
logits=lm_logits,
|
810 |
past_key_values=outputs.past_key_values,
|
811 |
hidden_states=outputs.hidden_states,
|
812 |
attentions=outputs.attentions,
|
|
|
813 |
)
|
814 |
|
815 |
def prepare_inputs_for_generation(
|
|
|
20 |
|
21 |
""" PyTorch MegatronGPT model."""
|
22 |
|
23 |
+
from dataclasses import dataclass
|
24 |
from typing import Optional, Tuple, Union
|
25 |
|
26 |
import torch
|
|
|
43 |
)
|
44 |
from transformers.modeling_utils import PreTrainedModel
|
45 |
from transformers.utils import logging
|
46 |
+
# try to load using a relative path, but if it fails try loading it directly
|
47 |
+
try:
|
48 |
+
from .configuration_megatron_gpt import MegatronGPTConfig
|
49 |
+
except:
|
50 |
+
from configuration_megatron_gpt import MegatronGPTConfig
|
51 |
+
|
52 |
|
53 |
def get_activation(act):
|
54 |
if act in ["gelu", "geglu", "fast-geglu"]:
|
|
|
63 |
|
64 |
_CONFIG_FOR_DOC = "MegatronGPTConfig"
|
65 |
|
66 |
+
@dataclass
|
67 |
+
class CausalLMOutputWithPastAndEncoding(CausalLMOutputWithPast):
|
68 |
+
encoding_states: Optional[torch.FloatTensor] = None
|
69 |
+
|
70 |
class MegatronGPTPreTrainedModel(PreTrainedModel):
|
71 |
"""
|
72 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
|
|
197 |
# Compute token offset for rotary embeddings (when decoding)
|
198 |
seq_len = key.shape[-2]
|
199 |
if has_layer_past:
|
200 |
+
seq_len = seq_len + layer_past[0].shape[-2]
|
201 |
cos, sin = self.rotary_emb(value, seq_len=seq_len)
|
202 |
query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
|
203 |
query = torch.cat((query, query_pass), dim=-1)
|
|
|
430 |
class MegatronGPTLayer(nn.Module):
|
431 |
def __init__(self, config, layer_idx):
|
432 |
super().__init__()
|
433 |
+
self.input_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
|
434 |
+
self.post_attention_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
|
435 |
self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
|
436 |
self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
|
437 |
self.self_attention = MegatronGPTAttention(config)
|
|
|
476 |
|
477 |
return outputs
|
478 |
|
479 |
+
class MegatronGPTLayerNorm(torch.nn.LayerNorm):
|
480 |
def __init__(self, normalization, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
|
481 |
+
normalization = normalization.lower()
|
482 |
+
assert normalization in ['layernorm', 'layernorm1p', 'rmsnorm']
|
483 |
+
if normalization == 'rmsnorm':
|
484 |
+
torch.nn.Module.__init__(self)
|
485 |
+
self.weight = nn.Parameter(torch.ones(normalized_shape))
|
486 |
+
self.variance_epsilon = eps
|
487 |
+
else:
|
488 |
+
super().__init__(
|
489 |
+
normalized_shape=normalized_shape,
|
490 |
+
eps=eps,
|
491 |
+
elementwise_affine=elementwise_affine,
|
492 |
+
device=device,
|
493 |
+
dtype=dtype,
|
494 |
+
)
|
495 |
self.normalization = normalization
|
496 |
|
497 |
def forward(self, x):
|
498 |
+
if self.normalization == 'rmsnorm':
|
499 |
+
input_dtype = x.dtype
|
500 |
+
x = x.to(torch.float32)
|
501 |
+
variance = x.pow(2).mean(-1, keepdim=True)
|
502 |
+
x = x * torch.rsqrt(variance + self.variance_epsilon)
|
503 |
+
return self.weight * x.to(input_dtype)
|
504 |
+
else:
|
505 |
+
weight_bias = 1 if self.normalization == 'layernorm1p' else 0
|
506 |
+
return torch.nn.functional.layer_norm(
|
507 |
+
x, self.normalized_shape, self.weight + weight_bias, self.bias, self.eps
|
508 |
+
)
|
509 |
|
510 |
|
511 |
|
|
|
574 |
self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
|
575 |
self.emb_dropout = nn.Dropout(config.hidden_dropout)
|
576 |
self.layers = nn.ModuleList([MegatronGPTLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
|
577 |
+
self.final_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
|
578 |
|
579 |
self.gradient_checkpointing = False
|
580 |
|
|
|
771 |
output_attentions: Optional[bool] = None,
|
772 |
output_hidden_states: Optional[bool] = None,
|
773 |
return_dict: Optional[bool] = None,
|
774 |
+
) -> Union[Tuple, CausalLMOutputWithPastAndEncoding]:
|
775 |
r"""
|
776 |
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
777 |
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
|
|
827 |
output = (lm_logits,) + outputs[1:]
|
828 |
return ((lm_loss,) + output) if lm_loss is not None else output
|
829 |
|
830 |
+
return CausalLMOutputWithPastAndEncoding(
|
831 |
loss=lm_loss,
|
832 |
logits=lm_logits,
|
833 |
past_key_values=outputs.past_key_values,
|
834 |
hidden_states=outputs.hidden_states,
|
835 |
attentions=outputs.attentions,
|
836 |
+
encoding_states=hidden_states
|
837 |
)
|
838 |
|
839 |
def prepare_inputs_for_generation(
|
pytorch_model-00001-of-00002.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9970836963
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69fceaa3477ed790a9f3506f717a58db7f328ffed532d468bdd82098f3433dce
|
3 |
size 9970836963
|
pytorch_model-00002-of-00002.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 950158711
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7a95c1cd54a63f3ba01e2283528f431948cd2014426efc1e08403cdf99bf3084
|
3 |
size 950158711
|