|
arch: |
|
type: TransformerLMHeadModel |
|
args: |
|
transformer_config: |
|
type: TransformerDecoderOnlyModel |
|
args: |
|
embed_config: |
|
type: TransformerEmbeddingBlock |
|
args: |
|
token_embed_config: |
|
type: TokenEmbedding |
|
args: |
|
n_embed: 2048 |
|
n_vocab: 92544 |
|
pos_embed_config: null |
|
type_embed_config: null |
|
ln_config: null |
|
p_drop_embed: 0.0 |
|
concat_strategy: id_first |
|
decoder_config: |
|
type: TransformerDecoderBlock |
|
args: |
|
attn_config: |
|
type: LlamaAttention |
|
args: |
|
n_embed: 2048 |
|
n_pos: 32768 |
|
n_head: 16 |
|
n_key_value_head: 8 |
|
head_size: 128 |
|
p_drop_attn: 0.0 |
|
p_drop_resid: 0.0 |
|
bias_attn: false |
|
bias_proj: false |
|
cross_attn: false |
|
scale_dot_product: true |
|
scale_layer_wise: false |
|
layer_idx: null |
|
rope_config: |
|
type: RotaryPositionEmbedding |
|
args: |
|
head_size: 128 |
|
n_pos: 32768 |
|
base: 1000000 |
|
scaling_type: null |
|
scaling_factor: null |
|
mlp_config: |
|
type: LlamaMLP |
|
args: |
|
n_embed: 2048 |
|
n_inner: 8192 |
|
act_fn_config: |
|
type: SiLUActivation |
|
args: {} |
|
ln_config: |
|
type: LlamaRMSNorm |
|
args: |
|
n_embed: 2048 |
|
ln_eps: 1.0e-05 |
|
n_embed: 2048 |
|
post_norm: false |
|
add_cross_attn: false |
|
n_embed: 2048 |
|
n_layer: 24 |
|
n_head: 16 |
|
ln_config: |
|
type: LlamaRMSNorm |
|
args: |
|
n_embed: 2048 |
|
ln_eps: 1.0e-05 |
|
perform_linear_bias: false |
|
attn_window_size_loop_unit: null |
|
lm_head_config: |
|
type: TransformerLMHead |
|
args: |
|
n_vocab: 92544 |
|
n_embed: 2048 |
|
perform_transform: false |
|
act_fn_config: null |
|
ln_config: null |
|
|