{ "architectures": [ "JambaForCausalLM" ], "attention_dropout": 0.0, "attn_hidden_size": -1, "attn_implementation": "flex", "attn_layer_offset": 100, "attn_layer_period": 1, "attn_reuse_every_i_layer": -1, "bos_token_id": 1, "calc_logits_for_entire_prompt": false, "compact_gating": false, "compute_attn_mat": false, "conv_dim": { "0": 3200, "1": 3200, "2": 3200, "3": 3200, "4": 3200, "5": 3200, "6": 3200, "7": 3200, "8": 3200, "9": 3200, "10": 3200, "11": 3200, "12": 3200, "13": 3200, "14": 3200, "15": 3200, "16": 3200, "17": 3200, "18": 3200, "19": 3200, "20": 3200, "21": 3200, "22": 3200, "23": 3200, "24": 3200, "25": 3200, "26": 3200, "27": 3200, "28": 3200, "29": 3200, "30": 3200, "31": 3200 }, "dense_public_ffn_structure": false, "double_v_dim": false, "enable_mod": false, "eos_token_id": 2, "expert_layer_offset": 1000, "expert_layer_period": 1000, "ffn_reuse_every_i_layer": -1, "ffn_sharing_config": null, "fully_parallel_jamba": false, "fused_multihead_config": { "conv_attn": null, "diverse_head_merge_op": "mean_wo_gate", "expand_v": true, "qkv_mode": "orig", "share_qk": false }, "global_attn_idx": [ 0, 15, 31 ], "gradient_checkpoint_layer": null, "hidden_act": "silu", "hidden_size": 1600, "hybrid_block_indices": [], "hybrid_decoder_layer": "mamba", "initializer_range": 0.02, "intermediate_size": 5504, "kq_head_dim": -1, "kq_norm": "none", "kv_reuse_every_i_layer": -1, "kv_reuse_group": [ [ 1, 2 ], [ 3, 4 ], [ 5, 6 ], [ 7, 8 ], [ 9, 10 ], [ 11, 12 ], [ 13, 14 ], [ 16, 17, 18 ], [ 19, 20 ], [ 21, 22 ], [ 23, 24 ], [ 25, 26 ], [ 27, 28 ], [ 29, 30 ] ], "kv_weight_reuse": false, "layer_type": [ "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h" ], "local_expand_ratio": 1, "local_global_dual_branch": false, "local_global_dual_branch_merge_op": "mean", "lookback_mode": "", "macro_arch": "", "mamba2_headdim": 64, "mamba_conv_bias": true, "mamba_d_conv": 4, "mamba_d_state": 16, "mamba_dt_rank": 100, "mamba_expand": 2, "mamba_inner_layernorms": true, "mamba_latent_size": null, "mamba_multihead_config": null, "mamba_proj_bias": false, "mamba_reuse_every_i_layer": -1, "max_position_embeddings": 2176, "memory_tokens_interspersed_every": 0, "mlp_hidden_act": "silu", "mod_topk": 2, "model_type": "jamba", "moe_config": null, "num_attention_heads": 25, "num_attn_per_ffn": 3, "num_experts": 1, "num_experts_per_tok": 1, "num_ffn": 1, "num_hidden_layers": 32, "num_key_value_heads": 5, "num_mamba": 1, "num_memory_tokens": 128, "output_router_logits": false, "pad_token_id": 0, "public_ffn_structure": false, "reduce_attn_ratio": 0.5, "reduce_method": "mean", "rms_norm_eps": 1e-06, "rope": true, "rope_theta": 10000.0, "rope_type": null, "router_aux_loss_coef": 0.001, "save_input_output": false, "seq_length": 2048, "sequential_jamba": false, "share_kv": false, "shared_module_attn": "", "shared_module_mamba": "", "sliding_window": 1024, "sliding_window_size": null, "swa_full_head": false, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "transformers_version": "4.44.1", "use_cache": false, "use_mamba2": false, "use_mamba_kernels": true, "v_head_dim": 128, "visual_attn": false, "vocab_size": 32001 }