{ "_name_": "Judge-GPT2", "_name_or_path": "Wonder-Griffin/JudgeLLM2", "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "batch_size": 32, "bias": true, "block_size": 512, "bos_token_id": 50256, "dim_feedforward": 3072, "dropout": 0.1, "embd_pdrop": 0.1, "eos_token_id": 50256, "ff_expansion_factor": 4, "hidden_act": "gelu", "id2label": { "0": "LABEL_0", "1": "LABEL_1", "2": "LABEL_2", "3": "LABEL_3", "4": "LABEL_4" }, "inference_mode": true, "initializer_range": 0.02, "label2id": { "LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2, "LABEL_3": 3, "LABEL_4": 4 }, "label_smoothing": 0.1, "layer_norm_epsilon": 1e-05, "learning_rate": 0.0003, "log_interval": 100, "max_grad_norm": 1.0, "model_type": "gpt2", "n_embd": 768, "n_head": 12, "n_inner": null, "n_layer": 12, "n_positions": 512, "output_dir": "C:/Users/wonde/output", "pretrained_weights": "Wonder-Griffin/Judge-GPT2", "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, "scale_attn_by_inverse_layer_idx": false, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_heads": { "classifier_head": { "params": { "num_labels": 5 }, "type": "JudgeClassifier" }, "lm_head": { "params": { "vocab_size": 50257 }, "type": "JudgeCasualLMHead" }, "qa_head": { "params": { "num_labels": 2 }, "type": "JudgeWithQA" } }, "task_specific_params": { "question-answering": { "max_answer_length": 100 }, "sequence-classification": { "eval_steps": 500 }, "text-generation": { "do_sample": true, "max_length": 100 } }, "tokenizer": { "params": { "vocab_size": 50257 }, "type": "AutoTokenizer" }, "torch_dtype": "float32", "total_steps": 10000, "transformers_version": "4.43.3", "use_cache": true, "vocab_size": 30522, "warmup_steps": 1000, "weight_decay": 0.01 }