# Copyright 2022 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Union import bitsandbytes as bnb import torch from accelerate import Accelerator, DistributedDataParallelKwargs from datasets import load_dataset from peft import LoraConfig, prepare_model_for_kbit_training from torch.utils.data import DataLoader from tqdm import tqdm from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, PreTrainedTokenizerBase, pipeline, ) import wandb # from transformers.trainer_utils import get_last_checkpoint from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed from trl.core import LengthSampler from trl.models.modeling_value_adapter import AutoModelForCausalLMWithValueAdapter # import copy # from torch_ema import ExponentialMovingAverage # from transforers import pipeline tqdm.pandas() @dataclass class ScriptArguments: """ The name of the Casual LM model we wish to fine with PPO """ model_name: Optional[str] = field(default="", metadata={"help": "the model name"}) reward_adapter_name: Optional[str] = field(default="", metadata={"help": "the reward model name"}) # tokenizer_name: Optional[str] = field(default=None, metadata={"help": "the tokenizer name"}) dataset_name: Optional[str] = field( default="CarperAI/openai_summarize_tldr", metadata={"help": "the dataset name"} ) train_split: Optional[str] = field( default="train", metadata={"help": "the dataset split to evaluate on; default to 'none' (no evaluation)"} ) eval_split: Optional[str] = field( default="train", metadata={"help": "the dataset split to evaluate on; default to 'none' (no evaluation)"} ) log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"}) learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"}) mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"}) batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"}) ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"}) gradient_accumulation_steps: Optional[int] = field( default=4, metadata={"help": "the number of gradient accumulation steps"} ) adafactor: Optional[bool] = field(default=False, metadata={"help": "whether to use the adafactor optimizer"}) early_stopping: Optional[bool] = field(default=False, metadata={"help": "whether to early stop"}) target_kl: Optional[float] = field(default=0.1, metadata={"help": "kl target for early stopping"}) reward_baseline: Optional[float] = field( default=0.0, metadata={"help": "a baseline value that is subtracted from the reward"}, ) batched_gen: Optional[bool] = field(default=False, metadata={"help": "whether to use the batched text gen"}) save_steps: Optional[int] = field(default=1000, metadata={"help": "the number of steps to save at"}) save_strategy: Optional[str] = field(default="steps") output_dir: Optional[str] = field(default="runs/", metadata={"help": "n steps to save the model"}) seed: Optional[int] = field(default=0, metadata={"help": "the seed"}) steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"}) init_kl_coef: Optional[float] = field( default=0.2, metadata={"help": "Initial KL penalty coefficient (used for adaptive and linear control)"}, ) adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"}) value_adapter: Optional[bool] = field(default=False) separate_reward_model: Optional[str] = field(default=None, metadata={"help": "the reward model name"}) # Generation output_min_length: Optional[int] = field(default=24, metadata={"help": "the batch size"}) output_max_length: Optional[int] = field(default=48, metadata={"help": "the batch size"}) input_max_length: Optional[int] = field(default=512, metadata={"help": "maximum length for generation"}) # Quantization load_in_8bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 8 bits precision"}) load_in_4bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 4 bits precision"}) bf16: Optional[bool] = field( default=False, metadata={ "help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU." }, ) fp16: Optional[bool] = field( default=False, metadata={ "help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU." }, ) # LoRA use_lora: Optional[bool] = field( default=True, ) lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"}) lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"}) lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"}) lora_all_linear: Optional[bool] = field(default=False, metadata={"help": "lora adapter on all linear layers"}) # Gold Model eval_steps: Optional[int] = field(default=None) gold_model_name: Optional[str] = field(default=None, metadata={"help": "the reward model name"}) gold_in_8bit: Optional[bool] = field(default=False, metadata={"help": "gold the model in 8 bits precision"}) gold_in_4bit: Optional[bool] = field(default=False, metadata={"help": "gold the model in 4 bits precision"}) gold_bf16: Optional[bool] = field( default=False, ) gold_fp16: Optional[bool] = field( default=False, ) gold_eval_greedy: Optional[bool] = field(default=True) # # EMA stuff # ema_decay: Optional[float] = field(default=0.995, metadata={"help": "the ema decay rate"}) # reset_freq: Optional[int] = field(default=None, metadata={"help": "reset every n epochs"}) input_ids_input: Optional[bool] = field( default=False, ) strip_prompt: Optional[bool] = field( default=False, ) just_eval: Optional[bool] = field(default=False) @dataclass class PromptCollator: tokenizer: PreTrainedTokenizerBase padding: Union[bool, str] = True max_prompt_length: Optional[int] = None prompt_field: str = "prompt" return_tensors: str = "pt" def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: prompts = [feat[self.prompt_field] for feat in features] original_side = self.tokenizer.padding_side self.tokenizer.padding_side = "left" tokenized_batch = self.tokenizer( prompts, truncation=True, padding=True, max_length=self.max_prompt_length, return_tensors=self.return_tensors, ) tokenized_batch["prompt"] = prompts self.tokenizer.padding_side = original_side return tokenized_batch def find_all_linear_names(args, model): cls = bnb.nn.Linear4bit if args.load_in_4bit else (bnb.nn.Linear8bitLt if args.load_in_8bit else torch.nn.Linear) lora_module_names = set() for name, module in model.named_modules(): if isinstance(module, cls): names = name.split(".") lora_module_names.add(names[0] if len(names) == 1 else names[-1]) if "lm_head" in lora_module_names: # needed for 16-bit lora_module_names.remove("lm_head") if "score" in lora_module_names: # needed for 16-bit lora_module_names.remove("score") return list(lora_module_names) def create_and_prepare_model(args): if args.load_in_8bit and args.load_in_4bit: raise ValueError("You can't load the model in 8 bits and 4 bits at the same time") elif args.load_in_8bit or args.load_in_4bit: quantization_config = BitsAndBytesConfig(load_in_8bit=args.load_in_8bit, load_in_4bit=args.load_in_4bit) device_map = {"": Accelerator().local_process_index} else: device_map = None quantization_config = None if args.bf16: torch_dtype = torch.bfloat16 # elif args.fp16: # torch_dtype = torch.float16 else: torch_dtype = torch.float32 if script_args.value_adapter: model_cls = AutoModelForCausalLMWithValueAdapter else: model_cls = AutoModelForCausalLMWithValueHead if args.use_lora: # we add `score` to the list of modules to save to # correctly save the score head. # if args.pretrained_adapter is not None: # model = PeftModel.from_pretrained(model, args.pretrained_adapter) # else: if args.lora_all_linear: # hardcoded pythia # target_modules = find_all_linear_names(args, model) target_modules = ["dense_h_to_4h", "dense_4h_to_h", "query_key_value", "dense"] else: target_modules = None peft_config = LoraConfig( r=args.lora_r, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout, bias="none", task_type="CAUSAL_LM", target_modules=target_modules, modules_to_save=["score"], ) # model = get_peft_model(model, peft_config) # TODO check # modules_to_save = ["score"] # for key, _ in model.named_modules(): # target_module_found = any(key.endswith(target_key) for target_key in modules_to_save) # if target_module_found: # model.get_submodule(key + ".original_module").requires_grad_(False) # # if torch_dtype == torch.bfloat16: # for name, module in model.named_modules(): # if isinstance(module, LoraLayer): # module = module.to(torch_dtype) # if "norm" in name: # module = module.to(torch.float32) # if "score" in name or "embed_tokens" in name: # if hasattr(module, "weight") and module.weight.dtype == torch.float32: # module = module.to(torch_dtype) else: peft_config = None model = model_cls.from_pretrained( args.model_name, quantization_config=quantization_config, device_map=device_map, torch_dtype=torch_dtype, peft_config=peft_config, reward_adapter=script_args.reward_adapter_name, ) # if script_args.ignore_bias_buffers: # torch distributed hack if quantization_config is not None: model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing) args.gradient_checkpointing = False model.config.torch_dtype = torch_dtype # model.config.use_cache = not args.gradient_checkpointing tokenizer = AutoTokenizer.from_pretrained(script_args.model_name) if getattr(tokenizer, "pad_token", None) is None: tokenizer.pad_token = tokenizer.eos_token if getattr(model.config, "pad_token_id", None) is None: model.config.pad_token_id = model.config.eos_token_id model.eval() return model, tokenizer def create_and_prepare_dataset(args, tokenizer, split, num_proc=2): dataset = load_dataset(args.dataset_name, split=split) def strip_prompt(examples): examples["prompt"] = [prompt.strip() for prompt in examples["prompt"]] return examples if args.strip_prompt: dataset = dataset.map(strip_prompt, batched=True) dataset = dataset.rename_column("prompt", "query") original_columns = dataset.column_names original_columns.remove("query") dataset = dataset.map( tokenizer, batched=True, num_proc=num_proc, input_columns="query", remove_columns=original_columns, fn_kwargs=dict(truncation=True, max_length=args.input_max_length), ) dataset.set_format("torch") return dataset def collator(data): return dict((key, [d[key] for d in data]) for key in data[0]) def decode_and_encode(output_token_ids: List[torch.Tensor], tokenizer, max_length, de_and_retokenize=True): if de_and_retokenize: texts = [q + r for q, r in zip(batch["query"], batch["response"])] output_encoding = tokenizer( texts, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, max_length=max_length, ).to(ppo_trainer.accelerator.device) else: default_padding_side = tokenizer.padding_side tokenizer.padding_side = "left" full_response_mask = [torch.ones_like(element) for element in output_token_ids] full_response_encoding = {"input_ids": output_token_ids, "attention_mask": full_response_mask} output_encoding = tokenizer.pad( full_response_encoding, padding=True, max_length=max_length, return_tensors="pt", ) tokenizer.padding_side = default_padding_side return output_encoding def create_and_prepare_gold_model(script_args, accelerator): if script_args.gold_in_8bit or script_args.gold_in_4bit: gold_quantization_config = BitsAndBytesConfig( load_in_8bit=script_args.gold_in_8bit, load_in_4bit=script_args.gold_in_4bit ) gold_device_map = {"": accelerator.local_process_index} else: gold_device_map = None gold_quantization_config = None if script_args.gold_bf16: torch_dtype = torch.bfloat16 elif script_args.gold_fp16: torch_dtype = torch.float16 else: torch_dtype = torch.float32 gold_model = AutoModelForSequenceClassification.from_pretrained( script_args.gold_model_name, quantization_config=gold_quantization_config, torch_dtype=torch_dtype, device_map=gold_device_map, ) if getattr(gold_model.config, "pad_token_id", None) is None: gold_model.config.pad_token_id = gold_model.config.eos_token_id gold_model = accelerator.prepare(gold_model) gold_model.eval() return gold_model def create_and_prepare_eval(args, tokenizer, accelerator): dataset = load_dataset(args.dataset_name, split=args.eval_split) def strip_prompt(examples): examples["prompt"] = [prompt.strip() for prompt in examples["prompt"]] return examples if args.strip_prompt: dataset = dataset.map(strip_prompt, batched=True) # data_collator = PromptCollator( # tokenizer, # max_prompt_length=args.input_max_length, # prompt_field="prompt", # ) dataloader = DataLoader(dataset, batch_size=args.batch_size) return accelerator.prepare(dataloader) def get_batch_samples( accelerator, model, tokenizer, input_ids, attention_mask, return_ids=False, generation_config=None ): policy_output = model.generate( input_ids=input_ids, attention_mask=attention_mask, generation_config=generation_config, ) # if self.ref_model is None: with accelerator.unwrap_model(model).disable_adapter(): reference_output = model.generate( input_ids=input_ids, attention_mask=attention_mask, generation_config=generation_config, ) # else: # reference_output = self.ref_model.generate( # **inputs, # generation_config=self.generation_config, # ) policy_output = pad_to_length(policy_output, self.max_length, tokenizer.pad_token_id) policy_output_decoded = tokenizer.batch_decode(policy_output, skip_special_tokens=True) reference_output = pad_to_length(reference_output, self.max_length, tokenizer.pad_token_id) reference_output_decoded = tokenizer.batch_decode(reference_output, skip_special_tokens=True) if return_ids: return policy_output_decoded, reference_output_decoded, policy_output else: return policy_output_decoded, reference_output_decoded def gold_eval(dataloader, model, gold_model, accelerator, epoch, log_n_samples_during_eval=0): samples_to_log = [] gold_reward_sum = 0.0 total_samples = 0 greedy_generation_kwargs = { "min_length": -1, "top_p": 1.0, "do_sample": False, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, "max_new_tokens": script_args.output_max_length, } for batch in tqdm( dataloader, disable=not ppo_trainer.accelerator.is_local_main_process, desc="Gold Eval", ): import pdb pdb.set_trace() full_response_tensors = ppo_trainer.generate( batch["input_ids"], return_prompt=True, **greedy_generation_kwargs, ) response_tensors = [] for question, full_response in zip(question_tensors, full_response_tensors): response_tensors.append(full_response[len(question) :]) batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True) texts = [q + r for q, r in zip(batch["prompt"], batch["response"])] import pdb pdb.set_trace() policy_output = tokenizer( texts, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False ).to(ppo_trainer.accelerator.device) # gold reward with torch.no_grad(): gold_rewards = gold_model( input_ids=policy_output["input_ids"], attention_mask=policy_output["attention_mask"] )[0] gold_rewards = accelerator.gather_for_metrics(gold_rewards) if accelerator.is_local_main_process(): gold_reward_sum += gold_rewards.sum().item() total_samples += gold_rewards.size(0) for i, (prompt, resp) in enumerate(zip(batch["prompt"], batch["response"])): if len(samples_to_log) < log_n_samples_during_eval: samples_to_log.append([prompt, resp]) else: break if accelerator.is_local_main_process(): print(f"gold reward mean {gold_reward_sum / total_samples}") gold_log = { "eval/gold_rewards_mean": gold_reward_sum / total_samples, } gold_log["epoch"] = epoch if samples_to_log: gold_log["game_log"] = ( wandb.Table( columns=["Prompt", "Policy", "Ref Model"], rows=samples_to_log, ), ) accelerator.log(gold_log) return gold_reward_sum / total_samples, samples_to_log if __name__ == "__main__": parser = HfArgumentParser(ScriptArguments) script_args: ScriptArguments = parser.parse_args_into_dataclasses()[0] config = PPOConfig( steps=script_args.steps, model_name=script_args.model_name, learning_rate=script_args.learning_rate, log_with=script_args.log_with, batch_size=script_args.batch_size, mini_batch_size=script_args.mini_batch_size, gradient_accumulation_steps=script_args.gradient_accumulation_steps, optimize_cuda_cache=True, early_stopping=script_args.early_stopping, target_kl=script_args.target_kl, ppo_epochs=script_args.ppo_epochs, seed=script_args.seed, init_kl_coef=script_args.init_kl_coef, adap_kl_ctrl=script_args.adap_kl_ctrl, accelerator_kwargs={"kwargs_handlers": [DistributedDataParallelKwargs(find_unused_parameters=False)]}, ) # set seed before initializing value head for deterministic eval set_seed(config.seed) model, tokenizer = create_and_prepare_model(script_args) train_dataset = create_and_prepare_dataset(script_args, tokenizer, script_args.train_split) # We then build the PPOTrainer, passing the model, the reference model, the tokenizer ppo_trainer = PPOTrainer( config, model, ref_model=None, tokenizer=tokenizer, dataset=train_dataset, data_collator=collator, ) # Gold Model Eval if script_args.gold_model_name is not None: gold_model = create_and_prepare_gold_model(script_args, ppo_trainer.accelerator) eval_dataloader = create_and_prepare_eval(script_args, tokenizer, ppo_trainer.accelerator) if script_args.just_eval: gold_eval( eval_dataloader, ppo_trainer.model, gold_model, ppo_trainer.accelerator, epoch=0, log_n_samples_during_eval=0, ) exit() if script_args.separate_reward_model: device = ppo_trainer.accelerator.device if ppo_trainer.accelerator.num_processes == 1: device = 0 if torch.cuda.is_available() else "cpu" # to avoid a ` pipeline` bug sentiment_pipe = pipeline( "sentiment-analysis", model=script_args.separate_reward_model, device_map={"": Accelerator().local_process_index}, model_kwargs={"load_in_8bit": True}, tokenizer=tokenizer, return_token_type_ids=False, ) sent_kwargs = { "return_all_scores": True, "function_to_apply": "none", "batch_size": 16, "truncation": True, } # We then define the arguments to pass to the `generate` function. These arguments # are passed to the `generate` function of the PPOTrainer, which is a wrapper around # the `generate` function of the trained model. generation_kwargs = { "min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, } output_length_sampler = LengthSampler(script_args.output_min_length, script_args.output_max_length) for epoch, batch in tqdm( enumerate(ppo_trainer.dataloader), total=config.total_ppo_epochs, disable=not ppo_trainer.accelerator.is_local_main_process, ): if epoch >= config.total_ppo_epochs: break question_tensors = batch["input_ids"] full_response_tensors = ppo_trainer.generate( question_tensors, return_prompt=True, length_sampler=output_length_sampler, **generation_kwargs, ) response_tensors = [] for question, full_response in zip(question_tensors, full_response_tensors): response_tensors.append(full_response[len(question) :]) batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True) # policy_output_encoding = create_encoding_from_output() # # Compute sentiment score # if script_args.input_ids_input: # max_length = script_args.input_max_length + script_args.output_max_length # default_padding_side = tokenizer.padding_side # tokenizer.padding_side = "left" # full_response_mask = [torch.ones_like(element) for element in full_response_tensors] # full_response_encoding = {"input_ids": full_response_tensors, "attention_mask": full_response_mask} # policy_output = tokenizer.pad( # full_response_encoding, # padding="max_length", # max_length=max_length, # return_tensors="pt", # ) # tokenizer.padding_side = default_padding_side # else: texts = [q + r for q, r in zip(batch["query"], batch["response"])] policy_output = tokenizer( texts, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False ).to(ppo_trainer.accelerator.device) # if script_args.separate_reward_model: # pipe_outputs = sentiment_pipe(texts, **sent_kwargs) # raw_rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs] # else: raw_rewards = ppo_trainer.compute_reward_model_score(**policy_output) rewards = [(raw_rewards[i] - script_args.reward_baseline) for i in range(len(raw_rewards))] # Run PPO step if not script_args.just_eval: stats = ppo_trainer.step(question_tensors, response_tensors, rewards) else: stats = {} if script_args.eval_steps is not None and epoch % script_args.eval_steps == 0: if script_args.gold_eval_greedy: greedy_generation_kwargs = { "min_length": -1, "top_p": 1.0, "do_sample": False, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, "max_new_tokens": script_args.output_max_length, } greedy_output = ppo_trainer.generate( question_tensors, return_prompt=True, **greedy_generation_kwargs, ) max_length = script_args.input_max_length + script_args.output_max_length policy_output = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) with torch.no_grad(): gold_rewards = gold_model(**policy_output)[0] else: gold_rewards = None stats["epoch"] = epoch ppo_trainer.log_stats(stats, batch, rewards, gold_rewards) # ppo_trainer.accelerator.print(stats) if script_args.save_strategy != "no" and epoch > 0 and epoch % script_args.save_steps == 0: ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")