open-moe-llm-leaderboard / src /backend /huggingface_generate_until.py
future-xy
formatting code
d6d7ec6
raw
history blame
2.16 kB
from typing import List, Literal, Optional, Tuple, Union
import torch
import transformers
from lm_eval.models.huggingface import HFLM
from lm_eval.api.registry import register_model
@register_model("hf-chat")
class HFLMwithChatTemplate(HFLM):
def __init__(self, use_chat_template=True, **kwargs):
super().__init__(**kwargs)
self.use_chat_template = use_chat_template
def tok_batch_encode(
self,
strings: List[str],
padding_side: str = "left",
left_truncate_len: int = None,
truncation: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
if self.use_chat_template:
try:
updated_strings = []
for input_string in strings:
messages = [
{"role": "user", "content": f"{input_string}"},
]
updated_string = self.tokenizer.apply_chat_template(messages, tokenize=False)
updated_strings.append(updated_string)
strings = updated_strings[:]
except:
print(f"failed to update input string with chat template: {self._model}")
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side
self.tokenizer.padding_side = padding_side
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
add_special_tokens = False
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
add_special_tokens = True
encoding = self.tokenizer(
strings,
truncation=truncation,
padding="longest",
return_tensors="pt",
add_special_tokens=add_special_tokens,
)
if left_truncate_len:
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
self.tokenizer.padding_side = old_padding_side
return encoding["input_ids"], encoding["attention_mask"]