from typing import Dict, List, Any import torch from accelerate import Accelerator from transformers import AutoTokenizer, AutoModelForCausalLM import numpy as np def softmax(x): z = x - max(x) numerator = np.exp(z) denominator = np.sum(numerator) softmax = numerator/denominator return softmax class EndpointHandler(): def __init__(self, path=""): self.accelerator = Accelerator() self.device = self.accelerator.device self.model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, device_map="auto") self.model = self.accelerator.prepare(self.model) self.tokenizer = AutoTokenizer.from_pretrained(path) self.options_tokens = [self.tokenizer.encode(choice)[-1] for choice in ["A", "B", "C", "D"]] def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str` | `PIL.Image` | `np.array`) kwargss Return: A :obj:`list` | `dict`: will be serialized and returned """ with torch.no_grad(): prompt = data.pop("prompt") inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) input_size = inputs['input_ids'].size(1) input_ids = inputs["input_ids"].to(self.device) outputs = self.model(**inputs) last_token_logits = outputs.logits[:, -1, :] options_tokens_logits = last_token_logits[:, self.options_tokens].detach().cpu().numpy() conf = softmax(options_tokens_logits[0]) pred = np.argmax(options_tokens_logits[0]) return [{"pred": pred, "conf":conf}]