mistralai/Mistral-7B-Instruct-v0.2 · Encountering KeyError: 'mistral' on GPUs for Inference

I'm trying to run inference on mistral-7b-instruct-v0.2 model using Ray and Fast API. My serve script is below. I'm getting the below error.

_call_func_or_gen
    result = callable(*args, **kwargs)
  File "/serve_app/ray_serve_mistral.py", line 48, in __init__
    self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
  File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 434, in from_pretrained
    config, kwargs = AutoConfig.from_pretrained(
  File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 829, in from_pretrained
    config_class = CONFIG_MAPPING[config_dict["model_type"]]
  File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 536, in __getitem__
    raise KeyError(key)
KeyError: 'mistral'

I'm installing the pip dependencies as RUN pip install -U --no-cache-dir requests torch transformers accelerate uvicorn fastapi.
The versions are as below

Python Dependencies:

transformers` version: 4.35.0
- Platform: Linux-5.10.219-208.866.amzn2.x86_64-x86_64-with-glibc2.31
- Python version: 3.10.13
- Huggingface_hub version: 0.17.3
- Safetensors version: 0.4.1
- Accelerate version: 0.20.3

Serving Script:

import os
import time
from typing import List
import ray
from ray import serve
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch
app = FastAPI()
MODEL_NAME = os.getenv("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.2")
HF_TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN')
# Deployment settings for the API ingress using Ray Serve


@serve

	.deployment(name="mistral-deployment-2", num_replicas=1, route_prefix="/mistral")


@serve

	.ingress(app)
class APIIngress:
    # Constructor to initialize the API with a model handle
    def __init__(self, mistral_model_handle) -> None:
        self.handle = mistral_model_handle
    # Define a GET endpoint for generateence
    

@app

	.get("/infer")
    async def infer(self, request: str):
        # Asynchronously perform generateence using the provided sentence and return the result
        result = await self.handle.infer.remote(request)
        return result


@serve

	.deployment(
    name="mistral-7b",
    ray_actor_options={"num_gpus": 1},
    autoscaling_config={
        "min_replicas": 1,
        "max_replicas": 5,
        "target_num_ongoing_requests_per_replica": 10,
    }
)
class MistralModel:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        login(token=HF_TOKEN)
        self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    def infer(self, request: str):
        text = "[INST]" + request + "[/INST]"
        inputs = self.tokenizer.encode(text, return_tensors="pt").to(self.device)
        self.model.to(self.device)
        with torch.inference_mode():
            generated_sequence = self.model.sample(
                inputs,
                sequence_length=512,
        start_ids=None
            )
        return [self.tokenizer.decode(seq) for seq in generated_sequence]
# Bind the model to the API ingress to enable endpoint functionality
entrypoint = APIIngress.bind(MistralModel.bind())