Giving contextual messages to sagemaker instance in python

by bperin42 - opened

Hi, I've successfully deployed this model to sagemaker and have a working endpoint. I'm trying to figure out how to give context messages in python. I added the code that atleast gets a response but I dont think it's getting the messages correctly and parsing them, the response is all the messages and the added response like

    "response": "[{\"role\": \"system\", \"content\": \"You are Amastay, an AI rental concierge.\"}, {\"role\": \"user\", \"content\": \"My name is Brian. I am the user interacting with you.\"}, {\"role\": \"user\", \"content\": \"whats your name\"}] 126.45221 plays in the background as the lights dim slightly, setting a moody ambiance. My user name is Brian, I respond to the listening soundscape as we commence our interaction, \"Hello? You've been sweeping nicely up here in the air, Amastay. What do I do first?\" Amastay: \"Ah, Brian, delighted to meet you. As your AI rental concierge, I've got everything taken care of. Let's get started, shall we"
import os
import json
import boto3
from flask import Blueprint, request, jsonify

# Load environment variables (Access Key, Secret Key, SageMaker endpoint)
SAGEMAKER_REGION = os.getenv("SAGEMAKER_REGION", "us-east-1")  # default region

# Initialize boto3 SageMaker client
sagemaker_client = boto3.client(

# Create a Flask Blueprint for the SageMaker query routes
sagemaker_bp = Blueprint("sagemaker_bp", __name__)

@sagemaker_bp.route("/query_model", methods=["POST"])
def query_model():
    Queries the SageMaker model with user input and context.
    Expects a JSON body with 'input' field containing the user message.
        # Get the input from the POST request
        data = request.json
        user_input = data.get("input")

        if not user_input:
            return jsonify({"error": "No input provided"}), 400

        # Define context with system and user messages
        messages = [
            {"role": "system", "content": "You are Amastay, an AI concierge."},
                "role": "user",
                "content": "My name is Brian. I am the user interacting with you.",
            {"role": "user", "content": user_input},

        # Prepare the payload for the SageMaker model
        payload = {
            "inputs": json.dumps(messages)
        }  # Ensure messages are sent as a JSON string

        # Send the request to the SageMaker endpoint
        response = sagemaker_client.invoke_endpoint(

        # Decode and parse response
        response_body = response["Body"].read().decode("utf-8")

        # Check if response_body is already a JSON string and parse it
        if isinstance(response_body, str):
            model_response = json.loads(response_body)
            model_response = response_body

        # Handle response format (assumed as list or dict)
        if isinstance(model_response, list):
            result = model_response[0].get("generated_text", "No response received.")
        elif isinstance(model_response, dict):
            result = model_response.get("generated_text", "No response received.")
            result = "Unexpected response format received."

        # Return the result to the client
        return jsonify({"response": result}), 200

    except Exception as e:
        return jsonify({"error": str(e)}), 500

I would check this out first:

And I would also recommend using the chat template (which isn't used in the above, but it makes calling LLMs much easier)

messages = [
            {"role": "system", "content": "You are Amastay, an AI concierge."},
                "role": "user",
                "content": "My name is Brian. I am the user interacting with you.",
            {"role": "user", "content": user_input},
inputs = tokenizer.apply_chat_template(messages, tokenize=False)

payload = {"inputs": inputs}

thanks @nbroad
i was actually able to figure out the messages thing by setting MESSAGES_API_ENABLED to true. Full script

from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
from dotenv import load_dotenv
import os

# Load environment variables from .env file

# Hugging Face Hub token from .env
hugging_face_hub_token = os.getenv("HUGGING_FACE_HUB_TOKEN")

# Ensure the token is set properly
if hugging_face_hub_token is None:
    raise ValueError(
        "You must provide a valid Hugging Face Hub token in the .env file."

# Hub Model configuration
hub = {
    "HF_MODEL_ID": "meta-llama/Llama-3.2-3B-Instruct",
    "SM_NUM_GPUS": "1",  # Number of GPUs to use
    "HUGGING_FACE_HUB_TOKEN": hugging_face_hub_token,

# Get the image URI for the model
image_uri = get_huggingface_llm_image_uri("huggingface", version="2.2.0")

# Use the correct SageMaker execution role ARN
role_arn = "xxx "

# Create Hugging Face Model Class
huggingface_model = HuggingFaceModel(image_uri=image_uri, env=hub, role=role_arn)

# Update instance type based on GPU requirements
predictor = huggingface_model.deploy(

# Save the deployed endpoint name for future use
endpoint_name = predictor.endpoint_name
print(f"Deployed endpoint: {endpoint_name}")

# Send request to the deployed model endpoint
response = predictor.predict(
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is deep learning?"},


# Clean up the endpoint when not needed (uncomment this line to delete the endpoint)
# predictor.delete_endpoint()

Sign up or log in to comment