Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

Jae-Won Chung

AmberLJC commited on Aug 10, 2023

Commit

8ff63e4

•

1 Parent(s): b08a0ac

The ML.ENERGY Colosseum (#22)

Browse files

Co-authored-by: AmberLJC <[email protected]>

Files changed (24) hide show

.gitignore +9 -0
README.md +7 -2
app.py +288 -15
Dockerfile → deployment/benchmark.Dockerfile +0 -0
deployment/controller-container.sh +11 -0
deployment/controller.Dockerfile +30 -0
deployment/docker-compose-0.yaml +74 -0
deployment/docker-compose-1.yaml +40 -0
docs/colosseum_bottom.md +14 -0
docs/colosseum_top.md +8 -0
LEADERBOARD.md → docs/leaderboard.md +2 -2
requirements.txt +1 -2
setup.py +21 -0
spitfight/__init__.py +0 -0
spitfight/colosseum/__init__.py +0 -0
spitfight/colosseum/client.py +106 -0
spitfight/colosseum/common.py +35 -0
spitfight/colosseum/controller/__init__.py +0 -0
spitfight/colosseum/controller/controller.py +266 -0
spitfight/colosseum/controller/router.py +125 -0
spitfight/colosseum/controller/worker.py +151 -0
spitfight/log.py +76 -0
spitfight/prompt.py +69 -0
spitfight/utils.py +305 -0

.gitignore CHANGED Viewed

@@ -7,3 +7,12 @@
 # Editor
 pyrightconfig.json
 .idea

 # Editor
 pyrightconfig.json
 .idea
+# Python
+*.egg-info
+**/__pycache__
+build/
+# Data files
+*.log
+pegasus/consumed.yaml

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: "⚡"
 python_version: "3.9"
 app_file: "app.py"
 sdk: "gradio"
-sdk_version: "3.35.2"
 pinned: true
 tags: ["energy", "leaderboard"]
 colorFrom: "black"
@@ -22,7 +22,12 @@ How much energy do LLMs consume?
 This README focuses on explaining how to run the benchmark yourself.
 The actual leaderboard is here: https://ml.energy/leaderboard.
-## Setup
 ### Model weights

 python_version: "3.9"
 app_file: "app.py"
 sdk: "gradio"
+sdk_version: "3.39.0"
 pinned: true
 tags: ["energy", "leaderboard"]
 colorFrom: "black"
 This README focuses on explaining how to run the benchmark yourself.
 The actual leaderboard is here: https://ml.energy/leaderboard.
+## Colosseum
+We instrumented [Hugging Face TGI](https://github.com/huggingface/text-generation-inference) so that it measures and returns GPU energy consumption.
+Then, our [controller](/spitfight/colosseum/controller) server receives user prompts from the [Gradio app](/app.py), selects two models randomly, and streams model responses back with energy consumption.
+## Setup for benchmarking
 ### Model weights

app.py CHANGED Viewed

@@ -5,6 +5,9 @@ import yaml
 import requests
 import itertools
 import contextlib
 from dateutil import parser, tz
 import numpy as np
@@ -13,9 +16,10 @@ import pandas as pd
 import plotly.io as pio
 import plotly.express as px
 from pandas.api.types import is_numeric_dtype, is_float_dtype
 pio.templates.default = "plotly_white"
 class TableManager:
     def __init__(self, data_dir: str) -> None:
@@ -215,7 +219,6 @@ class TableManager:
         return fig, width, height, ""
 # The global instance of the TableManager should only be used when
 # initializing components in the Gradio interface. If the global instance
 # is mutated while handling user sessions, the change will be reflected
@@ -280,7 +283,7 @@ function format_model_link() {{
 """
 # Custom CSS.
-css = """
 /* Make ML.ENERGY look like a clickable logo. */
 .text-logo {
     color: #23d175 !important;
@@ -311,6 +314,14 @@ table th:first-child {
 .tab-nav > button {
     font-size: 18px !important;
 }
 """
 intro_text = """
@@ -324,13 +335,262 @@ including the ARC Challenge (reasoning), HellaSwag (common sense), and TruthfulQ
 Every benchmark is limited in some sense -- Before you interpret the results, please take a look at the <b>Limitations</b> section there, too.</p>
 """
-block = gr.Blocks(css=css)
-with block:
     tbm = gr.State(global_tbm)  # type: ignore
     with gr.Box():
         gr.HTML("<h1><a href='https://ml.energy' class='text-logo'>ML.ENERGY</a> Leaderboard</h1>")
     with gr.Tabs():
         # Tab: Leaderboard.
         with gr.Tab("Leaderboard"):
             with gr.Box():
@@ -340,7 +600,7 @@ with block:
             with gr.Row():
                 with gr.Box():
                     gr.Markdown("### Benchmark results to show")
-                    checkboxes = []
                     for key, choices in global_tbm.schema.items():
                         # Specifying `value` makes everything checked by default.
                         checkboxes.append(gr.CheckboxGroup(choices=choices, value=choices[:1], label=key))
@@ -349,10 +609,10 @@ with block:
             with gr.Row():
                 dataframe = gr.Dataframe(type="pandas", elem_id="tab-leaderboard")
             # Make sure the models have clickable links.
-            dataframe.change(None, None, None, _js=dataframe_update_js)
             # Table automatically updates when users check or uncheck any checkbox.
             for checkbox in checkboxes:
-                checkbox.change(TableManager.set_filter_get_df, inputs=[tbm, *checkboxes], outputs=dataframe)
             # Block: Allow users to add new columns.
             with gr.Box():
@@ -381,21 +641,25 @@ with block:
                     TableManager.add_column,
                     inputs=[tbm, colname_input, formula_input],
                     outputs=[dataframe, add_col_message],
                 )
                 formula_input.submit(
                     TableManager.add_column,
                     inputs=[tbm, colname_input, formula_input],
                     outputs=[dataframe, add_col_message],
                 )
                 add_col_btn.click(
                     TableManager.add_column,
                     inputs=[tbm, colname_input, formula_input],
                     outputs=[dataframe, add_col_message],
                 )
                 clear_input_btn.click(
                     lambda: (None, None, None),
                     inputs=None,
                     outputs=[colname_input, formula_input, add_col_message],
                 )
             # Block: Allow users to plot 2D and 3D scatter plots.
@@ -425,42 +689,51 @@ with block:
                     )[0])  # type: ignore
                 with gr.Row():
                     plot_message = gr.HTML("")
-                add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns)  # type: ignore
                 plot_width_input.submit(
                     TableManager.plot_scatter,
                     inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
                     outputs=[plot, plot_width_input, plot_height_input, plot_message],
                 )
                 plot_height_input.submit(
                     TableManager.plot_scatter,
                     inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
                     outputs=[plot, plot_width_input, plot_height_input, plot_message],
                 )
                 plot_btn.click(
                     TableManager.plot_scatter,
                     inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
                     outputs=[plot, plot_width_input, plot_height_input, plot_message],
                 )
                 clear_plot_btn.click(
                     lambda: (None,) * 7,
                     None,
                     outputs=[*axis_dropdowns, plot, plot_width_input, plot_height_input, plot_message],
                 )
             # Block: Leaderboard date.
             with gr.Row():
                 gr.HTML(f"<h3 style='color: gray'>Last updated: {current_date}</h3>")
-        # Tab: Online demo.
-        with gr.Tab("Online demo (Coming in August!)"):
-            gr.Markdown("# Online demo with real time energy measurements\n\nComing soon in August!")
         # Tab: About page.
         with gr.Tab("About"):
             # Read in LEADERBOARD.md
-            gr.Markdown(open("LEADERBOARD.md").read())
     # Load the table on page load.
     block.load(lambda: global_tbm.set_filter_get_df(), outputs=dataframe)
-block.launch()

 import requests
 import itertools
 import contextlib
+import argparse
+import os
+from typing import Literal
 from dateutil import parser, tz
 import numpy as np
 import plotly.io as pio
 import plotly.express as px
 from pandas.api.types import is_numeric_dtype, is_float_dtype
 pio.templates.default = "plotly_white"
+from spitfight.colosseum.client import ControllerClient
 class TableManager:
     def __init__(self, data_dir: str) -> None:
         return fig, width, height, ""
 # The global instance of the TableManager should only be used when
 # initializing components in the Gradio interface. If the global instance
 # is mutated while handling user sessions, the change will be reflected
 """
 # Custom CSS.
+custom_css = """
 /* Make ML.ENERGY look like a clickable logo. */
 .text-logo {
     color: #23d175 !important;
 .tab-nav > button {
     font-size: 18px !important;
 }
+/* Color texts. */
+.green-text {
+    color: #23d175 !important;
+}
+.red-text {
+    color: #ff3860 !important;
+}
 """
 intro_text = """
 Every benchmark is limited in some sense -- Before you interpret the results, please take a look at the <b>Limitations</b> section there, too.</p>
 """
+# The app will not start without a controller address set.
+controller_addr = os.environ["COLOSSEUM_CONTROLLER_ADDR"]
+global_controller_client = ControllerClient(controller_addr=controller_addr, timeout=15)
+ANONYMOUS_MODEL_TEXT = "## Anonymous 🤫"
+# Colosseum helper functions.
+def enable_interact():
+    return [gr.update(interactive=True)] * 2
+def disable_interact():
+    return [gr.update(interactive=False)] * 2
+def consumed_less_energy_message(energy_a, energy_b):
+    """Return a message that indicates that the user chose the model that consumed less energy.
+    By default report in "%f %" but if the difference is larger than 2 times, report in "%f X".
+    """
+    less_energy = min(energy_a, energy_b)
+    more_energy = max(energy_a, energy_b)
+    factor = less_energy / more_energy
+    if factor <= 0.5:
+        message = f"<h2>That response also <span class='green-text'>consumed {1/factor:.1f}X less energy</span>!</h2>"
+    else:
+        message = f"<h2>That response also <span class='green-text'>consumed {100 - factor * 100:.1f}% less energy</span>!</h2>"
+    return message
+def consumed_more_energy_message(energy_a, energy_b):
+    """Return a message that indicates that the user chose the model that consumed more energy.
+    By default report in "%f %" but if the difference is larger than 2 times, report in "%f X".
+    """
+    less_energy = min(energy_a, energy_b)
+    more_energy = max(energy_a, energy_b)
+    factor = more_energy / less_energy
+    if factor >= 2.0:
+        message = f"<h2>That response <span class='red-text'>consumed {factor:.1f}x more energy</span>.</h2>"
+    else:
+        message = f"<h2>That response <span class='red-text'>consumed {factor * 100 - 100:.1f}% more energy</span>.</h2>"
+    return message
+# Colosseum event handlers
+def add_prompt_disable_submit(prompt, history_a, history_b):
+    """Add the user's prompt to the two model's history and disable the submit button."""
+    client = global_controller_client.fork()
+    return [
+        gr.Textbox.update(value=" ", interactive=False),
+        gr.Button.update(interactive=False),
+        history_a + [[prompt, ""]],
+        history_b + [[prompt, ""]],
+        client,
+    ]
+def generate_responses(client: ControllerClient, history_a, history_b):
+    """Generate responses for the two models."""
+    for resp_a, resp_b in itertools.zip_longest(
+        client.prompt(prompt=history_a[-1][0], index=0),
+        client.prompt(prompt=history_b[-1][0], index=1),
+    ):
+        if resp_a is not None:
+            history_a[-1][1] += resp_a
+        if resp_b is not None:
+            history_b[-1][1] += resp_b
+        yield [history_a, history_b]
+def make_resp_vote_func(victory_index: Literal[0, 1]):
+    """Return a function that will be called when the user clicks on response preference vote buttons."""
+    def resp_vote_func(client: ControllerClient):
+        vote_response = client.response_vote(victory_index=victory_index)
+        model_name_a, model_name_b = map(lambda n: f"## {n}", vote_response.model_names)
+        energy_a, energy_b = vote_response.energy_consumptions
+        # User liked the model that also consumed less energy.
+        if (victory_index == 0 and energy_a <= energy_b) or (victory_index == 1 and energy_a >= energy_b):
+            energy_message = consumed_less_energy_message(energy_a, energy_b)
+            return [
+                # Disable response vote buttons
+                gr.Button.update(interactive=False), gr.Button.update(interactive=False),
+                # Reveal model names
+                gr.Markdown.update(model_name_a), gr.Markdown.update(model_name_b),
+                # Display energy consumption comparison message
+                gr.Markdown.update(energy_message, visible=True),
+                # Keep energy vote buttons hidden
+                gr.Button.update(visible=False, interactive=False), gr.Button.update(visible=False, interactive=False),
+                # Enable reset button
+                gr.Button.update(visible=True, interactive=True),
+            ]
+        # User liked the model that consumed more energy.
+        else:
+            energy_message = consumed_more_energy_message(energy_a, energy_b)
+            return [
+                # Disable response vote buttons
+                gr.Button.update(interactive=False), gr.Button.update(interactive=False),
+                # Leave model names hidden
+                gr.Markdown.update(ANONYMOUS_MODEL_TEXT), gr.Markdown.update(ANONYMOUS_MODEL_TEXT),
+                # Display energy consumption comparison message
+                gr.Markdown.update(energy_message, visible=True),
+                # Reveal and enable energy vote buttons
+                gr.Button.update(visible=True, interactive=True), gr.Button.update(visible=True, interactive=True),
+                # Keep the reset button disabled
+                gr.Button.update(visible=False, interactive=False),
+            ]
+    return resp_vote_func
+def make_energy_vote_func(is_worth: bool):
+    """Return a function that will be called when the user clicks on energy vote buttons."""
+    def energy_vote_func(client: ControllerClient, energy_message: str):
+        vote_response = client.energy_vote(is_worth=is_worth)
+        model_name_a, model_name_b = map(lambda n: f"## {n}", vote_response.model_names)
+        return [
+            # Reveal model names
+            gr.Markdown.update(model_name_a), gr.Markdown.update(model_name_b),
+            # Disable energy vote buttons
+            gr.Button.update(interactive=False), gr.Button.update(interactive=False),
+            # Enable reset button
+            gr.Button.update(interactive=True, visible=True),
+            # Append to the energy comparison message
+            energy_message[:-5] + (" Fair enough.</h2>" if is_worth else " Wasn't worth it.</h2>"),
+        ]
+    return energy_vote_func
+def play_again():
+    return [
+        # Clear chatbot history
+        None, None,
+        # Turn on prompt textbox and submit button
+        gr.Textbox.update(value="", interactive=True), gr.Button.update(interactive=True),
+        # Mask model names
+        gr.Markdown.update(ANONYMOUS_MODEL_TEXT),
+        gr.Markdown.update(ANONYMOUS_MODEL_TEXT),
+        # Hide energy vote buttons and message
+        gr.Button.update(visible=False), gr.Button.update(visible=False), gr.Markdown.update(visible=False),
+        # Disable reset button
+        gr.Button.update(interactive=False, visible=False),
+    ]
+focus_prompt_input_js = """
+function() {
+    for (let textarea of document.getElementsByTagName("textarea")) {
+        if (textarea.hasAttribute("autofocus")) {
+            textarea.focus();
+            return;
+        }
+    }
+}
+"""
+with gr.Blocks(css=custom_css) as block:
     tbm = gr.State(global_tbm)  # type: ignore
     with gr.Box():
         gr.HTML("<h1><a href='https://ml.energy' class='text-logo'>ML.ENERGY</a> Leaderboard</h1>")
     with gr.Tabs():
+        # Tab: Colosseum.
+        with gr.TabItem("Colosseum ⚔️️"):
+            gr.Markdown(open("docs/colosseum_top.md").read())
+            with gr.Group():
+                with gr.Row():
+                    prompt_input = gr.Textbox(
+                        show_label=False,
+                        placeholder="Type your prompt and press ENTER",
+                        autofocus=True,
+                        container=False,
+                        scale=20,
+                        elem_id="prompt-textarea",
+                    )
+                    prompt_submit_btn = gr.Button(
+                        value="⚔️️ Fight!",
+                        elem_classes=["btn-submit"],
+                        min_width=60,
+                        scale=1,
+                    )
+            with gr.Row():
+                masked_model_names = []
+                chatbots = []
+                resp_vote_btn_list: list[gr.component.Component] = []
+                with gr.Column():
+                    with gr.Row():
+                        masked_model_names.append(gr.Markdown(ANONYMOUS_MODEL_TEXT))
+                    with gr.Row():
+                        chatbots.append(gr.Chatbot(label="Model A", elem_id="chatbot", height=600))
+                    with gr.Row():
+                        left_resp_vote_btn = gr.Button(value="👈 Model A is better", interactive=False)
+                        resp_vote_btn_list.append(left_resp_vote_btn)
+                with gr.Column():
+                    with gr.Row():
+                        masked_model_names.append(gr.Markdown(ANONYMOUS_MODEL_TEXT))
+                    with gr.Row():
+                        chatbots.append(gr.Chatbot(label="Model B", elem_id="chatbot", height=600))
+                    with gr.Row():
+                        right_resp_vote_btn = gr.Button(value="👉 Model B is better", interactive=False)
+                        resp_vote_btn_list.append(right_resp_vote_btn)
+            with gr.Row():
+                energy_comparison_message = gr.HTML(visible=False)
+            with gr.Row():
+                worth_energy_vote_btn = gr.Button(value="The better response was worth the extra energy.", visible=False)
+                notworth_energy_vote_btn = gr.Button(value="Not really worth it.", visible=False)
+                energy_vote_btn_list: list[gr.component.Component] = [worth_energy_vote_btn, notworth_energy_vote_btn]
+            with gr.Row():
+                play_again_btn = gr.Button("Play again!", visible=False)
+            gr.Markdown(open("docs/colosseum_bottom.md").read())
+            controller_client = gr.State()
+            (prompt_input
+                .submit(add_prompt_disable_submit, [prompt_input, *chatbots], [prompt_input, prompt_submit_btn, *chatbots, controller_client], queue=False)
+                .then(generate_responses, [controller_client, *chatbots], [*chatbots], queue=True)
+                .then(enable_interact, None, resp_vote_btn_list, queue=False))
+            (prompt_submit_btn
+                .click(add_prompt_disable_submit, [prompt_input, *chatbots], [prompt_input, prompt_submit_btn, *chatbots, controller_client], queue=False)
+                .then(generate_responses, [controller_client, *chatbots], [*chatbots], queue=True)
+                .then(enable_interact, None, resp_vote_btn_list, queue=False))
+            left_resp_vote_btn.click(
+                make_resp_vote_func(victory_index=0),
+                [controller_client],
+                [*resp_vote_btn_list, *masked_model_names, energy_comparison_message, *energy_vote_btn_list, play_again_btn],
+                queue=False,
+            )
+            right_resp_vote_btn.click(
+                make_resp_vote_func(victory_index=1),
+                [controller_client],
+                [*resp_vote_btn_list, *masked_model_names, energy_comparison_message, *energy_vote_btn_list, play_again_btn],
+                queue=False,
+            )
+            worth_energy_vote_btn.click(
+                make_energy_vote_func(is_worth=True),
+                [controller_client, energy_comparison_message],
+                [*masked_model_names, *energy_vote_btn_list, play_again_btn, energy_comparison_message],
+                queue=False,
+            )
+            notworth_energy_vote_btn.click(
+                make_energy_vote_func(is_worth=False),
+                [controller_client, energy_comparison_message],
+                [*masked_model_names, *energy_vote_btn_list, play_again_btn, energy_comparison_message],
+                queue=False,
+            )
+            (play_again_btn
+                .click(
+                    play_again,
+                    None,
+                    [*chatbots, prompt_input, prompt_submit_btn, *masked_model_names, *energy_vote_btn_list, energy_comparison_message, play_again_btn],
+                    queue=False,
+                )
+                .then(None, _js=focus_prompt_input_js, queue=False))
         # Tab: Leaderboard.
         with gr.Tab("Leaderboard"):
             with gr.Box():
             with gr.Row():
                 with gr.Box():
                     gr.Markdown("### Benchmark results to show")
+                    checkboxes: list[gr.CheckboxGroup] = []
                     for key, choices in global_tbm.schema.items():
                         # Specifying `value` makes everything checked by default.
                         checkboxes.append(gr.CheckboxGroup(choices=choices, value=choices[:1], label=key))
             with gr.Row():
                 dataframe = gr.Dataframe(type="pandas", elem_id="tab-leaderboard")
             # Make sure the models have clickable links.
+            dataframe.change(None, None, None, _js=dataframe_update_js, queue=False)
             # Table automatically updates when users check or uncheck any checkbox.
             for checkbox in checkboxes:
+                checkbox.change(TableManager.set_filter_get_df, inputs=[tbm, *checkboxes], outputs=dataframe, queue=False)
             # Block: Allow users to add new columns.
             with gr.Box():
                     TableManager.add_column,
                     inputs=[tbm, colname_input, formula_input],
                     outputs=[dataframe, add_col_message],
+                    queue=False,
                 )
                 formula_input.submit(
                     TableManager.add_column,
                     inputs=[tbm, colname_input, formula_input],
                     outputs=[dataframe, add_col_message],
+                    queue=False,
                 )
                 add_col_btn.click(
                     TableManager.add_column,
                     inputs=[tbm, colname_input, formula_input],
                     outputs=[dataframe, add_col_message],
+                    queue=False,
                 )
                 clear_input_btn.click(
                     lambda: (None, None, None),
                     inputs=None,
                     outputs=[colname_input, formula_input, add_col_message],
+                    queue=False,
                 )
             # Block: Allow users to plot 2D and 3D scatter plots.
                     )[0])  # type: ignore
                 with gr.Row():
                     plot_message = gr.HTML("")
+                add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns, queue=False)  # type: ignore
                 plot_width_input.submit(
                     TableManager.plot_scatter,
                     inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
                     outputs=[plot, plot_width_input, plot_height_input, plot_message],
+                    queue=False,
                 )
                 plot_height_input.submit(
                     TableManager.plot_scatter,
                     inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
                     outputs=[plot, plot_width_input, plot_height_input, plot_message],
+                    queue=False,
                 )
                 plot_btn.click(
                     TableManager.plot_scatter,
                     inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
                     outputs=[plot, plot_width_input, plot_height_input, plot_message],
+                    queue=False,
                 )
                 clear_plot_btn.click(
                     lambda: (None,) * 7,
                     None,
                     outputs=[*axis_dropdowns, plot, plot_width_input, plot_height_input, plot_message],
+                    queue=False,
                 )
             # Block: Leaderboard date.
             with gr.Row():
                 gr.HTML(f"<h3 style='color: gray'>Last updated: {current_date}</h3>")
         # Tab: About page.
         with gr.Tab("About"):
             # Read in LEADERBOARD.md
+            gr.Markdown(open("docs/leaderboard.md").read())
     # Load the table on page load.
     block.load(lambda: global_tbm.set_filter_get_df(), outputs=dataframe)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true", help="Specify if sharing is enabled")
+    parser.add_argument("--concurrency", type=int, default=10)
+    args = parser.parse_args()
+    block.queue(
+        concurrency_count=args.concurrency, status_update_rate=10, api_open=False
+    ).launch(share=args.share, show_error=True)

Dockerfile → deployment/benchmark.Dockerfile RENAMED Viewed

File without changes

deployment/controller-container.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env bash
+docker run \
+  --name controller \
+  --net leaderboard \
+  -v $HOME/workspace/leaderboard:/workspace/leaderboard \
+  -v $HOME/workspace/text-generation-inference/deployment:/workspace/text-generation-inference/deployment:ro \
+  -v /data/leaderboard/colosseum-controller-logs:/logs \
+  -p 7778:8000 \
+  -e LOG_DIR=/logs \
+  mlenergy/colosseum-controller:latest

deployment/controller.Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM ubuntu:22.04
+# Basic installs
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ='America/Detroit'
+RUN apt-get update -qq \
+    && apt-get -y --no-install-recommends install \
+       tzdata software-properties-common wget git \
+    && apt-get clean all \
+    && rm -r /var/lib/apt/lists/* \
+    && ln -fs /usr/share/zoneinfo/America/Detroit /etc/localtime \
+    && dpkg-reconfigure -f noninteractive tzdata
+# Install Miniconda3 23.3.1
+ENV PATH="/root/.local/miniconda3/bin:$PATH"
+RUN mkdir -p /root/.local \
+    && wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-py39_23.3.1-0-Linux-x86_64.sh -b -p /root/.local/miniconda3 \
+    && rm -f Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \
+    && ln -sf /root/.local/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+# Install spitfight
+ADD . /workspace/leaderboard
+RUN cd /workspace/leaderboard \
+      && pip install -e .[colosseum-controller]
+WORKDIR /workspace/leaderboard
+CMD ["python", "spitfight/colosseum/controller/router.py"]

deployment/docker-compose-0.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+services:
+  Falcon-7B:
+    container_name: worker0
+    image: mlenergy/tgi:latest
+    command: ["--model-id", "tiiuae/falcon-7b-instruct", "--num-shard", "1", "--otlp-endpoint", "http://jaeger:4317"]
+    shm_size: 1g
+    networks:
+      - leaderboard
+    volumes:
+      - /data/leaderboard/tgi-data:/data
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]
+              capabilities: [gpu]
+  Llama2-7B:
+    container_name: worker1
+    image: mlenergy/tgi:latest
+    command: ["--model-id", "/weights/metaai/Llama-2-7b-chat-hf", "--num-shard", "1", "--otlp-endpoint", "http://jaeger:4317"]
+    shm_size: 1g
+    networks:
+      - leaderboard
+    volumes:
+      - /data/leaderboard/tgi-data:/data
+      - /data/leaderboard/weights:/weights
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["1"]
+              capabilities: [gpu]
+  FastChat-T5-3B:
+    container_name: worker2
+    image: mlenergy/tgi:latest
+    command: ["--model-id", "lmsys/fastchat-t5-3b-v1.0", "--num-shard", "1", "--otlp-endpoint", "http://jaeger:4317"]
+    environment:
+      PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
+    shm_size: 1g
+    networks:
+      - leaderboard
+    volumes:
+      - /data/leaderboard/tgi-data:/data
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["2"]
+              capabilities: [gpu]
+  Llama2-13B:
+    container_name: worker3
+    image: mlenergy/tgi:latest
+    command: ["--model-id", "/weights/metaai/Llama-2-13b-chat-hf", "--num-shard", "1", "--otlp-endpoint", "http://jaeger:4317"]
+    shm_size: 1g
+    networks:
+      - leaderboard
+    volumes:
+      - /data/leaderboard/tgi-data:/data
+      - /data/leaderboard/weights:/weights
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["3"]
+              capabilities: [gpu]
+networks:
+  leaderboard:
+    name: leaderboard
+    external: true

deployment/docker-compose-1.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+services:
+  Llama2-70B-INT8:
+    container_name: worker4
+    image: mlenergy/tgi:latest
+    command: ["--model-id", "meta-llama/Llama-2-70b-chat-hf", "--num-shard", "2", "--otlp-endpoint", "http://jaeger:4317", "--quantize", "bitsandbytes"]
+    shm_size: 1g
+    environment:
+      HUGGING_FACE_HUB_TOKEN: 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
+    networks:
+      - leaderboard
+    volumes:
+      - /data/leaderboard/tgi-data:/data
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0", "1"]
+              capabilities: [gpu]
+  Falcon-40B:
+    container_name: worker5
+    image: mlenergy/tgi:latest
+    command: ["--model-id", "tiiuae/falcon-40b-instruct", "--num-shard", "2", "--otlp-endpoint", "http://jaeger:4317"]
+    shm_size: 1g
+    networks:
+      - leaderboard
+    volumes:
+      - /data/leaderboard/tgi-data:/data
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["2", "3"]
+              capabilities: [gpu]
+networks:
+  leaderboard:
+    name: leaderboard
+    external: true

docs/colosseum_bottom.md ADDED Viewed

	@@ -0,0 +1,14 @@

+### Terms of use
+By using our service, you agree to these Terms of Use and accept that the Service provides an approximate estimation of model inference energy usage for research purposes only. We are not liable for any damages or loss incurred by you or any third party arising from the use of the Service. It may generate offensive content and offers limited safety measures, thus should not be used for any illegal, harmful, violent, racist, or sexual purposes. The service collects user dialogue data and voting results. We reserve the right to distribute the dataset in the future.
+### Technical details
+- We allow models to generate only up to 512 new tokens. Due to this, some responses may be cut off in the middle.
+- Tokens are sampled from the model output with `temperature` 1.0, `repetition_penalty` 1.0, `top_k` 50, and `top_p` 0.95.
+- Large models (>= 30B) run on two NVIDIA A40 GPUs with tensor parallelism, whereas other models run on one NVIDIA A40 GPU. We directly measure the energy consumption of these GPUs.
+### Contact
+Please direct general questions and issues related to the Colosseum to our GitHub repository's [discussion board](https://github.com/ml-energy/leaderboard/discussions).
+You can find the ML.ENERGY initiative members in [our homepage](https://ml.energy#members).

docs/colosseum_top.md ADDED Viewed

	@@ -0,0 +1,8 @@

+> Enter the ML.ENERGY Colosseum, where language models duel with intellect, and your judgment tips the scales of victory.
+### Rules of the Colosseum
+- As the spectator, you'll decide the fates of two anonymous language models -- our gladiators.
+- Your role is twofold: First, you vote for the model that delivered the best response to your prompt.
+- Next, mighty [Zeus](https://ml.energy/zeus) will reveal which language model consumed more energy. Evaluate if its performance justified the energy consumption.
+- Only after you cast votes will the models' identities be unveiled.

LEADERBOARD.md → docs/leaderboard.md RENAMED Viewed

@@ -3,7 +3,7 @@ The goal of the ML.ENERGY Leaderboard is to give people a sense of how much **en
 The code for the leaderboard, backing data, and scripts for benchmarking are all open-source in our [repository](https://github.com/ml-energy/leaderboard).
 We'll see you at the [Discussion board](https://github.com/ml-energy/leaderboard/discussions), where you can ask questions, suggest improvement ideas, or just discuss leaderboard results!
-## Columns
 - `gpu`: NVIDIA GPU model name.
 - `task`: Name of the task. See *Tasks* below for details.
@@ -113,7 +113,7 @@ By doing this, we can provide numbers for reasonable comparison without being ti
 This leaderboard is a research preview intended for non-commercial use only.
 Model weights were taken as is from the Hugging Face Hub if available and are subject to their licenses.
-The use of LLaMA weights are subject to their [license](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md).
 Please direct inquiries/reports of potential violation to Jae-Won Chung.
 ## Acknowledgements

 The code for the leaderboard, backing data, and scripts for benchmarking are all open-source in our [repository](https://github.com/ml-energy/leaderboard).
 We'll see you at the [Discussion board](https://github.com/ml-energy/leaderboard/discussions), where you can ask questions, suggest improvement ideas, or just discuss leaderboard results!
+## Leaderboard Columns
 - `gpu`: NVIDIA GPU model name.
 - `task`: Name of the task. See *Tasks* below for details.
 This leaderboard is a research preview intended for non-commercial use only.
 Model weights were taken as is from the Hugging Face Hub if available and are subject to their licenses.
+The use of Llama weights are subject to their [license](https://github.com/facebookresearch/llama/blob/main/LICENSE).
 Please direct inquiries/reports of potential violation to Jae-Won Chung.
 ## Acknowledgements

requirements.txt CHANGED Viewed

	@@ -1,2 +1 @@
1	- ~~plotly==5~~.~~15.0~~
2	- gradio==3.35.2


1	+ .[app]

setup.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from setuptools import setup, find_packages
+extras_require = {
+    "colosseum-controller": [
+        "fastapi",
+        "fschat==0.2.23",
+        "text_generation @ git+https://github.com/ml-energy/text_generation_energy@master",
+    ],
+    "app": ["plotly==5.15.0", "gradio==3.39.0", "pydantic==1.10.9"],
+    "benchmark": ["zeus-ml", "fschat==0.2.23", "tyro", "rich"],
+}
+extras_require["all"] = list(set(sum(extras_require.values(), [])))
+setup(
+    name="spitfight",
+    version="0.0.1",
+    url="https://github.com/ml-energy/leaderboard",
+    packages=find_packages("."),
+    extras_require=extras_require,
+)

spitfight/__init__.py ADDED Viewed

File without changes

spitfight/colosseum/__init__.py ADDED Viewed

File without changes

spitfight/colosseum/client.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from __future__ import annotations
+import json
+import unittest
+import contextlib
+from uuid import uuid4, UUID
+from copy import deepcopy
+from typing import Generator, Literal
+import requests
+import gradio as gr
+from spitfight.colosseum.common import (
+    COLOSSEUM_PROMPT_ROUTE,
+    COLOSSEUM_RESP_VOTE_ROUTE,
+    COLOSSEUM_ENERGY_VOTE_ROUTE,
+    PromptRequest,
+    ResponseVoteRequest,
+    ResponseVoteResponse,
+    EnergyVoteRequest,
+    EnergyVoteResponse,
+)
+class ControllerClient:
+    """Client for the Colosseum controller, to be used by Gradio."""
+    def __init__(self, controller_addr: str, timeout: int = 15, request_id: UUID | None = None) -> None:
+        """Initialize the controller client."""
+        self.controller_addr = controller_addr
+        self.timeout = timeout
+        self.request_id = str(request_id) or str(uuid4())
+    def fork(self) -> ControllerClient:
+        """Return a copy of the client with a new request ID."""
+        return ControllerClient(
+            controller_addr=self.controller_addr,
+            timeout=self.timeout,
+            request_id=uuid4(),
+        )
+    def prompt(self, prompt: str, index: Literal[0, 1]) -> Generator[str, None, None]:
+        """Generate the response of the `index`th model with the prompt."""
+        prompt_request = PromptRequest(request_id=self.request_id, prompt=prompt, model_index=index)
+        with _catch_requests_exceptions():
+            resp = requests.post(
+                f"http://{self.controller_addr}{COLOSSEUM_PROMPT_ROUTE}",
+                json=prompt_request.dict(),
+                stream=True,
+                timeout=self.timeout,
+            )
+        _check_response(resp)
+        # XXX: Why can't the server just yield `text + "\n"` and here we just iter_lines?
+        for chunk in resp.iter_lines(decode_unicode=False, delimiter=b"\0"):
+            if chunk:
+                yield json.loads(chunk.decode("utf-8"))
+    def response_vote(self, victory_index: Literal[0, 1]) -> ResponseVoteResponse:
+        """Notify the controller of the user's vote for the response."""
+        response_vote_request = ResponseVoteRequest(request_id=self.request_id, victory_index=victory_index)
+        with _catch_requests_exceptions():
+            resp = requests.post(
+                f"http://{self.controller_addr}{COLOSSEUM_RESP_VOTE_ROUTE}",
+                json=response_vote_request.dict(),
+            )
+        _check_response(resp)
+        return ResponseVoteResponse(**resp.json())
+    def energy_vote(self, is_worth: bool) -> EnergyVoteResponse:
+        """Notify the controller of the user's vote for energy."""
+        energy_vote_request = EnergyVoteRequest(request_id=self.request_id, is_worth=is_worth)
+        with _catch_requests_exceptions():
+            resp = requests.post(
+                f"http://{self.controller_addr}{COLOSSEUM_ENERGY_VOTE_ROUTE}",
+                json=energy_vote_request.dict(),
+            )
+        _check_response(resp)
+        return EnergyVoteResponse(**resp.json())
+@contextlib.contextmanager
+def _catch_requests_exceptions():
+    """Catch requests exceptions and raise gr.Error instead."""
+    try:
+        yield
+    except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+        raise gr.Error("Failed to connect to our the backend server. Please try again later.")
+def _check_response(response: requests.Response) -> None:
+    if 400 <= response.status_code < 500:
+        raise gr.Error(response.json()["detail"])
+    elif response.status_code >= 500:
+        raise gr.Error("Failed to talk to our backend server. Please try again later.")
+class TestControllerClient(unittest.TestCase):
+    def test_new_uuid_on_deepcopy(self):
+        client = ControllerClient("http://localhost:8000")
+        clients = [client.fork() for _ in range(50)]
+        request_ids = [client.request_id for client in clients]
+        assert len(set(request_ids)) == len(request_ids)
+if __name__ == "__main__":
+    unittest.main()

spitfight/colosseum/common.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from __future__ import annotations
+from typing import Literal
+from pydantic import BaseModel
+COLOSSEUM_PROMPT_ROUTE = "/prompt"
+COLOSSEUM_RESP_VOTE_ROUTE = "/response_vote"
+COLOSSEUM_ENERGY_VOTE_ROUTE = "/energy_vote"
+COLOSSEUM_HEALTH_ROUTE = "/health"
+class PromptRequest(BaseModel):
+    request_id: str
+    prompt: str
+    model_index: Literal[0, 1]
+class ResponseVoteRequest(BaseModel):
+    request_id: str
+    victory_index: Literal[0, 1]
+class ResponseVoteResponse(BaseModel):
+    model_names: list[str]
+    energy_consumptions: list[float]
+class EnergyVoteRequest(BaseModel):
+    request_id: str
+    is_worth: bool
+class EnergyVoteResponse(BaseModel):
+    model_names: list[str]

spitfight/colosseum/controller/__init__.py ADDED Viewed

File without changes

spitfight/colosseum/controller/controller.py ADDED Viewed

	@@ -0,0 +1,266 @@

+from __future__ import annotations
+import json
+import asyncio
+from datetime import datetime
+from typing import AsyncGenerator, Literal, Optional, TYPE_CHECKING
+import aiohttp
+from pytz import timezone
+from pydantic import BaseModel, Field
+from spitfight.log import get_logger
+from spitfight.utils import BoundedExpiringDict, TokenGenerationBuffer, create_task
+from spitfight.colosseum.controller.worker import WorkerService
+from spitfight.prompt import get_system_prompt, apply_model_characteristics
+if TYPE_CHECKING:
+    from spitfight.colosseum.controller.router import ControllerConfig
+controller_logger = get_logger(__name__)
+request_logger = get_logger("colosseum_requests")
+def now() -> datetime:
+    return datetime.now(tz=timezone("US/Eastern"))
+# Internal states
+# The two "chose_*" stages are both the result of voting on a response.
+# A normal user will sequentially go through either
+#   "prompted" -> "chose_less_energy_response", or
+#   "prompted" -> "chose_more_energy_response" -> "voted_energy"
+UserStage = Literal[
+    "prompted",
+    "chose_less_energy_response",
+    "chose_more_energy_response",
+    "voted_energy",
+]
+class RequestState(BaseModel):
+    """Models the state of a Colosseum play.
+    This model is also serialized as is and logged.
+    """
+    request_id: str
+    prompt: str
+    model_names: list[str]
+    responses: list[str] = ["EMPTY", "EMPTY"]
+    energy_consumptions: list[float] = [0.0, 0.0]
+    response_victory_index: Optional[Literal[0, 1]] = None
+    extra_energy_was_worth: Optional[bool] = None
+    # The time when the user's stage changed.
+    timestamp: datetime = Field(default_factory=now)
+    # The user's current stage.
+    user_stage: UserStage = "prompted"
+    # When the the user is not going through the aforementioned stages,
+    # the user's stage transition is recorded here.
+    abnormal_stage_change: list[tuple[UserStage, UserStage]] = []
+    def set_response_and_energy(self, model_index: Literal[0, 1], response: str, energy_consumption: float) -> None:
+        self.timestamp = now()
+        self.energy_consumptions[model_index] = energy_consumption
+        self.responses[model_index] = response
+    def set_response_vote(self, victory_index: Literal[0, 1]) -> None:
+        self.timestamp = now()
+        # Next stage depends on the user's vote.
+        energy_a, energy_b = self.energy_consumptions
+        if (victory_index == 0 and energy_a <= energy_b) or (victory_index == 1 and energy_a >= energy_b):
+            next_stage = "chose_less_energy_response"
+        else:
+            next_stage = "chose_more_energy_response"
+        # Detect abnormal stage change.
+        if self.user_stage != "prompted":
+            self.abnormal_stage_change.append((self.user_stage, next_stage))
+        self.user_stage = next_stage
+        self.response_victory_index = victory_index
+    def set_energy_vote(self, is_worth: bool) -> None:
+        self.timestamp = now()
+        # Detect abnormal stage change.
+        if self.user_stage != "chose_more_energy_response":
+            self.abnormal_stage_change.append((self.user_stage, "voted_energy"))
+        self.user_stage = "voted_energy"
+        self.extra_energy_was_worth = is_worth
+class GenerationConfig(BaseModel):
+    """Configuration for generation of prompts."""
+    max_new_tokens: int
+    do_sample: bool
+    temperature: float
+    repetition_penalty: float
+    top_k: int
+    top_p: float
+class Controller:
+    def __init__(
+        self,
+        background_task_interval: int,
+        max_num_req_states: int,
+        req_state_expiration_time: int,
+        worker_service: WorkerService,
+        generation_config: GenerationConfig,
+    ):
+        self.request_states: BoundedExpiringDict[str, RequestState] = \
+            BoundedExpiringDict(max_num_req_states, req_state_expiration_time)
+        self.worker_service = worker_service
+        self.generation_config = generation_config
+        self.background_task_handle = create_task(
+            self._background_task(background_task_interval),
+        )
+    def shutdown(self) -> None:
+        """Shutdown the controller."""
+        self.background_task_handle.cancel()
+    async def _background_task(self, heartbeat_interval: int) -> None:
+        """Periodically check if dead workers are alive again and do request state GC."""
+        while True:
+            await asyncio.sleep(heartbeat_interval)
+            await self.worker_service.check_workers()
+            prev_num_req_states = len(self.request_states)
+            self.request_states.cleanup()
+            controller_logger.info(
+                "Request state garbage collection done: Removed %d reqeusts",
+                prev_num_req_states - len(self.request_states),
+            )
+    def response_vote(self, request_id: str, victory_index: Literal[0, 1]) -> RequestState | None:
+        """Record the user's response vote and return the new state."""
+        if (state := self.request_states.get(request_id)) is not None:
+            state.set_response_vote(victory_index)
+            # Pop the state from the dict if the user has voted on energy.
+            if state.user_stage == "chose_less_energy_response":
+                self.request_states.pop(request_id)
+            request_logger.info(state.json())
+            return state
+        return None
+    def energy_vote(self, request_id: str, is_worth: bool) -> RequestState | None:
+        """Record the user's energy vote and return the new state."""
+        # Pop the state from the dict, since this is the last step in any case.
+        if (state := self.request_states.pop(request_id)) is not None:
+            state.set_energy_vote(is_worth)
+            request_logger.info(state.json())
+            return state
+        return None
+    async def prompt(
+        self,
+        request_id: str,
+        prompt: str,
+        model_index: Literal[0, 1],
+    ) -> AsyncGenerator[bytes, None]:
+        # This method is called twice for the same request, once for each model.
+        # If it's the first time this method is called, assign models to the request.
+        if request_id not in self.request_states:
+            workers = self.worker_service.choose_two()
+            model_names = [worker.model_name for worker in workers]
+            self.request_states[request_id] = RequestState(
+                request_id=request_id,
+                prompt=prompt,
+                model_names=model_names,
+            )
+        request_state = self.request_states[request_id]
+        model_name = request_state.model_names[model_index]
+        try:
+            worker = self.worker_service.get_worker(model_name)
+        except KeyError:
+            controller_logger.error("Worker %s not found.", model_name)
+            raise
+        except RuntimeError:
+            controller_logger.error("Worker %s is dead.", model_name)
+            raise
+        prompt, stop_str, stop_token_ids = apply_model_characteristics(
+            system_prompt=get_system_prompt("chat"),
+            prompt=prompt,
+            model_name=worker.model_id,
+        )
+        # Request the model worker to stream the response to the user's prompt.
+        response = ""
+        energy = 0.0
+        client = worker.get_client()
+        buffer = TokenGenerationBuffer(stop_str=stop_str)
+        try:
+            async for resp in client.generate_stream(
+                prompt=prompt,
+                stop_sequences=[stop_str] if stop_str is not None else None,
+                **self.generation_config.dict(),
+            ):
+                # Even special tokens consume energy when they're generated.
+                energy += resp.token.energy
+                # Stop tokens usually don't overlap with (human-readable) stop sequences.
+                # if resp.token.special or resp.token.id in stop_token_ids:
+                if resp.token.id in stop_token_ids:
+                    # If the buffer is not empty (i.e., we had partial stop_str matches),
+                    # just yield it to the user.
+                    if (chunk := buffer.token_buffer):
+                        response += chunk
+                        yield json.dumps(chunk).encode() + b"\0"
+                    break
+                # Skip special tokens.
+                if resp.token.special:
+                    continue
+                # The buffer automatically handles `stop_str` partial and full matches.
+                buffer.append(resp.token.text)
+                if (chunk := buffer.pop()) is not None:
+                    response += chunk
+                    yield json.dumps(chunk).encode() + b"\0"
+                elif buffer.matched_stop_str:
+                    break
+        except aiohttp.ClientConnectorError:
+            worker.status = "down"
+            controller_logger.error(
+                "Problem talking to %s. Aborting and setting worker status to down",
+                repr(worker),
+            )
+            raise
+        except Exception:
+            yield json.dumps(buffer.token_buffer).encode() + b"\0"
+            raise
+        finally:
+            request_state.set_response_and_energy(model_index, response, energy)
+            request_logger.info(request_state.json())
+CONTROLLER: Controller | None = None
+def init_global_controller(config: ControllerConfig) -> None:
+    global CONTROLLER
+    CONTROLLER = Controller(
+        background_task_interval=config.background_task_interval,
+        max_num_req_states=config.max_num_req_states,
+        req_state_expiration_time=config.req_state_expiration_time,
+        worker_service=WorkerService(config.compose_files),
+        generation_config=GenerationConfig(
+            max_new_tokens=config.max_new_tokens,
+            do_sample=config.do_sample,
+            temperature=config.temperature,
+            repetition_penalty=config.repetition_penalty,
+            top_k=config.top_k,
+            top_p=config.top_p,
+        ),
+    )
+def get_global_controller() -> Controller:
+    global CONTROLLER
+    assert CONTROLLER is not None
+    return CONTROLLER

spitfight/colosseum/controller/router.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import json
+import uvicorn
+from pydantic import BaseSettings
+from fastapi import FastAPI, Depends
+from fastapi.responses import StreamingResponse
+from fastapi.exceptions import HTTPException
+from text_generation.errors import OverloadedError, UnknownError, ValidationError
+from spitfight.log import get_logger, init_queued_root_logger, shutdown_queued_root_loggers
+from spitfight.colosseum.common import (
+    COLOSSEUM_PROMPT_ROUTE,
+    COLOSSEUM_RESP_VOTE_ROUTE,
+    COLOSSEUM_ENERGY_VOTE_ROUTE,
+    COLOSSEUM_HEALTH_ROUTE,
+    PromptRequest,
+    ResponseVoteRequest,
+    ResponseVoteResponse,
+    EnergyVoteRequest,
+    EnergyVoteResponse,
+)
+from spitfight.colosseum.controller.controller import (
+    Controller,
+    init_global_controller,
+    get_global_controller,
+)
+from spitfight.utils import prepend_generator
+class ControllerConfig(BaseSettings):
+    """Controller settings automatically loaded from environment variables."""
+    # Controller
+    background_task_interval: int = 300
+    max_num_req_states: int = 10000
+    req_state_expiration_time: int = 600
+    compose_files: list[str] = ["deployment/docker-compose-0.yaml", "deployment/docker-compose-1.yaml"]
+    # Logging
+    log_dir: str = "/logs"
+    controller_log_file: str = "controller.log"
+    request_log_file: str = "requests.log"
+    uvicorn_log_file: str = "uvicorn.log"
+    # Generation
+    max_new_tokens: int = 512
+    do_sample: bool = True
+    temperature: float = 1.0
+    repetition_penalty: float = 1.0
+    top_k: int = 50
+    top_p: float = 0.95
+app = FastAPI()
+settings = ControllerConfig()
+logger = get_logger("spitfight.colosseum.controller.router")
+@app.on_event("startup")
+async def startup_event():
+    init_queued_root_logger("uvicorn", os.path.join(settings.log_dir, settings.uvicorn_log_file))
+    init_queued_root_logger("spitfight.colosseum.controller", os.path.join(settings.log_dir, settings.controller_log_file))
+    init_queued_root_logger("colosseum_requests", os.path.join(settings.log_dir, settings.request_log_file))
+    init_global_controller(settings)
+@app.on_event("shutdown")
+async def shutdown_event():
+    get_global_controller().shutdown()
+    shutdown_queued_root_loggers()
+@app.post(COLOSSEUM_PROMPT_ROUTE)
+async def prompt(
+    request: PromptRequest,
+    controller: Controller = Depends(get_global_controller),
+):
+    generator = controller.prompt(request.request_id, request.prompt, request.model_index)
+    # First try to get the first token in order to catch TGI errors.
+    try:
+        first_token = await generator.__anext__()
+    except OverloadedError:
+        name = controller.request_states[request.request_id].model_names[request.model_index]
+        logger.warning("Model %s is overloaded. Failed request: %s", name, repr(request))
+        raise HTTPException(status_code=429, detail="Model overloaded. Pleaes try again later.")
+    except ValidationError as e:
+        logger.info("TGI returned validation error: %s. Failed request: %s", str(e), repr(request))
+        raise HTTPException(status_code=422, detail=str(e))
+    except StopAsyncIteration:
+        logger.info("TGI returned empty response. Failed request: %s", repr(request))
+        return StreamingResponse(
+            iter([json.dumps("*The model generated an empty response.*").encode() + b"\0"]),
+        )
+    except UnknownError as e:
+        logger.error("TGI returned unknown error: %s. Failed request: %s", str(e), repr(request))
+        raise HTTPException(status_code=500, detail=str(e))
+    return StreamingResponse(prepend_generator(first_token, generator))
+@app.post(COLOSSEUM_RESP_VOTE_ROUTE, response_model=ResponseVoteResponse)
+async def response_vote(
+    request: ResponseVoteRequest,
+    controller: Controller = Depends(get_global_controller),
+):
+    if (state := controller.response_vote(request.request_id, request.victory_index)) is None:
+        raise HTTPException(status_code=410, detail="Colosseum battle session timeout expired.")
+    return ResponseVoteResponse(
+        energy_consumptions=state.energy_consumptions,
+        model_names=state.model_names,
+    )
+@app.post(COLOSSEUM_ENERGY_VOTE_ROUTE, response_model=EnergyVoteResponse)
+async def energy_vote(
+    request: EnergyVoteRequest,
+    controller: Controller = Depends(get_global_controller),
+):
+    if (state := controller.energy_vote(request.request_id, request.is_worth)) is None:
+        raise HTTPException(status_code=410, detail="Colosseum battle session timeout expired.")
+    return EnergyVoteResponse(model_names=state.model_names)
+@app.get(COLOSSEUM_HEALTH_ROUTE)
+async def health():
+    return "OK"
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", log_config=None)

spitfight/colosseum/controller/worker.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import yaml
+import random
+import asyncio
+from typing import Literal
+from functools import cached_property
+import httpx
+from pydantic import BaseModel
+from text_generation import AsyncClient
+from spitfight.log import get_logger
+logger = get_logger(__name__)
+class Worker(BaseModel):
+    """A worker that serves a model."""
+    # Worker's container name, since we're using Overlay networks.
+    hostname: str
+    # For TGI, this would always be 80.
+    port: int
+    # User-friendly model name, e.g. "metaai/llama2-13b-chat".
+    model_name: str
+    # Hugging Face model ID, e.g. "metaai/Llama-2-13b-chat-hf".
+    model_id: str
+    # Whether the model worker container is good.
+    status: Literal["up", "down"]
+    class Config:
+        keep_untouched = (cached_property,)
+    @cached_property
+    def url(self) -> str:
+        return f"http://{self.hostname}:{self.port}"
+    def get_client(self) -> AsyncClient:
+        return AsyncClient(base_url=self.url)
+    def audit(self) -> None:
+        """Make sure the worker is running and information is as expected.
+        Assumed to be called on app startup when workers are initialized.
+        This method will just raise `ValueError`s if audit fails in order to
+        prevent the controller from starting if anything is wrong.
+        """
+        try:
+            response = httpx.get(self.url + "/info")
+        except (httpx.ConnectError, httpx.TimeoutException) as e:
+            raise ValueError(f"Could not connect to {self!r}: {e!r}")
+        if response.status_code != 200:
+            raise ValueError(f"Could not get /info from {self!r}.")
+        info = response.json()
+        if info["model_id"] != self.model_id:
+            raise ValueError(f"Model name mismatch: {info['model_id']} != {self.model_id}")
+        self.status = "up"
+        logger.info("%s is up.", repr(self))
+    async def check_status(self) -> None:
+        """Check worker status and update `self.status` accordingly."""
+        async with httpx.AsyncClient() as client:
+            try:
+                response = await client.get(self.url + "/info")
+            except (httpx.ConnectError, httpx.TimeoutException) as e:
+                self.status = "down"
+                logger.warning("%s is down: %s", repr(self), repr(e))
+                return
+            if response.status_code != 200:
+                self.status = "down"
+                logger.warning("GET /info from %s returned %s.", repr(self), response.json())
+                return
+            info = response.json()
+            if info["model_id"] != self.model_id:
+                self.status = "down"
+                logger.warning(
+                    "Model name mismatch for %s: %s != %s",
+                    repr(self),
+                    info["model_id"],
+                    self.model_id,
+                )
+                return
+        logger.info("%s is up.", repr(self))
+        self.status = "up"
+class WorkerService:
+    """A service that manages model serving workers.
+    Worker objects are only created once and shared across the
+    entire application. Especially, changing the status of a worker
+    will immediately take effect on the result of `choose_two`.
+    Attributes:
+        workers (list[Worker]): The list of workers.
+    """
+    def __init__(self, compose_files: list[str]) -> None:
+        """Initialize the worker service."""
+        self.workers: list[Worker] = []
+        worker_model_names = set()
+        for compose_file in compose_files:
+            spec = yaml.safe_load(open(compose_file))
+            for model_name, service_spec in spec["services"].items():
+                command = service_spec["command"]
+                for i, cmd in enumerate(command):
+                    if cmd == "--model-id":
+                        model_id = command[i + 1]
+                        break
+                else:
+                    raise ValueError(f"Could not find model ID in {command!r}")
+                worker_model_names.add(model_name)
+                worker = Worker(
+                    hostname=service_spec["container_name"],
+                    port=80,
+                    model_name=model_name,
+                    model_id=model_id,
+                    status="down",
+                )
+                worker.audit()
+                self.workers.append(worker)
+        if len(worker_model_names) != len(self.workers):
+            raise ValueError("Model names must be unique.")
+    def get_worker(self, model_name: str) -> Worker:
+        """Get a worker by model name."""
+        for worker in self.workers:
+            if worker.model_name == model_name:
+                if worker.status == "down":
+                    # This is an unfortunate case where, when the two models were chosen,
+                    # the worker was up, but after that went down before the request
+                    # completed. We'll just raise a 500 internal error and have the user
+                    # try again. This won't be common.
+                    raise RuntimeError(f"The worker with model name {model_name} is down.")
+                return worker
+        raise ValueError(f"Worker with model name {model_name} does not exist.")
+    def choose_two(self) -> tuple[Worker, Worker]:
+        """Choose two different workers.
+        Good place to use the Strategy Pattern when we want to
+        implement different strategies for choosing workers.
+        """
+        live_workers = [worker for worker in self.workers if worker.status == "up"]
+        if len(live_workers) < 2:
+            raise ValueError("Not enough live workers to choose from.")
+        worker_a, worker_b = random.sample(live_workers, 2)
+        return worker_a, worker_b
+    async def check_workers(self) -> None:
+        """Check the status of all workers."""
+        await asyncio.gather(*[worker.check_status() for worker in self.workers])

spitfight/log.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from __future__ import annotations
+import queue
+import logging
+from logging.handlers import QueueHandler, QueueListener
+ROOT_LOGGER_NAMES: list[str | None] = []
+ROOT_LOGGER_QUEUE_LISTENERS: list[QueueListener] = []
+def init_queued_root_logger(
+    name: str | None,
+    filepath: str,
+    level: int = logging.INFO,
+) -> None:
+    """Initialize a queue-based pseudo-root logger.
+    The pseudo-root logger will aggregate log messages from children
+    loggers under its namespace and send them to a queue. A QueueListener,
+    running in a separate thread, will then process the messages in the
+    queue and send them to the configured handlers.
+    """
+    global ROOT_LOGGER_NAMES, ROOT_LOGGER_QUEUE_LISTENERS
+    # Make this function idempotent.
+    if name in ROOT_LOGGER_NAMES:
+        return
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.propagate = False
+    shared_queue = queue.SimpleQueue()
+    queue_handler = QueueHandler(shared_queue)
+    logger.addHandler(queue_handler)
+    formatter = logging.Formatter(
+        "[%(asctime)s] [%(levelname)s] [%(name)s](%(filename)s:%(lineno)d) %(message)s"
+    )
+    stderr_handler = logging.StreamHandler()
+    stderr_handler.setLevel(level)
+    stderr_handler.setFormatter(formatter)
+    file_handler = logging.FileHandler(filepath, encoding="utf-8")
+    file_handler.setLevel(level)
+    file_handler.setFormatter(formatter)
+    queue_listener = QueueListener(shared_queue, file_handler, stderr_handler)
+    queue_listener.start()
+    ROOT_LOGGER_NAMES.append(name)
+    ROOT_LOGGER_QUEUE_LISTENERS.append(queue_listener)
+def shutdown_queued_root_loggers() -> None:
+    """Shutdown all queue-based pseudo-root loggers.
+    This is necessary to make sure all log messages are flushed
+    before the application exits.
+    """
+    for queue_listener in ROOT_LOGGER_QUEUE_LISTENERS:
+        queue_listener.stop()
+def get_logger(name: str, level: int = logging.INFO) -> logging.Logger:
+    """Setup a logger with the given name and level."""
+    # Don't reconfigure existing loggers.
+    if name in logging.Logger.manager.loggerDict:
+        return logging.getLogger(name)
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.propagate = True
+    return logger

spitfight/prompt.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""An abstraction layer for prompting different models."""
+from __future__ import annotations
+import enum
+from fastchat.model.model_adapter import get_conversation_template
+class Task(enum.Enum):
+    """Different system prompt styles."""
+    CHAT = "chat"
+    CHAT_CONCISE = "chat-concise"
+    INSTRUCT = "instruct"
+    INSTRUCT_CONCISE = "instruct-concise"
+SYSTEM_PROMPTS = {
+    Task.CHAT: (
+        "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions. "
+    ),
+    Task.CHAT_CONCISE: (
+        "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions. "
+        "The assistant's answers are very concise. "
+    ),
+    Task.INSTRUCT: (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request. "
+    ),
+    Task.INSTRUCT_CONCISE: (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request. "
+        "The response should be very concise. "
+    ),
+}
+def get_system_prompt(task: Task | str) -> str:
+    """Get the system prompt for a given task."""
+    if isinstance(task, str):
+        task = Task(task)
+    return SYSTEM_PROMPTS[task]
+def apply_model_characteristics(
+    system_prompt: str,
+    prompt: str,
+    model_name: str,
+) -> tuple[str, str | None, list[int]]:
+    """Apply and return model-specific differences."""
+    conv = get_conversation_template(model_name)
+    if "llama-2" in model_name.lower():
+        conv.system = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
+    elif "stablelm" in model_name.lower():
+        conv.system = f"""<|SYSTEM|># {system_prompt}\n"""
+    else:
+        conv.system = system_prompt
+    conv.messages = []
+    conv.offset = 0
+    conv.append_message(conv.roles[0], prompt)
+    conv.append_message(conv.roles[1], "")
+    stop_str = None if conv.stop_str is None or not conv.stop_str else conv.stop_str
+    return conv.get_prompt(), stop_str, (conv.stop_token_ids or [])

spitfight/utils.py ADDED Viewed

	@@ -0,0 +1,305 @@

+from __future__ import annotations
+import time
+import heapq
+import asyncio
+import unittest
+from typing import TypeVar, Generic, AsyncGenerator, Any, Coroutine
+from fastapi.logger import logger
+K = TypeVar('K')
+V = TypeVar('V')
+class BoundedExpiringDict(Generic[K, V]):
+    def __init__(self, max_size: int, expiration_time: int) -> None:
+        self.data_dict: dict[K, V] = {}
+        self.timestamp_heap: list[tuple[float, K]] = []
+        self.timeout = expiration_time
+        # Without this, the controller is vulnerable to "user flood attacks,"
+        # where someone can create a bunch of users by polling /request before
+        # self.timeout expires and blow up memory.
+        self.max_size = max_size
+    def __getitem__(self, key: K) -> V:
+        return self.data_dict[key]
+    def __setitem__(self, key: K, value: V) -> None:
+        if len(self.data_dict) >= self.max_size:
+            self.cleanup()
+        heapq.heappush(self.timestamp_heap, (time.monotonic(), key))
+        self.data_dict[key] = value
+    def __delitem__(self, key: K) -> None:
+        # This is a bit inefficient, but it's not a common case operation.
+        # We still need to do this to keep timestamp_heap in sync.
+        del self.data_dict[key]
+        for i, (_, existing_key) in enumerate(self.timestamp_heap):
+            if existing_key == key:
+                del self.timestamp_heap[i]
+                break
+        heapq.heapify(self.timestamp_heap)
+    def __contains__(self, key: K) -> bool:
+        return key in self.data_dict
+    def __len__(self) -> int:
+        return len(self.data_dict)
+    def get(self, key: K, default: V | None = None) -> V | None:
+        return self.data_dict.get(key, default)
+    def pop(self, key: K, default: V | None = None) -> V | None:
+        item = self.data_dict.pop(key, default)
+        if item is not None:
+            for i, (_, existing_key) in enumerate(self.timestamp_heap):
+                if existing_key == key:
+                    del self.timestamp_heap[i]
+                    break
+            heapq.heapify(self.timestamp_heap)
+        return item
+    def cleanup(self) -> None:
+        now = time.monotonic()
+        # After the while loop, the dictionary will be smaller than max_size
+        # and all keys will have been accessed within the timeout.
+        while (self.timestamp_heap and now - self.timestamp_heap[0][0] > self.timeout) or len(self.data_dict) > self.max_size:
+            _, key = heapq.heappop(self.timestamp_heap)
+            del self.data_dict[key]
+        assert len(self.data_dict) == len(self.timestamp_heap)
+T = TypeVar("T")
+async def prepend_generator(
+    first_item: T,
+    generator: AsyncGenerator[T, None],
+) -> AsyncGenerator[T, None]:
+    """Prepend an item to an async generator."""
+    yield first_item
+    async for item in generator:
+        yield item
+def create_task(coroutine: Coroutine[Any, Any, T]) -> asyncio.Task[T]:
+    """Create an `asyncio.Task` but ensure that exceptions are logged.
+    Reference: https://quantlane.com/blog/ensure-asyncio-task-exceptions-get-logged/
+    """
+    loop = asyncio.get_running_loop()
+    task = loop.create_task(coroutine)
+    task.add_done_callback(_handle_task_exception)
+    return task
+def _handle_task_exception(task: asyncio.Task) -> None:
+    """Print out exception and tracebook when a task dies with an exception."""
+    try:
+        task.result()
+    except asyncio.CancelledError:
+        # Cancellation should not be logged as an error.
+        pass
+    except Exception:  # pylint: disable=broad-except
+        # `logger.exception` automatically handles exception and traceback info.
+        logger.exception("Job task died with an exception!")
+class TokenGenerationBuffer:
+    """A constant sized buffer for tokens, used to handle stop sequences.
+    Attributes:
+        token_buffer (str): Internal buffer for tokens.
+        matched_stop_str (bool): Whether the stop string has been seen. When this
+            is True, generation should stop and `pop` will always return None.
+    """
+    def __init__(self, stop_str: str | None = None) -> None:
+        """Initialize the buffer.
+        If `stop_str` is None, the buffer will just return all tokens as they come.
+        """
+        self.stop_str = stop_str
+        self.token_len_list = []
+        self.token_buffer = ""
+        self.matched_stop_str = False
+    def append(self, text: str) -> None:
+        """Append a token to the buffer."""
+        if self.stop_str is not None:
+            self.token_len_list.append(len(text))
+        self.token_buffer += text
+    def _pop_one(self) -> str:
+        """Remove and return the first token in the buffer."""
+        token_len = self.token_len_list.pop(0)
+        token, self.token_buffer = self.token_buffer[:token_len], self.token_buffer[token_len:]
+        return token
+    def pop(self) -> str | None:
+        """Try to pop a token from the buffer.
+        Return value None means that there is nothing to yield for now.
+        Repeated calls to this method will always just return None before more
+        tokens are appended to the buffer.
+        """
+        # A short circuit for no stop string.
+        if self.stop_str is None:
+            return_buffer = self.token_buffer or None
+            self.token_buffer = ""
+            return return_buffer
+        if self.matched_stop_str:
+            return None
+        # The token buffer matched the stop string. We're done generating.
+        if self.stop_str == self.token_buffer:
+            self.matched_stop_str = True
+            return None
+        # The tokens in the buffer could potentially be part of the stop string.
+        # We'll stay put until we see more tokens. This also covers the case of
+        # empty token buffer.
+        if self.stop_str.startswith(self.token_buffer):
+            return None
+        # We can return tokens from the beginning of the buffer until the buffer
+        # is a prefix of the stop string.
+        return_buffer = ""
+        while self.token_buffer:
+            return_buffer += self._pop_one()
+            if self.stop_str == self.token_buffer:
+                self.matched_stop_str = True
+                break
+            if self.stop_str.startswith(self.token_buffer):
+                break
+        return return_buffer or None
+class TestTokenGenerationBuffer(unittest.TestCase):
+    def test_basic1(self):
+        buffer = TokenGenerationBuffer(stop_str="stop")
+        buffer.append("hello")
+        self.assertEqual(buffer.pop(), "hello")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("world")
+        self.assertEqual(buffer.pop(), "world")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("stop")
+        self.assertEqual(buffer.pop(), None)
+        self.assertTrue(buffer.matched_stop_str)
+        self.assertEqual(buffer.pop(), None)
+        self.assertTrue(buffer.matched_stop_str)
+        self.assertEqual(buffer.pop(), None)
+        self.assertTrue(buffer.matched_stop_str)
+        self.assertEqual(buffer.pop(), None)
+        self.assertTrue(buffer.matched_stop_str)
+    def test_basic2(self):
+        buffer = TokenGenerationBuffer(stop_str="stop")
+        buffer.append("hi")
+        self.assertEqual(buffer.pop(), "hi")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("stole")
+        self.assertEqual(buffer.pop(), "stole")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("sto")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("ic")
+        self.assertEqual(buffer.pop(), "stoic")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("st")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("opper")
+        self.assertEqual(buffer.pop(), "stopper")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("sto")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("p")
+        self.assertEqual(buffer.pop(), None)
+        self.assertTrue(buffer.matched_stop_str)
+    def test_falcon1(self):
+        buffer = TokenGenerationBuffer(stop_str="\nUser")
+        buffer.append("Hi")
+        self.assertEqual(buffer.pop(), "Hi")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("!")
+        self.assertEqual(buffer.pop(), "!")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("\n")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("User")
+        self.assertEqual(buffer.pop(), None)
+        self.assertTrue(buffer.matched_stop_str)
+    def test_falcon2(self):
+        buffer = TokenGenerationBuffer(stop_str="\nUser")
+        buffer.append("\n")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("\n")
+        self.assertEqual(buffer.pop(), "\n")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("\n")
+        self.assertEqual(buffer.pop(), "\n")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("\n")
+        self.assertEqual(buffer.pop(), "\n")
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("User")
+        self.assertEqual(buffer.pop(), None)
+        self.assertEqual(buffer.pop(), None)
+        self.assertTrue(buffer.matched_stop_str)
+    def test_no_stop_str(self):
+        buffer = TokenGenerationBuffer(stop_str=None)
+        buffer.append("hello")
+        self.assertEqual(buffer.pop(), "hello")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("world")
+        self.assertEqual(buffer.pop(), "world")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+        buffer.append("\n")
+        self.assertEqual(buffer.pop(), "\n")
+        self.assertEqual(buffer.pop(), None)
+        self.assertFalse(buffer.matched_stop_str)
+if __name__ == "__main__":
+    unittest.main()