""" It provides a platform for comparing the responses of two LLMs. """ import enum from uuid import uuid4 from firebase_admin import firestore import gradio as gr import lingua from db import db from leaderboard import build_leaderboard from leaderboard import SUPPORTED_LANGUAGES from model import check_models from model import supported_models from rate_limit import set_token import response from response import get_responses detector = lingua.LanguageDetectorBuilder.from_all_languages().build() class VoteOptions(enum.Enum): MODEL_A = "Model A is better" MODEL_B = "Model B is better" TIE = "Tie" def vote(vote_button, response_a, response_b, model_a_name, model_b_name, prompt, instruction, category, source_lang, target_lang): doc_id = uuid4().hex winner = VoteOptions(vote_button).name.lower() deactivated_buttons = [gr.Button(interactive=False) for _ in range(3)] outputs = deactivated_buttons + [gr.Row(visible=True)] doc = { "id": doc_id, "prompt": prompt, "instruction": instruction, "model_a": model_a_name, "model_b": model_b_name, "model_a_response": response_a, "model_b_response": response_b, "winner": winner, "timestamp": firestore.SERVER_TIMESTAMP } if category == response.Category.SUMMARIZE.value: language_a = detector.detect_language_of(response_a) language_b = detector.detect_language_of(response_b) # TODO(#37): Move DB operations to db.py. doc_ref = db.collection("arena-summarizations").document(doc_id) doc["model_a_response_language"] = language_a.name.lower() doc["model_b_response_language"] = language_b.name.lower() doc_ref.set(doc) return outputs if category == response.Category.TRANSLATE.value: if not source_lang or not target_lang: raise gr.Error("Please select source and target languages.") doc_ref = db.collection("arena-translations").document(doc_id) doc["source_language"] = source_lang.lower() doc["target_language"] = target_lang.lower() doc_ref.set(doc) return outputs raise gr.Error("Please select a response type.") # Removes the persistent orange border from the leaderboard, which # appears due to the 'generating' class when using the 'every' parameter. css = """ .leaderboard .generating { border: none; } """ with gr.Blocks(title="Yanolja Arena", css=css) as app: token = gr.Textbox(visible=False) set_token(app, token) with gr.Row(): gr.HTML("""

Yanolja Arena

Yanolja Arena helps find the best LLMs for summarizing and translating text. We compare two random models at a time and use an ELO rating system to score them.

This is an open-source project. Check it out on GitHub.

""") with gr.Accordion("How to Use", open=False): gr.Markdown(""" 1. **For Summaries:** - Enter the text you want summarized into the prompt box. 2. **For Translations:** - Choose the language you're translating from and to. - Enter the text you want translated into the prompt box. 3. **Voting:** - After you see both results, pick which one you think is better. """) with gr.Accordion("Available Models", open=False): gr.Markdown("\n".join([f"- {model.name}" for model in supported_models])) with gr.Row(): category_radio = gr.Radio( choices=[category.value for category in response.Category], value=response.Category.SUMMARIZE.value, label="Category", info="The chosen category determines the instruction sent to the LLMs.") source_language = gr.Dropdown( choices=SUPPORTED_LANGUAGES, value=lingua.Language.ENGLISH.name.capitalize(), label="Source language", info="Choose the source language for translation.", interactive=True, visible=False) target_language = gr.Dropdown( choices=SUPPORTED_LANGUAGES, value=lingua.Language.KOREAN.name.capitalize(), label="Target language", info="Choose the target language for translation.", interactive=True, visible=False) def update_language_visibility(category): visible = category == response.Category.TRANSLATE.value return { source_language: gr.Dropdown(visible=visible), target_language: gr.Dropdown(visible=visible) } category_radio.change(update_language_visibility, category_radio, [source_language, target_language]) model_names = [gr.State(None), gr.State(None)] response_boxes = [gr.State(None), gr.State(None)] prompt_textarea = gr.TextArea(label="Prompt", lines=4) submit = gr.Button() with gr.Group(): with gr.Row(): response_boxes[0] = gr.Textbox(label="Model A", interactive=False) response_boxes[1] = gr.Textbox(label="Model B", interactive=False) with gr.Row(visible=False) as model_name_row: model_names[0] = gr.Textbox(show_label=False) model_names[1] = gr.Textbox(show_label=False) with gr.Row(visible=False) as vote_row: option_a = gr.Button(VoteOptions.MODEL_A.value) option_b = gr.Button(VoteOptions.MODEL_B.value) tie = gr.Button(VoteOptions.TIE.value) instruction_state = gr.State("") # The following elements need to be reset when the user changes # the category, source language, or target language. ui_elements = [ response_boxes[0], response_boxes[1], model_names[0], model_names[1], instruction_state, model_name_row, vote_row ] def reset_ui(): return [gr.Textbox(value="") for _ in range(4) ] + [gr.State(""), gr.Row(visible=False), gr.Row(visible=False)] category_radio.change(fn=reset_ui, outputs=ui_elements) source_language.change(fn=reset_ui, outputs=ui_elements) target_language.change(fn=reset_ui, outputs=ui_elements) submit_event = submit.click( fn=lambda: [ gr.Radio(interactive=False), gr.Dropdown(interactive=False), gr.Dropdown(interactive=False), gr.Button(interactive=False), gr.Row(visible=False), gr.Row(visible=False), ] + [gr.Button(interactive=True) for _ in range(3)], outputs=[ category_radio, source_language, target_language, submit, vote_row, model_name_row, option_a, option_b, tie ]).then(fn=get_responses, inputs=[ prompt_textarea, category_radio, source_language, target_language, token ], outputs=response_boxes + model_names + [instruction_state]) submit_event.success(fn=lambda: gr.Row(visible=True), outputs=vote_row) submit_event.then( fn=lambda: [ gr.Radio(interactive=True), gr.Dropdown(interactive=True), gr.Dropdown(interactive=True), gr.Button(interactive=True) ], outputs=[category_radio, source_language, target_language, submit]) def deactivate_after_voting(option_button: gr.Button): option_button.click( fn=vote, inputs=[option_button] + response_boxes + model_names + [ prompt_textarea, instruction_state, category_radio, source_language, target_language ], outputs=[option_a, option_b, tie, model_name_row]).then( fn=lambda: [gr.Button(interactive=False) for _ in range(3)], outputs=[option_a, option_b, tie]) for option in [option_a, option_b, tie]: deactivate_after_voting(option) build_leaderboard() if __name__ == "__main__": check_models(supported_models) # We need to enable queue to use generators. app.queue(api_open=False) app.launch(debug=True, show_api=False)