judge-arena / leaderboard.py
kaikaidai's picture
Create leaderboard.py
5267683 verified
from collections import defaultdict
from datetime import datetime, timezone
from typing import Dict, List
# Constants
DEFAULT_ELO = 1200 # Starting ELO for new models
K_FACTOR = 32 # Standard chess K-factor
def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
"""Generate leaderboard data using votes from MongoDB."""
# Initialize dictionaries for tracking
ratings = defaultdict(lambda: DEFAULT_ELO)
matches = defaultdict(int)
# Process each vote
for vote in voting_data:
try:
model_a = vote.get("model_a")
model_b = vote.get("model_b")
winner = vote.get("winner")
# Skip if models aren't in current model_data
if (
not all([model_a, model_b, winner])
or model_a not in model_data
or model_b not in model_data
):
continue
# Update match counts
matches[model_a] += 1
matches[model_b] += 1
# Calculate ELO changes
elo_a = ratings[model_a]
elo_b = ratings[model_b]
# Expected scores
expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
expected_b = 1 - expected_a
# Actual scores
score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
score_b = 1 - score_a
# Update ratings
ratings[model_a] += K_FACTOR * (score_a - expected_a)
ratings[model_b] += K_FACTOR * (score_b - expected_b)
except Exception as e:
print(f"Error processing vote: {e}")
continue
# Generate leaderboard data
leaderboard = []
for model in model_data.keys():
votes = matches[model]
# Skip models with < 500 votes if show_preliminary is False
if not show_preliminary and votes < 500:
continue
elo = ratings[model]
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
data = {
"Model": model,
"ELO Score": f"{int(elo)}",
"95% CI": f"±{int(ci)}",
"# Votes": votes,
"Organization": model_data[model]["organization"],
"License": model_data[model]["license"],
}
leaderboard.append(data)
# Sort leaderboard by ELO score in descending order
leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
return leaderboard
def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
"""Get summary statistics for the leaderboard."""
now = datetime.now(timezone.utc)
total_votes = len(voting_data)
total_models = len(model_data)
last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
"%B %d, %Y at %H:00 UTC"
)
return f"""
### Leaderboard Stats
- **Total Models**: {total_models}
- **Total Votes**: {total_votes}
- **Last Updated**: {last_updated}
"""
def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
"""Calculate ELO rating changes for both players."""
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
expected_b = 1 - expected_a
if winner == "A":
score_a, score_b = 1, 0
elif winner == "B":
score_a, score_b = 0, 1
else: # Handle ties
score_a, score_b = 0.5, 0.5
change_a = K_FACTOR * (score_a - expected_a)
change_b = K_FACTOR * (score_b - expected_b)
return change_a, change_b
def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
"""Get current rankings of all models from leaderboard data."""
return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}