from typing import Literal from huggingface_hub import HfFileSystem, hf_hub_download KEY_TO_CATEGORY_NAME = { "full": "Overall", "coding": "Coding", "long_user": "Longer Query", "english": "English", "chinese": "Chinese", "french": "French", "no_tie": "Exclude Ties", "no_short": "Exclude Short Query (< 5 tokens)", "no_refusal": "Exclude Refusal", } CAT_NAME_TO_EXPLANATION = { "Overall": "Overall Questions", "Coding": "Coding: whether conversation contains code snippets", "Longer Query": "Longer Query (>= 500 tokens)", "English": "English Prompts", "Chinese": "Chinese Prompts", "French": "French Prompts", "Exclude Ties": "Exclude Ties and Bothbad", "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)", "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")', } PROPRIETARY_LICENSES = [ "Proprietary", ] def download_latest_data_from_space( repo_id: str, file_type: Literal["pkl", "csv"] ) -> str: """ Downloads the latest data file of the specified file type from the given repository space. Args: repo_id (str): The ID of the repository space. file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv". Returns: str: The local file path of the downloaded data file. """ def extract_date(filename): return filename.split("/")[-1].split(".")[0].split("_")[-1] fs = HfFileSystem() data_file_path = f"spaces/{repo_id}/*.{file_type}" files = fs.glob(data_file_path) latest_file = sorted(files, key=extract_date, reverse=True)[0] latest_filepath_local = hf_hub_download( repo_id=repo_id, filename=latest_file.split("/")[-1], repo_type="space", ) return latest_filepath_local def get_constants(dfs): """ Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month. Parameters: - dfs (dict): A dictionary containing DataFrames for different categories. Returns: - min_elo_score (float): The minimum Elo score across all DataFrames. - max_elo_score (float): The maximum Elo score across all DataFrames. - upper_models_per_month (int): The maximum number of models per month per license across all DataFrames. """ filter_ranges = {} for k, df in dfs.items(): filter_ranges[k] = { "min_elo_score": df["rating"].min().round(), "max_elo_score": df["rating"].max().round(), "upper_models_per_month": int( df.groupby(["Month-Year", "License"])["rating"] .apply(lambda x: x.count()) .max() ), } min_elo_score = float("inf") max_elo_score = float("-inf") upper_models_per_month = 0 for _, value in filter_ranges.items(): min_elo_score = min(min_elo_score, value["min_elo_score"]) max_elo_score = max(max_elo_score, value["max_elo_score"]) upper_models_per_month = max( upper_models_per_month, value["upper_models_per_month"] ) return min_elo_score, max_elo_score, upper_models_per_month