Spaces:
Runtime error
Runtime error
import datetime | |
import os | |
from typing import Dict, List, Tuple | |
from uuid import UUID | |
import altair as alt | |
from apscheduler.schedulers.background import BackgroundScheduler | |
import argilla as rg | |
from argilla.feedback import FeedbackDataset | |
from huggingface_hub import restart_space | |
from argilla.client.feedback.dataset.remote.dataset import RemoteFeedbackDataset | |
import gradio as gr | |
import pandas as pd | |
# Translation of legends and titels | |
ANNOTATED = "Annotated" | |
NUMBER_ANNOTATED = "Total Annotations" | |
NUMBER_ANNOTATORS = "Total Annotators" | |
PENDING = "Pending Annotations" | |
NAME = "Username" | |
CATEGORY = "Category" | |
SUPPORTED_LANGUAGES = [ | |
"Russian", | |
"Dutch", | |
"Vietnamese", | |
"Arabic", | |
"Filipino", | |
"German", | |
"Swahili", | |
"Malagasy", | |
"Czech", | |
# "Tamil", | |
# "Telugu", | |
"Hungarian" | |
] | |
def restart() -> None: | |
""" | |
This function restarts the space where the dashboard is hosted. | |
""" | |
# Update Space name with your Space information | |
gr.Info("Restarting space at " + str(datetime.datetime.now())) | |
restart_space( | |
"DIBT/PromptTranslationMultilingualDashboard", | |
token=os.getenv("HF_TOKEN"), | |
# factory_reboot=True, | |
) | |
def get_user_annotations_dictionary( | |
dataset: FeedbackDataset | RemoteFeedbackDataset, | |
) -> Dict[str, int]: | |
""" | |
This function returns a dictionary with the username as the key and the number of annotations as the value. | |
Args: | |
dataset: The dataset to be analyzed. | |
Returns: | |
A dictionary with the username as the key and the number of annotations as the value. | |
""" | |
output = {} | |
for record in dataset: | |
for response in record.responses: | |
if str(response.user_id) not in output.keys(): | |
output[str(response.user_id)] = 1 | |
else: | |
output[str(response.user_id)] += 1 | |
# Changing the name of the keys, from the id to the username | |
for key in list(output.keys()): | |
output[rg.User.from_id(UUID(key)).username] = output.pop(key) | |
return output | |
def fetch_data() -> Tuple[Dict[str, int], Dict[str, dict]]: | |
""" | |
This function fetches the data from all the datasets and stores the annotation information in two dictionaries. | |
To do so, looks for all the environment variables that follow this pattern: | |
- SPANISH_API_URL | |
- SPANISH_API_KEY | |
- SPANISH_DATASET | |
- SPANISH_WORKSPACE | |
If the language name matches with one of the languages present in our SUPPORTED_LANGUAGES list, it will fetch the data | |
with the total amount of annotations and the total annotators. | |
Returns: | |
Tuple[Dict[str, int], Dict[str, dict]]: A tuple with two dictionaries. The first one contains the total amount of annotations | |
for each language. The second one contains the total annotators for each language. | |
""" | |
print(f"Starting to fetch data: {datetime.datetime.now()}") | |
# Obtain all the environment variables | |
environment_variables_languages = {} | |
for language in SUPPORTED_LANGUAGES: | |
print("Fetching data for: ", language) | |
if not os.getenv(f"{language.upper()}_API_URL"): | |
print(f"Missing environment variables for {language}") | |
continue | |
environment_variables_languages[language] = { | |
"api_url": os.getenv(f"{language.upper()}_API_URL"), | |
"api_key": os.getenv(f"{language.upper()}_API_KEY"), | |
"dataset_name": os.getenv(f"{language.upper()}_DATASET"), | |
"workspace_name": os.getenv(f"{language.upper()}_WORKSPACE"), | |
} | |
global annotations, annotators | |
annotations = {} | |
annotators = {} | |
# Connect to each space and obtain the total amount of annotations and annotators | |
for language, environment_variables in environment_variables_languages.items(): | |
rg.init( | |
api_url=environment_variables["api_url"], | |
api_key=environment_variables["api_key"], | |
) | |
# Obtain the dataset and see how many pending records are there | |
dataset = rg.FeedbackDataset.from_argilla( | |
environment_variables["dataset_name"], | |
workspace=environment_variables["workspace_name"], | |
) | |
# filtered_source_dataset = source_dataset.filter_by(response_status=["pending"]) | |
target_dataset = dataset.filter_by(response_status=["submitted"]) | |
annotations[language.lower()] = len(target_dataset) | |
annotators[language.lower()] = { | |
"annotators": get_user_annotations_dictionary(target_dataset) | |
} | |
# Print the current date and time | |
print(f"Data fetched: {datetime.datetime.now()}") | |
return annotations, annotators | |
def kpi_chart_total_annotations() -> alt.Chart: | |
""" | |
This function returns a KPI chart with the total amount of annotations. | |
Returns: | |
An altair chart with the KPI chart. | |
""" | |
total_annotations = 0 | |
for language in annotations.keys(): | |
total_annotations += annotations[language] | |
# Assuming you have a DataFrame with user data, create a sample DataFrame | |
data = pd.DataFrame({"Category": [NUMBER_ANNOTATED], "Value": [total_annotations]}) | |
# Create Altair chart | |
chart = ( | |
alt.Chart(data) | |
.mark_text(fontSize=100, align="center", baseline="middle", color="#e68b39") | |
.encode(text="Value:N") | |
.properties(title=NUMBER_ANNOTATED, width=250, height=200) | |
) | |
return chart | |
def kpi_chart_total_annotators() -> alt.Chart: | |
""" | |
This function returns a KPI chart with the total amount of annotators. | |
Returns: | |
An altair chart with the KPI chart. | |
""" | |
total_annotators = 0 | |
for _, value in annotators.items(): | |
total_annotators += len(list(value["annotators"].keys())) | |
# Assuming you have a DataFrame with user data, create a sample DataFrame | |
data = pd.DataFrame({"Category": [NUMBER_ANNOTATORS], "Value": [total_annotators]}) | |
# Create Altair chart | |
chart = ( | |
alt.Chart(data) | |
.mark_text(fontSize=100, align="center", baseline="middle", color="#e68b39") | |
.encode(text="Value:N") | |
.properties(title=NUMBER_ANNOTATORS, width=250, height=200) | |
) | |
return chart | |
def render_hub_user_link(hub_id: str) -> str: | |
""" | |
This function returns a link to the user's profile on Hugging Face. | |
Args: | |
hub_id: The user's id on Hugging Face. | |
Returns: | |
A string with the link to the user's profile on Hugging Face. | |
""" | |
link = f"https://huggingface.co/{hub_id}" | |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>' | |
def obtain_top_users(user_annotators_list: Dict[str, int], N: int = 50) -> pd.DataFrame: | |
""" | |
This function returns the top N users with the most annotations. | |
Args: | |
user_ids_annotations: A dictionary with the user ids as the key and the number of annotations as the value. | |
Returns: | |
A pandas dataframe with the top N users with the most annotations. | |
""" | |
user_id_annotations = {} | |
for _, user_annotators in user_annotators_list.items(): | |
for user_id, number_annotations in user_annotators["annotators"].items(): | |
if user_id not in user_id_annotations: | |
user_id_annotations[user_id] = number_annotations | |
else: | |
user_id_annotations[user_id] += number_annotations | |
dataframe = pd.DataFrame( | |
user_id_annotations.items(), columns=[NAME, NUMBER_ANNOTATED] | |
) | |
dataframe[NAME] = dataframe[NAME].apply(render_hub_user_link) | |
dataframe = dataframe.sort_values(by=NUMBER_ANNOTATED, ascending=False) | |
return dataframe.head(N) | |
def get_top(N=50) -> pd.DataFrame: | |
""" | |
This function returns the top N users with the most annotations. | |
Args: | |
N: The number of users to be returned. 50 by default | |
Returns: | |
A pandas dataframe with the top N users with the most annotations. | |
""" | |
return obtain_top_users(annotators, N=N) | |
def donut_chart_total() -> alt.Chart: | |
""" | |
This function returns a donut chart with the progress of the total annotations in each language. | |
Returns: | |
An altair chart with the donut chart. | |
""" | |
# Load your data | |
annotated_records = [annotation for annotation in annotations.values()] | |
languages = [language.capitalize() for language in annotations.keys()] | |
# Prepare data for the donut chart | |
source = pd.DataFrame( | |
{ | |
"values": annotated_records, | |
"category": languages, | |
# "colors": ["#4682b4", "#e68c39"], # Blue for Completed, Orange for Remaining | |
} | |
) | |
base = alt.Chart(source).encode( | |
theta=alt.Theta("values:Q", stack=True), | |
radius=alt.Radius( | |
"values", scale=alt.Scale(type="sqrt", zero=True, rangeMin=20) | |
), | |
color=alt.Color( | |
field="category", | |
type="nominal", | |
legend=alt.Legend(title=CATEGORY), | |
), | |
) | |
c1 = base.mark_arc(innerRadius=20, stroke="#fff") | |
c2 = base.mark_text(radiusOffset=20).encode(text="values:Q") | |
chart = c1 + c2 | |
return chart | |
def bar_chart_total() -> alt.Chart: | |
"""A bar chart with the progress of the total annotations in each language. | |
Returns: | |
An altair chart with the bar chart. | |
""" | |
# Load your data | |
annotated_records = [annotation for annotation in annotations.values()] | |
languages = [language.capitalize() for language in annotations.keys()] | |
# Prepare data for the bar chart | |
source = pd.DataFrame( | |
{ | |
"values": annotated_records, | |
"category": languages, | |
} | |
) | |
base = alt.Chart(source, width=300, height=200).encode( | |
x=alt.X("values:Q", title="Translations"), | |
y=alt.Y("category:N", title="Language"), | |
text="values:Q", | |
color=alt.Color("category:N", legend=None), | |
) | |
rule = alt.Chart(source).mark_rule(color="red").encode(x=alt.datum(500)) | |
return base.mark_bar() + base.mark_text(align="left", dx=2) + rule | |
def main() -> None: | |
fetch_data() | |
# To avoid the orange border for the Gradio elements that are in constant loading | |
css = """ | |
.generating { | |
border: none; | |
} | |
""" | |
with gr.Blocks(css=css, delete_cache=(300, 300)) as demo: | |
gr.Markdown( | |
""" | |
# π Translation Efforts Dashboard - Multilingual Prompt Evaluation Project | |
You can check out the progress done in each language for the Multilingual Prompt Evaluation Project in this dashboard. If you want to add a new language to this dashboard, please open an issue and we will contact you to obtain the necessary API KEYs and URLs include your language in this dashboard. | |
## How to participate | |
Participating is easy. Go to one of the Annotation Spaces of the language of your choice, log in or create a Hugging Face account, and you can start working. | |
- [Spanish](https://somosnlp-dibt-prompt-translation-for-es.hf.space) | |
- [Russian](https://dibt-russian-prompt-translation-for-russian.hf.space) | |
- [Dutch](https://dibt-dutch-prompt-translation-for-dutch.hf.space) | |
- [Vietnamese](https://ai-vietnam-prompt-translation-for-vie.hf.space) | |
- [Arabic](https://2a2i-prompt-translation-for-arabic.hf.space) | |
- [Filipino](https://dibt-filipino-prompt-translation-for-filipino.hf.space) | |
- [German](https://huggingface.co/spaces/DIBT-German/prompt-translation-for-German) | |
- [Swahili](https://dibt-swahili-prompt-translation-for-swahili.hf.space) | |
- [Malagasy](https://dibt-malagasy-prompt-translation-for-malagasy.hf.space) | |
- [Tamil](https://data-indica-prompt-translation-for-tamil.hf.space) | |
- [Telugu](https://data-indica-prompt-translation-for-telugu.hf.space) | |
- [Czech](https://dibt-czech-prompt-translation-for-czech.hf.space) | |
- [Hungarian](https://dibt-hungarian-prompt-translation-for-hungarian.hf.space) | |
""" | |
) | |
gr.Markdown( | |
f""" | |
## π Annotations among Languages | |
Here you can see the progress of the annotations among the different languages. | |
""" | |
) | |
with gr.Row(): | |
kpi_chart_annotations = gr.Plot(label="Plot") | |
demo.load( | |
kpi_chart_total_annotations, | |
inputs=[], | |
outputs=[kpi_chart_annotations], | |
) | |
bar_languages = gr.Plot(label="Plot") | |
demo.load( | |
bar_chart_total, | |
inputs=[], | |
outputs=[bar_languages], | |
) | |
gr.Markdown( | |
""" | |
## πΎ Hall of Fame | |
Check out the users with more contributions among the different translation efforts. | |
""" | |
) | |
with gr.Row(): | |
kpi_chart_annotators = gr.Plot(label="Plot") | |
demo.load( | |
kpi_chart_total_annotators, | |
inputs=[], | |
outputs=[kpi_chart_annotators], | |
) | |
top_df_plot = gr.Dataframe( | |
headers=[NAME, NUMBER_ANNOTATED], | |
datatype=[ | |
"markdown", | |
"number", | |
], | |
row_count=50, | |
col_count=(2, "fixed"), | |
interactive=False, | |
) | |
demo.load(get_top, None, [top_df_plot]) | |
# Manage background refresh | |
scheduler = BackgroundScheduler() | |
_ = scheduler.add_job(restart, "interval", minutes=30) | |
scheduler.start() | |
# Launch the Gradio interface | |
demo.launch() | |
if __name__ == "__main__": | |
main() | |