import gradio as gr import httpx from toolz import groupby import plotly.express as px import pandas as pd from functools import lru_cache choices = sorted( [ "art", "biology", "code", "distilabel", "fiftyone", "legal", "medical", "sentence-transformers", "synthetic", ] ) @lru_cache(maxsize=100) def fetch_data(framework): r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}") data = r.json() grouped = groupby(lambda x: x["author"], data) grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True)) return data, grouped def generate_dashboard(data, grouped, framework): total_datasets = sum(len(v) for v in grouped.values()) dashboard = f"## Hugging Face datasets for {framework} \n\n" dashboard += f"**Total number of datasets: {total_datasets}**\n\n" dashboard += f"**Total number of authors: {len(grouped)}**\n\n" dashboard += "### Datasets per Author\n\n" for k, v in grouped.items(): dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n" dashboard += f" - **Number of datasets:** {len(v)}\n" return dashboard def plot_datasets_growth(data, framework, show_growth_rate=True): df = pd.DataFrame(data) df["createdAt"] = pd.to_datetime(df["createdAt"]) df["month"] = df["createdAt"].dt.to_period("M").astype(str) # Exclude the current month current_month = pd.Period.now("M").strftime("%Y-%m") df = df[df["month"] < current_month] df_counts = df.groupby("month").size().reset_index(name="count") df_counts["cumulative_count"] = df_counts["count"].cumsum() df_counts["growth_rate"] = df_counts["count"].pct_change() fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth") fig.update_layout( xaxis_title="Month", yaxis_title="Cumulative Number of Datasets", yaxis=dict(title=f"Cumulative Number of Datasets ({framework})"), legend=dict( title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), ) if show_growth_rate: fig.update_layout( yaxis2=dict( title="Month-over-Month Growth Rate", overlaying="y", side="right", tickformat=",.0%", ) ) fig.add_scatter( x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2", ) fig.update_layout( title={ "text": f"Dataset Growth for {framework} datasets", "y": 0.95, "x": 0.5, "xanchor": "center", "yanchor": "top", }, title_font=dict(size=24), annotations=[ dict( x=0.5, y=0.85, xref="paper", yref="paper", text="Cumulative number of datasets" + (" and month-over-month growth rate" if show_growth_rate else ""), showarrow=False, font=dict(size=14), ) ], ) return fig def update_dashboard(framework, show_growth_rate=True): data, grouped = fetch_data(framework) dashboard = generate_dashboard(data, grouped, framework) fig = plot_datasets_growth(data, framework, show_growth_rate) return fig, dashboard with gr.Blocks() as demo: gr.Markdown("# Dataset frameworks/tags on the Hub") gr.Markdown( "This dashboard displays the number of datasets per author and the growth of datasets over time for a given framework/tag." ) framework = gr.Dropdown( choices=choices, allow_custom_value=True, label="Select a framework/tag", ) show_growth_rate = gr.Checkbox(True, label="Show growth rate") plot = gr.Plot(label="Growth of datasets over time") markdown = gr.Markdown(label="summary") framework.change( update_dashboard, inputs=[framework, show_growth_rate], outputs=[plot, markdown] ) show_growth_rate.change( update_dashboard, inputs=[framework, show_growth_rate], outputs=[plot, markdown] ) demo.launch()