hynky HF staff commited on
Commit
75448af
1 Parent(s): a6d926d

Refactor the code

Browse files
app.py CHANGED
@@ -1,4 +1,8 @@
1
  from src.view.view import create_interface
2
 
3
- demo = create_interface()
4
- demo.launch()
 
 
 
 
 
1
  from src.view.view import create_interface
2
 
3
+ global demo
4
+ demo = None
5
+
6
+ if __name__ == "__main__":
7
+ demo = create_interface()
8
+ demo.launch()
src/logic/data_fetching.py CHANGED
@@ -1,5 +1,7 @@
 
1
  import os
2
  import json
 
3
  import tempfile
4
  from pathlib import Path
5
  from concurrent.futures import ThreadPoolExecutor
@@ -9,30 +11,25 @@ from datatrove.utils.stats import MetricStatsDict
9
  import gradio as gr
10
  import tenacity
11
 
 
 
12
  def find_folders(base_folder: str, path: str) -> List[str]:
13
- base_folder = get_datafolder(base_folder)
14
- if not base_folder.exists(path):
15
  return []
16
  return sorted(
17
  [
18
- folder["name"]
19
- for folder in base_folder.ls(path, detail=True)
20
- if folder["type"] == "directory" and not folder["name"].rstrip("/") == path
21
  ]
22
  )
23
 
24
- def find_metrics_folders(base_folder: str) -> List[str]:
25
- base_data_df = get_datafolder(base_folder)
26
- dirs = sorted(
27
- folder
28
- for folder, info in base_data_df.find("", detail=True, maxdepth=1, withdirs=True).items()
29
- if info["type"] == "directory"
30
- )
31
- return sorted(list(set(dirs)))
32
-
33
  def fetch_datasets(base_folder: str):
34
- datasets = sorted(find_metrics_folders(base_folder))
35
- return datasets, gr.update(choices=datasets, value=None), fetch_groups(base_folder, datasets, None, "union")
 
 
36
 
37
  def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
38
  if not datasets:
@@ -55,7 +52,7 @@ def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: s
55
  if not value and len(new_choices) == 1:
56
  value = list(new_choices)[0]
57
 
58
- return gr.update(choices=sorted(list(new_choices)), value=value)
59
 
60
  def fetch_metrics(base_folder: str, datasets: List[str], group: str, old_metrics: str, type: str = "intersection"):
61
  if not group:
@@ -79,7 +76,7 @@ def fetch_metrics(base_folder: str, datasets: List[str], group: str, old_metrics
79
  if not value and len(new_possibles_choices) == 1:
80
  value = list(new_possibles_choices)[0]
81
 
82
- return gr.update(choices=sorted(list(new_possibles_choices)), value=value)
83
 
84
  def reverse_search(base_folder: str, possible_datasets: List[str], grouping: str, metric_name: str) -> str:
85
  with ThreadPoolExecutor() as executor:
@@ -91,7 +88,7 @@ def reverse_search(base_folder: str, possible_datasets: List[str], grouping: str
91
 
92
  def reverse_search_add(datasets: List[str], reverse_search_results: str) -> List[str]:
93
  datasets = datasets or []
94
- return sorted(list(set(datasets + reverse_search_results.strip().split("\n"))))
95
 
96
  def metric_exists(base_folder: str, path: str, metric_name: str, group_by: str) -> bool:
97
  base_folder = get_datafolder(base_folder)
@@ -105,4 +102,39 @@ def load_metrics(base_folder: str, path: str, metric_name: str, group_by: str) -
105
  return MetricStatsDict.from_dict(json_metric)
106
 
107
  def load_data(dataset_path: str, base_folder: str, grouping: str, metric_name: str) -> MetricStatsDict:
108
- return load_metrics(base_folder, dataset_path, metric_name, grouping)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
  import os
3
  import json
4
+ import re
5
  import tempfile
6
  from pathlib import Path
7
  from concurrent.futures import ThreadPoolExecutor
 
11
  import gradio as gr
12
  import tenacity
13
 
14
+ from src.logic.graph_settings import Grouping
15
+
16
  def find_folders(base_folder: str, path: str) -> List[str]:
17
+ base_folder_df = get_datafolder(base_folder)
18
+ if not base_folder_df.exists(path):
19
  return []
20
  return sorted(
21
  [
22
+ folder
23
+ for folder,info in base_folder_df.find(path, maxdepth=1, withdirs=True, detail=True).items()
24
+ if info["type"] == "directory" and not (folder.rstrip("/") == path.rstrip("/"))
25
  ]
26
  )
27
 
 
 
 
 
 
 
 
 
 
28
  def fetch_datasets(base_folder: str):
29
+ datasets = sorted(find_folders(base_folder, ""))
30
+ if len(datasets) == 0:
31
+ raise ValueError("No datasets found")
32
+ return datasets
33
 
34
  def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
35
  if not datasets:
 
52
  if not value and len(new_choices) == 1:
53
  value = list(new_choices)[0]
54
 
55
+ return gr.Dropdown(choices=sorted(list(new_choices)), value=value)
56
 
57
  def fetch_metrics(base_folder: str, datasets: List[str], group: str, old_metrics: str, type: str = "intersection"):
58
  if not group:
 
76
  if not value and len(new_possibles_choices) == 1:
77
  value = list(new_possibles_choices)[0]
78
 
79
+ return gr.Dropdown(choices=sorted(list(new_possibles_choices)), value=value)
80
 
81
  def reverse_search(base_folder: str, possible_datasets: List[str], grouping: str, metric_name: str) -> str:
82
  with ThreadPoolExecutor() as executor:
 
88
 
89
  def reverse_search_add(datasets: List[str], reverse_search_results: str) -> List[str]:
90
  datasets = datasets or []
91
+ return list(set(datasets + reverse_search_results.strip().split("\n")))
92
 
93
  def metric_exists(base_folder: str, path: str, metric_name: str, group_by: str) -> bool:
94
  base_folder = get_datafolder(base_folder)
 
102
  return MetricStatsDict.from_dict(json_metric)
103
 
104
  def load_data(dataset_path: str, base_folder: str, grouping: str, metric_name: str) -> MetricStatsDict:
105
+ return load_metrics(base_folder, dataset_path, metric_name, grouping)
106
+
107
+
108
+ def fetch_graph_data(
109
+ base_folder: str,
110
+ datasets: List[str],
111
+ metric_name: str,
112
+ grouping: Grouping,
113
+ progress=gr.Progress(),
114
+ ):
115
+ if len(datasets) <= 0 or not metric_name or not grouping:
116
+ return None
117
+
118
+ with ThreadPoolExecutor() as pool:
119
+ data = list(
120
+ progress.tqdm(
121
+ pool.map(
122
+ partial(load_data, base_folder=base_folder, metric_name=metric_name, grouping=grouping),
123
+ datasets,
124
+ ),
125
+ total=len(datasets),
126
+ desc="Loading data...",
127
+ )
128
+ )
129
+
130
+ data = {path: result for path, result in zip(datasets, data)}
131
+ return data, None
132
+
133
+ def update_datasets_with_regex(regex: str, selected_runs: List[str], all_runs: List[str]):
134
+ if not regex:
135
+ return []
136
+ new_dsts = {run for run in all_runs if re.search(regex, run)}
137
+ if not new_dsts:
138
+ return selected_runs
139
+ dst_union = new_dsts.union(selected_runs or [])
140
+ return sorted(list(dst_union))
src/logic/data_processing.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import json
2
  import re
3
  import heapq
@@ -7,6 +8,8 @@ from typing import Dict, Tuple, List, Literal
7
  import gradio as gr
8
  from datatrove.utils.stats import MetricStatsDict
9
 
 
 
10
  PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
11
 
12
  def prepare_for_non_grouped_plotting(metric: Dict[str, MetricStatsDict], normalization: bool, rounding: int) -> Dict[float, float]:
@@ -35,13 +38,14 @@ def prepare_for_group_plotting(metric: Dict[str, MetricStatsDict], top_k: int, d
35
  stds = [metric[key].standard_deviation for key in keys]
36
  return keys, means, stds
37
 
38
- def export_data(exported_data: Dict[str, MetricStatsDict], metric_name: str):
39
  if not exported_data:
40
  return None
41
- with tempfile.NamedTemporaryFile(mode="w", delete=False, prefix=metric_name, suffix=".json") as temp:
 
 
42
  json.dump({
43
  name: sorted([{"value": key, **value} for key, value in dt.to_dict().items()], key=lambda x: x["value"])
44
  for name, dt in exported_data.items()
45
- }, temp, indent=2)
46
- temp_path = temp.name
47
- return gr.update(visible=True, value=temp_path)
 
1
+ from datetime import datetime
2
  import json
3
  import re
4
  import heapq
 
8
  import gradio as gr
9
  from datatrove.utils.stats import MetricStatsDict
10
 
11
+ from src.logic.graph_settings import Grouping
12
+
13
  PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
14
 
15
  def prepare_for_non_grouped_plotting(metric: Dict[str, MetricStatsDict], normalization: bool, rounding: int) -> Dict[float, float]:
 
38
  stds = [metric[key].standard_deviation for key in keys]
39
  return keys, means, stds
40
 
41
+ def export_data(exported_data: Dict[str, MetricStatsDict], metric_name: str, grouping: Grouping):
42
  if not exported_data:
43
  return None
44
+
45
+ file_name = f"{metric_name}_{grouping}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
46
+ with open(file_name, 'w') as f:
47
  json.dump({
48
  name: sorted([{"value": key, **value} for key, value in dt.to_dict().items()], key=lambda x: x["value"])
49
  for name, dt in exported_data.items()
50
+ }, f, indent=2)
51
+ return gr.File(value=file_name, visible=True)
 
src/logic/graph_settings.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+ import gradio as gr
3
+
4
+
5
+ Grouping = Literal["histogram", "fqdn", "suffix", "summary"]
6
+
7
+ def update_graph_options(grouping: Grouping):
8
+ """
9
+ Updates visibility of the graph options based on the grouping type.
10
+ The return should be in following order:
11
+ group_settings, histogram_settings
12
+ """
13
+ if grouping == "histogram":
14
+ return [
15
+ gr.TabItem(visible=False),
16
+ gr.TabItem(visible=True),
17
+ gr.TabItem(visible=False),
18
+ ]
19
+ elif grouping in ["fqdn", "suffix"]:
20
+ return [
21
+ gr.Column(visible=True),
22
+ gr.Column(visible=False),
23
+ gr.Column(visible=False),
24
+ ]
25
+ elif grouping == "summary":
26
+ return [
27
+ gr.Column(visible=False),
28
+ gr.Column(visible=False),
29
+ gr.Column(visible=True),
30
+ ]
31
+
32
+ return [
33
+ gr.Column(visible=False),
34
+ gr.Column(visible=False),
35
+ gr.Column(visible=False),
36
+ ]
src/logic/plotting.py CHANGED
@@ -4,8 +4,11 @@ import plotly.graph_objects as go
4
  import numpy as np
5
  import gradio as gr
6
  from typing import Dict, List
7
- from .data_processing import prepare_for_non_grouped_plotting, prepare_for_group_plotting
8
- from .utils import set_alpha
 
 
 
9
 
10
  def plot_scatter(
11
  data: Dict[str, Dict[float, float]],
@@ -55,14 +58,15 @@ def plot_scatter(
55
  return fig
56
 
57
  def plot_bars(
58
- data: Dict[str, List[Dict[str, float]]],
59
  metric_name: str,
60
  top_k: int,
61
- direction: str,
62
  regex: str | None,
63
  rounding: int,
64
  log_scale_x: bool,
65
  log_scale_y: bool,
 
66
  progress: gr.Progress,
67
  ):
68
  fig = go.Figure()
@@ -77,7 +81,7 @@ def plot_bars(
77
  y=y,
78
  name=f"{name} Mean",
79
  marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
80
- error_y=dict(type='data', array=stds, visible=True)
81
  ))
82
 
83
  fig.update_layout(
@@ -94,14 +98,65 @@ def plot_bars(
94
 
95
  return fig
96
 
97
- def plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x, log_scale_y,
98
- cumsum, perc, progress=gr.Progress()):
99
- if rounding is None or top_k is None:
100
- return None
101
- graph_fc = (
102
- partial(plot_scatter, normalization=normalization, rounding=rounding, cumsum=cumsum, perc=perc)
103
- if grouping == "histogram"
104
- else partial(plot_bars, top_k=top_k, direction=direction, regex=regex, rounding=rounding)
105
- )
106
- return graph_fc(data=data, metric_name=metric_name, progress=progress, log_scale_x=log_scale_x,
107
- log_scale_y=log_scale_y)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import numpy as np
5
  import gradio as gr
6
  from typing import Dict, List
7
+
8
+ from src.logic.data_processing import PARTITION_OPTIONS, prepare_for_non_grouped_plotting, prepare_for_group_plotting
9
+ from src.logic.graph_settings import Grouping
10
+ from src.logic.utils import set_alpha
11
+ from datatrove.utils.stats import MetricStatsDict
12
 
13
  def plot_scatter(
14
  data: Dict[str, Dict[float, float]],
 
58
  return fig
59
 
60
  def plot_bars(
61
+ data: Dict[str, MetricStatsDict],
62
  metric_name: str,
63
  top_k: int,
64
+ direction: PARTITION_OPTIONS,
65
  regex: str | None,
66
  rounding: int,
67
  log_scale_x: bool,
68
  log_scale_y: bool,
69
+ show_stds: bool,
70
  progress: gr.Progress,
71
  ):
72
  fig = go.Figure()
 
81
  y=y,
82
  name=f"{name} Mean",
83
  marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
84
+ error_y=dict(type='data', array=stds, visible=show_stds)
85
  ))
86
 
87
  fig.update_layout(
 
98
 
99
  return fig
100
 
101
+
102
+ # Add any other necessary functions
103
+
104
+ def plot_data(
105
+ metric_data: Dict[str, MetricStatsDict],
106
+ metric_name: str,
107
+ normalize: bool,
108
+ rounding: int,
109
+ grouping: Grouping,
110
+ top_n: int,
111
+ direction: PARTITION_OPTIONS,
112
+ group_regex: str,
113
+ log_scale_x: bool,
114
+ log_scale_y: bool,
115
+ cdf: bool,
116
+ perc: bool,
117
+ show_stds: bool,
118
+ ) -> tuple[go.Figure, gr.Row, str]:
119
+ if grouping == "histogram":
120
+ fig = plot_scatter(
121
+ metric_data,
122
+ metric_name,
123
+ log_scale_x,
124
+ log_scale_y,
125
+ normalize,
126
+ rounding,
127
+ cdf,
128
+ perc,
129
+ gr.Progress(),
130
+ )
131
+ min_max_hist_data = generate_min_max_hist_data(metric_data)
132
+ return fig, gr.Row.update(visible=True), min_max_hist_data
133
+ else:
134
+ fig = plot_bars(
135
+ metric_data,
136
+ metric_name,
137
+ top_n,
138
+ direction,
139
+ group_regex,
140
+ rounding,
141
+ log_scale_x,
142
+ log_scale_y,
143
+ show_stds,
144
+ gr.Progress(),
145
+ )
146
+ return fig, gr.Row.update(visible=True), ""
147
+
148
+ def generate_min_max_hist_data(data: Dict[str, MetricStatsDict]) -> str:
149
+ runs_data = {
150
+ run: {
151
+ "min": min(map(float, dato.keys())),
152
+ "max": max(map(float, dato.keys())),
153
+ }
154
+ for run, dato in data.items()
155
+ }
156
+
157
+ runs_rows = [
158
+ f"| {run} | {values['min']:.4f} | {values['max']:.4f} |"
159
+ for run, values in runs_data.items()
160
+ ]
161
+ header = "| Run | Min | Max |\n|-----|-----|-----|\n"
162
+ return header + "\n".join(runs_rows)
src/view/help_tab.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def create_help_tab():
4
+ gr.Markdown(
5
+ label="Readme",
6
+ value="""
7
+
8
+ # Dataset Metrics Explorer
9
+ ## Features:
10
+ - View metrics for various datasets you computed using datatrove
11
+ - Search for metrics across datasets
12
+
13
+ ## View metrics Usage:
14
+ 1) Specify Metrics location (Stats block `output_folder`) and click "Fetch Datasets"
15
+ 2) Select datasets you are interested in using the dropdown or regex filter
16
+ 3) Specify Grouping (histogram/summary/fqdn/suffix) and Metric name
17
+ 4) Click "Render Metric", adjust Graph settings and see the result
18
+
19
+ ### Groupings:
20
+ - **histogram**: Creates a line plot of values with their frequencies.
21
+ * normalize: Normalize the histogram to sum to 1
22
+ * CDF: Show the plot as cumulative distribution function
23
+ * %: Show the plot as percentage of the total
24
+ - **(fqdn/suffix)**: Creates a bar plot of the avg. values of the metric for full qualifed domain name/suffix of domain.
25
+ * k: the number of groups to show
26
+ * Top/Bottom/Most frequent (n_docs): Groups with the top/bottom k values/most prevalant docs are shown
27
+ - **summary**: Shows the average value of given metric for every dataset
28
+ * show_stds: Show the standard deviation from mean for every datasets
29
+
30
+ ## Reverse search Usage:
31
+ To search for datasets containing a grouping and certain metric, use the Reverse search section.
32
+ Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
33
+
34
+ ## Note:
35
+ The data might not be 100% representative, due to the sampling and optimistic merging of the metrics (fqdn/suffix).
36
+ """
37
+ )
src/view/metric_view_tab.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import tempfile
3
+ from typing import Callable
4
+ import gradio as gr
5
+ from functools import partial
6
+ import re
7
+ import json
8
+
9
+ from src.logic.data_fetching import fetch_datasets, fetch_graph_data, fetch_groups, fetch_metrics, update_datasets_with_regex
10
+ from src.logic.data_processing import export_data
11
+ from src.logic.graph_settings import update_graph_options
12
+ from src.logic.plotting import plot_data
13
+
14
+ def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr.State, selected_datasets: gr.State):
15
+ metric_data = gr.State([])
16
+
17
+ with gr.Row():
18
+ with gr.Column(scale=2):
19
+ with gr.Row():
20
+ with gr.Column(scale=1):
21
+ base_folder = gr.Textbox(
22
+ label="Metrics Location",
23
+ value=METRICS_LOCATION_DEFAULT,
24
+ )
25
+ datasets_fetch = gr.Button("Fetch Datasets")
26
+
27
+ with gr.Column(scale=1):
28
+ regex_select = gr.Text(label="Regex filter", value=".*")
29
+ regex_button = gr.Button("Search")
30
+ with gr.Row():
31
+ selected_datasets_dropdown = gr.Dropdown(
32
+ choices=[],
33
+ label="Datasets",
34
+ multiselect=True,
35
+ interactive=True,
36
+ )
37
+
38
+ with gr.Column(scale=1):
39
+ grouping_dropdown = gr.Dropdown(
40
+ choices=[],
41
+ label="Grouping",
42
+ multiselect=False,
43
+ )
44
+ metric_name_dropdown = gr.Dropdown(
45
+ choices=[],
46
+ label="Metric name",
47
+ multiselect=False,
48
+ )
49
+
50
+ render_button = gr.Button("Render Metric", variant="primary")
51
+
52
+ with gr.Tabs():
53
+ with gr.TabItem("Graph Settings"):
54
+ log_scale_x_checkbox = gr.Checkbox(
55
+ label="Log scale x",
56
+ value=False,
57
+ )
58
+ log_scale_y_checkbox = gr.Checkbox(
59
+ label="Log scale y",
60
+ value=False,
61
+ )
62
+ rounding = gr.Number(
63
+ label="Rounding",
64
+ value=2,
65
+ )
66
+
67
+ with gr.TabItem("Grouping Settings") as group_settings:
68
+ with gr.Row() as group_choices:
69
+ with gr.Column(scale=2):
70
+ group_regex = gr.Text(
71
+ label="Group Regex",
72
+ value=None,
73
+ )
74
+ with gr.Row():
75
+ top_select = gr.Number(
76
+ label="N Groups",
77
+ value=100,
78
+ interactive=True,
79
+ )
80
+
81
+ direction_checkbox = gr.Radio(
82
+ label="Partition",
83
+ choices=[
84
+ "Top",
85
+ "Bottom",
86
+ "Most frequent (n_docs)",
87
+ ],
88
+ value="Most frequent (n_docs)",
89
+ )
90
+
91
+ with gr.TabItem("Histogram Settings") as histogram_settings:
92
+ normalization_checkbox = gr.Checkbox(
93
+ label="Normalize",
94
+ value=True,
95
+ visible=False
96
+ )
97
+ cdf_checkbox = gr.Checkbox(
98
+ label="CDF",
99
+ value=False,
100
+ )
101
+ perc_checkbox = gr.Checkbox(
102
+ label="%",
103
+ value=False,
104
+ )
105
+
106
+ with gr.TabItem("Summary Settings") as summary_settings:
107
+ show_stds_checkbox = gr.Checkbox(
108
+ label="Show standard deviations",
109
+ value=False,
110
+ )
111
+
112
+ with gr.Row():
113
+ graph_output = gr.Plot(label="Graph")
114
+ with gr.Row(visible=False) as min_max_hist:
115
+ with gr.Column(scale=3):
116
+ min_max_hist_data = gr.Markdown()
117
+ with gr.Column(scale=1):
118
+ export_data_button = gr.Button("Export Data")
119
+ export_data_json = gr.File(visible=False)
120
+
121
+
122
+
123
+ def update_selected_datasets_dropdown(available_datasets, selected_datasets):
124
+ return gr.Dropdown(choices=available_datasets, value=sorted(selected_datasets))
125
+
126
+
127
+ datasets_fetch.click(
128
+ fn=fetch_datasets,
129
+ inputs=[base_folder],
130
+ outputs=[available_datasets],
131
+ )
132
+
133
+ available_datasets.change(
134
+ fn=update_selected_datasets_dropdown,
135
+ inputs=[available_datasets, selected_datasets],
136
+ outputs=selected_datasets_dropdown,
137
+ )
138
+
139
+ regex_button.click(
140
+ fn=update_datasets_with_regex,
141
+ inputs=[regex_select, selected_datasets, available_datasets],
142
+ outputs=selected_datasets,
143
+ )
144
+
145
+ def update_selected_datasets(selected_datasets_dropdown):
146
+ return selected_datasets_dropdown
147
+
148
+ selected_datasets_dropdown.change(
149
+ fn=update_selected_datasets,
150
+ inputs=[selected_datasets_dropdown],
151
+ outputs=selected_datasets,
152
+ )
153
+
154
+ selected_datasets.change(
155
+ fn=update_selected_datasets_dropdown,
156
+ inputs=[available_datasets, selected_datasets],
157
+ outputs=selected_datasets_dropdown,
158
+ )
159
+
160
+
161
+ selected_datasets.change(
162
+ fn=fetch_groups,
163
+ inputs=[base_folder, selected_datasets, grouping_dropdown],
164
+ outputs=grouping_dropdown,
165
+ )
166
+
167
+ grouping_dropdown.change(
168
+ fn=fetch_metrics,
169
+ inputs=[base_folder, selected_datasets, grouping_dropdown, metric_name_dropdown],
170
+ outputs=metric_name_dropdown,
171
+ )
172
+
173
+ render_button.click(
174
+ fn=fetch_graph_data,
175
+ inputs=[
176
+ base_folder,
177
+ selected_datasets,
178
+ metric_name_dropdown,
179
+ grouping_dropdown,
180
+ ],
181
+ # We also output the graph_output = None to show the progress
182
+ outputs=[metric_data, graph_output],
183
+ )
184
+
185
+
186
+ grouping_dropdown.change(
187
+ fn=update_graph_options,
188
+ inputs=[grouping_dropdown],
189
+ outputs=[group_settings, histogram_settings, summary_settings],
190
+ )
191
+
192
+
193
+ gr.on(
194
+ triggers=[normalization_checkbox.input, rounding.input, group_regex.input, direction_checkbox.input,
195
+ top_select.input, log_scale_x_checkbox.input,
196
+ log_scale_y_checkbox.input, cdf_checkbox.input, perc_checkbox.input, show_stds_checkbox.input, metric_data.change],
197
+ fn=plot_data,
198
+ inputs=[
199
+ metric_data,
200
+ metric_name_dropdown,
201
+ normalization_checkbox,
202
+ rounding,
203
+ grouping_dropdown,
204
+ top_select,
205
+ direction_checkbox,
206
+ group_regex,
207
+ log_scale_x_checkbox,
208
+ log_scale_y_checkbox,
209
+ cdf_checkbox,
210
+ perc_checkbox,
211
+ show_stds_checkbox
212
+ ],
213
+ outputs=[graph_output, min_max_hist, min_max_hist_data],
214
+ )
215
+
216
+ export_data_button.click(
217
+ fn=export_data,
218
+ inputs=[metric_data, metric_name_dropdown, grouping_dropdown],
219
+ outputs=[export_data_json],
220
+ )
221
+
222
+ return base_folder
src/view/reverse_search_tab.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ import gradio as gr
3
+
4
+ from src.logic.data_fetching import fetch_groups, fetch_metrics, reverse_search, reverse_search_add
5
+
6
+ def create_reverse_search_tab(base_folder: gr.Textbox, datasets_available: gr.State, datasets_selected: gr.State):
7
+ reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search")
8
+
9
+ with gr.Row():
10
+ with gr.Column(scale=1):
11
+ reverse_grouping_dropdown = gr.Dropdown(
12
+ choices=[],
13
+ label="Grouping",
14
+ multiselect=False,
15
+ )
16
+ reverse_metric_name_dropdown = gr.Dropdown(
17
+ choices=[],
18
+ label="Metric Name",
19
+ multiselect=False,
20
+ )
21
+ reverse_search_button = gr.Button("Search")
22
+ reverse_search_add_button = gr.Button("Add to selection")
23
+
24
+ with gr.Column(scale=2):
25
+ reverse_search_results = gr.Textbox(
26
+ label="Found datasets",
27
+ lines=10,
28
+ placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection"
29
+ )
30
+
31
+ datasets_available.change(
32
+ fn=partial(fetch_groups, type="union"),
33
+ inputs=[base_folder, datasets_available, reverse_grouping_dropdown],
34
+ outputs=[reverse_grouping_dropdown],
35
+ )
36
+
37
+ reverse_grouping_dropdown.select(
38
+ fn=partial(fetch_metrics, type="union"),
39
+ inputs=[base_folder, datasets_available, reverse_grouping_dropdown, reverse_metric_name_dropdown],
40
+ outputs=reverse_metric_name_dropdown,
41
+ )
42
+
43
+ reverse_search_button.click(
44
+ fn=partial(reverse_search),
45
+ inputs=[base_folder, datasets_available, reverse_grouping_dropdown, reverse_metric_name_dropdown],
46
+ outputs=reverse_search_results,
47
+ )
48
+
49
+ reverse_search_add_button.click(
50
+ fn=reverse_search_add,
51
+ inputs=[datasets_selected, reverse_search_results],
52
+ outputs=datasets_selected,
53
+ )
src/view/view.py CHANGED
@@ -1,325 +1,28 @@
 
1
  import gradio as gr
2
- from src.logic.data_fetching import fetch_datasets, fetch_groups, fetch_metrics, load_data, reverse_search, reverse_search_add
3
- from src.logic.data_processing import export_data
4
- from src.logic.plotting import plot_data
5
- from src.logic.utils import get_desc
6
- from concurrent.futures import ThreadPoolExecutor
7
  from functools import partial
8
- import os
9
  import re
10
-
11
-
 
 
12
 
13
  METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/HuggingFaceFW-Dev/summary-stats-files")
14
 
15
- def update_graph(
16
- base_folder,
17
- datasets,
18
- metric_name,
19
- grouping,
20
- log_scale_x,
21
- log_scale_y,
22
- rounding,
23
- normalization,
24
- top_k,
25
- direction,
26
- regex,
27
- cumsum,
28
- perc,
29
- progress=gr.Progress(),
30
- ):
31
- if len(datasets) <= 0 or not metric_name or not grouping:
32
- return None
33
-
34
- with ThreadPoolExecutor() as pool:
35
- data = list(
36
- progress.tqdm(
37
- pool.map(
38
- partial(load_data, base_folder=base_folder, metric_name=metric_name, grouping=grouping),
39
- datasets,
40
- ),
41
- total=len(datasets),
42
- desc="Loading data...",
43
- )
44
- )
45
-
46
- data = {path: result for path, result in zip(datasets, data)}
47
- return plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x,
48
- log_scale_y, cumsum, perc, progress), data, export_data(data, metric_name), get_desc(data)
49
-
50
  def create_interface():
51
  with gr.Blocks() as demo:
52
- datasets = gr.State([])
53
- exported_data = gr.State([])
54
  metrics_headline = gr.Markdown(value="# Metrics Exploration")
 
 
55
 
56
  with gr.Tabs():
57
- with gr.TabItem("Help"):
58
- gr.Markdown(
59
- label="Readme",
60
- value="""
61
- ## How to use:
62
- 1) Specify Metrics location (Stats block `output_folder` without the last path segment) and click "Fetch Datasets"
63
- 2) Select datasets you are interested in using the dropdown or regex filter
64
- 3) Specify Grouping (global average/value/fqdn/suffix) and Metric name
65
- 4) Click "Render Metric"
66
-
67
-
68
- ## Groupings:
69
- - **histogram**: Creates a line plot of values with their frequencies. If normalization is on, the frequencies sum to 1.
70
- * normalize:
71
- - **(fqdn/suffix)**: Creates a bar plot of the avg. values of the metric for full qualifed domain name/suffix of domain.
72
- * k: the number of groups to show
73
- * Top/Bottom/Most frequent (n_docs): Groups with the top/bottom k values/most prevalant docs are shown
74
- - **none**: Shows the average value of given metric
75
-
76
- ## Reverse search:
77
- To search for datasets containing a grouping and certain metric, use the Reverse search section.
78
- Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
79
-
80
- ## Note:
81
- The data might not be 100% representative, due to the sampling and optimistic merging of the metrics (fqdn/suffix).
82
- """,
83
- )
84
-
85
  with gr.TabItem("Metric View"):
86
- with gr.Row():
87
- with gr.Column(scale=2):
88
- with gr.Row():
89
- with gr.Column(scale=1):
90
- base_folder = gr.Textbox(
91
- label="Metrics Location",
92
- value=METRICS_LOCATION_DEFAULT,
93
- )
94
- datasets_refetch = gr.Button("Fetch Datasets")
95
-
96
- with gr.Column(scale=1):
97
- regex_select = gr.Text(label="Regex filter", value=".*")
98
- regex_button = gr.Button("Search")
99
- with gr.Row():
100
- datasets_selected = gr.Dropdown(
101
- choices=[],
102
- label="Datasets",
103
- multiselect=True,
104
- )
105
-
106
- with gr.Column(scale=1):
107
- grouping_dropdown = gr.Dropdown(
108
- choices=[],
109
- label="Grouping",
110
- multiselect=False,
111
- )
112
- metric_name_dropdown = gr.Dropdown(
113
- choices=[],
114
- label="Metric name",
115
- multiselect=False,
116
- )
117
-
118
- render_button = gr.Button("Render Metric", variant="primary")
119
-
120
- with gr.Tabs():
121
- with gr.TabItem("Graph Settings"):
122
- log_scale_x_checkbox = gr.Checkbox(
123
- label="Log scale x",
124
- value=False,
125
- )
126
- log_scale_y_checkbox = gr.Checkbox(
127
- label="Log scale y",
128
- value=False,
129
- )
130
- rounding = gr.Number(
131
- label="Rounding",
132
- value=2,
133
- )
134
- normalization_checkbox = gr.Checkbox(
135
- label="Normalize",
136
- value=True,
137
- visible=False
138
- )
139
- with gr.Row():
140
- export_data_json = gr.File(visible=False)
141
-
142
- with gr.TabItem("Grouping Settings"):
143
- with gr.Row(visible=False) as group_choices:
144
- with gr.Column(scale=2):
145
- group_regex = gr.Text(
146
- label="Group Regex",
147
- value=None,
148
- )
149
- with gr.Row():
150
- top_select = gr.Number(
151
- label="N Groups",
152
- value=100,
153
- interactive=True,
154
- )
155
-
156
- direction_checkbox = gr.Radio(
157
- label="Partition",
158
- choices=[
159
- "Top",
160
- "Bottom",
161
- "Most frequent (n_docs)",
162
- ],
163
- value="Most frequent (n_docs)",
164
- )
165
-
166
- with gr.TabItem("Histogram Settings") as histogram_settings:
167
- cdf_checkbox = gr.Checkbox(
168
- label="CDF",
169
- value=False,
170
- )
171
- perc_checkbox = gr.Checkbox(
172
- label="%",
173
- value=False,
174
- )
175
- with gr.Column(visible=False) as min_max_hist:
176
- min_max_hist_data = gr.Markdown()
177
-
178
- with gr.Row():
179
- graph_output = gr.Plot(label="Graph")
180
 
181
  with gr.TabItem("Reverse Metrics Search"):
182
- reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search")
183
-
184
- with gr.Row():
185
- with gr.Column(scale=1):
186
- reverse_grouping_dropdown = gr.Dropdown(
187
- choices=[],
188
- label="Grouping",
189
- multiselect=False,
190
- )
191
- reverse_metric_name_dropdown = gr.Dropdown(
192
- choices=[],
193
- label="Metric Name",
194
- multiselect=False,
195
- )
196
- reverse_search_button = gr.Button("Search")
197
- reverse_search_add_button = gr.Button("Add to selection")
198
-
199
- with gr.Column(scale=2):
200
- reverse_search_results = gr.Textbox(
201
- label="Found datasets",
202
- lines=10,
203
- placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection"
204
- )
205
-
206
- render_button.click(
207
- fn=update_graph,
208
- inputs=[
209
- base_folder,
210
- datasets_selected,
211
- metric_name_dropdown,
212
- grouping_dropdown,
213
- log_scale_x_checkbox,
214
- log_scale_y_checkbox,
215
- rounding,
216
- normalization_checkbox,
217
- top_select,
218
- direction_checkbox,
219
- group_regex,
220
- cdf_checkbox,
221
- perc_checkbox
222
- ],
223
- outputs=[graph_output, exported_data, export_data_json, min_max_hist_data],
224
- )
225
-
226
- gr.on(
227
- triggers=[normalization_checkbox.change, rounding.change, group_regex.change, direction_checkbox.change,
228
- top_select.change, log_scale_x_checkbox.change,
229
- log_scale_y_checkbox.change, cdf_checkbox.change, perc_checkbox.change],
230
- fn=plot_data,
231
- inputs=[
232
- exported_data,
233
- metric_name_dropdown,
234
- normalization_checkbox,
235
- rounding,
236
- grouping_dropdown,
237
- top_select,
238
- direction_checkbox,
239
- group_regex,
240
- log_scale_x_checkbox,
241
- log_scale_y_checkbox,
242
- cdf_checkbox,
243
- perc_checkbox
244
- ],
245
- outputs=[graph_output],
246
- )
247
-
248
- datasets_selected.change(
249
- fn=fetch_groups,
250
- inputs=[base_folder, datasets_selected, grouping_dropdown],
251
- outputs=grouping_dropdown,
252
- )
253
-
254
- grouping_dropdown.change(
255
- fn=fetch_metrics,
256
- inputs=[base_folder, datasets_selected, grouping_dropdown, metric_name_dropdown],
257
- outputs=metric_name_dropdown,
258
- )
259
-
260
- reverse_grouping_dropdown.select(
261
- fn=partial(fetch_metrics, type="union"),
262
- inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown],
263
- outputs=reverse_metric_name_dropdown,
264
- )
265
-
266
- reverse_search_button.click(
267
- fn=reverse_search,
268
- inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown],
269
- outputs=reverse_search_results,
270
- )
271
-
272
- reverse_search_add_button.click(
273
- fn=reverse_search_add,
274
- inputs=[datasets_selected, reverse_search_results],
275
- outputs=datasets_selected,
276
- )
277
-
278
- datasets_refetch.click(
279
- fn=fetch_datasets,
280
- inputs=[base_folder],
281
- outputs=[datasets, datasets_selected, reverse_grouping_dropdown],
282
- )
283
-
284
-
285
- def update_datasets_with_regex(regex, selected_runs, all_runs):
286
- if not regex:
287
- return
288
- new_dsts = {run for run in all_runs if re.search(regex, run)}
289
- if not new_dsts:
290
- return gr.update(value=list(selected_runs))
291
- dst_union = new_dsts.union(selected_runs or [])
292
- return gr.update(value=sorted(list(dst_union)))
293
-
294
-
295
- regex_button.click(
296
- fn=update_datasets_with_regex,
297
- inputs=[regex_select, datasets_selected, datasets],
298
- outputs=datasets_selected,
299
- )
300
-
301
-
302
- def update_grouping_options(grouping):
303
- if grouping == "histogram":
304
- return {
305
- normalization_checkbox: gr.Column(visible=True),
306
- group_choices: gr.Column(visible=False),
307
- min_max_hist: gr.Column(visible=True),
308
- histogram_settings: gr.TabItem(visible=True),
309
- }
310
- else:
311
- return {
312
- normalization_checkbox: gr.Column(visible=False),
313
- group_choices: gr.Column(visible=True),
314
- min_max_hist: gr.Column(visible=False),
315
- histogram_settings: gr.TabItem(visible=False),
316
- }
317
-
318
-
319
- grouping_dropdown.change(
320
- fn=update_grouping_options,
321
- inputs=[grouping_dropdown],
322
- outputs=[normalization_checkbox, group_choices, min_max_hist, histogram_settings],
323
- )
324
 
325
  return demo
 
1
+ import os
2
  import gradio as gr
 
 
 
 
 
3
  from functools import partial
 
4
  import re
5
+ from src.view.help_tab import create_help_tab
6
+ from src.view.metric_view_tab import create_metric_view_tab
7
+ from src.view.reverse_search_tab import create_reverse_search_tab
8
+ from src.logic.data_fetching import fetch_datasets, fetch_groups, fetch_metrics, reverse_search, reverse_search_add
9
 
10
  METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/HuggingFaceFW-Dev/summary-stats-files")
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def create_interface():
13
  with gr.Blocks() as demo:
 
 
14
  metrics_headline = gr.Markdown(value="# Metrics Exploration")
15
+ available_datasets = gr.State([])
16
+ selected_datasets = gr.State([])
17
 
18
  with gr.Tabs():
19
+ with gr.Tab("Help"):
20
+ create_help_tab()
21
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  with gr.TabItem("Metric View"):
23
+ base_folder = create_metric_view_tab(METRICS_LOCATION_DEFAULT, available_datasets, selected_datasets)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  with gr.TabItem("Reverse Metrics Search"):
26
+ create_reverse_search_tab(base_folder, available_datasets, selected_datasets)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  return demo