Sasha commited on
Commit
4f31875
1 Parent(s): 74e9f8c

catching dataset-specific metrics

Browse files
Files changed (1) hide show
  1. app.py +62 -58
app.py CHANGED
@@ -31,6 +31,8 @@ tasks= ['classification', 'question answering', 'automatic speech recognition',
31
  'textual entailment', 'commonsense reasoning', 'summarization']
32
  metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
33
 
 
 
34
  def find_task(dname):
35
  task = None
36
  dataset_builder = load_dataset_builder(dataset_name, dataset_config)
@@ -83,28 +85,29 @@ if dataset_name in metrics:
83
  code = ''' from datasets import load_metric
84
  metric = load_metric(\"'''+dataset_name+'''\")'''
85
  st.code(code, language='python')
 
86
  else:
87
  st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
88
  dedicated_metric = False
89
 
90
- st.markdown("### Task-Specific Metrics")
91
-
92
- task = find_task(dataset_name)
93
-
94
- if task is not None:
95
- st.markdown("The task associated to it this dataset is: " + task.replace('-',' '))
96
- if task == 'automatic-speech-recognition':
97
- st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
98
- st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
99
- wer_code = '''from datasets import load_metric
100
- metric = load_metric("wer")'''
101
- st.code(wer_code, language='python')
102
- st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
103
- cer_code = '''from datasets import load_metric
104
- metric = load_metric("cer")'''
105
- st.code(cer_code, language='python')
106
- else:
107
- st.markdown("The task for this dataset doesn't have any dedicated metrics, but you can still use general ones! :cowboy_hat_face:")
108
 
109
 
110
  #print(dataset_builder.info.task_templates)
@@ -119,44 +122,45 @@ else:
119
  #print(dataset_name, dataset_config, dataset_split)
120
 
121
  #print(labels.head())
122
- if dataset_name in ['glue','super_glue', 'paws']:
123
- dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
124
- else:
125
- dataset = load_dataset(dataset_name, split=dataset_split)
126
-
127
- try:
128
- num_classes = dataset_builder.info.features['label'].num_classes
129
- labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
130
- labels = labels.rename(columns={"count_star()": "count"})
131
- labels.index = dataset_builder.info.features['label'].names
132
- st.markdown("### Labelled Metrics")
133
- st.markdown("This dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
134
- #TODO : figure out how to make a label plot
135
- st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
136
- total = sum(c for c in labels['count'])
137
- proportion = [c/total for c in labels['count']]
138
- #proportion = [0.85, 0.15]
139
- stdev_dataset= statistics.stdev(proportion)
140
- if stdev_dataset <= balanced_stdev:
141
- st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
142
- st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
143
- accuracy_code = '''from datasets import load_metric
144
- metric = load_metric("accuracy")'''
145
- st.code(accuracy_code, language='python')
146
-
147
  else:
148
- st.markdown("Since this dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
149
- st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
150
- accuracy_code = '''from datasets import load_metric
151
- metric = load_metric("accuracy")'''
152
- st.code(accuracy_code, language='python')
153
- st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
154
- except:
155
- if task != 'automatic-speech-recognition':
156
- st.markdown("### Unsupervised Metrics")
157
- st.markdown("Since this dataset doesn't have any labels, the metrics that you can use for evaluation are:")
158
- st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
159
- perplexity_code = '''from datasets import load_metric
160
- metric = load_metric("perplexity")'''
161
- st.code(perplexity_code, language='python')
162
- st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  'textual entailment', 'commonsense reasoning', 'summarization']
32
  metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
33
 
34
+ dedicated_metric = False
35
+
36
  def find_task(dname):
37
  task = None
38
  dataset_builder = load_dataset_builder(dataset_name, dataset_config)
 
85
  code = ''' from datasets import load_metric
86
  metric = load_metric(\"'''+dataset_name+'''\")'''
87
  st.code(code, language='python')
88
+ dedicated_metric = True
89
  else:
90
  st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
91
  dedicated_metric = False
92
 
93
+ if dedicated_metric == False:
94
+ st.markdown("### Task-Specific Metrics")
95
+ task = find_task(dataset_name)
96
+
97
+ if task is not None:
98
+ st.markdown("The task associated to it this dataset is: " + task.replace('-',' '))
99
+ if task == 'automatic-speech-recognition':
100
+ st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
101
+ st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
102
+ wer_code = '''from datasets import load_metric
103
+ metric = load_metric("wer")'''
104
+ st.code(wer_code, language='python')
105
+ st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
106
+ cer_code = '''from datasets import load_metric
107
+ metric = load_metric("cer")'''
108
+ st.code(cer_code, language='python')
109
+ else:
110
+ st.markdown("The task for this dataset doesn't have any dedicated metrics, but you can still use general ones! :cowboy_hat_face:")
111
 
112
 
113
  #print(dataset_builder.info.task_templates)
 
122
  #print(dataset_name, dataset_config, dataset_split)
123
 
124
  #print(labels.head())
125
+ if dedicated_metric == False:
126
+ if dataset_name in ['glue','super_glue', 'paws', 'squad_es']:
127
+ dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  else:
129
+ dataset = load_dataset(dataset_name, split=dataset_split)
130
+
131
+ try:
132
+ num_classes = dataset_builder.info.features['label'].num_classes
133
+ labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
134
+ labels = labels.rename(columns={"count_star()": "count"})
135
+ labels.index = dataset_builder.info.features['label'].names
136
+ st.markdown("### Labelled Metrics")
137
+ st.markdown("This dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
138
+ #TODO : figure out how to make a label plot
139
+ st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
140
+ total = sum(c for c in labels['count'])
141
+ proportion = [c/total for c in labels['count']]
142
+ #proportion = [0.85, 0.15]
143
+ stdev_dataset= statistics.stdev(proportion)
144
+ if stdev_dataset <= balanced_stdev:
145
+ st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
146
+ st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
147
+ accuracy_code = '''from datasets import load_metric
148
+ metric = load_metric("accuracy")'''
149
+ st.code(accuracy_code, language='python')
150
+
151
+ else:
152
+ st.markdown("Since this dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
153
+ st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
154
+ accuracy_code = '''from datasets import load_metric
155
+ metric = load_metric("accuracy")'''
156
+ st.code(accuracy_code, language='python')
157
+ st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
158
+ except:
159
+ if task != 'automatic-speech-recognition':
160
+ st.markdown("### Unsupervised Metrics")
161
+ st.markdown("Since this dataset doesn't have any labels, the metrics that you can use for evaluation are:")
162
+ st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
163
+ perplexity_code = '''from datasets import load_metric
164
+ metric = load_metric("perplexity")'''
165
+ st.code(perplexity_code, language='python')
166
+ st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')