commited on
catching dataset-specific metrics
Browse files
@@ -31,6 +31,8 @@ tasks= ['classification', 'question answering', 'automatic speech recognition',
31 |
'textual entailment', 'commonsense reasoning', 'summarization']
32 |
metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
33 |
34 |
def find_task(dname):
35 |
task = None
36 |
dataset_builder = load_dataset_builder(dataset_name, dataset_config)
@@ -83,28 +85,29 @@ if dataset_name in metrics:
83 |
code = ''' from datasets import load_metric
84 |
metric = load_metric(\"'''+dataset_name+'''\")'''
85 |
st.code(code, language='python')
86 |
87 |
st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
88 |
dedicated_metric = False
89 |
90 |
91 |
92 |
task = find_task(dataset_name)
93 |
94 |
if task is not None:
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
@@ -119,44 +122,45 @@ else:
119 |
#print(dataset_name, dataset_config, dataset_split)
120 |
121 |
122 |
123 |
124 |
125 |
dataset = load_dataset(dataset_name, split=dataset_split)
126 |
127 |
128 |
num_classes =['label'].num_classes
129 |
labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
130 |
labels = labels.rename(columns={"count_star()": "count"})
131 |
labels.index =['label'].names
132 |
st.markdown("### Labelled Metrics")
133 |
st.markdown("This dataset has "+ str(['label'].num_classes) + " labels : " + ', '.join(['label'].names))
134 |
#TODO : figure out how to make a label plot
135 |
st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
136 |
total = sum(c for c in labels['count'])
137 |
proportion = [c/total for c in labels['count']]
138 |
#proportion = [0.85, 0.15]
139 |
stdev_dataset= statistics.stdev(proportion)
140 |
if stdev_dataset <= balanced_stdev:
141 |
st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
142 |
143 |
accuracy_code = '''from datasets import load_metric
144 |
metric = load_metric("accuracy")'''
145 |
st.code(accuracy_code, language='python')
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
31 |
'textual entailment', 'commonsense reasoning', 'summarization']
32 |
metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
33 |
34 |
dedicated_metric = False
35 |
36 |
def find_task(dname):
37 |
task = None
38 |
dataset_builder = load_dataset_builder(dataset_name, dataset_config)
85 |
code = ''' from datasets import load_metric
86 |
metric = load_metric(\"'''+dataset_name+'''\")'''
87 |
st.code(code, language='python')
88 |
dedicated_metric = True
89 |
90 |
st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
91 |
dedicated_metric = False
92 |
93 |
if dedicated_metric == False:
94 |
st.markdown("### Task-Specific Metrics")
95 |
task = find_task(dataset_name)
96 |
97 |
if task is not None:
98 |
st.markdown("The task associated to it this dataset is: " + task.replace('-',' '))
99 |
if task == 'automatic-speech-recognition':
100 |
st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
101 |
st.markdown('[Word Error Rate](')
102 |
wer_code = '''from datasets import load_metric
103 |
metric = load_metric("wer")'''
104 |
st.code(wer_code, language='python')
105 |
st.markdown('[Character Error Rate](')
106 |
cer_code = '''from datasets import load_metric
107 |
metric = load_metric("cer")'''
108 |
st.code(cer_code, language='python')
109 |
110 |
st.markdown("The task for this dataset doesn't have any dedicated metrics, but you can still use general ones! :cowboy_hat_face:")
111 |
112 |
113 |
122 |
#print(dataset_name, dataset_config, dataset_split)
123 |
124 |
125 |
if dedicated_metric == False:
126 |
if dataset_name in ['glue','super_glue', 'paws', 'squad_es']:
127 |
dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
128 |
129 |
dataset = load_dataset(dataset_name, split=dataset_split)
130 |
131 |
132 |
num_classes =['label'].num_classes
133 |
labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
134 |
labels = labels.rename(columns={"count_star()": "count"})
135 |
labels.index =['label'].names
136 |
st.markdown("### Labelled Metrics")
137 |
st.markdown("This dataset has "+ str(['label'].num_classes) + " labels : " + ', '.join(['label'].names))
138 |
#TODO : figure out how to make a label plot
139 |
st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
140 |
total = sum(c for c in labels['count'])
141 |
proportion = [c/total for c in labels['count']]
142 |
#proportion = [0.85, 0.15]
143 |
stdev_dataset= statistics.stdev(proportion)
144 |
if stdev_dataset <= balanced_stdev:
145 |
st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
146 |
147 |
accuracy_code = '''from datasets import load_metric
148 |
metric = load_metric("accuracy")'''
149 |
st.code(accuracy_code, language='python')
150 |
151 |
152 |
st.markdown("Since this dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
153 |
st.markdown('[F1 Score](')
154 |
accuracy_code = '''from datasets import load_metric
155 |
metric = load_metric("accuracy")'''
156 |
st.code(accuracy_code, language='python')
157 |
st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
158 |
159 |
if task != 'automatic-speech-recognition':
160 |
st.markdown("### Unsupervised Metrics")
161 |
st.markdown("Since this dataset doesn't have any labels, the metrics that you can use for evaluation are:")
162 |
163 |
perplexity_code = '''from datasets import load_metric
164 |
metric = load_metric("perplexity")'''
165 |
st.code(perplexity_code, language='python')
166 |
st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')