Spaces:
Runtime error
Runtime error
import pytest | |
import markdown | |
from bs4 import BeautifulSoup | |
from compliance_checks.evaluation import ( | |
EvaluationCheck, EvaluationResult, | |
) | |
empty_template = """\ | |
## Evaluation | |
<!-- This section describes the evaluation protocols and provides the results. --> | |
### Testing Data, Factors & Metrics | |
#### Testing Data | |
<!-- This should link to a Data Card if possible. --> | |
[More Information Needed] | |
#### Factors | |
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. --> | |
[More Information Needed] | |
#### Metrics | |
<!-- These are the evaluation metrics being used, ideally with a description of why. --> | |
[More Information Needed] | |
### Results | |
[More Information Needed] | |
#### Summary | |
""" | |
model_card_template = """\ | |
## Evaluation | |
Some info... | |
### Testing Data, Factors & Metrics | |
#### Testing Data | |
Some information here | |
#### Factors | |
Etc... | |
#### Metrics | |
There are some metrics listed out here | |
### Results | |
And some results | |
#### Summary | |
Summarizing everything up! | |
""" | |
albert = """\ | |
# ALBERT Base v2 | |
## Evaluation results | |
When fine-tuned on downstream tasks, the ALBERT models achieve the following results: | |
""" | |
helsinki = """\ | |
### eng-spa | |
## Benchmarks | |
| testset | BLEU | chr-F | | |
|-----------------------|-------|-------| | |
| newssyscomb2009-engspa.eng.spa | 31.0 | 0.583 | | |
| news-test2008-engspa.eng.spa | 29.7 | 0.564 | | |
| newstest2009-engspa.eng.spa | 30.2 | 0.578 | | |
| newstest2010-engspa.eng.spa | 36.9 | 0.620 | | |
| newstest2011-engspa.eng.spa | 38.2 | 0.619 | | |
| newstest2012-engspa.eng.spa | 39.0 | 0.625 | | |
| newstest2013-engspa.eng.spa | 35.0 | 0.598 | | |
| Tatoeba-test.eng.spa | 54.9 | 0.721 | | |
""" | |
phil = """\ | |
## Results | |
| key | value | | |
| --- | ----- | | |
| eval_rouge1 | 42.621 | | |
| eval_rouge2 | 21.9825 | | |
| eval_rougeL | 33.034 | | |
| eval_rougeLsum | 39.6783 | | |
""" | |
runway = """\ | |
## Evaluation Results | |
Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0, | |
""" | |
success_result = EvaluationResult( | |
status=True | |
) | |
def test_run_checks(card): | |
model_card_html = markdown.markdown(card) | |
card_soup = BeautifulSoup(model_card_html, features="html.parser") | |
results = EvaluationCheck().run_check(card_soup) | |
assert results == success_result | |
def test_fail_on_empty_template(): | |
model_card_html = markdown.markdown(empty_template) | |
card_soup = BeautifulSoup(model_card_html, features="html.parser") | |
results = EvaluationCheck().run_check(card_soup) | |
assert results == EvaluationResult() | |