Spaces:
Running
on
Zero
Running
on
Zero
DongfuJiang
commited on
Commit
•
152a862
1
Parent(s):
7ad9a04
update
Browse files- app.py +57 -17
- requirements.txt +5 -2
app.py
CHANGED
@@ -2,10 +2,12 @@ import os
|
|
2 |
# os.environ["HF_HOME"] = "/data/.huggingface"
|
3 |
import gradio as gr
|
4 |
import sys
|
|
|
5 |
from datasets import load_dataset
|
6 |
from typing import List
|
7 |
-
from
|
8 |
-
|
|
|
9 |
|
10 |
DESCRIPTIONS = """
|
11 |
We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
|
@@ -24,18 +26,56 @@ for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET.take(100)):
|
|
24 |
# continue
|
25 |
EXAMPLES.append([ex[field] for field in fields])
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
|
41 |
def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
|
@@ -66,7 +106,7 @@ with gr.Blocks(theme='gradio/soft') as demo:
|
|
66 |
minimum=256,
|
67 |
maximum=1024,
|
68 |
step=1,
|
69 |
-
value=
|
70 |
)
|
71 |
temperature = gr.Slider(
|
72 |
label='Temperature of generation',
|
@@ -85,10 +125,10 @@ with gr.Blocks(theme='gradio/soft') as demo:
|
|
85 |
|
86 |
gr.Markdown("## TIGERScore Outputs")
|
87 |
evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
|
88 |
-
|
89 |
|
90 |
submit_button.click(
|
91 |
-
fn=
|
92 |
inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
|
93 |
outputs=evaluation_output_textbox,
|
94 |
)
|
|
|
2 |
# os.environ["HF_HOME"] = "/data/.huggingface"
|
3 |
import gradio as gr
|
4 |
import sys
|
5 |
+
import copy
|
6 |
from datasets import load_dataset
|
7 |
from typing import List
|
8 |
+
from llama_cpp import Llama
|
9 |
+
from huggingface_hub import hf_hub_download
|
10 |
+
from string import Template
|
11 |
|
12 |
DESCRIPTIONS = """
|
13 |
We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
|
|
|
26 |
# continue
|
27 |
EXAMPLES.append([ex[field] for field in fields])
|
28 |
|
29 |
+
TEMPLATE = """You are evaluating errors in a model-generated output for a given instruction.
|
30 |
+
Instruction:
|
31 |
+
${generation_instruction}
|
32 |
+
${input_context}
|
33 |
+
|
34 |
+
Model-generated Output:
|
35 |
+
${hypothesis_output}
|
36 |
+
|
37 |
+
For each error you give in the response, please also elaborate the following information:
|
38 |
+
- error location (the words that are wrong in the output)
|
39 |
+
- error aspect it belongs to.
|
40 |
+
- explanation why it's an error, and the correction suggestions.
|
41 |
+
- severity of the error ("Major" or "Minor").
|
42 |
+
- reduction of score (between 0.5 and 5 given the severity of the error)
|
43 |
+
|
44 |
+
Your evaluation output:
|
45 |
+
"""
|
46 |
+
|
47 |
+
llm = Llama(
|
48 |
+
model_path=hf_hub_download(
|
49 |
+
repo_id=os.environ.get("REPO_ID", "TIGER-Lab/TIGERScore-7B-GGUF"),
|
50 |
+
filename=os.environ.get("MODEL_FILE", "ggml-model-q4_0.gguf"),
|
51 |
+
),
|
52 |
+
n_ctx=2048,
|
53 |
+
# n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
|
54 |
+
)
|
55 |
+
|
56 |
+
def generate_text(input_context, generation_instruction, hypo_output, max_new_tokens=1024, temperature=0.7, top_p=1.0):
|
57 |
+
prompt_template = Template(TEMPLATE)
|
58 |
+
prompt = prompt_template.substitute(
|
59 |
+
generation_instruction=generation_instruction,
|
60 |
+
input_context=input_context,
|
61 |
+
hypothesis_output=hypo_output,
|
62 |
+
).strip("\n ")
|
63 |
+
gen_params = {
|
64 |
+
"max_tokens": max_new_tokens,
|
65 |
+
"top_p": top_p,
|
66 |
+
"top_k": 40,
|
67 |
+
"temperature": temperature,
|
68 |
+
"frequency_penalty": 0.0,
|
69 |
+
"presence_penalty": 0.0,
|
70 |
+
"echo": False,
|
71 |
+
"stream": True,
|
72 |
+
}
|
73 |
+
outputs = llm(prompt, **gen_params)
|
74 |
+
temp=""
|
75 |
+
for out in outputs:
|
76 |
+
stream = copy.deepcopy(out)
|
77 |
+
temp += stream["choices"][0]["text"]
|
78 |
+
yield temp
|
79 |
|
80 |
|
81 |
def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
|
|
|
106 |
minimum=256,
|
107 |
maximum=1024,
|
108 |
step=1,
|
109 |
+
value=1024,
|
110 |
)
|
111 |
temperature = gr.Slider(
|
112 |
label='Temperature of generation',
|
|
|
125 |
|
126 |
gr.Markdown("## TIGERScore Outputs")
|
127 |
evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
|
128 |
+
|
129 |
|
130 |
submit_button.click(
|
131 |
+
fn=generate_text,
|
132 |
inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
|
133 |
outputs=evaluation_output_textbox,
|
134 |
)
|
requirements.txt
CHANGED
@@ -1,2 +1,5 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
1 |
+
datasets==2.14.5
|
2 |
+
torch
|
3 |
+
transformers
|
4 |
+
llama
|
5 |
+
llama-cpp-python
|