Commit
•
e36d40b
1
Parent(s):
ff44e29
fix: mappings
Browse filesfeat: add max number of rows
src/distilabel_dataset_generator/sft.py
CHANGED
@@ -142,16 +142,10 @@ DEFAULT_DATASET = pd.DataFrame(
|
|
142 |
|
143 |
|
144 |
def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str = None):
|
145 |
-
|
146 |
-
{
|
147 |
-
|
148 |
-
|
149 |
-
}
|
150 |
-
if num_turns == 1
|
151 |
-
else {
|
152 |
-
"conversation": "messages",
|
153 |
-
}
|
154 |
-
)
|
155 |
with Pipeline(name="sft") as pipeline:
|
156 |
magpie = MagpieGenerator(
|
157 |
llm=InferenceEndpointsLLM(
|
@@ -181,7 +175,7 @@ def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str =
|
|
181 |
columns=list(output_mappings.values()) + ["model_name"],
|
182 |
)
|
183 |
magpie.connect(keep_columns)
|
184 |
-
distiset: Distiset = pipeline.run()
|
185 |
result_queue.put(distiset)
|
186 |
|
187 |
|
@@ -227,6 +221,16 @@ def generate_dataset(
|
|
227 |
raise gr.Error(
|
228 |
"Please sign in with Hugging Face to be able to push the dataset to the Hub."
|
229 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
gr.Info(
|
232 |
"Started pipeline execution. This might take a while, depending on the number of rows and turns you have selected. Don't close this page."
|
@@ -316,7 +320,7 @@ More information on distilabel and techniques can be found in the "FAQ" tab. The
|
|
316 |
num_turns = gr.Number(
|
317 |
value=1,
|
318 |
label="Number of turns in the conversation",
|
319 |
-
|
320 |
info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
|
321 |
)
|
322 |
num_rows = gr.Number(
|
|
|
142 |
|
143 |
|
144 |
def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str = None):
|
145 |
+
if num_turns == 1:
|
146 |
+
output_mappings = {"instruction": "prompt", "response": "completion"}
|
147 |
+
else:
|
148 |
+
output_mappings = {"conversation": "messages"}
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
with Pipeline(name="sft") as pipeline:
|
150 |
magpie = MagpieGenerator(
|
151 |
llm=InferenceEndpointsLLM(
|
|
|
175 |
columns=list(output_mappings.values()) + ["model_name"],
|
176 |
)
|
177 |
magpie.connect(keep_columns)
|
178 |
+
distiset: Distiset = pipeline.run(use_cache=False)
|
179 |
result_queue.put(distiset)
|
180 |
|
181 |
|
|
|
221 |
raise gr.Error(
|
222 |
"Please sign in with Hugging Face to be able to push the dataset to the Hub."
|
223 |
)
|
224 |
+
if num_turns > 4:
|
225 |
+
raise gr.Info(
|
226 |
+
"You can only generate a dataset with 4 or fewer turns. Setting to 4."
|
227 |
+
)
|
228 |
+
num_turns = 4
|
229 |
+
if num_rows > 5000:
|
230 |
+
raise gr.Info(
|
231 |
+
"You can only generate a dataset with 5000 or fewer rows. Setting to 5000."
|
232 |
+
)
|
233 |
+
num_rows = 5000
|
234 |
|
235 |
gr.Info(
|
236 |
"Started pipeline execution. This might take a while, depending on the number of rows and turns you have selected. Don't close this page."
|
|
|
320 |
num_turns = gr.Number(
|
321 |
value=1,
|
322 |
label="Number of turns in the conversation",
|
323 |
+
maximum=4,
|
324 |
info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
|
325 |
)
|
326 |
num_rows = gr.Number(
|