davidberenstein1957 HF staff commited on
Commit
e36d40b
1 Parent(s): ff44e29

fix: mappings

Browse files

feat: add max number of rows

src/distilabel_dataset_generator/sft.py CHANGED
@@ -142,16 +142,10 @@ DEFAULT_DATASET = pd.DataFrame(
142
 
143
 
144
  def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str = None):
145
- output_mappings = (
146
- {
147
- "instruction": "prompt",
148
- "response": "completion",
149
- }
150
- if num_turns == 1
151
- else {
152
- "conversation": "messages",
153
- }
154
- )
155
  with Pipeline(name="sft") as pipeline:
156
  magpie = MagpieGenerator(
157
  llm=InferenceEndpointsLLM(
@@ -181,7 +175,7 @@ def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str =
181
  columns=list(output_mappings.values()) + ["model_name"],
182
  )
183
  magpie.connect(keep_columns)
184
- distiset: Distiset = pipeline.run()
185
  result_queue.put(distiset)
186
 
187
 
@@ -227,6 +221,16 @@ def generate_dataset(
227
  raise gr.Error(
228
  "Please sign in with Hugging Face to be able to push the dataset to the Hub."
229
  )
 
 
 
 
 
 
 
 
 
 
230
 
231
  gr.Info(
232
  "Started pipeline execution. This might take a while, depending on the number of rows and turns you have selected. Don't close this page."
@@ -316,7 +320,7 @@ More information on distilabel and techniques can be found in the "FAQ" tab. The
316
  num_turns = gr.Number(
317
  value=1,
318
  label="Number of turns in the conversation",
319
- minimum=1,
320
  info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
321
  )
322
  num_rows = gr.Number(
 
142
 
143
 
144
  def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str = None):
145
+ if num_turns == 1:
146
+ output_mappings = {"instruction": "prompt", "response": "completion"}
147
+ else:
148
+ output_mappings = {"conversation": "messages"}
 
 
 
 
 
 
149
  with Pipeline(name="sft") as pipeline:
150
  magpie = MagpieGenerator(
151
  llm=InferenceEndpointsLLM(
 
175
  columns=list(output_mappings.values()) + ["model_name"],
176
  )
177
  magpie.connect(keep_columns)
178
+ distiset: Distiset = pipeline.run(use_cache=False)
179
  result_queue.put(distiset)
180
 
181
 
 
221
  raise gr.Error(
222
  "Please sign in with Hugging Face to be able to push the dataset to the Hub."
223
  )
224
+ if num_turns > 4:
225
+ raise gr.Info(
226
+ "You can only generate a dataset with 4 or fewer turns. Setting to 4."
227
+ )
228
+ num_turns = 4
229
+ if num_rows > 5000:
230
+ raise gr.Info(
231
+ "You can only generate a dataset with 5000 or fewer rows. Setting to 5000."
232
+ )
233
+ num_rows = 5000
234
 
235
  gr.Info(
236
  "Started pipeline execution. This might take a while, depending on the number of rows and turns you have selected. Don't close this page."
 
320
  num_turns = gr.Number(
321
  value=1,
322
  label="Number of turns in the conversation",
323
+ maximum=4,
324
  info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
325
  )
326
  num_rows = gr.Number(