Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

sdiazlor HF staff commited on 23 days ago

Commit

3f21280

•

1 Parent(s): 3c2fc33

fix: apply feedback

Browse files

Files changed (3) hide show

src/distilabel_dataset_generator/apps/base.py +7 -3
src/distilabel_dataset_generator/apps/textcat.py +16 -6
src/distilabel_dataset_generator/pipelines/textcat.py +4 -3

src/distilabel_dataset_generator/apps/base.py CHANGED Viewed

@@ -38,8 +38,8 @@ def get_main_ui(
         if task == TEXTCAT_TASK:
             result = fn_generate_dataset(
                 system_prompt=system_prompt,
-                difficulty="mixed",
-                clarity="mixed",
                 labels=[],
                 num_labels=1,
                 num_rows=1,
@@ -271,7 +271,11 @@ def get_iterate_on_sample_dataset_ui(
         with gr.Row():
             sample_dataset = gr.Dataframe(
                 value=default_datasets[0],
-                label="Sample dataset. Prompts and completions truncated to 256 tokens.",
                 interactive=False,
                 wrap=True,
             )

         if task == TEXTCAT_TASK:
             result = fn_generate_dataset(
                 system_prompt=system_prompt,
+                difficulty="high school",
+                clarity="clear",
                 labels=[],
                 num_labels=1,
                 num_rows=1,
         with gr.Row():
             sample_dataset = gr.Dataframe(
                 value=default_datasets[0],
+                label=(
+                    "Sample dataset. Text truncated to 256 tokens."
+                    if task == TEXTCAT_TASK
+                    else "Sample dataset. Prompts and completions truncated to 256 tokens."
+                ),
                 interactive=False,
                 wrap=True,
             )

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -215,7 +215,6 @@ def generate_dataset(
         system_prompt=system_prompt,
         labels=labels,
         num_labels=num_labels,
-        is_sample=is_sample,
     )
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
@@ -309,6 +308,9 @@ def validate_input_labels(labels):
         )
     return labels
 (
     app,
@@ -354,7 +356,7 @@ with app:
                 ],
                 value="mixed",
                 label="Difficulty",
-                info="The difficulty of the text to be generated.",
             )
             clarity = gr.Dropdown(
                 choices=[
@@ -368,7 +370,7 @@ with app:
                 ],
                 value="mixed",
                 label="Clarity",
-                info="The clarity of the text to be generated.",
             )
             with gr.Column():
                 labels = gr.Dropdown(
@@ -385,18 +387,18 @@ with app:
                         size="sm",
                     )
             num_labels = gr.Number(
-                label="Number of labels",
                 value=1,
                 minimum=1,
                 maximum=10,
-                info="The number of labels to classify the text.",
             )
             num_rows = gr.Number(
                 label="Number of rows",
                 value=10,
                 minimum=1,
                 maximum=500,
-                info="More rows will take longer to generate.",
             )
         pipeline_code = get_pipeline_code_ui(
@@ -415,6 +417,10 @@ with app:
         fn=update_suggested_labels,
         inputs=[system_prompt],
         outputs=labels,
     )
     gr.on(
@@ -540,6 +546,10 @@ with app:
         fn=generate_pipeline_code,
         inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
         outputs=[pipeline_code],
     )
     num_labels.change(
         fn=generate_pipeline_code,

         system_prompt=system_prompt,
         labels=labels,
         num_labels=num_labels,
     )
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
         )
     return labels
+def update_max_num_labels(labels):
+    return gr.update(maximum=len(labels) if labels else 1)
 (
     app,
                 ],
                 value="mixed",
                 label="Difficulty",
+                info="Select the comprehension level for the text. Ensure it matches the task context.",
             )
             clarity = gr.Dropdown(
                 choices=[
                 ],
                 value="mixed",
                 label="Clarity",
+                info="Set how easily the correct label can be identified.",
             )
             with gr.Column():
                 labels = gr.Dropdown(
                         size="sm",
                     )
             num_labels = gr.Number(
+                label="Number of labels per text",
                 value=1,
                 minimum=1,
                 maximum=10,
+                info="Select 1 for single-label and >1 for multi-label.",
             )
             num_rows = gr.Number(
                 label="Number of rows",
                 value=10,
                 minimum=1,
                 maximum=500,
+                info="Select the number of rows in the dataset. More rows will take more time.",
             )
         pipeline_code = get_pipeline_code_ui(
         fn=update_suggested_labels,
         inputs=[system_prompt],
         outputs=labels,
+    ).then(
+        fn=update_max_num_labels,
+        inputs=[labels, num_labels],
+        outputs=[num_labels],
     )
     gr.on(
         fn=generate_pipeline_code,
         inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
         outputs=[pipeline_code],
+    ).then(
+        fn=update_max_num_labels,
+        inputs=[labels, num_labels],
+        outputs=[num_labels],
     )
     num_labels.change(
         fn=generate_pipeline_code,

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -176,7 +176,8 @@ def get_textcat_generator(difficulty, clarity, is_sample):
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.8,
-                "max_new_tokens": 256 if is_sample else 1024,
             },
         ),
         difficulty=None if difficulty == "mixed" else difficulty,
@@ -186,7 +187,7 @@ def get_textcat_generator(difficulty, clarity, is_sample):
     return textcat_generator
-def get_labeller_generator(system_prompt, labels, num_labels, is_sample):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
@@ -194,7 +195,7 @@ def get_labeller_generator(system_prompt, labels, num_labels, is_sample):
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.8,
-                "max_new_tokens": 256 if is_sample else 1024,
             },
         ),
         context=system_prompt,

             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.8,
+                "max_new_tokens": 256 if is_sample else 2048,
+                "do_sample": True,
             },
         ),
         difficulty=None if difficulty == "mixed" else difficulty,
     return textcat_generator
+def get_labeller_generator(system_prompt, labels, num_labels):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.8,
+                "max_new_tokens": 2048,
             },
         ),
         context=system_prompt,