Spaces:

argilla
/

synthetic-data-generator

Running

sdiazlor HF staff commited on 23 days ago

Commit

46f00bc

•

1 Parent(s): d27c1e6

fix: add seed for more randomized samples

Files changed (1) hide show

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
 import pandas as pd
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
@@ -88,6 +89,7 @@ def generate_pipeline_code(
     base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, KeepColumns
@@ -111,6 +113,8 @@ with Pipeline(name="textcat") as pipeline:
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
             }},
         ),
         difficulty={None if difficulty == "mixed" else repr(difficulty)},
@@ -151,6 +155,7 @@ with Pipeline(name="textcat") as pipeline:
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
             }},
         ),
         n={num_labels},
@@ -175,9 +180,10 @@ def get_textcat_generator(difficulty, clarity, is_sample):
             tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
-                "temperature": 0.8,
                 "max_new_tokens": 256 if is_sample else 2048,
                 "do_sample": True,
             },
         ),
         difficulty=None if difficulty == "mixed" else difficulty,

 from typing import List
 import pandas as pd
+import random
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
     base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
+import random
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, KeepColumns
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
+                "do_sample": True,
+                "seed": random.randint(0, 2**32 - 1),
             }},
         ),
         difficulty={None if difficulty == "mixed" else repr(difficulty)},
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
+                "do_sample": True,
             }},
         ),
         n={num_labels},
             tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
+                "temperature": 0.9,
                 "max_new_tokens": 256 if is_sample else 2048,
                 "do_sample": True,
+                "seed": random.randint(0, 2**32 - 1),
             },
         ),
         difficulty=None if difficulty == "mixed" else difficulty,