Spaces:

burtenshaw
/

my_personas_generator

Running

App Files Files Community

burtenshaw HF staff commited on 2 days ago

Commit

7b3a105

•

1 Parent(s): df69857

first commit

Browse files

Files changed (2) hide show

app.py +147 -0
personas.py +71 -0

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import io
+import multiprocessing
+import os
+import time
+import gradio as gr
+import pandas as pd
+from unstructured.partition.pdf import partition_pdf
+import nltk
+from distilabel.pipeline import Pipeline
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.steps import LoadDataFromDicts, KeepColumns
+from distilabel.steps.tasks import TextGeneration
+from personas import *  # Assuming this contains TextToPersona and other necessary definitions
+nltk.download("punkt", quiet=True)
+PROMPT_TEMPLATE = """\
+Generate a single prompt the persona below might ask to an AI assistant:
+{{ persona }}
+"""
+# Get HF_TOKEN from environment variable
+HF_TOKEN = os.environ.get("HF_TOKEN")
+def process_pdfs(pdf_files):
+    all_data = []
+    for pdf_file in pdf_files:
+        elements = partition_pdf(pdf_file.name)
+        full_text = ""
+        for element in elements:
+            full_text += element.text + "\n"
+        all_data.append({"text": full_text.strip()})
+    return all_data
+def _run_pipeline(result_queue, pdf_files):
+    data = process_pdfs(pdf_files)
+    with Pipeline(name="personahub-fineweb-edu-text-to-persona") as pipeline:
+        input_batch_size = 10
+        data_loader = LoadDataFromDicts(data=data)
+        llm = InferenceEndpointsLLM(
+            model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
+            api_key=HF_TOKEN,
+        )
+        text_to_persona = TextToPersona(
+            llm=llm,
+            input_batch_size=input_batch_size,
+        )
+        text_gen = TextGeneration(
+            llm=llm,
+            system_prompt="You are an AI assistant expert at simulating user interactions.",
+            template=PROMPT_TEMPLATE,
+            columns="persona",
+            output_mappings={"generation": "instruction"},
+            num_generations=1,
+        )
+        response_gen = TextGeneration(
+            llm=llm,
+            system_prompt="You are an AI assistant expert in responding to tasks",
+            output_mappings={"generation": "response"},
+        )
+        keep = KeepColumns(
+            columns=["text", "persona", "model_name", "instruction", "response"],
+            input_batch_size=input_batch_size,
+        )
+        (data_loader >> text_to_persona >> text_gen >> response_gen >> keep)
+    distiset = pipeline.run(use_cache=False)
+    result_queue.put(distiset)
+def generate_dataset(pdf_files, progress=gr.Progress()):
+    result_queue = multiprocessing.Queue()
+    p = multiprocessing.Process(
+        target=_run_pipeline,
+        args=(result_queue, pdf_files),
+    )
+    try:
+        p.start()
+        total_steps = 100
+        for step in range(total_steps):
+            if not p.is_alive() or p._popen.poll() is not None:
+                break
+            progress(
+                (step + 1) / total_steps,
+                desc="Generating dataset. Don't close this window.",
+            )
+            time.sleep(2)  # Adjust this value based on your needs
+        p.join()
+    except Exception as e:
+        raise gr.Error(f"An error occurred during dataset generation: {str(e)}")
+    distiset = result_queue.get()
+    df = distiset["default"]["train"].to_pandas()
+    progress(1.0, desc="Dataset generation completed")
+    return df
+def gradio_interface(pdf_files):
+    if HF_TOKEN is None:
+        raise gr.Error(
+            "HF_TOKEN environment variable is not set. Please set it and restart the application."
+        )
+    df = generate_dataset(pdf_files)
+    return df
+with gr.Blocks(title="MyPersonas Dataset Generator") as app:
+    gr.Markdown("# MyPersonas Dataset Generator")
+    gr.Markdown("Upload one or more PDFs to generate a persona based SFT dataset.")
+    with gr.Row():
+        pdf_files = gr.File(label="Upload PDFs", file_count="multiple")
+    with gr.Row():
+        generate_button = gr.Button("Generate Dataset")
+    output_dataframe = gr.DataFrame(
+        label="Generated Dataset",
+        interactive=False,
+        wrap=True,
+    )
+    generate_button.click(
+        fn=gradio_interface,
+        inputs=[pdf_files],
+        outputs=[output_dataframe],
+    )
+if __name__ == "__main__":
+    app.launch()

personas.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from typing import TYPE_CHECKING, Any, Dict, List, Union
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromDicts
+from distilabel.steps.tasks.base import Task
+from distilabel.steps import KeepColumns
+from distilabel.steps.base import StepResources
+from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks import TextGeneration
+SYSTEM_PROMPT_TEXT_TO_PERSONA: str = (
+    "You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. "
+    "Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. "
+    "The persona definition must go straight to the point, be assertive. The following are starts of persona definitions:\n"
+    "A machine learning researcher...\n"
+    "A pedriatric nurse whose...\n"
+    "An urban planner focused on..."
+)
+TEXT_TO_PERSONA_PROMPT: str = (
+    "What is the likely profession, interest, or role of the person who would write or be interested in this text?\n\n"
+    "## Text\n"
+    "{text}"
+)
+class TextToPersona(Task):
+    """
+    You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing.
+    Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements.
+    The persona definition must go straight to the point, be assertive. For example, you can start the definition as:
+    A machine learning researcher... or A pedriatric nurse whose...
+    See Figure 3 in PersonaHub paper.
+    """
+    system_prompt: str = SYSTEM_PROMPT_TEXT_TO_PERSONA
+    @property
+    def inputs(self) -> List[str]:
+        """The inputs for the task are the `text`."""
+        return ["text"]
+    def format_input(self, input: Dict[str, Any]) -> "ChatType":
+        """The input is formatted as a `ChatType`."""
+        return [
+            {"role": "system", "content": self.system_prompt},
+            {
+                "role": "user",
+                "content": TEXT_TO_PERSONA_PROMPT.format(text=input["text"]),  # type: ignore
+            },
+        ]
+    @property
+    def outputs(self) -> List[str]:
+        """The output for the task is the persona definition."""
+        return ["persona", "model_name"]
+    def format_output(
+        self, output: Union[str, None], input: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """The output is formatted as a list with the score of each instruction.
+        Args:
+            output: the raw output of the LLM.
+            input: the input to the task. Used for obtaining the number of responses.
+        Returns:
+            A dict with the persona definition.
+        """
+        return {"persona": output}