from typing import TYPE_CHECKING, Any, Dict, List, Union from distilabel.llms import InferenceEndpointsLLM from distilabel.pipeline import Pipeline from distilabel.steps import LoadDataFromDicts from distilabel.steps.tasks.base import Task from distilabel.steps import KeepColumns from distilabel.steps.base import StepResources from distilabel.steps.tasks.typing import ChatType from distilabel.steps.tasks import TextGeneration SYSTEM_PROMPT_TEXT_TO_PERSONA: str = ( "You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. " "Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. " "The persona definition must go straight to the point, be assertive. The following are starts of persona definitions:\n" "A machine learning researcher...\n" "A pedriatric nurse whose...\n" "An urban planner focused on..." ) TEXT_TO_PERSONA_PROMPT: str = ( "What is the likely profession, interest, or role of the person who would write or be interested in this text?\n\n" "## Text\n" "{text}" ) class TextToPersona(Task): """ You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. The persona definition must go straight to the point, be assertive. For example, you can start the definition as: A machine learning researcher... or A pedriatric nurse whose... See Figure 3 in PersonaHub paper. """ system_prompt: str = SYSTEM_PROMPT_TEXT_TO_PERSONA @property def inputs(self) -> List[str]: """The inputs for the task are the `text`.""" return ["text"] def format_input(self, input: Dict[str, Any]) -> "ChatType": """The input is formatted as a `ChatType`.""" return [ {"role": "system", "content": self.system_prompt}, { "role": "user", "content": TEXT_TO_PERSONA_PROMPT.format(text=input["text"]), # type: ignore }, ] @property def outputs(self) -> List[str]: """The output for the task is the persona definition.""" return ["persona", "model_name"] def format_output( self, output: Union[str, None], input: Dict[str, Any] ) -> Dict[str, Any]: """The output is formatted as a list with the score of each instruction. Args: output: the raw output of the LLM. input: the input to the task. Used for obtaining the number of responses. Returns: A dict with the persona definition. """ return {"persona": output}