|
from typing import TYPE_CHECKING, Any, Dict, List, Union |
|
|
|
from distilabel.llms import InferenceEndpointsLLM |
|
from distilabel.pipeline import Pipeline |
|
from distilabel.steps import LoadDataFromDicts |
|
from distilabel.steps.tasks.base import Task |
|
from distilabel.steps import KeepColumns |
|
from distilabel.steps.base import StepResources |
|
|
|
from distilabel.steps.tasks.typing import ChatType |
|
from distilabel.steps.tasks import TextGeneration |
|
|
|
|
|
SYSTEM_PROMPT_TEXT_TO_PERSONA: str = ( |
|
"You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. " |
|
"Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. " |
|
"The persona definition must go straight to the point, be assertive. The following are starts of persona definitions:\n" |
|
"A machine learning researcher...\n" |
|
"A pedriatric nurse whose...\n" |
|
"An urban planner focused on..." |
|
) |
|
|
|
TEXT_TO_PERSONA_PROMPT: str = ( |
|
"What is the likely profession, interest, or role of the person who would write or be interested in this text?\n\n" |
|
"## Text\n" |
|
"{text}" |
|
) |
|
|
|
|
|
class TextToPersona(Task): |
|
""" |
|
You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. |
|
Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. |
|
The persona definition must go straight to the point, be assertive. For example, you can start the definition as: |
|
A machine learning researcher... or A pedriatric nurse whose... |
|
See Figure 3 in PersonaHub paper. |
|
""" |
|
|
|
system_prompt: str = SYSTEM_PROMPT_TEXT_TO_PERSONA |
|
|
|
@property |
|
def inputs(self) -> List[str]: |
|
"""The inputs for the task are the `text`.""" |
|
return ["text"] |
|
|
|
def format_input(self, input: Dict[str, Any]) -> "ChatType": |
|
"""The input is formatted as a `ChatType`.""" |
|
return [ |
|
{"role": "system", "content": self.system_prompt}, |
|
{ |
|
"role": "user", |
|
"content": TEXT_TO_PERSONA_PROMPT.format(text=input["text"]), |
|
}, |
|
] |
|
|
|
@property |
|
def outputs(self) -> List[str]: |
|
"""The output for the task is the persona definition.""" |
|
return ["persona", "model_name"] |
|
|
|
def format_output( |
|
self, output: Union[str, None], input: Dict[str, Any] |
|
) -> Dict[str, Any]: |
|
"""The output is formatted as a list with the score of each instruction. |
|
Args: |
|
output: the raw output of the LLM. |
|
input: the input to the task. Used for obtaining the number of responses. |
|
Returns: |
|
A dict with the persona definition. |
|
""" |
|
return {"persona": output} |
|
|