burtenshaw HF staff commited on
Commit
7b3a105
1 Parent(s): df69857

first commit

Browse files
Files changed (2) hide show
  1. app.py +147 -0
  2. personas.py +71 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import multiprocessing
3
+ import os
4
+ import time
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from unstructured.partition.pdf import partition_pdf
9
+ import nltk
10
+ from distilabel.pipeline import Pipeline
11
+ from distilabel.llms import InferenceEndpointsLLM
12
+ from distilabel.steps import LoadDataFromDicts, KeepColumns
13
+ from distilabel.steps.tasks import TextGeneration
14
+
15
+ from personas import * # Assuming this contains TextToPersona and other necessary definitions
16
+
17
+ nltk.download("punkt", quiet=True)
18
+
19
+ PROMPT_TEMPLATE = """\
20
+ Generate a single prompt the persona below might ask to an AI assistant:
21
+
22
+ {{ persona }}
23
+ """
24
+
25
+ # Get HF_TOKEN from environment variable
26
+ HF_TOKEN = os.environ.get("HF_TOKEN")
27
+
28
+
29
+ def process_pdfs(pdf_files):
30
+ all_data = []
31
+ for pdf_file in pdf_files:
32
+ elements = partition_pdf(pdf_file.name)
33
+
34
+ full_text = ""
35
+ for element in elements:
36
+ full_text += element.text + "\n"
37
+
38
+ all_data.append({"text": full_text.strip()})
39
+
40
+ return all_data
41
+
42
+
43
+ def _run_pipeline(result_queue, pdf_files):
44
+ data = process_pdfs(pdf_files)
45
+
46
+ with Pipeline(name="personahub-fineweb-edu-text-to-persona") as pipeline:
47
+ input_batch_size = 10
48
+
49
+ data_loader = LoadDataFromDicts(data=data)
50
+
51
+ llm = InferenceEndpointsLLM(
52
+ model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
53
+ api_key=HF_TOKEN,
54
+ )
55
+
56
+ text_to_persona = TextToPersona(
57
+ llm=llm,
58
+ input_batch_size=input_batch_size,
59
+ )
60
+
61
+ text_gen = TextGeneration(
62
+ llm=llm,
63
+ system_prompt="You are an AI assistant expert at simulating user interactions.",
64
+ template=PROMPT_TEMPLATE,
65
+ columns="persona",
66
+ output_mappings={"generation": "instruction"},
67
+ num_generations=1,
68
+ )
69
+
70
+ response_gen = TextGeneration(
71
+ llm=llm,
72
+ system_prompt="You are an AI assistant expert in responding to tasks",
73
+ output_mappings={"generation": "response"},
74
+ )
75
+
76
+ keep = KeepColumns(
77
+ columns=["text", "persona", "model_name", "instruction", "response"],
78
+ input_batch_size=input_batch_size,
79
+ )
80
+
81
+ (data_loader >> text_to_persona >> text_gen >> response_gen >> keep)
82
+
83
+ distiset = pipeline.run(use_cache=False)
84
+ result_queue.put(distiset)
85
+
86
+
87
+ def generate_dataset(pdf_files, progress=gr.Progress()):
88
+ result_queue = multiprocessing.Queue()
89
+ p = multiprocessing.Process(
90
+ target=_run_pipeline,
91
+ args=(result_queue, pdf_files),
92
+ )
93
+
94
+ try:
95
+ p.start()
96
+ total_steps = 100
97
+ for step in range(total_steps):
98
+ if not p.is_alive() or p._popen.poll() is not None:
99
+ break
100
+ progress(
101
+ (step + 1) / total_steps,
102
+ desc="Generating dataset. Don't close this window.",
103
+ )
104
+ time.sleep(2) # Adjust this value based on your needs
105
+ p.join()
106
+ except Exception as e:
107
+ raise gr.Error(f"An error occurred during dataset generation: {str(e)}")
108
+
109
+ distiset = result_queue.get()
110
+ df = distiset["default"]["train"].to_pandas()
111
+ progress(1.0, desc="Dataset generation completed")
112
+ return df
113
+
114
+
115
+ def gradio_interface(pdf_files):
116
+ if HF_TOKEN is None:
117
+ raise gr.Error(
118
+ "HF_TOKEN environment variable is not set. Please set it and restart the application."
119
+ )
120
+ df = generate_dataset(pdf_files)
121
+ return df
122
+
123
+
124
+ with gr.Blocks(title="MyPersonas Dataset Generator") as app:
125
+ gr.Markdown("# MyPersonas Dataset Generator")
126
+ gr.Markdown("Upload one or more PDFs to generate a persona based SFT dataset.")
127
+
128
+ with gr.Row():
129
+ pdf_files = gr.File(label="Upload PDFs", file_count="multiple")
130
+
131
+ with gr.Row():
132
+ generate_button = gr.Button("Generate Dataset")
133
+
134
+ output_dataframe = gr.DataFrame(
135
+ label="Generated Dataset",
136
+ interactive=False,
137
+ wrap=True,
138
+ )
139
+
140
+ generate_button.click(
141
+ fn=gradio_interface,
142
+ inputs=[pdf_files],
143
+ outputs=[output_dataframe],
144
+ )
145
+
146
+ if __name__ == "__main__":
147
+ app.launch()
personas.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING, Any, Dict, List, Union
2
+
3
+ from distilabel.llms import InferenceEndpointsLLM
4
+ from distilabel.pipeline import Pipeline
5
+ from distilabel.steps import LoadDataFromDicts
6
+ from distilabel.steps.tasks.base import Task
7
+ from distilabel.steps import KeepColumns
8
+ from distilabel.steps.base import StepResources
9
+
10
+ from distilabel.steps.tasks.typing import ChatType
11
+ from distilabel.steps.tasks import TextGeneration
12
+
13
+
14
+ SYSTEM_PROMPT_TEXT_TO_PERSONA: str = (
15
+ "You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. "
16
+ "Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. "
17
+ "The persona definition must go straight to the point, be assertive. The following are starts of persona definitions:\n"
18
+ "A machine learning researcher...\n"
19
+ "A pedriatric nurse whose...\n"
20
+ "An urban planner focused on..."
21
+ )
22
+
23
+ TEXT_TO_PERSONA_PROMPT: str = (
24
+ "What is the likely profession, interest, or role of the person who would write or be interested in this text?\n\n"
25
+ "## Text\n"
26
+ "{text}"
27
+ )
28
+
29
+
30
+ class TextToPersona(Task):
31
+ """
32
+ You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing.
33
+ Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements.
34
+ The persona definition must go straight to the point, be assertive. For example, you can start the definition as:
35
+ A machine learning researcher... or A pedriatric nurse whose...
36
+ See Figure 3 in PersonaHub paper.
37
+ """
38
+
39
+ system_prompt: str = SYSTEM_PROMPT_TEXT_TO_PERSONA
40
+
41
+ @property
42
+ def inputs(self) -> List[str]:
43
+ """The inputs for the task are the `text`."""
44
+ return ["text"]
45
+
46
+ def format_input(self, input: Dict[str, Any]) -> "ChatType":
47
+ """The input is formatted as a `ChatType`."""
48
+ return [
49
+ {"role": "system", "content": self.system_prompt},
50
+ {
51
+ "role": "user",
52
+ "content": TEXT_TO_PERSONA_PROMPT.format(text=input["text"]), # type: ignore
53
+ },
54
+ ]
55
+
56
+ @property
57
+ def outputs(self) -> List[str]:
58
+ """The output for the task is the persona definition."""
59
+ return ["persona", "model_name"]
60
+
61
+ def format_output(
62
+ self, output: Union[str, None], input: Dict[str, Any]
63
+ ) -> Dict[str, Any]:
64
+ """The output is formatted as a list with the score of each instruction.
65
+ Args:
66
+ output: the raw output of the LLM.
67
+ input: the input to the task. Used for obtaining the number of responses.
68
+ Returns:
69
+ A dict with the persona definition.
70
+ """
71
+ return {"persona": output}