Alexander Watson
commited on
Commit
•
faef657
1
Parent(s):
63a19c9
Add HF data support to SDK code generator
Browse files
app.py
CHANGED
@@ -115,7 +115,12 @@ def main():
|
|
115 |
)
|
116 |
|
117 |
df = None
|
|
|
|
|
|
|
|
|
118 |
if data_source == "Upload a file":
|
|
|
119 |
uploaded_file = st.file_uploader(
|
120 |
"Upload a CSV, JSON, or JSONL file",
|
121 |
type=["csv", "json", "jsonl"],
|
@@ -132,16 +137,19 @@ def main():
|
|
132 |
st.success(f"File uploaded successfully: {uploaded_file.name}")
|
133 |
|
134 |
elif data_source == "Select a dataset from Hugging Face":
|
|
|
135 |
huggingface_dataset = st.text_input(
|
136 |
"Hugging Face Dataset Repository",
|
137 |
help="Enter the name of the Hugging Face dataset repository (e.g., 'squad')",
|
138 |
)
|
|
|
139 |
|
140 |
huggingface_split = st.selectbox(
|
141 |
"Dataset Split",
|
142 |
options=["train", "validation", "test"],
|
143 |
help="Select the dataset split to use",
|
144 |
)
|
|
|
145 |
|
146 |
if st.button("Load Hugging Face Dataset"):
|
147 |
if huggingface_dataset:
|
@@ -160,6 +168,7 @@ def main():
|
|
160 |
st.warning("Please provide a Hugging Face dataset repository name.")
|
161 |
|
162 |
elif data_source == "Use a sample dataset":
|
|
|
163 |
st.write("Try a sample dataset to get started quickly.")
|
164 |
if st.button("Try Sample Dataset"):
|
165 |
try:
|
@@ -422,14 +431,27 @@ def main():
|
|
422 |
import logging
|
423 |
import pandas as pd
|
424 |
from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
|
|
|
425 |
|
426 |
# Configure the logger
|
427 |
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
428 |
|
429 |
-
DATASET = "YOUR_DATASET"
|
430 |
API_KEY = "YOUR_API_KEY"
|
431 |
-
|
432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
|
434 |
# Create the instruction response configuration
|
435 |
config = InstructionResponseConfig(
|
@@ -583,7 +605,7 @@ new_df = synthesizer.generate()
|
|
583 |
|
584 |
time.sleep(0.1)
|
585 |
logger.removeHandler(handler)
|
586 |
-
st.success("Data
|
587 |
st.stop()
|
588 |
|
589 |
if stop_button:
|
@@ -638,7 +660,7 @@ new_df = synthesizer.generate()
|
|
638 |
zip_file.write(log_file_path, "complete_logs.jsonl")
|
639 |
if synthesized_data_jsonl:
|
640 |
zip_file.write(
|
641 |
-
synthesized_data_file_path, "
|
642 |
)
|
643 |
zip_file.write(sdk_file_path, "data_synthesis_code.py")
|
644 |
|
|
|
115 |
)
|
116 |
|
117 |
df = None
|
118 |
+
dataset_source_type = ""
|
119 |
+
huggingface_dataset = ""
|
120 |
+
huggingface_split = ""
|
121 |
+
|
122 |
if data_source == "Upload a file":
|
123 |
+
dataset_source_type = "uploaded"
|
124 |
uploaded_file = st.file_uploader(
|
125 |
"Upload a CSV, JSON, or JSONL file",
|
126 |
type=["csv", "json", "jsonl"],
|
|
|
137 |
st.success(f"File uploaded successfully: {uploaded_file.name}")
|
138 |
|
139 |
elif data_source == "Select a dataset from Hugging Face":
|
140 |
+
dataset_source_type = "huggingface"
|
141 |
huggingface_dataset = st.text_input(
|
142 |
"Hugging Face Dataset Repository",
|
143 |
help="Enter the name of the Hugging Face dataset repository (e.g., 'squad')",
|
144 |
)
|
145 |
+
st.session_state.huggingface_dataset = huggingface_dataset
|
146 |
|
147 |
huggingface_split = st.selectbox(
|
148 |
"Dataset Split",
|
149 |
options=["train", "validation", "test"],
|
150 |
help="Select the dataset split to use",
|
151 |
)
|
152 |
+
st.session_state.huggingface_split = huggingface_split
|
153 |
|
154 |
if st.button("Load Hugging Face Dataset"):
|
155 |
if huggingface_dataset:
|
|
|
168 |
st.warning("Please provide a Hugging Face dataset repository name.")
|
169 |
|
170 |
elif data_source == "Use a sample dataset":
|
171 |
+
dataset_source_type = "sample"
|
172 |
st.write("Try a sample dataset to get started quickly.")
|
173 |
if st.button("Try Sample Dataset"):
|
174 |
try:
|
|
|
431 |
import logging
|
432 |
import pandas as pd
|
433 |
from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
|
434 |
+
from datasets import load_dataset
|
435 |
|
436 |
# Configure the logger
|
437 |
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
438 |
|
|
|
439 |
API_KEY = "YOUR_API_KEY"
|
440 |
+
DATASET_SOURCE = "{dataset_source_type}"
|
441 |
+
HUGGINGFACE_DATASET = "{huggingface_dataset}"
|
442 |
+
HUGGINGFACE_SPLIT = "{huggingface_split}"
|
443 |
+
SAMPLE_DATASET_URL = "{SAMPLE_DATASET_URL}"
|
444 |
+
|
445 |
+
# Load dataset
|
446 |
+
if DATASET_SOURCE == 'uploaded':
|
447 |
+
df = pd.read_csv("YOUR_UPLOADED_FILE_PATH") # Replace with the actual file path
|
448 |
+
elif DATASET_SOURCE == 'huggingface':
|
449 |
+
dataset = load_dataset(HUGGINGFACE_DATASET, split=HUGGINGFACE_SPLIT)
|
450 |
+
df = dataset.to_pandas()
|
451 |
+
elif DATASET_SOURCE == 'sample':
|
452 |
+
df = pd.read_csv(SAMPLE_DATASET_URL)
|
453 |
+
else:
|
454 |
+
raise ValueError("Invalid DATASET_SOURCE specified")
|
455 |
|
456 |
# Create the instruction response configuration
|
457 |
config = InstructionResponseConfig(
|
|
|
605 |
|
606 |
time.sleep(0.1)
|
607 |
logger.removeHandler(handler)
|
608 |
+
st.success("Data synthesis completed!")
|
609 |
st.stop()
|
610 |
|
611 |
if stop_button:
|
|
|
660 |
zip_file.write(log_file_path, "complete_logs.jsonl")
|
661 |
if synthesized_data_jsonl:
|
662 |
zip_file.write(
|
663 |
+
synthesized_data_file_path, "synthetic_data.jsonl"
|
664 |
)
|
665 |
zip_file.write(sdk_file_path, "data_synthesis_code.py")
|
666 |
|