asoria HF staff commited on
Commit
fb1a11c
1 Parent(s): ab6348d

Adding contribute tutorial

Browse files
CONTRIBUTING.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to contribute
2
+
3
+ Thanks for your interest in contributing! 🙌 This project helps users quickly create notebooks to showcase how they use datasets. The generated code can be added to their repositories or used in research projects.
4
+
5
+ ## Ways to Contribute
6
+ There are a few ways you can help:
7
+
8
+ - 💡**Share ideas**: Got a cool feature in mind? Let us know!
9
+ - 🐞**Report bugs**: If something isn’t working, we’d love to fix it.
10
+ - 🚀**Suggest improvements**: Any suggestions to make the tool better are welcome.
11
+ - 📓**Add new notebook types**: This is one of the most exciting ways to contribute!
12
+
13
+ ## For Ideas, Bugs, or Suggestions:
14
+ - Start a new discussion [here](https://huggingface.co/spaces/asoria/auto-notebook-creator/discussions/new).
15
+ - Tell me what’s on your mind and include any details that might help.
16
+
17
+ ## For Adding New Notebook Types:
18
+ - Open a pull request (PR) [here](https://huggingface.co/spaces/asoria/auto-notebook-creator/discussions?new_pr=true).
19
+ - Add a new `.json` file in the notebooks folder. There’s a sample file you can copy and tweak.
20
+ - Submit your PR! 🎉
21
+
22
+ ## Running the Space Application
23
+ To execute the space, follow these steps:
24
+
25
+ 1. Set Required Environment Variables:
26
+ - `NOTEBOOKS_REPOSITORY`: The name of the repository where the generated notebooks will be stored. Ensure that you have **write** permissions for this repository. For example, I use [asoria/dataset-notebook-creator-content](https://huggingface.co/datasets/asoria/dataset-notebook-creator-content) repository.
27
+ - `HF_TOKEN`: Your Hugging Face token, used for authentication to push changes to the repository.
28
+
29
+ Example setup:
30
+
31
+ ```bash
32
+ export HF_TOKEN=your_huggingface_token
33
+ export NOTEBOOKS_REPOSITORY=your_repository_name
34
+ ```
35
+
36
+ 2. Execute the following command to start the application:
37
+
38
+ ```bash
39
+ python app.py
40
+ ```
41
+
42
+ I am excited to see what you come up with. Thanks for helping make this project even better! 💖
43
+
app.py CHANGED
@@ -6,25 +6,23 @@ from httpx import Client
6
  import logging
7
  import pandas as pd
8
  from utils.notebook_utils import (
9
- eda_cells,
10
  replace_wildcards,
11
- rag_cells,
12
- embeggins_cells,
13
  )
14
  from dotenv import load_dotenv
15
  import os
16
  from nbconvert import HTMLExporter
17
 
18
- # TODOs:
19
- # Improve UI code preview
20
- # Add template for training
21
-
22
  load_dotenv()
23
 
24
  HF_TOKEN = os.getenv("HF_TOKEN")
25
  assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
26
 
27
- NOTEBOOKS_REPOSITORY = "asoria/dataset-notebook-creator-content"
 
 
 
 
28
 
29
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
30
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
@@ -96,18 +94,6 @@ def longest_string_column(df):
96
  return longest_col
97
 
98
 
99
- def generate_eda_cells(dataset_id):
100
- return generate_cells(dataset_id, eda_cells, "eda")
101
-
102
-
103
- def generate_rag_cells(dataset_id):
104
- return generate_cells(dataset_id, rag_cells, "rag")
105
-
106
-
107
- def generate_embedding_cells(dataset_id):
108
- return generate_cells(dataset_id, embeggins_cells, "embeddings")
109
-
110
-
111
  def _push_to_hub(
112
  dataset_id,
113
  notebook_file,
@@ -129,8 +115,15 @@ def _push_to_hub(
129
  raise
130
 
131
 
132
- def generate_cells(dataset_id, cells, notebook_type="eda"):
133
- logging.info(f"Generating notebook for dataset {dataset_id}")
 
 
 
 
 
 
 
134
  try:
135
  libraries = get_compatible_libraries(dataset_id)
136
  except Exception as err:
@@ -161,6 +154,7 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
161
  has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
162
  has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
163
 
 
164
  if notebook_type in ("rag", "embeddings") and not has_categoric_columns:
165
  logging.error(
166
  "Dataset does not have categorical columns, which are required for RAG generation."
@@ -250,34 +244,25 @@ with gr.Blocks(
250
 
251
  gr.Markdown("## 2. Select the type of notebook you want to generate")
252
  with gr.Row():
253
- with gr.Column():
254
- generate_eda_btn = gr.Button("EDA", size="sm")
255
- with gr.Column():
256
- generate_embedding_btn = gr.Button("Embeddings", size="sm")
257
- with gr.Column():
258
- generate_rag_btn = gr.Button("RAG", size="sm")
259
- with gr.Column():
260
- generate_training_btn = gr.Button("Training", interactive=False, size="sm")
 
 
 
261
 
262
  gr.Markdown("## 3. Notebook code result")
263
  code_component = gr.HTML(elem_id="box")
264
  go_to_notebook = gr.Markdown("", visible=True)
265
 
266
- generate_eda_btn.click(
267
- generate_eda_cells,
268
- inputs=[dataset_name],
269
- outputs=[code_component, go_to_notebook],
270
- )
271
-
272
- generate_embedding_btn.click(
273
- generate_embedding_cells,
274
- inputs=[dataset_name],
275
- outputs=[code_component, go_to_notebook],
276
- )
277
-
278
- generate_rag_btn.click(
279
- generate_rag_cells,
280
- inputs=[dataset_name],
281
  outputs=[code_component, go_to_notebook],
282
  )
283
 
 
6
  import logging
7
  import pandas as pd
8
  from utils.notebook_utils import (
 
9
  replace_wildcards,
10
+ load_json_files_from_folder,
 
11
  )
12
  from dotenv import load_dotenv
13
  import os
14
  from nbconvert import HTMLExporter
15
 
 
 
 
 
16
  load_dotenv()
17
 
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
  assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
20
 
21
+ NOTEBOOKS_REPOSITORY = os.getenv("NOTEBOOKS_REPOSITORY")
22
+ assert (
23
+ NOTEBOOKS_REPOSITORY is not None
24
+ ), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
25
+
26
 
27
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
28
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
 
94
  return longest_col
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def _push_to_hub(
98
  dataset_id,
99
  notebook_file,
 
115
  raise
116
 
117
 
118
+ folder_path = "notebooks"
119
+ notebook_templates = load_json_files_from_folder(folder_path)
120
+ logging.info(f"Available notebooks {notebook_templates.keys()}")
121
+
122
+
123
+ def generate_cells(dataset_id, notebook_title):
124
+ logging.info(f"Generating {notebook_title} notebook for dataset {dataset_id}")
125
+ cells = notebook_templates[notebook_title]["notebook_template"]
126
+ notebook_type = notebook_templates[notebook_title]["notebook_type"]
127
  try:
128
  libraries = get_compatible_libraries(dataset_id)
129
  except Exception as err:
 
154
  has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
155
  has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
156
 
157
+ # TODO: Validate by notebook type
158
  if notebook_type in ("rag", "embeddings") and not has_categoric_columns:
159
  logging.error(
160
  "Dataset does not have categorical columns, which are required for RAG generation."
 
244
 
245
  gr.Markdown("## 2. Select the type of notebook you want to generate")
246
  with gr.Row():
247
+ notebook_type = gr.Dropdown(
248
+ choices=notebook_templates.keys(), label="Notebook type"
249
+ )
250
+ generate_button = gr.Button("Generate Notebook", variant="primary")
251
+ contribute_btn = gr.Button(
252
+ "Or Contribute",
253
+ visible=True,
254
+ variant="secondary",
255
+ size="sm",
256
+ link="https://huggingface.co/spaces/asoria/auto-notebook-creator/blob/main/CONTRIBUTING.md",
257
+ )
258
 
259
  gr.Markdown("## 3. Notebook code result")
260
  code_component = gr.HTML(elem_id="box")
261
  go_to_notebook = gr.Markdown("", visible=True)
262
 
263
+ generate_button.click(
264
+ generate_cells,
265
+ inputs=[dataset_name, notebook_type],
 
 
 
 
 
 
 
 
 
 
 
 
266
  outputs=[code_component, go_to_notebook],
267
  )
268
 
notebooks/eda.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "notebook_title": "Exploratory data analysis (EDA)",
3
+ "notebook_type": "eda",
4
+ "dataset_type": "numeric",
5
+ "notebook_template": [
6
+ {
7
+ "cell_type": "markdown",
8
+ "source": "\n---\n# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**\n---\n"
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "source": "## 1. Setup necessary libraries and load the dataset"
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "source": "\n# Install and import necessary libraries.\n!pip install pandas matplotlib seaborn\n"
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "source": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n"
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "source": "\n# Load the dataset as a DataFrame\n{first_code}\n"
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "source": "## 2. Understanding the Dataset"
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "source": "\n# First rows of the dataset and info\nprint(df.head())\nprint(df.info())\n"
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "source": "\n# Check for missing values\nprint(df.isnull().sum())\n"
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "source": "\n# Identify data types of each column\nprint(df.dtypes)\n"
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "source": "\n# Detect duplicated rows\nprint(df.duplicated().sum())\n"
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "source": "\n# Generate descriptive statistics\nprint(df.describe())\n"
49
+ },
50
+ {
51
+ "type": "categoric",
52
+ "cell_type": "code",
53
+ "source": "\n# Unique values in categorical columns\ndf.select_dtypes(include=['object']).nunique()\n"
54
+ },
55
+ {
56
+ "cell_type": "markdown",
57
+ "source": "## 3. Data Visualization"
58
+ },
59
+ {
60
+ "type": "numeric",
61
+ "cell_type": "code",
62
+ "source": "\n# Correlation matrix for numerical columns\ncorr_matrix = df.corr(numeric_only=True)\nplt.figure(figsize=(10, 8))\nsns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\nplt.title('Correlation Matrix')\nplt.show()\n"
63
+ },
64
+ {
65
+ "type": "numeric",
66
+ "cell_type": "code",
67
+ "source": "\n# Distribution plots for numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n plt.figure(figsize=(8, 4))\n sns.histplot(df[column], kde=True)\n plt.title(f'Distribution of {column}')\n plt.xlabel(column)\n plt.ylabel('Frequency')\n plt.show()\n"
68
+ },
69
+ {
70
+ "type": "categoric",
71
+ "cell_type": "code",
72
+ "source": "\n# Count plots for categorical columns\nfor column in df.select_dtypes(include=['object']).columns:\n plt.figure(figsize=(8, 4))\n sns.countplot(x=column, data=df)\n plt.title(f'Count Plot of {column}')\n plt.xlabel(column)\n plt.ylabel('Count')\n plt.show()\n"
73
+ },
74
+ {
75
+ "type": "numeric",
76
+ "cell_type": "code",
77
+ "source": "\n# Box plots for detecting outliers in numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n plt.figure(figsize=(8, 4))\n sns.boxplot(df[column])\n plt.title(f'Box Plot of {column}')\n plt.xlabel(column)\n plt.show()\n"
78
+ }
79
+ ]
80
+ }
notebooks/embeddings.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "notebook_title": "Text Embeddings",
3
+ "notebook_type": "embeddings",
4
+ "dataset_type": "text",
5
+ "notebook_template": [
6
+ {
7
+ "cell_type": "markdown",
8
+ "source": "---\n# **Embeddings Notebook for {dataset_name} dataset**\n---"
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "source": "## 1. Setup necessary libraries and load the dataset"
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "source": "# Install and import necessary libraries.\n!pip install pandas sentence-transformers faiss-cpu "
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "source": "from sentence_transformers import SentenceTransformer\nimport faiss"
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "source": "# Load the dataset as a DataFrame\n{first_code}"
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "source": "# Specify the column name that contains the text data to generate embeddings\ncolumn_to_generate_embeddings = '{longest_col}'"
29
+ },
30
+ {
31
+ "cell_type": "markdown",
32
+ "source": "## 2. Loading embedding model and creating FAISS index"
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "source": "# Remove duplicate entries based on the specified column\ndf = df.drop_duplicates(subset=column_to_generate_embeddings)"
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "source": "# Convert the column data to a list of text entries\ntext_list = df[column_to_generate_embeddings].tolist()"
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "source": "# Specify the embedding model you want to use\nmodel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "source": "vectors = model.encode(text_list)\nvector_dimension = vectors.shape[1]\n\n# Initialize the FAISS index with the appropriate dimension (384 for this model)\nindex = faiss.IndexFlatL2(vector_dimension)\n\n# Encode the text list into embeddings and add them to the FAISS index\nindex.add(vectors)"
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "source": "## 3. Perform a text search"
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "source": "# Specify the text you want to search for in the list\ntext_to_search = text_list[0]\nprint(f\"Text to search: {text_to_search}\")"
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "source": "# Generate the embedding for the search query\nquery_embedding = model.encode([text_to_search])"
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "source": "# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)\nD, I = index.search(query_embedding, k=10)\n\n# Print the similar documents\nprint(f\"Similar documents: {[text_list[i] for i in I[0]]}\")"
65
+ }
66
+ ]
67
+ }
notebooks/finetuning.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "notebook_title": "Supervised fine-tuning (SFT)",
3
+ "notebook_type": "sft",
4
+ "dataset_type": "numeric",
5
+ "notebook_template": []
6
+ }
notebooks/rag.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "notebook_title": "Retrieval-augmented generation (RAG)",
3
+ "notebook_type": "rag",
4
+ "dataset_type": "text",
5
+ "notebook_template": []
6
+ }
utils/notebook_utils.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  def replace_wildcards(
2
  templates, wildcards, replacements, has_numeric_columns, has_categoric_columns
3
  ):
@@ -20,7 +24,7 @@ def replace_wildcards(
20
  return new_templates
21
 
22
 
23
- embeggins_cells = [
24
  {
25
  "cell_type": "markdown",
26
  "source": """
@@ -475,3 +479,17 @@ def generate_rag_system_prompt():
475
 
476
  Use the provided code to load the dataset; do not use any other method.
477
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+
5
  def replace_wildcards(
6
  templates, wildcards, replacements, has_numeric_columns, has_categoric_columns
7
  ):
 
24
  return new_templates
25
 
26
 
27
+ embeddings_cells = [
28
  {
29
  "cell_type": "markdown",
30
  "source": """
 
479
 
480
  Use the provided code to load the dataset; do not use any other method.
481
  """
482
+
483
+
484
+ def load_json_files_from_folder(folder_path):
485
+ components = {}
486
+
487
+ for filename in os.listdir(folder_path):
488
+ if filename.endswith(".json"):
489
+ file_path = os.path.join(folder_path, filename)
490
+
491
+ with open(file_path, "r") as json_file:
492
+ data = json.load(json_file)
493
+ components[data["notebook_title"]] = data
494
+
495
+ return components