asoria HF staff commited on
Commit
7afe0ab
β€’
1 Parent(s): ed28c62

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +5 -5
  2. app.py +206 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: Dataset Insight Explorer
3
- emoji: 🐠
4
- colorFrom: red
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Dataset Insights Explorer
3
+ emoji: πŸ’»
4
+ colorFrom: gray
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODOS:
3
+ - Improve prompts
4
+ - Improve model usage (Quantization?)
5
+ - Improve error handling
6
+ - Add more tests
7
+ - Improve response in a friendly way
8
+ """
9
+
10
+ import gradio as gr
11
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
12
+ import duckdb
13
+ import pandas as pd
14
+ import requests
15
+ from outlines import prompt
16
+ from transformers import AutoTokenizer, AutoModelForCausalLM
17
+ import spaces
18
+ import json
19
+ import torch
20
+ import logging
21
+
22
+ BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
23
+ logger = logging.getLogger(__name__)
24
+
25
+ """
26
+ Methods for generating potential questions and SQL queries
27
+ """
28
+ device = "cuda"
29
+ gemma_model_id = "google/gemma-2b-it"
30
+ gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_id)
31
+ gemma_model = AutoModelForCausalLM.from_pretrained(
32
+ gemma_model_id,
33
+ device_map="auto",
34
+ torch_dtype=torch.bfloat16
35
+ )
36
+
37
+ @spaces.GPU
38
+ def generate_potential_questions_with_gemma(prompt):
39
+ input_ids = gemma_tokenizer(prompt, return_tensors="pt").to(device)
40
+ outputs = gemma_model.generate(**input_ids, max_new_tokens=1024)
41
+ return gemma_tokenizer.decode(outputs[0], skip_special_tokens=True)
42
+
43
+
44
+ @prompt
45
+ def prompt_for_questions(dataset, schema, first_rows):
46
+ """
47
+ You are a data analyst tasked with exploring a dataset named {{ dataset }}.
48
+ Below is the dataset schema in SQL format along with a sample of 3 rows:
49
+ {{ schema }}
50
+ Sample rows:
51
+ {% for example in first_rows %}
52
+ {{ example}}
53
+ {% endfor %}
54
+ Your goal is to generate a list of 5 potential questions that a user might want
55
+ to ask about this dataset. Consider the information contained in the provided
56
+ columns and rows, and try to think of meaningful questions that could
57
+ provide insights or useful information. For each question, provide the SQL query
58
+ that would extract the relevant information from the dataset.
59
+ Ouput JSON format:
60
+ {
61
+ "questions": [
62
+ {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
63
+ {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
64
+ {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
65
+ {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
66
+ {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
67
+ ]
68
+ }
69
+ Please ensure that each SQL query retrieves relevant information from the dataset to answer the corresponding question accurately.
70
+ Return only the JSON object, do not add extra information.
71
+ """
72
+
73
+ """
74
+ Methods for generating and SQL based on a user request
75
+ """
76
+ mother_duckdb_model_id = "motherduckdb/DuckDB-NSQL-7B-v0.1"
77
+ mother_duck_tokenizer = AutoTokenizer.from_pretrained(mother_duckdb_model_id)
78
+ mother_duck_model = AutoModelForCausalLM.from_pretrained(
79
+ mother_duckdb_model_id,
80
+ device_map="auto",
81
+ torch_dtype=torch.bfloat16
82
+ )
83
+
84
+ @spaces.GPU
85
+ def generate_sql_with_mother_duck(prompt):
86
+ input_ids = mother_duck_tokenizer(prompt, return_tensors="pt").to(device).input_ids
87
+ generated_ids = mother_duck_model.generate(input_ids, max_length=1024)
88
+ return mother_duck_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
89
+
90
+
91
+ @prompt
92
+ def prompt_for_sql(ddl_create, query_input):
93
+ """
94
+ ### Instruction:
95
+ Your task is to generate valid duckdb SQL to answer the following question.
96
+ ### Input:
97
+ Here is the database schema that the SQL query will run on:
98
+ {{ ddl_create }}
99
+
100
+ ### Question:
101
+ {{ query_input }}
102
+ ### Response (use duckdb shorthand if possible):
103
+ """
104
+
105
+
106
+ """
107
+ Datasets Viewer Methods
108
+ https://huggingface.co/docs/datasets-server/index
109
+ """
110
+
111
+ def get_first_parquet(dataset: str):
112
+ resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}")
113
+ return resp.json()["parquet_files"][0]
114
+
115
+
116
+ def get_dataset_schema(parquet_url: str):
117
+ con = duckdb.connect()
118
+ con.execute(f"CREATE TABLE data as SELECT * FROM '{parquet_url}' LIMIT 1;")
119
+ result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df()
120
+ ddl_create = result.iloc[0,0]
121
+ con.close()
122
+ return ddl_create
123
+
124
+
125
+ def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int):
126
+ resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}")
127
+ rows = resp.json()["rows"]
128
+ rows = [row['row'] for row in rows]
129
+ return pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit)
130
+
131
+ """
132
+ Main logic, to get the recommended queries
133
+ """
134
+ def get_recommended_queries(dataset: str):
135
+ ddl_create, prompt = "", ""
136
+ try:
137
+ first_split = get_first_parquet(dataset)
138
+ df_first_rows = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 3)
139
+ first_parquet_url = first_split["url"]
140
+ logger.info(f"First parquet URL: {first_parquet_url}")
141
+ ddl_create = get_dataset_schema(first_parquet_url)
142
+ prompt = prompt_for_questions(dataset, ddl_create, df_first_rows.to_dict('records'))
143
+ txt_questions = generate_potential_questions_with_gemma(prompt).split("``json")[1].replace('\n', ' ').strip()[:-4]
144
+ data = json.loads(txt_questions)
145
+ questions = data["questions"]
146
+ potential_questions = []
147
+ for question in questions:
148
+ try:
149
+ sql = question["sql_query"].replace("FROM data", f"FROM '{first_parquet_url}'")
150
+ result = duckdb.sql(sql).df()
151
+ potential_questions.append({"question": question["question"], "result": result, "sql_query": sql})
152
+ continue
153
+ except Exception as err:
154
+ logger.error(f"Error in running SQL query: {question['sql_query']} {err}")
155
+ mother_duck_prompt = prompt_for_sql(ddl_create, question["question"])
156
+ sql = generate_sql_with_mother_duck(mother_duck_prompt).split("### Response (use duckdb shorthand if possible):")[-1].strip()
157
+ sql = sql.replace("FROM data", f"FROM '{first_parquet_url}'")
158
+ try:
159
+ result = duckdb.sql(sql).df()
160
+ potential_questions.append({"question": question["question"], "result": result, "sql_query": sql})
161
+ except:
162
+ pass
163
+ df_result = pd.DataFrame(potential_questions)
164
+ except Exception as err:
165
+ logger.error(f"Error in getting recommended queries: {err}")
166
+ return {
167
+ gr_txt_ddl: ddl_create,
168
+ gr_txt_prompt: prompt,
169
+ gr_df_result: pd.DataFrame([{"error": f"❌ {err=}"}])
170
+ }
171
+ return {
172
+ gr_txt_ddl: ddl_create,
173
+ gr_txt_prompt: prompt,
174
+ gr_df_result: df_result
175
+ }
176
+
177
+
178
+ def preview_dataset(dataset: str):
179
+ try:
180
+ first_split = get_first_parquet(dataset)
181
+ df = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 4)
182
+ except Exception as err:
183
+ df = pd.DataFrame([{"Unable to preview dataset": f"❌ {err=}"}])
184
+ return {
185
+ gr_df_first_rows: df
186
+ }
187
+
188
+
189
+ with gr.Blocks() as demo:
190
+ gr.Markdown("# πŸ’« Dataset Insights Explorer πŸ’«")
191
+ gr_dataset_name = HuggingfaceHubSearch(
192
+ label="Hub Dataset ID",
193
+ placeholder="Search for dataset id on Huggingface",
194
+ search_type="dataset",
195
+ value="jamescalam/world-cities-geo",
196
+ )
197
+ gr_preview_btn = gr.Button("Preview Dataset")
198
+ gr_df_first_rows = gr.DataFrame(datatype="markdown")
199
+ gr_recommend_btn = gr.Button("Show Insights")
200
+ gr_df_result = gr.DataFrame(datatype="markdown")
201
+ with gr.Accordion("Open for details", open=False):
202
+ gr_txt_ddl = gr.Textbox(label="Dataset as CREATE DDL", interactive= False)
203
+ gr_txt_prompt = gr.Textbox(label="Generated prompt to get recommended questions", interactive= False)
204
+ gr_preview_btn.click(preview_dataset, inputs=[gr_dataset_name], outputs=[gr_df_first_rows])
205
+ gr_recommend_btn.click(get_recommended_queries, inputs=[gr_dataset_name], outputs=[gr_txt_ddl, gr_txt_prompt, gr_df_result])
206
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio_huggingfacehub_search==0.0.7
2
+ duckdb
3
+ pandas
4
+ outlines
5
+ transformers
6
+ accelerate