asoria HF staff commited on
Commit
39f0f76
β€’
1 Parent(s): e6bb5bf

fix details

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +27 -30
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Datasets Similarity Tool
3
- emoji: 🐨
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
 
1
  ---
2
  title: Datasets Similarity Tool
3
+ emoji: πŸ•΅οΈβ€β™€οΈ
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
app.py CHANGED
@@ -1,17 +1,19 @@
1
- from dotenv import load_dotenv
 
2
  import os
 
 
3
 
 
 
 
4
  import pandas as pd
 
5
  from httpx import Client
6
- from huggingface_hub.utils import logging
7
- from functools import lru_cache
8
- from tqdm.contrib.concurrent import thread_map
9
  from huggingface_hub import HfApi
10
- import gradio as gr
11
  from sentence_transformers import SentenceTransformer
12
- import faiss
13
- import numpy as np
14
- from urllib.parse import quote
15
 
16
  load_dotenv()
17
 
@@ -53,10 +55,7 @@ def dataset_is_valid(dataset):
53
 
54
  def get_first_config_and_split_name(hub_id: str):
55
  try:
56
- resp = client.get(
57
- f"https://datasets-server.huggingface.co/splits?dataset={hub_id}"
58
- )
59
-
60
  data = resp.json()
61
  return data["splits"][0]["config"], data["splits"][0]["split"]
62
  except Exception as e:
@@ -71,9 +70,7 @@ def get_dataset_info(hub_id: str, config: str | None = None):
71
  return None
72
  else:
73
  config = config[0]
74
- resp = client.get(
75
- f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
76
- )
77
  resp.raise_for_status()
78
  return resp.json()
79
 
@@ -91,35 +88,34 @@ def dataset_with_info(dataset):
91
  "downloads": dataset.downloads,
92
  "created_at": dataset.created_at,
93
  "tags": dataset.tags,
 
94
  }
95
  except Exception as e:
96
  logger.error(f"Failed to get info for {dataset.id}: {e}")
97
  return None
98
 
99
 
100
-
101
  @lru_cache(maxsize=100)
102
  def prep_data():
103
  datasets = list(api.list_datasets(limit=None, sort="createdAt", direction=-1))
104
  print(f"Found {len(datasets)} datasets in the hub.")
105
- logger.info(f"Found {len(datasets)} datasets.")
106
  has_server = thread_map(
107
  dataset_is_valid,
108
  datasets,
109
  )
110
  datasets_with_server = [x for x in has_server if x is not None]
111
- print(f"Found {len(datasets_with_server)} datasets with server.")
112
  dataset_infos = thread_map(dataset_with_info, datasets_with_server)
113
  dataset_infos = [x for x in dataset_infos if x is not None]
114
- print(f"Found {len(dataset_infos)} datasets with server data.")
115
- print(dataset_infos[0])
116
  return dataset_infos
117
 
 
118
  all_datasets = prep_data()
119
  all_datasets_df = pd.DataFrame.from_dict(all_datasets)
120
  print(all_datasets_df.head())
121
  text = all_datasets_df['text']
122
- encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
123
  vectors = encoder.encode(text)
124
  vector_dimension = vectors.shape[1]
125
  print("Start indexing")
@@ -128,24 +124,25 @@ faiss.normalize_L2(vectors)
128
  index.add(vectors)
129
  print("Indexing done")
130
 
 
131
  def render_model_hub_link(hub_id):
132
  link = f"https://huggingface.co/datasets/{quote(hub_id)}"
133
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
134
 
135
 
136
- def search(dataset_name):
137
  print(f"start search for {dataset_name}")
138
  try:
139
  dataset_row = all_datasets_df[all_datasets_df.dataset == dataset_name].iloc[0]
140
  print(dataset_row)
141
  except IndexError:
142
- return pd.DataFrame([{"error": f"❌ Dataset does not exist or is not supported"}])
143
  text = dataset_row["text"]
144
  search_vector = encoder.encode(text)
145
  _vector = np.array([search_vector])
146
  faiss.normalize_L2(_vector)
147
- distances, ann = index.search(_vector, k=20)
148
- results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
149
  print("results for distances and ann")
150
  print(results)
151
  merge = pd.merge(results, all_datasets_df, left_on="ann", right_index=True)
@@ -153,14 +150,14 @@ def search(dataset_name):
153
  merge["dataset"] = merge["dataset"].apply(render_model_hub_link)
154
  return merge
155
 
 
156
  with gr.Blocks() as demo:
157
  gr.Markdown("# Search similar Datasets on Hugging Face")
158
- gr.Markdown("This space shows similar dataset based on column name and types")
159
- dataset_name = gr.Textbox(
160
- "asoria/bolivian-population", label="Dataset Name"
161
- )
162
  btn = gr.Button("Show similar datasets")
163
  df = gr.DataFrame(datatype="markdown")
164
- btn.click(search, dataset_name, df)
165
 
166
  demo.launch()
 
1
+ # Inspired by https://huggingface.co/spaces/davanstrien/dataset_column_search
2
+
3
  import os
4
+ from functools import lru_cache
5
+ from urllib.parse import quote
6
 
7
+ import faiss
8
+ import gradio as gr
9
+ import numpy as np
10
  import pandas as pd
11
+ from dotenv import load_dotenv
12
  from httpx import Client
 
 
 
13
  from huggingface_hub import HfApi
14
+ from huggingface_hub.utils import logging
15
  from sentence_transformers import SentenceTransformer
16
+ from tqdm.contrib.concurrent import thread_map
 
 
17
 
18
  load_dotenv()
19
 
 
55
 
56
  def get_first_config_and_split_name(hub_id: str):
57
  try:
58
+ resp = client.get(f"https://datasets-server.huggingface.co/splits?dataset={hub_id}")
 
 
 
59
  data = resp.json()
60
  return data["splits"][0]["config"], data["splits"][0]["split"]
61
  except Exception as e:
 
70
  return None
71
  else:
72
  config = config[0]
73
+ resp = client.get(f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}")
 
 
74
  resp.raise_for_status()
75
  return resp.json()
76
 
 
88
  "downloads": dataset.downloads,
89
  "created_at": dataset.created_at,
90
  "tags": dataset.tags,
91
+ "text": f"{dataset.id}-{','.join(list(columns.keys()))}",
92
  }
93
  except Exception as e:
94
  logger.error(f"Failed to get info for {dataset.id}: {e}")
95
  return None
96
 
97
 
 
98
  @lru_cache(maxsize=100)
99
  def prep_data():
100
  datasets = list(api.list_datasets(limit=None, sort="createdAt", direction=-1))
101
  print(f"Found {len(datasets)} datasets in the hub.")
 
102
  has_server = thread_map(
103
  dataset_is_valid,
104
  datasets,
105
  )
106
  datasets_with_server = [x for x in has_server if x is not None]
107
+ print(f"Found {len(datasets_with_server)} valid datasets.")
108
  dataset_infos = thread_map(dataset_with_info, datasets_with_server)
109
  dataset_infos = [x for x in dataset_infos if x is not None]
110
+ print(f"Found {len(dataset_infos)} datasets with info.")
 
111
  return dataset_infos
112
 
113
+
114
  all_datasets = prep_data()
115
  all_datasets_df = pd.DataFrame.from_dict(all_datasets)
116
  print(all_datasets_df.head())
117
  text = all_datasets_df['text']
118
+ encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
119
  vectors = encoder.encode(text)
120
  vector_dimension = vectors.shape[1]
121
  print("Start indexing")
 
124
  index.add(vectors)
125
  print("Indexing done")
126
 
127
+
128
  def render_model_hub_link(hub_id):
129
  link = f"https://huggingface.co/datasets/{quote(hub_id)}"
130
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
131
 
132
 
133
+ def search(dataset_name, k):
134
  print(f"start search for {dataset_name}")
135
  try:
136
  dataset_row = all_datasets_df[all_datasets_df.dataset == dataset_name].iloc[0]
137
  print(dataset_row)
138
  except IndexError:
139
+ return pd.DataFrame([{"error": "❌ Dataset does not exist or is not supported"}])
140
  text = dataset_row["text"]
141
  search_vector = encoder.encode(text)
142
  _vector = np.array([search_vector])
143
  faiss.normalize_L2(_vector)
144
+ distances, ann = index.search(_vector, k=k)
145
+ results = pd.DataFrame({"distances": distances[0], "ann": ann[0]})
146
  print("results for distances and ann")
147
  print(results)
148
  merge = pd.merge(results, all_datasets_df, left_on="ann", right_index=True)
 
150
  merge["dataset"] = merge["dataset"].apply(render_model_hub_link)
151
  return merge
152
 
153
+
154
  with gr.Blocks() as demo:
155
  gr.Markdown("# Search similar Datasets on Hugging Face")
156
+ gr.Markdown("This space shows similar datasets based on a name and columns. It uses https://github.com/facebookresearch/faiss for vector indexing.")
157
+ dataset_name = gr.Textbox("asoria/bolivian-population", label="Dataset Name")
158
+ k = gr.Slider(5, 200, 20, step=5, interactive=True, label="K Nearest Neighbors")
 
159
  btn = gr.Button("Show similar datasets")
160
  df = gr.DataFrame(datatype="markdown")
161
+ btn.click(search, inputs=[dataset_name, k], outputs=df)
162
 
163
  demo.launch()