Spaces:
Running
Running
fix details
Browse files
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Datasets Similarity Tool
|
3 |
-
emoji:
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
title: Datasets Similarity Tool
|
3 |
+
emoji: π΅οΈββοΈ
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
app.py
CHANGED
@@ -1,17 +1,19 @@
|
|
1 |
-
|
|
|
2 |
import os
|
|
|
|
|
3 |
|
|
|
|
|
|
|
4 |
import pandas as pd
|
|
|
5 |
from httpx import Client
|
6 |
-
from huggingface_hub.utils import logging
|
7 |
-
from functools import lru_cache
|
8 |
-
from tqdm.contrib.concurrent import thread_map
|
9 |
from huggingface_hub import HfApi
|
10 |
-
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
-
import
|
13 |
-
import numpy as np
|
14 |
-
from urllib.parse import quote
|
15 |
|
16 |
load_dotenv()
|
17 |
|
@@ -53,10 +55,7 @@ def dataset_is_valid(dataset):
|
|
53 |
|
54 |
def get_first_config_and_split_name(hub_id: str):
|
55 |
try:
|
56 |
-
resp = client.get(
|
57 |
-
f"https://datasets-server.huggingface.co/splits?dataset={hub_id}"
|
58 |
-
)
|
59 |
-
|
60 |
data = resp.json()
|
61 |
return data["splits"][0]["config"], data["splits"][0]["split"]
|
62 |
except Exception as e:
|
@@ -71,9 +70,7 @@ def get_dataset_info(hub_id: str, config: str | None = None):
|
|
71 |
return None
|
72 |
else:
|
73 |
config = config[0]
|
74 |
-
resp = client.get(
|
75 |
-
f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
|
76 |
-
)
|
77 |
resp.raise_for_status()
|
78 |
return resp.json()
|
79 |
|
@@ -91,35 +88,34 @@ def dataset_with_info(dataset):
|
|
91 |
"downloads": dataset.downloads,
|
92 |
"created_at": dataset.created_at,
|
93 |
"tags": dataset.tags,
|
|
|
94 |
}
|
95 |
except Exception as e:
|
96 |
logger.error(f"Failed to get info for {dataset.id}: {e}")
|
97 |
return None
|
98 |
|
99 |
|
100 |
-
|
101 |
@lru_cache(maxsize=100)
|
102 |
def prep_data():
|
103 |
datasets = list(api.list_datasets(limit=None, sort="createdAt", direction=-1))
|
104 |
print(f"Found {len(datasets)} datasets in the hub.")
|
105 |
-
logger.info(f"Found {len(datasets)} datasets.")
|
106 |
has_server = thread_map(
|
107 |
dataset_is_valid,
|
108 |
datasets,
|
109 |
)
|
110 |
datasets_with_server = [x for x in has_server if x is not None]
|
111 |
-
print(f"Found {len(datasets_with_server)} datasets
|
112 |
dataset_infos = thread_map(dataset_with_info, datasets_with_server)
|
113 |
dataset_infos = [x for x in dataset_infos if x is not None]
|
114 |
-
print(f"Found {len(dataset_infos)} datasets with
|
115 |
-
print(dataset_infos[0])
|
116 |
return dataset_infos
|
117 |
|
|
|
118 |
all_datasets = prep_data()
|
119 |
all_datasets_df = pd.DataFrame.from_dict(all_datasets)
|
120 |
print(all_datasets_df.head())
|
121 |
text = all_datasets_df['text']
|
122 |
-
encoder = SentenceTransformer("
|
123 |
vectors = encoder.encode(text)
|
124 |
vector_dimension = vectors.shape[1]
|
125 |
print("Start indexing")
|
@@ -128,24 +124,25 @@ faiss.normalize_L2(vectors)
|
|
128 |
index.add(vectors)
|
129 |
print("Indexing done")
|
130 |
|
|
|
131 |
def render_model_hub_link(hub_id):
|
132 |
link = f"https://huggingface.co/datasets/{quote(hub_id)}"
|
133 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
|
134 |
|
135 |
|
136 |
-
def search(dataset_name):
|
137 |
print(f"start search for {dataset_name}")
|
138 |
try:
|
139 |
dataset_row = all_datasets_df[all_datasets_df.dataset == dataset_name].iloc[0]
|
140 |
print(dataset_row)
|
141 |
except IndexError:
|
142 |
-
return pd.DataFrame([{"error":
|
143 |
text = dataset_row["text"]
|
144 |
search_vector = encoder.encode(text)
|
145 |
_vector = np.array([search_vector])
|
146 |
faiss.normalize_L2(_vector)
|
147 |
-
distances, ann = index.search(_vector, k=
|
148 |
-
results = pd.DataFrame({
|
149 |
print("results for distances and ann")
|
150 |
print(results)
|
151 |
merge = pd.merge(results, all_datasets_df, left_on="ann", right_index=True)
|
@@ -153,14 +150,14 @@ def search(dataset_name):
|
|
153 |
merge["dataset"] = merge["dataset"].apply(render_model_hub_link)
|
154 |
return merge
|
155 |
|
|
|
156 |
with gr.Blocks() as demo:
|
157 |
gr.Markdown("# Search similar Datasets on Hugging Face")
|
158 |
-
gr.Markdown("This space shows similar
|
159 |
-
dataset_name = gr.Textbox(
|
160 |
-
|
161 |
-
)
|
162 |
btn = gr.Button("Show similar datasets")
|
163 |
df = gr.DataFrame(datatype="markdown")
|
164 |
-
btn.click(search, dataset_name, df)
|
165 |
|
166 |
demo.launch()
|
|
|
1 |
+
# Inspired by https://huggingface.co/spaces/davanstrien/dataset_column_search
|
2 |
+
|
3 |
import os
|
4 |
+
from functools import lru_cache
|
5 |
+
from urllib.parse import quote
|
6 |
|
7 |
+
import faiss
|
8 |
+
import gradio as gr
|
9 |
+
import numpy as np
|
10 |
import pandas as pd
|
11 |
+
from dotenv import load_dotenv
|
12 |
from httpx import Client
|
|
|
|
|
|
|
13 |
from huggingface_hub import HfApi
|
14 |
+
from huggingface_hub.utils import logging
|
15 |
from sentence_transformers import SentenceTransformer
|
16 |
+
from tqdm.contrib.concurrent import thread_map
|
|
|
|
|
17 |
|
18 |
load_dotenv()
|
19 |
|
|
|
55 |
|
56 |
def get_first_config_and_split_name(hub_id: str):
|
57 |
try:
|
58 |
+
resp = client.get(f"https://datasets-server.huggingface.co/splits?dataset={hub_id}")
|
|
|
|
|
|
|
59 |
data = resp.json()
|
60 |
return data["splits"][0]["config"], data["splits"][0]["split"]
|
61 |
except Exception as e:
|
|
|
70 |
return None
|
71 |
else:
|
72 |
config = config[0]
|
73 |
+
resp = client.get(f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}")
|
|
|
|
|
74 |
resp.raise_for_status()
|
75 |
return resp.json()
|
76 |
|
|
|
88 |
"downloads": dataset.downloads,
|
89 |
"created_at": dataset.created_at,
|
90 |
"tags": dataset.tags,
|
91 |
+
"text": f"{dataset.id}-{','.join(list(columns.keys()))}",
|
92 |
}
|
93 |
except Exception as e:
|
94 |
logger.error(f"Failed to get info for {dataset.id}: {e}")
|
95 |
return None
|
96 |
|
97 |
|
|
|
98 |
@lru_cache(maxsize=100)
|
99 |
def prep_data():
|
100 |
datasets = list(api.list_datasets(limit=None, sort="createdAt", direction=-1))
|
101 |
print(f"Found {len(datasets)} datasets in the hub.")
|
|
|
102 |
has_server = thread_map(
|
103 |
dataset_is_valid,
|
104 |
datasets,
|
105 |
)
|
106 |
datasets_with_server = [x for x in has_server if x is not None]
|
107 |
+
print(f"Found {len(datasets_with_server)} valid datasets.")
|
108 |
dataset_infos = thread_map(dataset_with_info, datasets_with_server)
|
109 |
dataset_infos = [x for x in dataset_infos if x is not None]
|
110 |
+
print(f"Found {len(dataset_infos)} datasets with info.")
|
|
|
111 |
return dataset_infos
|
112 |
|
113 |
+
|
114 |
all_datasets = prep_data()
|
115 |
all_datasets_df = pd.DataFrame.from_dict(all_datasets)
|
116 |
print(all_datasets_df.head())
|
117 |
text = all_datasets_df['text']
|
118 |
+
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
119 |
vectors = encoder.encode(text)
|
120 |
vector_dimension = vectors.shape[1]
|
121 |
print("Start indexing")
|
|
|
124 |
index.add(vectors)
|
125 |
print("Indexing done")
|
126 |
|
127 |
+
|
128 |
def render_model_hub_link(hub_id):
|
129 |
link = f"https://huggingface.co/datasets/{quote(hub_id)}"
|
130 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
|
131 |
|
132 |
|
133 |
+
def search(dataset_name, k):
|
134 |
print(f"start search for {dataset_name}")
|
135 |
try:
|
136 |
dataset_row = all_datasets_df[all_datasets_df.dataset == dataset_name].iloc[0]
|
137 |
print(dataset_row)
|
138 |
except IndexError:
|
139 |
+
return pd.DataFrame([{"error": "β Dataset does not exist or is not supported"}])
|
140 |
text = dataset_row["text"]
|
141 |
search_vector = encoder.encode(text)
|
142 |
_vector = np.array([search_vector])
|
143 |
faiss.normalize_L2(_vector)
|
144 |
+
distances, ann = index.search(_vector, k=k)
|
145 |
+
results = pd.DataFrame({"distances": distances[0], "ann": ann[0]})
|
146 |
print("results for distances and ann")
|
147 |
print(results)
|
148 |
merge = pd.merge(results, all_datasets_df, left_on="ann", right_index=True)
|
|
|
150 |
merge["dataset"] = merge["dataset"].apply(render_model_hub_link)
|
151 |
return merge
|
152 |
|
153 |
+
|
154 |
with gr.Blocks() as demo:
|
155 |
gr.Markdown("# Search similar Datasets on Hugging Face")
|
156 |
+
gr.Markdown("This space shows similar datasets based on a name and columns. It uses https://github.com/facebookresearch/faiss for vector indexing.")
|
157 |
+
dataset_name = gr.Textbox("asoria/bolivian-population", label="Dataset Name")
|
158 |
+
k = gr.Slider(5, 200, 20, step=5, interactive=True, label="K Nearest Neighbors")
|
|
|
159 |
btn = gr.Button("Show similar datasets")
|
160 |
df = gr.DataFrame(datatype="markdown")
|
161 |
+
btn.click(search, inputs=[dataset_name, k], outputs=df)
|
162 |
|
163 |
demo.launch()
|