Updates
Browse files
app.py
CHANGED
@@ -14,19 +14,19 @@ default_dataset_split = "train"
|
|
14 |
default_text_column = "sentence"
|
15 |
default_threshold = 0.9
|
16 |
|
17 |
-
def batch_iterable(iterable, batch_size):
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
def compute_embeddings(texts, batch_size, progress, desc):
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
|
31 |
def deduplicate_embeddings(
|
32 |
embeddings_a: np.ndarray,
|
@@ -90,8 +90,8 @@ def perform_deduplication(
|
|
90 |
yield "Loading Dataset 1...", ""
|
91 |
texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
|
92 |
yield "Computing embeddings for Dataset 1...", ""
|
93 |
-
embeddings1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Dataset 1 embeddings")
|
94 |
-
|
95 |
if deduplication_type == "Single dataset":
|
96 |
# Deduplicate within Dataset 1
|
97 |
yield "Deduplicating within Dataset 1...", ""
|
@@ -128,8 +128,8 @@ def perform_deduplication(
|
|
128 |
yield "Loading Dataset 2...", ""
|
129 |
texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
|
130 |
yield "Computing embeddings for Dataset 2...", ""
|
131 |
-
embeddings2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Dataset 2 embeddings")
|
132 |
-
|
133 |
# Deduplicate Dataset 2 against Dataset 1
|
134 |
yield "Deduplicating Dataset 2 against Dataset 1...", ""
|
135 |
duplicate_indices, duplicate_mapping = deduplicate_embeddings(
|
|
|
14 |
default_text_column = "sentence"
|
15 |
default_threshold = 0.9
|
16 |
|
17 |
+
# def batch_iterable(iterable, batch_size):
|
18 |
+
# """Yield successive batches from an iterable."""
|
19 |
+
# for i in range(0, len(iterable), batch_size):
|
20 |
+
# yield iterable[i:i + batch_size]
|
21 |
+
|
22 |
+
# def compute_embeddings(texts, batch_size, progress, desc):
|
23 |
+
# """Compute embeddings for a list of texts with progress tracking."""
|
24 |
+
# embeddings = []
|
25 |
+
# total_batches = (len(texts) + batch_size - 1) // batch_size
|
26 |
+
# for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
|
27 |
+
# embeddings.append(model.encode(batch_texts, show_progressbar=False))
|
28 |
+
# progress((i + 1) / total_batches, desc=desc)
|
29 |
+
# return np.concatenate(embeddings, axis=0)
|
30 |
|
31 |
def deduplicate_embeddings(
|
32 |
embeddings_a: np.ndarray,
|
|
|
90 |
yield "Loading Dataset 1...", ""
|
91 |
texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
|
92 |
yield "Computing embeddings for Dataset 1...", ""
|
93 |
+
#embeddings1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Dataset 1 embeddings")
|
94 |
+
embeddings1 = model.encode(texts1, show_progressbar=True)
|
95 |
if deduplication_type == "Single dataset":
|
96 |
# Deduplicate within Dataset 1
|
97 |
yield "Deduplicating within Dataset 1...", ""
|
|
|
128 |
yield "Loading Dataset 2...", ""
|
129 |
texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
|
130 |
yield "Computing embeddings for Dataset 2...", ""
|
131 |
+
#embeddings2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Dataset 2 embeddings")
|
132 |
+
embeddings2 = model.encode(texts2, show_progressbar=True)
|
133 |
# Deduplicate Dataset 2 against Dataset 1
|
134 |
yield "Deduplicating Dataset 2 against Dataset 1...", ""
|
135 |
duplicate_indices, duplicate_mapping = deduplicate_embeddings(
|