Updates
Browse files
app.py
CHANGED
@@ -69,17 +69,11 @@ def perform_deduplication(
|
|
69 |
batch_size = 64
|
70 |
total_batches = (len(texts) + batch_size - 1) // batch_size
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
79 |
-
future = executor.submit(compute_embeddings)
|
80 |
-
while not future.done():
|
81 |
-
pass # Wait for embeddings to be computed
|
82 |
-
embedding_matrix = future.result()
|
83 |
|
84 |
# Deduplicate
|
85 |
status = "Deduplicating embeddings..."
|
@@ -125,6 +119,7 @@ def perform_deduplication(
|
|
125 |
yield f"An error occurred: {e}", ""
|
126 |
raise e
|
127 |
|
|
|
128 |
def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
|
129 |
"""
|
130 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
|
|
69 |
batch_size = 64
|
70 |
total_batches = (len(texts) + batch_size - 1) // batch_size
|
71 |
|
72 |
+
for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches):
|
73 |
+
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
74 |
+
embeddings.append(batch_embeddings)
|
75 |
+
|
76 |
+
embedding_matrix = np.concatenate(embeddings, axis=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# Deduplicate
|
79 |
status = "Deduplicating embeddings..."
|
|
|
119 |
yield f"An error occurred: {e}", ""
|
120 |
raise e
|
121 |
|
122 |
+
|
123 |
def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
|
124 |
"""
|
125 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|