Pringled commited on
Commit
a9118ee
1 Parent(s): 4f0286f
Files changed (1) hide show
  1. app.py +6 -11
app.py CHANGED
@@ -69,17 +69,11 @@ def perform_deduplication(
69
  batch_size = 64
70
  total_batches = (len(texts) + batch_size - 1) // batch_size
71
 
72
- def compute_embeddings():
73
- for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches):
74
- batch_embeddings = model.encode(batch_texts, show_progressbar=False)
75
- embeddings.append(batch_embeddings)
76
- return np.concatenate(embeddings, axis=0)
77
-
78
- with concurrent.futures.ThreadPoolExecutor() as executor:
79
- future = executor.submit(compute_embeddings)
80
- while not future.done():
81
- pass # Wait for embeddings to be computed
82
- embedding_matrix = future.result()
83
 
84
  # Deduplicate
85
  status = "Deduplicating embeddings..."
@@ -125,6 +119,7 @@ def perform_deduplication(
125
  yield f"An error occurred: {e}", ""
126
  raise e
127
 
 
128
  def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
129
  """
130
  Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
 
69
  batch_size = 64
70
  total_batches = (len(texts) + batch_size - 1) // batch_size
71
 
72
+ for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches):
73
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
74
+ embeddings.append(batch_embeddings)
75
+
76
+ embedding_matrix = np.concatenate(embeddings, axis=0)
 
 
 
 
 
 
77
 
78
  # Deduplicate
79
  status = "Deduplicating embeddings..."
 
119
  yield f"An error occurred: {e}", ""
120
  raise e
121
 
122
+
123
  def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
124
  """
125
  Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.