Pringled commited on
Commit
f39d105
1 Parent(s): 95530b9
Files changed (1) hide show
  1. app.py +17 -17
app.py CHANGED
@@ -14,19 +14,19 @@ default_dataset_split = "train"
14
  default_text_column = "sentence"
15
  default_threshold = 0.9
16
 
17
- def batch_iterable(iterable, batch_size):
18
- """Yield successive batches from an iterable."""
19
- for i in range(0, len(iterable), batch_size):
20
- yield iterable[i:i + batch_size]
21
-
22
- def compute_embeddings(texts, batch_size, progress, desc):
23
- """Compute embeddings for a list of texts with progress tracking."""
24
- embeddings = []
25
- total_batches = (len(texts) + batch_size - 1) // batch_size
26
- for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
27
- embeddings.append(model.encode(batch_texts, show_progressbar=False))
28
- progress((i + 1) / total_batches, desc=desc)
29
- return np.concatenate(embeddings, axis=0)
30
 
31
  def deduplicate_embeddings(
32
  embeddings_a: np.ndarray,
@@ -90,8 +90,8 @@ def perform_deduplication(
90
  yield "Loading Dataset 1...", ""
91
  texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
92
  yield "Computing embeddings for Dataset 1...", ""
93
- embeddings1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Dataset 1 embeddings")
94
-
95
  if deduplication_type == "Single dataset":
96
  # Deduplicate within Dataset 1
97
  yield "Deduplicating within Dataset 1...", ""
@@ -128,8 +128,8 @@ def perform_deduplication(
128
  yield "Loading Dataset 2...", ""
129
  texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
130
  yield "Computing embeddings for Dataset 2...", ""
131
- embeddings2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Dataset 2 embeddings")
132
-
133
  # Deduplicate Dataset 2 against Dataset 1
134
  yield "Deduplicating Dataset 2 against Dataset 1...", ""
135
  duplicate_indices, duplicate_mapping = deduplicate_embeddings(
 
14
  default_text_column = "sentence"
15
  default_threshold = 0.9
16
 
17
+ # def batch_iterable(iterable, batch_size):
18
+ # """Yield successive batches from an iterable."""
19
+ # for i in range(0, len(iterable), batch_size):
20
+ # yield iterable[i:i + batch_size]
21
+
22
+ # def compute_embeddings(texts, batch_size, progress, desc):
23
+ # """Compute embeddings for a list of texts with progress tracking."""
24
+ # embeddings = []
25
+ # total_batches = (len(texts) + batch_size - 1) // batch_size
26
+ # for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
27
+ # embeddings.append(model.encode(batch_texts, show_progressbar=False))
28
+ # progress((i + 1) / total_batches, desc=desc)
29
+ # return np.concatenate(embeddings, axis=0)
30
 
31
  def deduplicate_embeddings(
32
  embeddings_a: np.ndarray,
 
90
  yield "Loading Dataset 1...", ""
91
  texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
92
  yield "Computing embeddings for Dataset 1...", ""
93
+ #embeddings1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Dataset 1 embeddings")
94
+ embeddings1 = model.encode(texts1, show_progressbar=True)
95
  if deduplication_type == "Single dataset":
96
  # Deduplicate within Dataset 1
97
  yield "Deduplicating within Dataset 1...", ""
 
128
  yield "Loading Dataset 2...", ""
129
  texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
130
  yield "Computing embeddings for Dataset 2...", ""
131
+ #embeddings2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Dataset 2 embeddings")
132
+ embeddings2 = model.encode(texts2, show_progressbar=True)
133
  # Deduplicate Dataset 2 against Dataset 1
134
  yield "Deduplicating Dataset 2 against Dataset 1...", ""
135
  duplicate_indices, duplicate_mapping = deduplicate_embeddings(