Pringled commited on
Commit
1744dee
1 Parent(s): ed5b7bd
Files changed (1) hide show
  1. app.py +42 -8
app.py CHANGED
@@ -201,7 +201,7 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
201
  deduplication_type = gr.Radio(
202
  choices=["Single dataset", "Cross-dataset"],
203
  label="Deduplication Type",
204
- value="Single dataset",
205
  )
206
 
207
  with gr.Row():
@@ -209,7 +209,7 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
209
  dataset1_split = gr.Textbox(value=default_dataset_split, label="Dataset 1 Split")
210
  dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
211
 
212
- dataset2_inputs = gr.Column(visible=False)
213
  with dataset2_inputs:
214
  gr.Markdown("### Dataset 2")
215
  with gr.Row():
@@ -245,8 +245,6 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
245
 
246
  demo.launch()
247
 
248
-
249
-
250
  # import gradio as gr
251
  # from datasets import load_dataset
252
  # import numpy as np
@@ -270,7 +268,16 @@ demo.launch()
270
  # batch_size: int = 1024,
271
  # progress=None
272
  # ) -> tuple[np.ndarray, dict[int, int]]:
273
- # """Deduplicate embeddings within one dataset or across two datasets."""
 
 
 
 
 
 
 
 
 
274
  # if embeddings_b is None:
275
  # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
276
  # duplicate_to_original = {}
@@ -298,13 +305,27 @@ demo.launch()
298
  # return duplicate_indices_in_b, duplicate_to_original
299
 
300
  # def display_word_differences(x: str, y: str) -> str:
301
- # """Display word-level differences between two texts, avoiding Markdown issues."""
 
 
 
 
 
 
 
302
  # diff = ndiff(x.split(), y.split())
303
  # formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
304
  # return f"```\n{formatted_diff}\n```"
305
 
306
  # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
307
- # """Load texts from a specified dataset and split."""
 
 
 
 
 
 
 
308
  # ds = load_dataset(dataset_name, split=dataset_split)
309
  # return [example[text_column] for example in ds]
310
 
@@ -319,7 +340,20 @@ demo.launch()
319
  # threshold: float = default_threshold,
320
  # progress: gr.Progress = gr.Progress(track_tqdm=True)
321
  # ):
322
- # """Perform deduplication on one or two datasets."""
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  # try:
324
  # threshold = float(threshold)
325
 
 
201
  deduplication_type = gr.Radio(
202
  choices=["Single dataset", "Cross-dataset"],
203
  label="Deduplication Type",
204
+ value="Cross-dataset", # Set "Cross-dataset" as the default value
205
  )
206
 
207
  with gr.Row():
 
209
  dataset1_split = gr.Textbox(value=default_dataset_split, label="Dataset 1 Split")
210
  dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
211
 
212
+ dataset2_inputs = gr.Column(visible=True) # Make dataset2_inputs visible by default
213
  with dataset2_inputs:
214
  gr.Markdown("### Dataset 2")
215
  with gr.Row():
 
245
 
246
  demo.launch()
247
 
 
 
248
  # import gradio as gr
249
  # from datasets import load_dataset
250
  # import numpy as np
 
268
  # batch_size: int = 1024,
269
  # progress=None
270
  # ) -> tuple[np.ndarray, dict[int, int]]:
271
+ # """
272
+ # Deduplicate embeddings within one dataset or across two datasets.
273
+
274
+ # :param embeddings_a: Embeddings of Dataset 1.
275
+ # :param embeddings_b: Optional, embeddings of Dataset 2.
276
+ # :param threshold: Similarity threshold for deduplication.
277
+ # :param batch_size: Batch size for similarity computation.
278
+ # :param progress: Gradio progress tracker for feedback.
279
+ # :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
280
+ # """
281
  # if embeddings_b is None:
282
  # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
283
  # duplicate_to_original = {}
 
305
  # return duplicate_indices_in_b, duplicate_to_original
306
 
307
  # def display_word_differences(x: str, y: str) -> str:
308
+ # """
309
+ # Display the word-level differences between two texts, formatted to avoid
310
+ # misinterpretation of Markdown syntax.
311
+
312
+ # :param x: First text.
313
+ # :param y: Second text.
314
+ # :return: A string showing word-level differences, wrapped in a code block.
315
+ # """
316
  # diff = ndiff(x.split(), y.split())
317
  # formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
318
  # return f"```\n{formatted_diff}\n```"
319
 
320
  # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
321
+ # """
322
+ # Load texts from a specified dataset and split.
323
+
324
+ # :param dataset_name: Name of the dataset.
325
+ # :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
326
+ # :param text_column: Name of the text column.
327
+ # :return: A list of texts from the dataset.
328
+ # """
329
  # ds = load_dataset(dataset_name, split=dataset_split)
330
  # return [example[text_column] for example in ds]
331
 
 
340
  # threshold: float = default_threshold,
341
  # progress: gr.Progress = gr.Progress(track_tqdm=True)
342
  # ):
343
+ # """
344
+ # Perform deduplication on one or two datasets based on the deduplication type.
345
+
346
+ # :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
347
+ # :param dataset1_name: Name of the first dataset.
348
+ # :param dataset1_split: Split of the first dataset.
349
+ # :param dataset1_text_column: Text column of the first dataset.
350
+ # :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
351
+ # :param dataset2_split: Optional, split of the second dataset.
352
+ # :param dataset2_text_column: Optional, text column of the second dataset.
353
+ # :param threshold: Similarity threshold for deduplication.
354
+ # :param progress: Gradio progress tracker.
355
+ # :return: Status updates and result text for the Gradio interface.
356
+ # """
357
  # try:
358
  # threshold = float(threshold)
359