Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12

Commit

a81fb12

•

1 Parent(s): 82a1d00

Updates

Browse files

Files changed (1) hide show

app.py +38 -39

app.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import gradio as gr
 from datasets import load_dataset
 import numpy as np
-from model2vec import StaticModel
 import model2vec
 from reach import Reach
 from difflib import ndiff
 # Load the model at startup
-model = StaticModel.from_pretrained("minishlab/M2V_base_output")
 # Default dataset parameters
 default_dataset1_name = "sst2"
@@ -23,43 +24,43 @@ ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 # Patch tqdm to use Gradio's progress bar
-from tqdm import tqdm as original_tqdm
 # Patch tqdm to use Gradio's progress bar
 # Patch tqdm to use Gradio's progress bar
-def patch_tqdm_for_gradio(progress):
-    class GradioTqdm(original_tqdm):
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
-            self.progress = progress
-            self.total_batches = kwargs.get('total', len(args[0])) if len(args) > 0 else 1
-            self.update_interval = max(1, self.total_batches // 100)  # Update every 1%
-        def update(self, n=1):
-            super().update(n)
-            # Update Gradio progress bar every update_interval steps
-            if self.n % self.update_interval == 0 or self.n == self.total_batches:
-                self.progress(self.n / self.total_batches)
-    return GradioTqdm
-def patch_model2vec_tqdm(progress):
-    patched_tqdm = patch_tqdm_for_gradio(progress)
-    model2vec.tqdm = patched_tqdm  # Replace tqdm in model2vec
-# Function to patch the original encode function with our Gradio tqdm
-def original_encode_with_tqdm(original_encode_func, patched_tqdm):
-    def new_encode(*args, **kwargs):
-        original_tqdm_backup = original_tqdm
-        try:
-            # Patch the `tqdm` within encode
-            globals()['tqdm'] = patched_tqdm
-            return original_encode_func(*args, **kwargs)
-        finally:
-            # Restore original tqdm after calling encode
-            globals()['tqdm'] = original_tqdm_backup
-    return new_encode
 def batch_iterable(iterable, batch_size):
@@ -157,12 +158,10 @@ def perform_deduplication(
             texts = [example[dataset1_text_column] for example in ds]
             #patched_tqdm = patch_tqdm_for_gradio(progress)
-            patch_model2vec_tqdm(progress)
             #model.encode = original_encode_with_tqdm(model.encode, patched_tqdm)
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
-            # Remove?
             yield status, ""

 import gradio as gr
 from datasets import load_dataset
 import numpy as np
+#from model2vec import StaticModel
 import model2vec
 from reach import Reach
 from difflib import ndiff
 # Load the model at startup
+model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
 # Default dataset parameters
 default_dataset1_name = "sst2"
 # Patch tqdm to use Gradio's progress bar
+#from tqdm import tqdm as original_tqdm
 # Patch tqdm to use Gradio's progress bar
 # Patch tqdm to use Gradio's progress bar
+# def patch_tqdm_for_gradio(progress):
+#     class GradioTqdm(original_tqdm):
+#         def __init__(self, *args, **kwargs):
+#             super().__init__(*args, **kwargs)
+#             self.progress = progress
+#             self.total_batches = kwargs.get('total', len(args[0])) if len(args) > 0 else 1
+#             self.update_interval = max(1, self.total_batches // 100)  # Update every 1%
+#         def update(self, n=1):
+#             super().update(n)
+#             # Update Gradio progress bar every update_interval steps
+#             if self.n % self.update_interval == 0 or self.n == self.total_batches:
+#                 self.progress(self.n / self.total_batches)
+#     return GradioTqdm
+# def patch_model2vec_tqdm(progress):
+#     patched_tqdm = patch_tqdm_for_gradio(progress)
+#     model2vec.tqdm = patched_tqdm  # Replace tqdm in model2vec
+# # Function to patch the original encode function with our Gradio tqdm
+# def original_encode_with_tqdm(original_encode_func, patched_tqdm):
+#     def new_encode(*args, **kwargs):
+#         original_tqdm_backup = original_tqdm
+#         try:
+#             # Patch the `tqdm` within encode
+#             globals()['tqdm'] = patched_tqdm
+#             return original_encode_func(*args, **kwargs)
+#         finally:
+#             # Restore original tqdm after calling encode
+#             globals()['tqdm'] = original_tqdm_backup
+#     return new_encode
 def batch_iterable(iterable, batch_size):
             texts = [example[dataset1_text_column] for example in ds]
             #patched_tqdm = patch_tqdm_for_gradio(progress)
+            #patch_model2vec_tqdm(progress)
             #model.encode = original_encode_with_tqdm(model.encode, patched_tqdm)
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""