Spaces:

datasets-topics
/

topics-generator

Sleeping

App Files Files Community

fix-plot-issue

by asoria HF staff - opened 24 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+216

-209

Files changed (1) hide show

app.py +216 -209

app.py CHANGED Viewed

@@ -37,7 +37,6 @@ DATASETS_TOPICS_ORGANIZATION = os.getenv(
     "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
 )
 USE_CUML = int(os.getenv("USE_CUML", "1"))
-USE_LLM_TEXT_GENERATION = int(os.getenv("USE_LLM_TEXT_GENERATION", "1"))
 # Use cuml lib only if configured
 if USE_CUML:
@@ -53,19 +52,17 @@ logging.basicConfig(
 )
 api = HfApi(token=HF_TOKEN)
-sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
-# Representation model
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-representation_model = KeyBERTInspired()
 vectorizer_model = CountVectorizer(stop_words="english")
 inference_client = InferenceClient(model_id)
 def calculate_embeddings(docs):
-    return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
 def calculate_n_neighbors_and_components(n_rows):
@@ -95,7 +92,7 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
     new_model = BERTopic(
         language="english",
         # Sub-models
-        embedding_model=sentence_model,  # Step 1 - Extract embeddings
         umap_model=umap_model,  # Step 2 - UMAP model
         hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
         vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
@@ -169,44 +166,146 @@ def generate_topics(dataset, config, split, column, plot_type):
         "",
     )
-    while offset < limit:
-        logging.info(f"----> Getting records from {offset=} with {CHUNK_SIZE=}")
-        docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
-        if not docs:
-            break
-        logging.info(f"Got {len(docs)} docs  ✓")
-        embeddings = calculate_embeddings(docs)
-        new_model = fit_model(docs, embeddings, n_neighbors, n_components)
-        if base_model is None:
-            base_model = new_model
-            logging.info(
-                f"The following topics are newly found: {base_model.topic_labels_}"
             )
-        else:
-            updated_model = BERTopic.merge_models([base_model, new_model])
-            nr_new_topics = len(set(updated_model.topics_)) - len(
-                set(base_model.topics_)
             )
-            new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
-            logging.info(f"The following topics are newly found: {new_topics}")
-            base_model = updated_model
-        logging.info("Reducing embeddings to 2D")
-        reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
-        reduced_embeddings_list.append(reduced_embeddings)
-        logging.info("Reducing embeddings to 2D ✓")
-        all_docs.extend(docs)
-        reduced_embeddings_array = np.vstack(reduced_embeddings_list)
-        topics_info = base_model.get_topic_info()
         all_topics = base_model.topics_
-        logging.info(f"Preparing topics {plot_type} plot")
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 topics=all_topics,
                 reduced_embeddings=reduced_embeddings_array,
                 title="",
                 sub_title=sub_title,
@@ -227,192 +326,100 @@ def generate_topics(dataset, config, split, column, plot_type):
             if plot_type == "DataMapPlot"
             else base_model.visualize_documents(
                 docs=all_docs,
-                topics=all_topics,
                 reduced_embeddings=reduced_embeddings_array,
                 title="",
             )
         )
-        logging.info("Plot done ✓")
-        rows_processed += len(docs)
-        progress = min(rows_processed / limit, 1.0)
-        logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
-        message = (
-            f"Processing topics for full dataset: {rows_processed} of {limit}"
-            if full_processing
-            else f"Processing topics for partial dataset: {rows_processed} of {limit} rows"
-        )
         yield (
             gr.Accordion(open=False),
             topics_info,
             topic_plot,
-            gr.Label({"⏳ " + message: progress}, visible=True),
             "",
         )
-        offset += CHUNK_SIZE
-        del docs, embeddings, new_model, reduced_embeddings
-    logging.info("Finished processing all data")
-    yield (
-        gr.Accordion(open=False),
-        topics_info,
-        topic_plot,
-        gr.Label(
-            {
-                "✅ " + message: 1.0,
-                f"⏳ Generating topic names with {model_id}": 0.0,
-            },
-            visible=True,
-        ),
-        "",
-    )
-    all_topics = base_model.topics_
-    topics_info = base_model.get_topic_info()
-    new_topics_by_text_generation = {}
-    for _, row in topics_info.iterrows():
-        logging.info(
-            f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
-        )
-        prompt = f"{LLAMA_3_8B_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
-        prompt_messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful, respectful and honest assistant for labeling topics.",
-            },
-            {"role": "user", "content": prompt},
-        ]
-        output = inference_client.chat_completion(
-            messages=prompt_messages,
-            stream=False,
-            max_tokens=500,
-            top_p=0.8,
-            seed=42,
         )
-        inference_response = output.choices[0].message.content
-        logging.info("Inference response:")
-        logging.info(inference_response)
-        new_topics_by_text_generation[row["Topic"]] = inference_response.replace(
-            "Topic=", ""
-        ).strip()
-    base_model.set_topic_labels(new_topics_by_text_generation)
-    topics_info = base_model.get_topic_info()
-    topic_plot = (
-        base_model.visualize_document_datamap(
-            docs=all_docs,
-            topics=all_topics,
-            custom_labels=True,
-            reduced_embeddings=reduced_embeddings_array,
-            title="",
-            sub_title=sub_title,
-            width=800,
-            height=700,
-            arrowprops={
-                "arrowstyle": "wedge,tail_width=0.5",
-                "connectionstyle": "arc3,rad=0.05",
-                "linewidth": 0,
-                "fc": "#33333377",
-            },
-            dynamic_label_size=True,
-            # label_wrap_width=12,
-            label_over_points=True,
-            max_font_size=36,
-            min_font_size=4,
         )
-        if plot_type == "DataMapPlot"
-        else base_model.visualize_documents(
-            docs=all_docs,
-            reduced_embeddings=reduced_embeddings_array,
-            custom_labels=True,
-            title="",
         )
-    )
-    dataset_clear_name = dataset.replace("/", "-")
-    plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
-    if plot_type == "DataMapPlot":
-        topic_plot.savefig(plot_png, format="png", dpi=300)
-    else:
-        topic_plot.write_image(plot_png)
-    custom_labels = base_model.custom_labels_
-    topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
-    yield (
-        gr.Accordion(open=False),
-        topics_info,
-        topic_plot,
-        gr.Label(
-            {
-                "✅ " + message: 1.0,
-                f"✅ Generating topic names with {model_id}": 1.0,
-                "⏳ Creating Interactive Space": 0.0,
-            },
-            visible=True,
-        ),
-        "",
-    )
-    interactive_plot = datamapplot.create_interactive_plot(
-        reduced_embeddings_array,
-        topic_names_array,
-        hover_text=all_docs,
-        title=dataset,
-        sub_title=sub_title.replace(
-            "dataset",
-            f"<a href='https://huggingface.co/datasets/{dataset}/viewer/{config}/{split}' target='_blank'>dataset</a>",
-        ),
-        enable_search=True,
-        # TODO: Export data to .arrow and also serve it
-        inline_data=True,
-        # offline_data_prefix=dataset_clear_name,
-        initial_zoom_fraction=0.9,
-        cluster_boundary_polygons=True
-    )
-    html_content = str(interactive_plot)
-    html_file_path = f"{dataset_clear_name}.html"
-    with open(html_file_path, "w", encoding="utf-8") as html_file:
-        html_file.write(html_content)
-    repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
-    space_id = create_space_with_content(
-        api=api,
-        repo_id=repo_id,
-        dataset_id=dataset,
-        html_file_path=html_file_path,
-        plot_file_path=plot_png,
-        space_card=SPACE_REPO_CARD_CONTENT,
-        token=HF_TOKEN,
-    )
-    space_link = f"https://huggingface.co/spaces/{space_id}"
-    yield (
-        gr.Accordion(open=False),
-        topics_info,
-        topic_plot,
-        gr.Label(
-            {
-                "✅ " + message: 1.0,
-                f"✅ Generating topic names with {model_id}": 1.0,
-                "✅ Creating Interactive Space": 1.0,
-            },
-            visible=True,
-        ),
-        f"[![Go to interactive plot](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue)]({space_link})",
-    )
-    del reduce_umap_model, all_docs, reduced_embeddings_list
-    del (
-        base_model,
-        all_topics,
-        topics_info,
-        topic_plot,
-        topic_names_array,
-        interactive_plot,
-    )
-    cuda.empty_cache()
 with gr.Blocks() as demo:
@@ -461,11 +468,11 @@ with gr.Blocks() as demo:
         generate_button = gr.Button("Generate Topics", variant="primary")
     gr.Markdown("## Data map")
-    full_topics_generation_label = gr.Label(visible=False, show_label=False)
     open_space_label = gr.Markdown()
     topics_plot = gr.Plot()
-    with gr.Accordion("Topics Info", open=False):
-        topics_df = gr.DataFrame(interactive=False, visible=True)
     gr.HTML(
         f"<p style='text-align: center; color:orange;'>⚠ This space processes datasets in batches of <b>{CHUNK_SIZE}</b>, with a maximum of <b>{MAX_ROWS}</b> rows. If you need further assistance, please open a new issue in the Community tab.</p>"
     )
@@ -487,7 +494,7 @@ with gr.Blocks() as demo:
             data_details_accordion,
             topics_df,
             topics_plot,
-            full_topics_generation_label,
             open_space_label,
         ],
     )

     "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
 )
 USE_CUML = int(os.getenv("USE_CUML", "1"))
 # Use cuml lib only if configured
 if USE_CUML:
 )
 api = HfApi(token=HF_TOKEN)
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 vectorizer_model = CountVectorizer(stop_words="english")
+representation_model = KeyBERTInspired()
 inference_client = InferenceClient(model_id)
 def calculate_embeddings(docs):
+    return embedding_model.encode(docs, show_progress_bar=True, batch_size=32)
 def calculate_n_neighbors_and_components(n_rows):
     new_model = BERTopic(
         language="english",
         # Sub-models
+        embedding_model=embedding_model,  # Step 1 - Extract embeddings
         umap_model=umap_model,  # Step 2 - UMAP model
         hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
         vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
         "",
     )
+    try:
+        while offset < limit:
+            logging.info(f"----> Getting records from {offset=} with {CHUNK_SIZE=}")
+            docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
+            if not docs:
+                break
+            logging.info(f"Got {len(docs)} docs  ✓")
+            embeddings = calculate_embeddings(docs)
+            new_model = fit_model(docs, embeddings, n_neighbors, n_components)
+            if base_model is None:
+                base_model = new_model
+                logging.info(
+                    f"The following topics are newly found: {base_model.topic_labels_}"
+                )
+            else:
+                updated_model = BERTopic.merge_models([base_model, new_model])
+                nr_new_topics = len(set(updated_model.topics_)) - len(
+                    set(base_model.topics_)
+                )
+                new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
+                logging.info(f"The following topics are newly found: {new_topics}")
+                base_model = updated_model
+            logging.info("Reducing embeddings to 2D")
+            reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
+            reduced_embeddings_list.append(reduced_embeddings)
+            all_docs.extend(docs)
+            reduced_embeddings_array = np.vstack(reduced_embeddings_list)
+            logging.info("Reducing embeddings to 2D ✓")
+            topics_info = base_model.get_topic_info()
+            all_topics = base_model.topics_
+            logging.info(f"Preparing topics {plot_type} plot")
+            topic_plot = (
+                base_model.visualize_document_datamap(
+                    docs=all_docs,
+                    topics=all_topics,
+                    reduced_embeddings=reduced_embeddings_array,
+                    title="",
+                    sub_title=sub_title,
+                    width=800,
+                    height=700,
+                    arrowprops={
+                        "arrowstyle": "wedge,tail_width=0.5",
+                        "connectionstyle": "arc3,rad=0.05",
+                        "linewidth": 0,
+                        "fc": "#33333377",
+                    },
+                    dynamic_label_size=True,
+                    # label_wrap_width=12,
+                    label_over_points=True,
+                    max_font_size=36,
+                    min_font_size=4,
+                )
+                if plot_type == "DataMapPlot"
+                else base_model.visualize_documents(
+                    docs=all_docs,
+                    topics=all_topics,
+                    reduced_embeddings=reduced_embeddings_array,
+                    custom_labels=True,
+                    title="",
+                )
             )
+            logging.info("Plot done ✓")
+            rows_processed += len(docs)
+            progress = min(rows_processed / limit, 1.0)
+            logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
+            message = (
+                f"Processing topics for full dataset: {rows_processed} of {limit}"
+                if full_processing
+                else f"Processing topics for partial dataset: {rows_processed} of {limit} rows"
             )
+            yield (
+                gr.Accordion(open=False),
+                topics_info,
+                topic_plot,
+                gr.Label({"⏳ " + message: progress}, visible=True),
+                "",
+            )
+            offset += CHUNK_SIZE
+            del docs, embeddings, new_model, reduced_embeddings
+        logging.info("Finished processing topic modeling data")
+        yield (
+            gr.Accordion(open=False),
+            topics_info,
+            topic_plot,
+            gr.Label(
+                {
+                    "✅ " + message: 1.0,
+                    f"⏳ Generating topic names with {model_id}": 0.0,
+                },
+                visible=True,
+            ),
+            "",
+        )
         all_topics = base_model.topics_
+        topics_info = base_model.get_topic_info()
+        new_topics_by_text_generation = {}
+        for _, row in topics_info.iterrows():
+            logging.info(
+                f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
+            )
+            prompt = f"{LLAMA_3_8B_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
+            prompt_messages = [
+                {
+                    "role": "system",
+                    "content": "You are a helpful, respectful and honest assistant for labeling topics.",
+                },
+                {"role": "user", "content": prompt},
+            ]
+            output = inference_client.chat_completion(
+                messages=prompt_messages,
+                stream=False,
+                max_tokens=500,
+                top_p=0.8,
+                seed=42,
+            )
+            inference_response = output.choices[0].message.content
+            logging.info("Inference response:")
+            logging.info(inference_response)
+            new_topics_by_text_generation[row["Topic"]] = inference_response.replace(
+                "Topic=", ""
+            ).strip()
+        base_model.set_topic_labels(new_topics_by_text_generation)
+        topics_info = base_model.get_topic_info()
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 topics=all_topics,
+                custom_labels=True,
                 reduced_embeddings=reduced_embeddings_array,
                 title="",
                 sub_title=sub_title,
             if plot_type == "DataMapPlot"
             else base_model.visualize_documents(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
+                custom_labels=True,
                 title="",
             )
         )
+        dataset_clear_name = dataset.replace("/", "-")
+        plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
+        if plot_type == "DataMapPlot":
+            topic_plot.savefig(plot_png, format="png", dpi=300)
+        else:
+            topic_plot.write_image(plot_png)
+        custom_labels = base_model.custom_labels_
+        topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
         yield (
             gr.Accordion(open=False),
             topics_info,
             topic_plot,
+            gr.Label(
+                {
+                    "✅ " + message: 1.0,
+                    f"✅ Generating topic names with {model_id}": 1.0,
+                    "⏳ Creating Interactive Space": 0.0,
+                },
+                visible=True,
+            ),
             "",
         )
+        interactive_plot = datamapplot.create_interactive_plot(
+            reduced_embeddings_array,
+            topic_names_array,
+            hover_text=all_docs,
+            title=dataset,
+            sub_title=sub_title.replace(
+                "dataset",
+                f"<a href='https://huggingface.co/datasets/{dataset}/viewer/{config}/{split}' target='_blank'>dataset</a>",
+            ),
+            enable_search=True,
+            # TODO: Export data to .arrow and also serve it
+            inline_data=True,
+            # offline_data_prefix=dataset_clear_name,
+            initial_zoom_fraction=0.8,
+        )
+        html_content = str(interactive_plot)
+        html_file_path = f"{dataset_clear_name}.html"
+        with open(html_file_path, "w", encoding="utf-8") as html_file:
+            html_file.write(html_content)
+        repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
+        space_id = create_space_with_content(
+            api=api,
+            repo_id=repo_id,
+            dataset_id=dataset,
+            html_file_path=html_file_path,
+            plot_file_path=plot_png,
+            space_card=SPACE_REPO_CARD_CONTENT,
+            token=HF_TOKEN,
+        )
+        space_link = f"https://huggingface.co/spaces/{space_id}"
+        yield (
+            gr.Accordion(open=False),
+            topics_info,
+            topic_plot,
+            gr.Label(
+                {
+                    "✅ " + message: 1.0,
+                    f"✅ Generating topic names with {model_id}": 1.0,
+                    "✅ Creating Interactive Space": 1.0,
+                },
+                visible=True,
+            ),
+            f"[![Go to interactive plot](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue)]({space_link})",
         )
+        del reduce_umap_model, all_docs, reduced_embeddings_list
+        del (
+            base_model,
+            all_topics,
+            topics_info,
+            topic_names_array,
+            interactive_plot,
         )
+        cuda.empty_cache()
+    except Exception as error:
+        return (
+            gr.Accordion(open=True),
+            gr.DataFrame(value=[], interactive=False, visible=True),
+            gr.Plot(value=None, visible=True),
+            gr.Label({f"❌ Error: {error}": 0.0}, visible=True),
+            "",
         )
 with gr.Blocks() as demo:
         generate_button = gr.Button("Generate Topics", variant="primary")
     gr.Markdown("## Data map")
+    progress_label = gr.Label(visible=False, show_label=False)
     open_space_label = gr.Markdown()
     topics_plot = gr.Plot()
+    # with gr.Accordion("Topics Info", open=False):
+    topics_df = gr.DataFrame(interactive=False, visible=True)
     gr.HTML(
         f"<p style='text-align: center; color:orange;'>⚠ This space processes datasets in batches of <b>{CHUNK_SIZE}</b>, with a maximum of <b>{MAX_ROWS}</b> rows. If you need further assistance, please open a new issue in the Community tab.</p>"
     )
             data_details_accordion,
             topics_df,
             topics_plot,
+            progress_label,
             open_space_label,
         ],
     )