Spaces:

rameshmoorthy
/

NAC-Product-Clustering-analysis

Sleeping

App Files Files Community

rameshmoorthy commited on Jan 27

Commit

7efeab0

•

1 Parent(s): ba6d530

Upload 3 files

Browse files

Files changed (3) hide show

app.py +197 -0
functions.py +206 -0
requirements.txt.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import gradio as gr
+import pandas as pd
+from functions import process_file_bm25 , process_file_bert , generate_plot , generate
+#------------------------------------------------------
+# Create the state object
+state = gr.State()
+state.df_bm25 = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]})
+state.df_bert = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]})
+state = gr.State()
+state.df_topics_bert = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]})
+state.df_topics_bm25 = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]})
+with gr.Blocks() as bm25:
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(
+                """
+                # Select a CSV/Excel file with column as 'products'
+                """)
+            inputfile = gr.File(file_types=['.csv','.xlsx'], label="Upload CSV/Excel file")
+            #german = gr.Textbox(label="German Text")
+            def confirmation():
+                doc='File uploaded! Press Cluster button'
+                return doc
+            def download_doc(doc):
+                return doc
+            def download_df():
+                df1=state.df
+                print(df1)
+                return df1
+            out = gr.Textbox()
+            mode=gr.Radio(["Automated clustering", "Manually choose parameters"], label="Type of algorithm", value="Automated clustering",info="Choose any mode u want")
+            inputfile.upload(confirmation,inputs=[],outputs=out)
+            with gr.Row():
+              with gr.Column():
+                min_cluster_size=gr.Slider(1, 100, value=5, step=1,label="min_cluster_size", info="Choose minimum No. of docs in a cluster. Lower the value ,higher the clusters created")
+                with gr.Column():
+                      top_n_words=gr.Slider(1, 25, value=10, step=1,label="top_n_words", info="Choose no of key words for a cluster")
+                      with gr.Column():
+                            ngram=gr.Slider(1, 3, value=2, step=1,label="ngram", info="Choose no of n-grams words to be taken for clustering")
+            cluster_btn = gr.Button(value="Cluster")
+            #[  df,topics_info,barchart,topics_plot,heatmap,hierarchy]
+            tup=cluster_btn.click(process_file_bm25,    inputs=[inputfile,mode,min_cluster_size,top_n_words,ngram],
+            outputs=[
+                gr.Dataframe(),
+                gr.File(label="Download CSV"),
+                gr.Dataframe(),
+                #'html',
+                gr.Plot(label="Barchart"),
+                gr.Plot(label="Topics Plot"),
+                gr.Plot(label="Heatmap"),
+                gr.Plot(label="Hierarchy")
+            ])
+            print('Tuple **************************' ,tup)
+            #[df1, df2, barchart_plot, topics_plot, heatmap_plot, hierarchy_plot] = tup
+            llm_btn = gr.Button(value="AI generation ")
+            llm_btn.click(download_df,inputs=[],outputs=gr.Dataframe(label="Output"))
+with gr.Blocks() as bert:
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(
+                """
+                # Select a CSV/Excel file with column as 'products'
+                """)
+            inputfile = gr.File(file_types=['.csv','.xlsx'], label="Upload CSV/Excel file")
+            #german = gr.Textbox(label="German Text")
+            def confirmation():
+                doc='File uploaded! Press Cluster button'
+                return doc
+            out = gr.Textbox()
+            mode=gr.Radio(["Automated clustering", "Manually choose parameters"], label="Type of algorithm", value="Automated clustering",info="Choose any mode u want")
+            inputfile.upload(confirmation,inputs=[],outputs=out)
+            with gr.Row():
+              with gr.Column():
+                min_cluster_size=gr.Slider(1, 100, value=5, step=1,label="min_cluster_size", info="Choose minimum No. of docs in a cluster. Lower the value ,higher the clusters created")
+                with gr.Column():
+                      top_n_words=gr.Slider(1, 25, value=10, step=1,label="top_n_words", info="Choose no of key words for a cluster")
+                      with gr.Column():
+                            ngram=gr.Slider(1, 3, value=2, step=1,label="ngram", info="Choose no of n-grams words to be taken for clustering")
+            cluster_btn = gr.Button(value="Cluster")
+            #[  df,topics_info,barchart,topics_plot,heatmap,hierarchy]
+            tup=cluster_btn.click(process_file_bert,    inputs=[inputfile,mode,min_cluster_size],
+            outputs=[
+            gr.Dataframe(),
+            gr.Dataframe(),
+            gr.Plot(label="Barchart"),
+            gr.Plot(label="Topics Plot"),
+            gr.Plot(label="Heatmap"),
+            gr.Plot(label="Hierarchy")
+            ])
+#___________________________________________
+additional_inputs=[
+gr.Textbox(
+    label="System Prompt",
+    max_lines=1,
+    interactive=True,
+),
+gr.Slider(
+    label="Temperature",
+    value=0.9,
+    minimum=0.0,
+    maximum=1.0,
+    step=0.05,
+    interactive=True,
+    info="Higher values produce more diverse outputs",
+),
+gr.Slider(
+    label="Max new tokens",
+    value=256,
+    minimum=0,
+    maximum=1048,
+    step=64,
+    interactive=True,
+    info="The maximum numbers of new tokens",
+),
+gr.Slider(
+    label="Top-p (nucleus sampling)",
+    value=0.90,
+    minimum=0.0,
+    maximum=1,
+    step=0.05,
+    interactive=True,
+    info="Higher values sample more low-probability tokens",
+),
+gr.Slider(
+    label="Repetition penalty",
+    value=1.2,
+    minimum=1.0,
+    maximum=2.0,
+    step=0.05,
+    interactive=True,
+    info="Penalize repeated tokens",
+)
+]
+examples=[["I'm planning a vacation to Japan. Can you suggest a one-week itinerary including must-visit places and local cuisines to try?", None, None, None, None, None, ],
+          ["Can you write a short story about a time-traveling detective who solves historical mysteries?", None, None, None, None, None,],
+          ["I'm trying to learn French. Can you provide some common phrases that would be useful for a beginner, along with their pronunciations?", None, None, None, None, None,],
+          ["I have chicken, rice, and bell peppers in my kitchen. Can you suggest an easy recipe I can make with these ingredients?", None, None, None, None, None,],
+          ["Can you explain how the QuickSort algorithm works and provide a Python implementation?", None, None, None, None, None,],
+          ["What are some unique features of Rust that make it stand out compared to other systems programming languages like C++?", None, None, None, None, None,],
+        ]
+chat_interface=gr.ChatInterface(
+    fn=generate,
+    chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
+    additional_inputs=additional_inputs,
+    title="Mixtral 46.7B",
+    examples=examples,
+    concurrency_limit=20,
+)
+#______________________________________________________
+# Create a Gradio interface
+df=state.df_topics_bm25
+#df=pd.DataFrame(columns=['Topic'])
+excel_analysis_bm25 = gr.Interface(
+    fn=generate_plot,  # Placeholder function, will be defined later
+    inputs=[
+        gr.Dropdown(df['Topic'].unique().tolist(), label="Select Topic Number", type="index"),
+        gr.Dropdown(list(df.columns[~df.columns.isin(['Topic'])]), label="Select X Axis", type="index"),
+        gr.Dropdown(list(df.columns[~df.columns.isin(['Topic'])]), label="Select Y Axis", type="index"),
+        gr.Radio(["scatter", "bar", "line", "box", "wordcloud", "pie"], label="Select Chart Type"),
+        gr.Dropdown(["count", "count_distinct", "sum", "average"], label="Select Aggregation Function")
+    ],
+    outputs=gr.Plot(label="Visualization")
+)
+demo = gr.TabbedInterface([bm25,chat_interface,excel_analysis_bm25,
+     bert], ["TFIDF-BM25 Clustering", "TFIDF-BM25-Topics AI","TFIDF-BM25-Topic analysis","keyBERT"])
+demo.launch(share=True,debug=True)

functions.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import pandas as pd
+from bertopic import BERTopic
+from huggingface_hub import InferenceClient
+from bertopic.vectorizers import ClassTfidfTransformer
+from sentence_transformers import SentenceTransformer
+from sklearn import preprocessing
+from sklearn.preprocessing import LabelEncoder
+from tempfile import NamedTemporaryFile
+import matplotlib.pyplot as plt
+import plotly.express as px
+from wordcloud import WordCloud
+def process_file_bm25(file,mode,min_cluster_size,top_n_words,ngram):
+    # Read the Excel sheet or CSV file
+    if file.name.endswith('.csv'):
+        df = pd.read_csv(file)
+    elif file.name.endswith('.xls') or file.name.endswith('.xlsx'):
+        df = pd.read_excel(file)
+    else:
+        raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")
+    # Ensure that the 'products' column is present in the dataframe
+    if 'products' not in df.columns.str.lower():
+        raise ValueError("The input file must have a column named 'products'.")
+    # Convert the 'products' column to a list
+    sentences_list = df['products'].tolist()
+    print(len(sentences_list))
+    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True,reduce_frequent_words=True)
+    if mode=="Automated clustering":
+      topic_model = BERTopic(ctfidf_model=ctfidf_model,n_gram_range =(1,ngram),top_n_words=top_n_words)
+    else:
+      topic_model = BERTopic(ctfidf_model=ctfidf_model,n_gram_range =(1,ngram),top_n_words=top_n_words,min_topic_size=min_cluster_size)
+    # Perform topic modeling
+    topics, probabilities = topic_model.fit_transform(sentences_list)
+    # Visualize all graphs
+    topics_info=topic_model.get_topic_info()
+    state.df_topics_bm25= topics_info
+    #print(topics)
+    try:
+      barchart = topic_model.visualize_barchart(top_n_topics=10)
+    except:
+      barchart='Error message'
+    try:
+      topics_plot = topic_model.visualize_topics()
+    except:
+      topics_plot = ' Error message'
+    heatmap = topic_model.visualize_heatmap()
+    hierarchy = topic_model.visualize_hierarchy()
+    df['topic_number'] = topics
+    # Encode the topic numbers to make them categorical
+    label_encoder = LabelEncoder()
+    df['topic_number_encoded'] = label_encoder.fit_transform(df['topic_number'])
+    temp_file = NamedTemporaryFile(delete=False, suffix=".xlsx")
+    df.to_excel(temp_file.name, index=False)
+    state.df_bm25=df
+    #print(df)
+    return  df,temp_file.name,topics_info ,barchart,topics_plot, heatmap, hierarchy
+def process_file_bert(file,mode,min_cluster_size,top_n_words,ngram):
+    # Read the Excel sheet or CSV file
+    if file.name.endswith('.csv'):
+        df = pd.read_csv(file)
+    elif file.name.endswith('.xls') or file.name.endswith('.xlsx'):
+        df = pd.read_excel(file)
+    else:
+        raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")
+    # Ensure that the 'products' column is present in the dataframe
+    if 'products' not in df.columns.str.lower():
+        raise ValueError("The input file must have a column named 'products'.")
+    # Convert the 'products' column to a list
+    sentences_list = df['products'].tolist()
+    print(len(sentences_list))
+    representation_model = KeyBERTInspired()
+    if mode=="Automated clustering":
+      # Fine-tune your topic representations
+      topic_model = BERTopic(representation_model=representation_model,n_gram_range =(1,ngram),top_n_words=top_n_words)
+    else:
+      topic_model = BERTopic(representation_model=representation_model,n_gram_range =(1,ngram),top_n_words=top_n_words,min_topic_size=min_cluster_size)
+    topics, probabilities = topic_model.fit_transform(sentences_list)
+    # Visualize all graphs
+    topics_info=topic_model.get_topic_info()
+    state.df_topics_bert= topics_info
+    #print(topics)
+    try:
+      barchart = topic_model.visualize_barchart(top_n_topics=10)
+    except:
+      barchart='Error message'
+    try:
+      topics_plot = topic_model.visualize_topics()
+    except:
+      topics_plot = ' Error message'
+    heatmap = topic_model.visualize_heatmap()
+    hierarchy = topic_model.visualize_hierarchy()
+    df['topic_number'] = topics
+    # Encode the topic numbers to make them categorical
+    label_encoder = LabelEncoder()
+    df['topic_number_encoded'] = label_encoder.fit_transform(df['topic_number'])
+    temp_file = NamedTemporaryFile(delete=False, suffix=".xlsx")
+    df.to_excel(temp_file.name, index=False)
+    state.df_bert=df
+    return  df, topics_info ,barchart,topics_plot, heatmap, hierarchy
+client = InferenceClient(
+    "mistralai/Mixtral-8x7B-Instruct-v0.1"
+)
+def format_prompt(message, history):
+  prompt = "<s>"
+  for user_prompt, bot_response in history:
+    prompt += f"[INST] {user_prompt} [/INST]"
+    prompt += f" {bot_response}</s> "
+  prompt += f"[INST] {message} [/INST]"
+  return prompt
+def generate(
+    prompt, history, system_prompt, temperature=0.9, max_new_tokens=4096, top_p=0.95, repetition_penalty=1.0,
+):
+    temperature = float(temperature)
+    if temperature < 1e-2:
+        temperature = 1e-2
+    top_p = float(top_p)
+    generate_kwargs = dict(
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        do_sample=True,
+        seed=42,
+    )
+    formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
+    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+    output = ""
+    for response in stream:
+        output += response.token.text
+        yield output
+    return output
+# Define the function to generate the plot based on user inputs
+def generate_plot(topic, x_axis_index, y_axis_index, chart_type, agg_func):
+    x_axis = df.columns[1:][x_axis_index]
+    y_axis = df.columns[1:][y_axis_index]
+    print(x_axis,y_axis)
+    filtered_df = df[df['Topic Number'] == topic]
+    if chart_type == "scatter":
+        fig = px.scatter(filtered_df, x=x_axis, y=y_axis)
+    elif chart_type == "bar":
+        print('Bar chart selected')
+        if agg_func == "count_distinct":
+            fig = px.bar(filtered_df, x=x_axis, y=y_axis, color=y_axis, barmode='group')
+        else:
+            fig = px.bar(filtered_df, x=x_axis, y=y_axis, color=y_axis)
+    elif chart_type == "line":
+        fig = px.line(filtered_df, x=x_axis, y=y_axis)
+    elif chart_type == "box":
+        fig = px.box(filtered_df, x=x_axis, y=y_axis)
+    elif chart_type == "wordcloud":
+        text = ' '.join(filtered_df[y_axis].astype(str))
+        wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(text)
+        plt.figure(figsize=(10, 7))
+        plt.imshow(wordcloud, interpolation="bilinear")
+        plt.axis('off')
+        plt.show()
+        return None
+    elif chart_type == "pie":
+        fig = px.pie(filtered_df, names=x_axis, values=y_axis)
+        print('Pie chart selected')
+    return fig

requirements.txt.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pandas
+bertopic
+huggingface_hub
+sentence-transformers
+scikit-learn
+matplotlib
+plotly