rameshmoorthy commited on
Commit
7efeab0
1 Parent(s): ba6d530

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +197 -0
  2. functions.py +206 -0
  3. requirements.txt.txt +7 -0
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from functions import process_file_bm25 , process_file_bert , generate_plot , generate
4
+
5
+
6
+
7
+ #------------------------------------------------------
8
+
9
+ # Create the state object
10
+ state = gr.State()
11
+ state.df_bm25 = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]})
12
+ state.df_bert = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]})
13
+ state = gr.State()
14
+ state.df_topics_bert = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]})
15
+ state.df_topics_bm25 = pd.DataFrame({"column1": [1, 2, 3], "column2": ["A", "B", "C"]})
16
+
17
+
18
+
19
+ with gr.Blocks() as bm25:
20
+ with gr.Row():
21
+ with gr.Column():
22
+ gr.Markdown(
23
+ """
24
+ # Select a CSV/Excel file with column as 'products'
25
+ """)
26
+ inputfile = gr.File(file_types=['.csv','.xlsx'], label="Upload CSV/Excel file")
27
+
28
+
29
+
30
+ #german = gr.Textbox(label="German Text")
31
+ def confirmation():
32
+ doc='File uploaded! Press Cluster button'
33
+ return doc
34
+
35
+ def download_doc(doc):
36
+ return doc
37
+
38
+ def download_df():
39
+ df1=state.df
40
+ print(df1)
41
+ return df1
42
+
43
+ out = gr.Textbox()
44
+ mode=gr.Radio(["Automated clustering", "Manually choose parameters"], label="Type of algorithm", value="Automated clustering",info="Choose any mode u want")
45
+ inputfile.upload(confirmation,inputs=[],outputs=out)
46
+ with gr.Row():
47
+ with gr.Column():
48
+ min_cluster_size=gr.Slider(1, 100, value=5, step=1,label="min_cluster_size", info="Choose minimum No. of docs in a cluster. Lower the value ,higher the clusters created")
49
+ with gr.Column():
50
+ top_n_words=gr.Slider(1, 25, value=10, step=1,label="top_n_words", info="Choose no of key words for a cluster")
51
+ with gr.Column():
52
+ ngram=gr.Slider(1, 3, value=2, step=1,label="ngram", info="Choose no of n-grams words to be taken for clustering")
53
+
54
+ cluster_btn = gr.Button(value="Cluster")
55
+ #[ df,topics_info,barchart,topics_plot,heatmap,hierarchy]
56
+ tup=cluster_btn.click(process_file_bm25, inputs=[inputfile,mode,min_cluster_size,top_n_words,ngram],
57
+ outputs=[
58
+ gr.Dataframe(),
59
+ gr.File(label="Download CSV"),
60
+ gr.Dataframe(),
61
+ #'html',
62
+ gr.Plot(label="Barchart"),
63
+ gr.Plot(label="Topics Plot"),
64
+ gr.Plot(label="Heatmap"),
65
+ gr.Plot(label="Hierarchy")
66
+ ])
67
+ print('Tuple **************************' ,tup)
68
+ #[df1, df2, barchart_plot, topics_plot, heatmap_plot, hierarchy_plot] = tup
69
+
70
+
71
+ llm_btn = gr.Button(value="AI generation ")
72
+ llm_btn.click(download_df,inputs=[],outputs=gr.Dataframe(label="Output"))
73
+
74
+
75
+ with gr.Blocks() as bert:
76
+ with gr.Row():
77
+ with gr.Column():
78
+ gr.Markdown(
79
+ """
80
+ # Select a CSV/Excel file with column as 'products'
81
+ """)
82
+ inputfile = gr.File(file_types=['.csv','.xlsx'], label="Upload CSV/Excel file")
83
+
84
+
85
+
86
+ #german = gr.Textbox(label="German Text")
87
+ def confirmation():
88
+ doc='File uploaded! Press Cluster button'
89
+
90
+ return doc
91
+
92
+ out = gr.Textbox()
93
+ mode=gr.Radio(["Automated clustering", "Manually choose parameters"], label="Type of algorithm", value="Automated clustering",info="Choose any mode u want")
94
+ inputfile.upload(confirmation,inputs=[],outputs=out)
95
+ with gr.Row():
96
+ with gr.Column():
97
+ min_cluster_size=gr.Slider(1, 100, value=5, step=1,label="min_cluster_size", info="Choose minimum No. of docs in a cluster. Lower the value ,higher the clusters created")
98
+ with gr.Column():
99
+ top_n_words=gr.Slider(1, 25, value=10, step=1,label="top_n_words", info="Choose no of key words for a cluster")
100
+ with gr.Column():
101
+ ngram=gr.Slider(1, 3, value=2, step=1,label="ngram", info="Choose no of n-grams words to be taken for clustering")
102
+
103
+ cluster_btn = gr.Button(value="Cluster")
104
+
105
+ #[ df,topics_info,barchart,topics_plot,heatmap,hierarchy]
106
+ tup=cluster_btn.click(process_file_bert, inputs=[inputfile,mode,min_cluster_size],
107
+ outputs=[
108
+ gr.Dataframe(),
109
+ gr.Dataframe(),
110
+ gr.Plot(label="Barchart"),
111
+ gr.Plot(label="Topics Plot"),
112
+ gr.Plot(label="Heatmap"),
113
+ gr.Plot(label="Hierarchy")
114
+ ])
115
+
116
+ #___________________________________________
117
+ additional_inputs=[
118
+ gr.Textbox(
119
+ label="System Prompt",
120
+ max_lines=1,
121
+ interactive=True,
122
+ ),
123
+ gr.Slider(
124
+ label="Temperature",
125
+ value=0.9,
126
+ minimum=0.0,
127
+ maximum=1.0,
128
+ step=0.05,
129
+ interactive=True,
130
+ info="Higher values produce more diverse outputs",
131
+ ),
132
+ gr.Slider(
133
+ label="Max new tokens",
134
+ value=256,
135
+ minimum=0,
136
+ maximum=1048,
137
+ step=64,
138
+ interactive=True,
139
+ info="The maximum numbers of new tokens",
140
+ ),
141
+ gr.Slider(
142
+ label="Top-p (nucleus sampling)",
143
+ value=0.90,
144
+ minimum=0.0,
145
+ maximum=1,
146
+ step=0.05,
147
+ interactive=True,
148
+ info="Higher values sample more low-probability tokens",
149
+ ),
150
+ gr.Slider(
151
+ label="Repetition penalty",
152
+ value=1.2,
153
+ minimum=1.0,
154
+ maximum=2.0,
155
+ step=0.05,
156
+ interactive=True,
157
+ info="Penalize repeated tokens",
158
+ )
159
+ ]
160
+
161
+ examples=[["I'm planning a vacation to Japan. Can you suggest a one-week itinerary including must-visit places and local cuisines to try?", None, None, None, None, None, ],
162
+ ["Can you write a short story about a time-traveling detective who solves historical mysteries?", None, None, None, None, None,],
163
+ ["I'm trying to learn French. Can you provide some common phrases that would be useful for a beginner, along with their pronunciations?", None, None, None, None, None,],
164
+ ["I have chicken, rice, and bell peppers in my kitchen. Can you suggest an easy recipe I can make with these ingredients?", None, None, None, None, None,],
165
+ ["Can you explain how the QuickSort algorithm works and provide a Python implementation?", None, None, None, None, None,],
166
+ ["What are some unique features of Rust that make it stand out compared to other systems programming languages like C++?", None, None, None, None, None,],
167
+ ]
168
+
169
+ chat_interface=gr.ChatInterface(
170
+ fn=generate,
171
+ chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
172
+ additional_inputs=additional_inputs,
173
+ title="Mixtral 46.7B",
174
+ examples=examples,
175
+ concurrency_limit=20,
176
+ )
177
+
178
+ #______________________________________________________
179
+ # Create a Gradio interface
180
+ df=state.df_topics_bm25
181
+ #df=pd.DataFrame(columns=['Topic'])
182
+ excel_analysis_bm25 = gr.Interface(
183
+ fn=generate_plot, # Placeholder function, will be defined later
184
+ inputs=[
185
+ gr.Dropdown(df['Topic'].unique().tolist(), label="Select Topic Number", type="index"),
186
+ gr.Dropdown(list(df.columns[~df.columns.isin(['Topic'])]), label="Select X Axis", type="index"),
187
+ gr.Dropdown(list(df.columns[~df.columns.isin(['Topic'])]), label="Select Y Axis", type="index"),
188
+ gr.Radio(["scatter", "bar", "line", "box", "wordcloud", "pie"], label="Select Chart Type"),
189
+ gr.Dropdown(["count", "count_distinct", "sum", "average"], label="Select Aggregation Function")
190
+ ],
191
+ outputs=gr.Plot(label="Visualization")
192
+ )
193
+ demo = gr.TabbedInterface([bm25,chat_interface,excel_analysis_bm25,
194
+ bert], ["TFIDF-BM25 Clustering", "TFIDF-BM25-Topics AI","TFIDF-BM25-Topic analysis","keyBERT"])
195
+
196
+
197
+ demo.launch(share=True,debug=True)
functions.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from bertopic import BERTopic
3
+ from huggingface_hub import InferenceClient
4
+ from bertopic.vectorizers import ClassTfidfTransformer
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn import preprocessing
7
+ from sklearn.preprocessing import LabelEncoder
8
+ from tempfile import NamedTemporaryFile
9
+ import matplotlib.pyplot as plt
10
+ import plotly.express as px
11
+
12
+
13
+ from wordcloud import WordCloud
14
+
15
+
16
+ def process_file_bm25(file,mode,min_cluster_size,top_n_words,ngram):
17
+ # Read the Excel sheet or CSV file
18
+
19
+ if file.name.endswith('.csv'):
20
+ df = pd.read_csv(file)
21
+ elif file.name.endswith('.xls') or file.name.endswith('.xlsx'):
22
+ df = pd.read_excel(file)
23
+ else:
24
+ raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")
25
+
26
+ # Ensure that the 'products' column is present in the dataframe
27
+ if 'products' not in df.columns.str.lower():
28
+ raise ValueError("The input file must have a column named 'products'.")
29
+
30
+ # Convert the 'products' column to a list
31
+ sentences_list = df['products'].tolist()
32
+ print(len(sentences_list))
33
+ ctfidf_model = ClassTfidfTransformer(bm25_weighting=True,reduce_frequent_words=True)
34
+
35
+ if mode=="Automated clustering":
36
+
37
+ topic_model = BERTopic(ctfidf_model=ctfidf_model,n_gram_range =(1,ngram),top_n_words=top_n_words)
38
+
39
+ else:
40
+
41
+ topic_model = BERTopic(ctfidf_model=ctfidf_model,n_gram_range =(1,ngram),top_n_words=top_n_words,min_topic_size=min_cluster_size)
42
+
43
+
44
+ # Perform topic modeling
45
+ topics, probabilities = topic_model.fit_transform(sentences_list)
46
+
47
+ # Visualize all graphs
48
+
49
+ topics_info=topic_model.get_topic_info()
50
+ state.df_topics_bm25= topics_info
51
+ #print(topics)
52
+ try:
53
+ barchart = topic_model.visualize_barchart(top_n_topics=10)
54
+ except:
55
+ barchart='Error message'
56
+ try:
57
+ topics_plot = topic_model.visualize_topics()
58
+ except:
59
+ topics_plot = ' Error message'
60
+ heatmap = topic_model.visualize_heatmap()
61
+ hierarchy = topic_model.visualize_hierarchy()
62
+ df['topic_number'] = topics
63
+
64
+ # Encode the topic numbers to make them categorical
65
+ label_encoder = LabelEncoder()
66
+ df['topic_number_encoded'] = label_encoder.fit_transform(df['topic_number'])
67
+ temp_file = NamedTemporaryFile(delete=False, suffix=".xlsx")
68
+ df.to_excel(temp_file.name, index=False)
69
+ state.df_bm25=df
70
+ #print(df)
71
+
72
+ return df,temp_file.name,topics_info ,barchart,topics_plot, heatmap, hierarchy
73
+
74
+
75
+ def process_file_bert(file,mode,min_cluster_size,top_n_words,ngram):
76
+ # Read the Excel sheet or CSV file
77
+ if file.name.endswith('.csv'):
78
+ df = pd.read_csv(file)
79
+ elif file.name.endswith('.xls') or file.name.endswith('.xlsx'):
80
+ df = pd.read_excel(file)
81
+ else:
82
+ raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")
83
+
84
+ # Ensure that the 'products' column is present in the dataframe
85
+ if 'products' not in df.columns.str.lower():
86
+ raise ValueError("The input file must have a column named 'products'.")
87
+
88
+ # Convert the 'products' column to a list
89
+ sentences_list = df['products'].tolist()
90
+ print(len(sentences_list))
91
+ representation_model = KeyBERTInspired()
92
+ if mode=="Automated clustering":
93
+ # Fine-tune your topic representations
94
+
95
+ topic_model = BERTopic(representation_model=representation_model,n_gram_range =(1,ngram),top_n_words=top_n_words)
96
+
97
+ else:
98
+
99
+ topic_model = BERTopic(representation_model=representation_model,n_gram_range =(1,ngram),top_n_words=top_n_words,min_topic_size=min_cluster_size)
100
+
101
+ topics, probabilities = topic_model.fit_transform(sentences_list)
102
+
103
+ # Visualize all graphs
104
+
105
+ topics_info=topic_model.get_topic_info()
106
+ state.df_topics_bert= topics_info
107
+ #print(topics)
108
+ try:
109
+ barchart = topic_model.visualize_barchart(top_n_topics=10)
110
+ except:
111
+ barchart='Error message'
112
+ try:
113
+ topics_plot = topic_model.visualize_topics()
114
+ except:
115
+ topics_plot = ' Error message'
116
+ heatmap = topic_model.visualize_heatmap()
117
+ hierarchy = topic_model.visualize_hierarchy()
118
+ df['topic_number'] = topics
119
+
120
+ # Encode the topic numbers to make them categorical
121
+ label_encoder = LabelEncoder()
122
+ df['topic_number_encoded'] = label_encoder.fit_transform(df['topic_number'])
123
+ temp_file = NamedTemporaryFile(delete=False, suffix=".xlsx")
124
+ df.to_excel(temp_file.name, index=False)
125
+
126
+ state.df_bert=df
127
+ return df, topics_info ,barchart,topics_plot, heatmap, hierarchy
128
+
129
+
130
+ client = InferenceClient(
131
+ "mistralai/Mixtral-8x7B-Instruct-v0.1"
132
+ )
133
+
134
+ def format_prompt(message, history):
135
+ prompt = "<s>"
136
+ for user_prompt, bot_response in history:
137
+ prompt += f"[INST] {user_prompt} [/INST]"
138
+ prompt += f" {bot_response}</s> "
139
+ prompt += f"[INST] {message} [/INST]"
140
+ return prompt
141
+
142
+ def generate(
143
+ prompt, history, system_prompt, temperature=0.9, max_new_tokens=4096, top_p=0.95, repetition_penalty=1.0,
144
+ ):
145
+ temperature = float(temperature)
146
+ if temperature < 1e-2:
147
+ temperature = 1e-2
148
+ top_p = float(top_p)
149
+
150
+ generate_kwargs = dict(
151
+ temperature=temperature,
152
+ max_new_tokens=max_new_tokens,
153
+ top_p=top_p,
154
+ repetition_penalty=repetition_penalty,
155
+ do_sample=True,
156
+ seed=42,
157
+ )
158
+
159
+ formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
160
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
161
+ output = ""
162
+
163
+ for response in stream:
164
+ output += response.token.text
165
+ yield output
166
+ return output
167
+
168
+
169
+ # Define the function to generate the plot based on user inputs
170
+ def generate_plot(topic, x_axis_index, y_axis_index, chart_type, agg_func):
171
+ x_axis = df.columns[1:][x_axis_index]
172
+ y_axis = df.columns[1:][y_axis_index]
173
+ print(x_axis,y_axis)
174
+ filtered_df = df[df['Topic Number'] == topic]
175
+
176
+ if chart_type == "scatter":
177
+ fig = px.scatter(filtered_df, x=x_axis, y=y_axis)
178
+ elif chart_type == "bar":
179
+ print('Bar chart selected')
180
+ if agg_func == "count_distinct":
181
+ fig = px.bar(filtered_df, x=x_axis, y=y_axis, color=y_axis, barmode='group')
182
+ else:
183
+ fig = px.bar(filtered_df, x=x_axis, y=y_axis, color=y_axis)
184
+ elif chart_type == "line":
185
+ fig = px.line(filtered_df, x=x_axis, y=y_axis)
186
+ elif chart_type == "box":
187
+ fig = px.box(filtered_df, x=x_axis, y=y_axis)
188
+ elif chart_type == "wordcloud":
189
+ text = ' '.join(filtered_df[y_axis].astype(str))
190
+ wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(text)
191
+ plt.figure(figsize=(10, 7))
192
+ plt.imshow(wordcloud, interpolation="bilinear")
193
+ plt.axis('off')
194
+ plt.show()
195
+ return None
196
+ elif chart_type == "pie":
197
+ fig = px.pie(filtered_df, names=x_axis, values=y_axis)
198
+ print('Pie chart selected')
199
+
200
+ return fig
201
+
202
+
203
+
204
+
205
+
206
+
requirements.txt.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas
2
+ bertopic
3
+ huggingface_hub
4
+ sentence-transformers
5
+ scikit-learn
6
+ matplotlib
7
+ plotly