isimorfizam commited on
Commit
b79abc2
1 Parent(s): d0d5329

Trrying to fix lfs issue

Browse files
Files changed (1) hide show
  1. app.py +308 -3
app.py CHANGED
@@ -1,3 +1,308 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec776a7d993d37a267ba115d1f7b6ed3f47543372183e0aba99d3ec5cdbf443d
3
- size 12135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from transformers.utils import is_flash_attn_2_available
5
+ from transformers import BitsAndBytesConfig
6
+ import pandas as pd
7
+ import os
8
+ import torch
9
+ import numpy as np
10
+ from scipy import sparse
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ from scipy import sparse
13
+ from langchain_community.vectorstores import Chroma
14
+ from langchain_community.embeddings.sentence_transformer import (
15
+ SentenceTransformerEmbeddings,
16
+ )
17
+
18
+ # SET TO WIDE LAYOUT
19
+ st.set_page_config(layout="wide")
20
+
21
+ #_______________________________________________SET VARIABLES_____________________________________________________
22
+
23
+ MODEL_ID = 'google/gemma-2b-it'
24
+ CHUNK_SIZE = 1000
25
+ OVERLAP_SIZE = 100
26
+ EMBEDDING = "all-MiniLM-L6-v2"
27
+ COLLECTION_NAME = f'vb_summarizer_{EMBEDDING}_test'
28
+ CHROMA_DATA_PATH = 'feedback_360'
29
+
30
+ #_______________________________________________LOAD MODELS_____________________________________________________
31
+ # LOAD MODEL
32
+ @st.cache_resource
33
+ def load_model(model_id) :
34
+
35
+ HF_TOKEN = os.environ['HF_TOKEN']
36
+ print(torch.backends.mps.is_available())
37
+ #device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
38
+ device = 'cpu'
39
+ print(device)
40
+
41
+ if device=='cpu' :
42
+ print('Warning! No GPU available')
43
+
44
+ # IMPORT MODEL
45
+
46
+ print(model_id)
47
+
48
+ quantization_config = BitsAndBytesConfig(load_in_4bit=True,
49
+ bnb_4bit_compute_dtype=torch.float16)
50
+
51
+ # if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
52
+ # attn_implementation = "flash_attention_2"
53
+ # else:
54
+ # attn_implementation = "sdpa"
55
+ # print(f"[INFO] Using attention implementation: {attn_implementation}")
56
+
57
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, token=HF_TOKEN)
58
+
59
+ llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
60
+ token=HF_TOKEN,
61
+ torch_dtype=torch.float16,
62
+ #quantization_config=quantization_config if quantization_config else None,
63
+ low_cpu_mem_usage=False,) # use full memory
64
+ #attn_implementation=attn_implementation) # which attention version to use
65
+ llm_model.to(device)
66
+ return llm_model, tokenizer, device
67
+
68
+ # LOAD VECTORSTORE
69
+ @st.cache_resource
70
+ def load_data(embedding) :
71
+ # CREATE EMBEDDING
72
+ embedding_function = SentenceTransformerEmbeddings(model_name=embedding)
73
+ db3 = Chroma(collection_name = COLLECTION_NAME, persist_directory="./chroma", embedding_function = embedding_function)
74
+ return db3
75
+
76
+
77
+ # Create a text element and let the reader know the data is loading.
78
+ model_load_state = st.text('Loading model...')
79
+ # Load 10,000 rows of data into the dataframe.
80
+ llm_model, tokenizer, device = load_model(MODEL_ID)
81
+ # Notify the reader that the data was successfully loaded.
82
+ model_load_state.text('Loading model...done!')
83
+
84
+ # Create a text element and let the reader know the data is loading.
85
+ data_load_state = st.text('Loading data...')
86
+ # Load 10,000 rows of data into the dataframe.
87
+ vectorstore = load_data(EMBEDDING)
88
+ # Notify the reader that the data was successfully loaded.
89
+ data_load_state.text('Loading data...done!')
90
+
91
+
92
+ #_______________________________________________SUMMARIZATION_____________________________________________________
93
+ # INFERENCE
94
+ # def prompt_formatter(reviews, type_of_doc):
95
+ # return f"""You are a summarization bot.
96
+ # You will receive {type_of_doc} and you will extract all relevant information from {type_of_doc} and return one paragraph in which you will summarize what was said.
97
+ # {type_of_doc} are listed below under inputs.
98
+ # Inputs: {reviews}
99
+ # Answer :
100
+ # """
101
+ # def prompt_formatter(reviews, type_of_doc):
102
+ # return f"""You are a summarization bot.
103
+ # You will receive {type_of_doc} and you will summarize what was said in the input.
104
+ # {type_of_doc} are listed below under inputs.
105
+ # Inputs: {reviews}
106
+ # Answer :
107
+ # """
108
+ def prompt_formatter(reviews):
109
+ return f"""You are a summarization bot.
110
+ You will receive reviews of Clockify from different users.
111
+ You will summarize what these reviews said while keeping the information about each of the user.
112
+ Reviews are listed below.
113
+ Reviews: {reviews}
114
+ Answer :
115
+ """
116
+
117
+ def mirror_mirror(inputs, prompt_formatter, tokenizer):
118
+ print('Mirror_mirror')
119
+ prompt = prompt_formatter(inputs)
120
+ input_ids = tokenizer(prompt, return_tensors="pt").to(device)
121
+ outputs = llm_model.generate(**input_ids,
122
+ temperature=0.3,
123
+ do_sample=True,
124
+ max_new_tokens=275)
125
+ output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
126
+ return prompt, output_text.replace(prompt, '')
127
+
128
+
129
+ def summarization(example : str, results_df : pd.DataFrame = pd.DataFrame()) -> pd.DataFrame :
130
+
131
+ # INFERENCE
132
+ results = []
133
+ for cnt in range(0,2) :
134
+
135
+ prompt, result = mirror_mirror(example, prompt_formatter, tokenizer)
136
+ list_temp = [result, example]
137
+ tokenized = tokenizer(list_temp, return_tensors="pt", padding = True)
138
+ A = tokenized.input_ids.numpy()
139
+ A = sparse.csr_matrix(A)
140
+ score = cosine_similarity(A)[0,1]
141
+ #print(cosine_similarity(A)[0,1])
142
+ #print(cosine_similarity(A)[1,0])
143
+ print(score)
144
+ if score>0.1 :
145
+ fin_result = result
146
+ max_score = score
147
+ break
148
+
149
+ results.append(result)
150
+ #print(result+'\n\n')
151
+
152
+ # tokenize results and example together
153
+ try :
154
+ fin_result
155
+ except :
156
+ # if fin_result not already defined, use the best of available results
157
+ # add example to results so tokenization is done together (due to padding limitations)
158
+ results.append(example)
159
+ tokenized = tokenizer(results, return_tensors="pt", padding = True)
160
+ A = tokenized.input_ids.numpy()
161
+ A = sparse.csr_matrix(A)
162
+ # calculate cosine similarity of each pair
163
+ # keep only example X result column
164
+ scores = cosine_similarity(A)[:,2]
165
+ # final result is the one with greaters cos_score
166
+ fin_result = results[np.argmax(scores)]
167
+ max_score = max(scores)
168
+
169
+ #print(fin_result)
170
+ # save final result and its attributes
171
+ row = pd.DataFrame({'model' : MODEL_ID, 'prompt' : prompt, 'reviews' : example, 'summarization' : fin_result, 'score' : [max_score] })
172
+ results_df = pd.concat([results_df,row], ignore_index = True)
173
+
174
+ return results_df
175
+
176
+ def create_filter(group:str=None, platform:str=None, ReviewerPosition:str=None, Industry:str=None, CompanySize:str=None,
177
+ UsagePeriod:str=None, LinkedinVerified:str=None, Date:str=None, Rating:str=None) :
178
+ keys = ['group', 'Platform', 'ReviewerPosition', 'Industry', 'CompanySize',
179
+ 'UsagePeriod', 'LinkedinVerified', 'Date', 'Rating']
180
+ input_keys = [group,platform, ReviewerPosition, Industry, CompanySize, UsagePeriod, LinkedinVerified, Date, Rating]
181
+
182
+ # create filter dict
183
+ filter_dict = {}
184
+ for key, in_key in zip(keys, input_keys) :
185
+ if not in_key == None and not in_key == ' ':
186
+ filter_dict[key] = {'$eq' : in_key}
187
+
188
+ print(filter_dict)
189
+ return filter_dict
190
+
191
+ #_______________________________________________UI_____________________________________________________
192
+
193
+ st.title("Mirror, mirror, on the cloud, what do Clockify users say aloud?")
194
+ st.subheader("--Clockify review summarizer--")
195
+
196
+ col1, col2, col3 = st.columns(3, gap = 'small')
197
+
198
+ with col1:
199
+ platform = st.selectbox(label = 'Platform',
200
+ options = [' ', 'Capterra', 'Chrome Extension', 'GetApp', 'AppStore', 'GooglePlay',
201
+ 'Firefox Extension', 'JIRA Plugin', 'Trustpilot', 'G2',
202
+ 'TrustRadius']
203
+ )
204
+
205
+ with col2:
206
+ company_size = st.selectbox(label = 'Company Size',
207
+ options = [' ', '1-10 employees', 'Self-employed', 'self-employed',
208
+ 'Small-Business(50 or fewer emp.)', '51-200 employees',
209
+ 'Mid-Market(51-1000 emp.)', '11-50 employees',
210
+ '501-1,000 employees', '10,001+ employees', '201-500 employees',
211
+ '1,001-5,000 employees', '5,001-10,000 employees',
212
+ 'Enterprise(> 1000 emp.)', 'Unknown', '1001-5000 employees']
213
+ )
214
+
215
+ with col3:
216
+ linkedin_verified = st.selectbox(label = 'Linkedin Verified',
217
+ options = [' ', 'True', 'False'],
218
+ placeholder = 'Choose an option'
219
+ )
220
+
221
+ num_to_return = int(st.number_input(label = 'Number of documents to return', min_value = 2, max_value = 50, step = 1))
222
+
223
+ # group = st.selectbox(label = 'Review Platform Group',
224
+ # options = ['Software Review Platforms', 'Browser Extension Stores', 'Mobile App Stores', 'Plugin Marketplace']
225
+ # )
226
+
227
+
228
+
229
+ default_value = "Clockify"
230
+
231
+ query = st.text_area("Query", default_value, height = 50)
232
+ #type_of_doc = st.text_area("Type of text", 'text', height = 25)
233
+
234
+ # result = ''
235
+ # score = ''
236
+ # reviews = ''
237
+
238
+ if 'result' not in st.session_state:
239
+ st.session_state['result'] = ''
240
+
241
+ if 'score' not in st.session_state:
242
+ st.session_state['score'] = ''
243
+
244
+ if 'reviews' not in st.session_state:
245
+ st.session_state['reviews'] = ''
246
+
247
+ col11, col21 = st.columns(2, gap = 'small')
248
+
249
+ with col11:
250
+ button_query = st.button('Conquer and query!')
251
+ with col21:
252
+ button_summarize = st.button('Summon the summarizer!')
253
+
254
+
255
+ if button_query :
256
+ print('Querying')
257
+ # create filter from drop-downs
258
+ filter_dict = create_filter(#group = group,
259
+ platform = platform,
260
+ CompanySize = company_size,
261
+ LinkedinVerified = linkedin_verified
262
+ )
263
+ # FILTER BY META
264
+ if filter_dict == {} :
265
+ retriever = vectorstore.as_retriever(search_kwargs = {"k": num_to_return})
266
+
267
+ elif len(filter_dict.keys()) == 1 :
268
+ retriever = vectorstore.as_retriever(search_kwargs = {"k": num_to_return,
269
+ "filter": filter_dict})
270
+ else :
271
+ retriever = vectorstore.as_retriever(search_kwargs = {"k": num_to_return,
272
+ "filter":{'$and': [{key : value} for key,value in filter_dict.items()]}
273
+ }
274
+ )
275
+
276
+ reviews = retriever.get_relevant_documents(query = query)
277
+ # only get page content
278
+ st.session_state['reviews'] = [review.page_content for review in reviews]
279
+ print(st.session_state['reviews'])
280
+ result = 'You may summarize now!'
281
+
282
+ if button_summarize :
283
+ print('Summarization in progress')
284
+ st.session_state['result'] = 'Summarization in progress'
285
+ results_df = summarization("\n".join(st.session_state['reviews']))
286
+ # only one input
287
+ st.session_state['result'] = results_df.summarization[0]
288
+ score = results_df.score[0]
289
+
290
+
291
+ col12, col22 = st.columns(2, gap = 'small')
292
+
293
+ with col12:
294
+ chosen_reviews = st.text_area("Reviews to be summarized", "\n".join(st.session_state['reviews']), height = 275)
295
+ with col22:
296
+ summarized_text = st.text_area("Summarized text", st.session_state['result'], height = 275)
297
+
298
+ score = st.text_area("Cosine similarity score", st.session_state['score'], height = 25)
299
+
300
+
301
+
302
+
303
+
304
+ # max_length = st.sidebar.slider("Max Length", min_value = 10, max_value=30)
305
+ # temperature = st.sidebar.slider("Temperature", value = 1.0, min_value = 0.0, max_value=1.0, step=0.05)
306
+ # top_k = st.sidebar.slider("Top-k", min_value = 0, max_value=5, value = 0)
307
+ # top_p = st.sidebar.slider("Top-p", min_value = 0.0, max_value=1.0, step = 0.05, value = 0.9)
308
+ # num_return_sequences = st.sidebar.number_input('Number of Return Sequences', min_value=1, max_value=5, value=1, step=1)s