isimorfizam commited on
Commit
412b3ca
1 Parent(s): 9a3f791

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +333 -0
app.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ #from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from langchain_community.llms import CTransformers
5
+ from transformers.utils import is_flash_attn_2_available
6
+ from transformers import BitsAndBytesConfig
7
+ import pandas as pd
8
+ import os
9
+ import torch
10
+ import numpy as np
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ import streamlit as st
13
+ import llama_cpp
14
+ from llama_cpp import Llama
15
+ import llama_cpp.llama_tokenizer
16
+ from langchain.llms.base import LLM
17
+ from typing import Optional, List, Mapping, Any
18
+ from langchain_community.vectorstores import Chroma
19
+ from langchain_community.embeddings.sentence_transformer import (
20
+ SentenceTransformerEmbeddings,
21
+ )
22
+ # SET TO WIDE LAYOUT
23
+ st.set_page_config(layout="wide")
24
+
25
+ #_______________________________________________SET VARIABLES_____________________________________________________
26
+ EMBEDDING = "all-MiniLM-L6-v2"
27
+ COLLECTION_NAME = f'vb_summarizer_{EMBEDDING}_test'
28
+ CHROMA_DATA_PATH = 'feedback_360'
29
+
30
+ #_______________________________________________LOAD MODELS_____________________________________________________
31
+ # LOAD MODEL
32
+ @st.cache_resource
33
+ class LlamaLLM(LLM):
34
+ model_path: str
35
+ llm: Llama
36
+
37
+ @property
38
+ def _llm_type(self) -> str:
39
+ return "llama-cpp-python"
40
+
41
+ def __init__(self, model_path: str, **kwargs: Any):
42
+ model_path = model_path
43
+ llm = Llama(model_path=model_path)
44
+ super().__init__(model_path=model_path, llm=llm, **kwargs)
45
+
46
+ def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
47
+ response = self.llm(prompt, stop=stop or [])
48
+ return response["choices"][0]["text"]
49
+
50
+ @property
51
+ def _identifying_params(self) -> Mapping[str, Any]:
52
+ return {"model_path": self.model_path}
53
+
54
+ @st.cache_resource
55
+ def load_model():
56
+ llm_model = llama_cpp.Llama.from_pretrained(
57
+ repo_id="TheBloke/Llama-2-7b-Chat-GGUF",
58
+ filename="llama-2-7b-chat.Q4_K_M.gguf",
59
+ #tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
60
+ embedding=True,
61
+ verbose=False,
62
+ n_ctx=2048,
63
+ cache_dir='./model_cached'
64
+ )
65
+
66
+ #from ctransformers import AutoModelForCausalLM
67
+
68
+ # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
69
+ #from ctransformers import AutoModelForCausalLM
70
+ #import ctransformers
71
+ # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
72
+ #llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.q4_K_M.gguf", model_type="llama", gpu_layers=0)
73
+ #llm = CTransformers(model = "TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.q4_K_M.gguf", model_type = 'llama')
74
+ #print(llm("AI is going to"))
75
+
76
+ return llm_model
77
+
78
+ # LOAD VECTORSTORE
79
+ @st.cache_resource
80
+ def load_data(embedding) :
81
+ # CREATE EMBEDDING
82
+ embedding_function = SentenceTransformerEmbeddings(model_name=embedding)
83
+ db3 = Chroma(collection_name = COLLECTION_NAME, persist_directory="./chroma", embedding_function = embedding_function)
84
+ return db3
85
+
86
+ # Create a text element and let the reader know the data is loading.
87
+ model_load_state = st.text('Loading model...')
88
+ # Load 10,000 rows of data into the dataframe.
89
+ llm_model = load_model()
90
+ # Notify the reader that the data was successfully loaded.
91
+ model_load_state.text('Loading model...done!')
92
+
93
+ # Create a text element and let the reader know the data is loading.
94
+ data_load_state = st.text('Loading data...')
95
+ # Load 10,000 rows of data into the dataframe.
96
+ vectorstore = load_data(EMBEDDING)
97
+ # Notify the reader that the data was successfully loaded.
98
+ data_load_state.text('Loading data...done!')
99
+
100
+
101
+ # INFERENCE
102
+ # def prompt_formatter(reviews, type_of_doc):
103
+ # return f"""You are a summarization bot.
104
+ # You will receive {type_of_doc} and you will extract all relevant information from {type_of_doc} and return one paragraph in which you will summarize what was said.
105
+ # {type_of_doc} are listed below under inputs.
106
+ # Inputs: {reviews}
107
+ # Answer :
108
+ # """
109
+ # def prompt_formatter(reviews):
110
+ # return f"""You are a summarization bot.
111
+ # You will an input and summarize in one paragraph the meaning of the input.
112
+ # Do not quote from the input and do not repeat what was said in the input.
113
+ # Do not make things up.
114
+ # Input: {reviews}
115
+ # Answer :
116
+ # """
117
+
118
+ # def prompt_formatter(reviews):
119
+ # return f"""You are a summarization bot.
120
+ # You will receive reviews of Clockify from different users.
121
+ # You will summarize what these reviews said while keeping the information about each of the user.
122
+ # You will return the answer in the form : Review [number of review] : [summarization of review].
123
+ # Reviews are listed below.
124
+ # Reviews: {reviews}
125
+ # Answer :
126
+ # """
127
+
128
+ def prompt_formatter(reviews):
129
+ return f"""You are a summarization bot.
130
+ You will receive reviews of Clockify from different users.
131
+ You will create one paragraph with the summarization of what the reviews say about Clockify.
132
+ Reviews are listed below.
133
+ Do not make things up. Use only information from reviews.
134
+ Reviews: {reviews}
135
+ Answer :
136
+ """
137
+
138
+ def mirror_mirror(inputs, prompt_formatter):
139
+ prompt = prompt_formatter(inputs)
140
+ response = llm_model.create_chat_completion(
141
+ messages=[
142
+ {
143
+ "role": "user",
144
+ "content": prompt
145
+ }
146
+ ],
147
+ response_format={
148
+ "type": "text",
149
+ },
150
+ temperature = 0.4,
151
+ min_p = 0.01,
152
+ max_tokens = 256,
153
+ #presence_penalty = 100,
154
+ repeat_penalty = 2,
155
+ )
156
+
157
+ output_text = response['choices'][0]['message']['content']
158
+ return prompt, output_text
159
+
160
+
161
+
162
+ def summarization(example : list[str], results_df : pd.DataFrame = pd.DataFrame()) -> pd.DataFrame :
163
+
164
+ # INFERENCE
165
+ results = []
166
+ scores = []
167
+ for cnt in range(0,3) :
168
+ print(cnt)
169
+ prompt, result = mirror_mirror(example, prompt_formatter)
170
+
171
+ example_embedded = np.array(llm_model.create_embedding(result)["data"][0]["embedding"]).reshape(1, -1)
172
+ result_embedded = np.array(llm_model.create_embedding(example)["data"][0]["embedding"]).reshape(1, -1)
173
+
174
+ score = cosine_similarity(example_embedded,result_embedded)
175
+ scores.append(str(score[0][0]))
176
+ #print(score[0])
177
+
178
+
179
+ # if score>0.1 :
180
+ # fin_result = result
181
+ # max_score = score
182
+ # break
183
+ #print(result)
184
+ results.append(f'Summary{cnt} : '+result)
185
+ #print(result+'\n\n')
186
+
187
+ # tokenize results and example together
188
+ # try :
189
+ # fin_result
190
+ # except :
191
+ # # if fin_result not already defined, use the best of available results
192
+ # # add example to results so tokenization is done together (due to padding limitations)
193
+ # results.append(example)
194
+ # tokenized = tokenizer(results, return_tensors="pt", padding = True)
195
+ # A = tokenized.input_ids.numpy()
196
+ # A = sparse.csr_matrix(A)
197
+ # # calculate cosine similarity of each pair
198
+ # # keep only example X result column
199
+ # scores = cosine_similarity(A)[:,5]
200
+ # # final result is the one with greaters cos_score
201
+ #
202
+ # max_score = max(scores)
203
+
204
+ # save final result and its attributes
205
+ fin_result = results[np.argmax(scores)]
206
+ row = pd.DataFrame({'model' : 'llama_neka_cpp', 'prompt' : prompt, 'reviews' : example, 'summarization' : fin_result, 'scores' :[max(scores)] })
207
+ results_df = pd.concat([results_df,row], ignore_index = True)
208
+
209
+ return results_df
210
+
211
+ def create_filter(group:str=None, platform:str=None, ReviewerPosition:str=None, Industry:str=None, CompanySize:str=None,
212
+ UsagePeriod:str=None, LinkedinVerified:str=None, Date:str=None, Rating:str=None) :
213
+ keys = ['group', 'Platform', 'ReviewerPosition', 'Industry', 'CompanySize',
214
+ 'UsagePeriod', 'LinkedinVerified', 'Date', 'Rating']
215
+ input_keys = [group,platform, ReviewerPosition, Industry, CompanySize, UsagePeriod, LinkedinVerified, Date, Rating]
216
+
217
+ # create filter dict
218
+ filter_dict = {}
219
+ for key, in_key in zip(keys, input_keys) :
220
+ if not in_key == None and not in_key == ' ':
221
+ filter_dict[key] = {'$eq' : in_key}
222
+
223
+ print(filter_dict)
224
+ return filter_dict
225
+
226
+ #_______________________________________________UI_____________________________________________________
227
+
228
+ st.title("Mirror, mirror, on the cloud, what do Clockify users say aloud?")
229
+ st.subheader("--Clockify review summarizer--")
230
+
231
+ col1, col2, col3 = st.columns(3, gap = 'small')
232
+
233
+ with col1:
234
+ platform = st.selectbox(label = 'Platform',
235
+ options = [' ', 'Capterra', 'Chrome Extension', 'GetApp', 'AppStore', 'GooglePlay',
236
+ 'Firefox Extension', 'JIRA Plugin', 'Trustpilot', 'G2',
237
+ 'TrustRadius']
238
+ )
239
+
240
+ with col2:
241
+ company_size = st.selectbox(label = 'Company Size',
242
+ options = [' ', '1-10 employees', 'Self-employed', 'self-employed',
243
+ 'Small-Business(50 or fewer emp.)', '51-200 employees',
244
+ 'Mid-Market(51-1000 emp.)', '11-50 employees',
245
+ '501-1,000 employees', '10,001+ employees', '201-500 employees',
246
+ '1,001-5,000 employees', '5,001-10,000 employees',
247
+ 'Enterprise(> 1000 emp.)', 'Unknown', '1001-5000 employees']
248
+ )
249
+
250
+ with col3:
251
+ linkedin_verified = st.selectbox(label = 'Linkedin Verified',
252
+ options = [' ', 'True', 'False'],
253
+ placeholder = 'Choose an option'
254
+ )
255
+
256
+ num_to_return = int(st.number_input(label = 'Number of documents to return', min_value = 2, max_value = 50, step = 1))
257
+
258
+ # group = st.selectbox(label = 'Review Platform Group',
259
+ # options = ['Software Review Platforms', 'Browser Extension Stores', 'Mobile App Stores', 'Plugin Marketplace']
260
+ # )
261
+
262
+
263
+ default_value = "Clockify"
264
+
265
+ query = st.text_area("Query", default_value, height = 50)
266
+ #type_of_doc = st.text_area("Type of text", 'text', height = 25)
267
+
268
+ # result = ''
269
+ # score = ''
270
+ # reviews = ''
271
+
272
+ if 'result' not in st.session_state:
273
+ st.session_state['result'] = ''
274
+
275
+ if 'score' not in st.session_state:
276
+ st.session_state['score'] = ''
277
+
278
+ if 'reviews' not in st.session_state:
279
+ st.session_state['reviews'] = ''
280
+
281
+ col11, col21 = st.columns(2, gap = 'small')
282
+
283
+ with col11:
284
+ button_query = st.button('Conquer and query!')
285
+ with col21:
286
+ button_summarize = st.button('Summon the summarizer!')
287
+
288
+
289
+ if button_query :
290
+ print('Querying')
291
+ # create filter from drop-downs
292
+ filter_dict = create_filter(#group = group,
293
+ platform = platform,
294
+ CompanySize = company_size,
295
+ LinkedinVerified = linkedin_verified
296
+ )
297
+ # FILTER BY META
298
+ if filter_dict == {} :
299
+ retriever = vectorstore.as_retriever(search_kwargs = {"k": num_to_return})
300
+
301
+ elif len(filter_dict.keys()) == 1 :
302
+ retriever = vectorstore.as_retriever(search_kwargs = {"k": num_to_return,
303
+ "filter": filter_dict})
304
+ else :
305
+ retriever = vectorstore.as_retriever(search_kwargs = {"k": num_to_return,
306
+ "filter":{'$and': [{key : value} for key,value in filter_dict.items()]}
307
+ }
308
+ )
309
+
310
+ reviews = retriever.get_relevant_documents(query = query)
311
+ # only get page content
312
+ st.session_state['reviews'] = [f'Review {cnt} : {review.page_content}\n\n' for cnt,review in enumerate(reviews)]
313
+ #print(st.session_state['reviews'])
314
+ result = 'You may summarize now!'
315
+
316
+ if button_summarize :
317
+ print('Summarization in progress')
318
+ st.session_state['result'] = 'Summarization in progress'
319
+ results_df = summarization("\n".join(st.session_state['reviews']))
320
+ # only one input
321
+ st.session_state['result'] = results_df.summarization[0]
322
+ st.session_state['score'] = results_df.scores[0]
323
+
324
+
325
+ col12, col22 = st.columns(2, gap = 'small')
326
+
327
+ with col12:
328
+ chosen_reviews = st.text_area("Reviews to be summarized", "\n".join(st.session_state['reviews']), height = 275)
329
+ with col22:
330
+ summarized_text = st.text_area("Summarized text", st.session_state['result'], height = 275)
331
+
332
+ score = st.text_area("Cosine similarity score", st.session_state['score'], height = 25)
333
+