hakim commited on
Commit
2bd8718
β€’
1 Parent(s): fd31bf7

app updated

Browse files
Files changed (2) hide show
  1. app.py +28 -1
  2. research/model_evaluatoin.ipynb +19 -15
app.py CHANGED
@@ -1,5 +1,32 @@
1
  import streamlit as st
2
- from textsummarizer.pipeline.predict import PredictionPipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def main():
5
  # Set page config
 
1
  import streamlit as st
2
+ from textsummarizer.config.configuration import ConfigurationManager
3
+ from transformers import AutoTokenizer
4
+ from transformers import pipeline
5
+
6
+
7
+
8
+ class PredictionPipeline:
9
+ def __init__(self):
10
+ self.config = ConfigurationManager().get_model_evaluation_config()
11
+
12
+ def predict(self,text):
13
+ tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
14
+ gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
15
+
16
+ pipe = pipeline("summarization", model=self.config.model_path,tokenizer=tokenizer)
17
+
18
+ print("Dialogue:")
19
+ print(text)
20
+
21
+ output = pipe(text, **gen_kwargs)[0]["summary_text"]
22
+ print("\nModel Summary:")
23
+ print(output)
24
+
25
+ return output
26
+
27
+
28
+
29
+
30
 
31
  def main():
32
  # Set page config
research/model_evaluatoin.ipynb CHANGED
@@ -12,7 +12,7 @@
12
  },
13
  {
14
  "cell_type": "code",
15
- "execution_count": 2,
16
  "metadata": {},
17
  "outputs": [],
18
  "source": [
@@ -30,7 +30,7 @@
30
  },
31
  {
32
  "cell_type": "code",
33
- "execution_count": 3,
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
@@ -52,6 +52,7 @@
52
  " \n",
53
  " def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
54
  " config = self.config.model_evaluation\n",
 
55
  "\n",
56
  " create_directories([config.root_dir])\n",
57
  "\n",
@@ -60,7 +61,8 @@
60
  " data_path=config.data_path,\n",
61
  " model_path = config.model_path,\n",
62
  " tokenizer_path = config.tokenizer_path,\n",
63
- " metric_file_name = config.metric_file_name\n",
 
64
  " \n",
65
  " )\n",
66
  "\n",
@@ -91,7 +93,7 @@
91
  },
92
  {
93
  "cell_type": "code",
94
- "execution_count": 6,
95
  "metadata": {},
96
  "outputs": [],
97
  "source": [
@@ -155,6 +157,7 @@
155
  " with mlflow.start_run():\n",
156
  " mlflow.log_param(\"model_name\", \"pegasus\")\n",
157
  " mlflow.log_param(\"dataset\", \"samsum\")\n",
 
158
  "\n",
159
  " score = self.calculate_metric_on_test_ds(\n",
160
  " dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, \n",
@@ -162,6 +165,7 @@
162
  " )\n",
163
  "\n",
164
  " rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)\n",
 
165
  "\n",
166
  " # Log metrics to MLflow\n",
167
  " for rouge_name, rouge_score in rouge_dict.items():\n",
@@ -177,18 +181,18 @@
177
  },
178
  {
179
  "cell_type": "code",
180
- "execution_count": 8,
181
  "metadata": {},
182
  "outputs": [
183
  {
184
  "name": "stdout",
185
  "output_type": "stream",
186
  "text": [
187
- "[2024-08-11 22:27:18,954: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
188
- "[2024-08-11 22:27:18,967: INFO: common: yaml file: params.yaml loaded successfully]\n",
189
- "[2024-08-11 22:27:18,971: INFO: common: created directory at: artifacts]\n",
190
- "[2024-08-11 22:27:18,973: INFO: common: created directory at: artifacts/model_evaluation]\n",
191
- "[2024-08-11 22:27:19,619: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/azizulhakim8291/text-summarization \"HTTP/1.1 200 OK\"]\n"
192
  ]
193
  },
194
  {
@@ -208,7 +212,7 @@
208
  "name": "stdout",
209
  "output_type": "stream",
210
  "text": [
211
- "[2024-08-11 22:27:20,037: INFO: helpers: Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"]\n"
212
  ]
213
  },
214
  {
@@ -228,8 +232,8 @@
228
  "name": "stdout",
229
  "output_type": "stream",
230
  "text": [
231
- "[2024-08-11 22:27:20,040: INFO: helpers: Repository azizulhakim8291/text-summarization initialized!]\n",
232
- "[2024-08-11 22:27:20,119: WARNING: connectionpool: Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'RemoteDisconnected('Remote end closed connection without response')': /azizulhakim8291/text-summarization.mlflow/api/2.0/mlflow/experiments/get-by-name?experiment_name=text-summarization-evaluation]\n"
233
  ]
234
  },
235
  {
@@ -240,14 +244,14 @@
240
  "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
241
  "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
242
  " warnings.warn(\n",
243
- "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:21<00:00, 4.26s/it]"
244
  ]
245
  },
246
  {
247
  "name": "stdout",
248
  "output_type": "stream",
249
  "text": [
250
- "[2024-08-11 22:28:20,351: INFO: rouge_scorer: Using default tokenizer.]\n"
251
  ]
252
  },
253
  {
 
12
  },
13
  {
14
  "cell_type": "code",
15
+ "execution_count": 9,
16
  "metadata": {},
17
  "outputs": [],
18
  "source": [
 
30
  },
31
  {
32
  "cell_type": "code",
33
+ "execution_count": 10,
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
 
52
  " \n",
53
  " def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
54
  " config = self.config.model_evaluation\n",
55
+ " params = self.params.TrainingArguments\n",
56
  "\n",
57
  " create_directories([config.root_dir])\n",
58
  "\n",
 
61
  " data_path=config.data_path,\n",
62
  " model_path = config.model_path,\n",
63
  " tokenizer_path = config.tokenizer_path,\n",
64
+ " metric_file_name = config.metric_file_name,\n",
65
+ " all_params = params\n",
66
  " \n",
67
  " )\n",
68
  "\n",
 
93
  },
94
  {
95
  "cell_type": "code",
96
+ "execution_count": 11,
97
  "metadata": {},
98
  "outputs": [],
99
  "source": [
 
157
  " with mlflow.start_run():\n",
158
  " mlflow.log_param(\"model_name\", \"pegasus\")\n",
159
  " mlflow.log_param(\"dataset\", \"samsum\")\n",
160
+ " mlflow.log_param('parameter name', 'value')\n",
161
  "\n",
162
  " score = self.calculate_metric_on_test_ds(\n",
163
  " dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, \n",
 
165
  " )\n",
166
  "\n",
167
  " rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)\n",
168
+ " mlflow.log_params(self.config.all_params)\n",
169
  "\n",
170
  " # Log metrics to MLflow\n",
171
  " for rouge_name, rouge_score in rouge_dict.items():\n",
 
181
  },
182
  {
183
  "cell_type": "code",
184
+ "execution_count": 12,
185
  "metadata": {},
186
  "outputs": [
187
  {
188
  "name": "stdout",
189
  "output_type": "stream",
190
  "text": [
191
+ "[2024-08-11 22:39:28,983: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
192
+ "[2024-08-11 22:39:28,986: INFO: common: yaml file: params.yaml loaded successfully]\n",
193
+ "[2024-08-11 22:39:28,989: INFO: common: created directory at: artifacts]\n",
194
+ "[2024-08-11 22:39:28,992: INFO: common: created directory at: artifacts/model_evaluation]\n",
195
+ "[2024-08-11 22:39:29,723: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/azizulhakim8291/text-summarization \"HTTP/1.1 200 OK\"]\n"
196
  ]
197
  },
198
  {
 
212
  "name": "stdout",
213
  "output_type": "stream",
214
  "text": [
215
+ "[2024-08-11 22:39:29,731: INFO: helpers: Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"]\n"
216
  ]
217
  },
218
  {
 
232
  "name": "stdout",
233
  "output_type": "stream",
234
  "text": [
235
+ "[2024-08-11 22:39:29,735: INFO: helpers: Repository azizulhakim8291/text-summarization initialized!]\n",
236
+ "[2024-08-11 22:39:29,802: WARNING: connectionpool: Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'RemoteDisconnected('Remote end closed connection without response')': /azizulhakim8291/text-summarization.mlflow/api/2.0/mlflow/experiments/get-by-name?experiment_name=text-summarization-evaluation]\n"
237
  ]
238
  },
239
  {
 
244
  "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
245
  "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
246
  " warnings.warn(\n",
247
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆοΏ½οΏ½οΏ½β–ˆ| 5/5 [00:17<00:00, 3.48s/it]"
248
  ]
249
  },
250
  {
251
  "name": "stdout",
252
  "output_type": "stream",
253
  "text": [
254
+ "[2024-08-11 22:39:59,553: INFO: rouge_scorer: Using default tokenizer.]\n"
255
  ]
256
  },
257
  {