Spaces:

Md-Hakim
/

text-summarization

Sleeping

App Files Files Community

hakim commited on Aug 11

Commit

2bd8718

•

1 Parent(s): fd31bf7

app updated

Browse files

Files changed (2) hide show

app.py +28 -1
research/model_evaluatoin.ipynb +19 -15

app.py CHANGED Viewed

@@ -1,5 +1,32 @@
 import streamlit as st
-from textsummarizer.pipeline.predict import PredictionPipeline
 def main():
     # Set page config

 import streamlit as st
+from textsummarizer.config.configuration import ConfigurationManager
+from transformers import AutoTokenizer
+from transformers import pipeline
+class PredictionPipeline:
+    def __init__(self):
+        self.config = ConfigurationManager().get_model_evaluation_config()
+    def predict(self,text):
+        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
+        gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
+        pipe = pipeline("summarization", model=self.config.model_path,tokenizer=tokenizer)
+        print("Dialogue:")
+        print(text)
+        output = pipe(text, **gen_kwargs)[0]["summary_text"]
+        print("\nModel Summary:")
+        print(output)
+        return output
 def main():
     # Set page config

research/model_evaluatoin.ipynb CHANGED Viewed

@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -52,6 +52,7 @@
     "    \n",
     "    def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
     "        config = self.config.model_evaluation\n",
     "\n",
     "        create_directories([config.root_dir])\n",
     "\n",
@@ -60,7 +61,8 @@
     "            data_path=config.data_path,\n",
     "            model_path = config.model_path,\n",
     "            tokenizer_path = config.tokenizer_path,\n",
-    "            metric_file_name = config.metric_file_name\n",
     "           \n",
     "        )\n",
     "\n",
@@ -91,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -155,6 +157,7 @@
     "        with mlflow.start_run():\n",
     "            mlflow.log_param(\"model_name\", \"pegasus\")\n",
     "            mlflow.log_param(\"dataset\", \"samsum\")\n",
     "\n",
     "            score = self.calculate_metric_on_test_ds(\n",
     "                dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, \n",
@@ -162,6 +165,7 @@
     "            )\n",
     "\n",
     "            rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)\n",
     "\n",
     "            # Log metrics to MLflow\n",
     "            for rouge_name, rouge_score in rouge_dict.items():\n",
@@ -177,18 +181,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-08-11 22:27:18,954: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
-      "[2024-08-11 22:27:18,967: INFO: common: yaml file: params.yaml loaded successfully]\n",
-      "[2024-08-11 22:27:18,971: INFO: common: created directory at: artifacts]\n",
-      "[2024-08-11 22:27:18,973: INFO: common: created directory at: artifacts/model_evaluation]\n",
-      "[2024-08-11 22:27:19,619: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/azizulhakim8291/text-summarization \"HTTP/1.1 200 OK\"]\n"
      ]
     },
     {
@@ -208,7 +212,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-08-11 22:27:20,037: INFO: helpers: Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"]\n"
      ]
     },
     {
@@ -228,8 +232,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-08-11 22:27:20,040: INFO: helpers: Repository azizulhakim8291/text-summarization initialized!]\n",
-      "[2024-08-11 22:27:20,119: WARNING: connectionpool: Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'RemoteDisconnected('Remote end closed connection without response')': /azizulhakim8291/text-summarization.mlflow/api/2.0/mlflow/experiments/get-by-name?experiment_name=text-summarization-evaluation]\n"
      ]
     },
     {
@@ -240,14 +244,14 @@
       "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
       "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
       "  warnings.warn(\n",
-      "100%|██████████| 5/5 [00:21<00:00,  4.26s/it]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-08-11 22:28:20,351: INFO: rouge_scorer: Using default tokenizer.]\n"
      ]
     },
     {

   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "    \n",
     "    def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
     "        config = self.config.model_evaluation\n",
+    "        params = self.params.TrainingArguments\n",
     "\n",
     "        create_directories([config.root_dir])\n",
     "\n",
     "            data_path=config.data_path,\n",
     "            model_path = config.model_path,\n",
     "            tokenizer_path = config.tokenizer_path,\n",
+    "            metric_file_name = config.metric_file_name,\n",
+    "            all_params = params\n",
     "           \n",
     "        )\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
     "        with mlflow.start_run():\n",
     "            mlflow.log_param(\"model_name\", \"pegasus\")\n",
     "            mlflow.log_param(\"dataset\", \"samsum\")\n",
+    "            mlflow.log_param('parameter name', 'value')\n",
     "\n",
     "            score = self.calculate_metric_on_test_ds(\n",
     "                dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, \n",
     "            )\n",
     "\n",
     "            rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)\n",
+    "            mlflow.log_params(self.config.all_params)\n",
     "\n",
     "            # Log metrics to MLflow\n",
     "            for rouge_name, rouge_score in rouge_dict.items():\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "[2024-08-11 22:39:28,983: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-08-11 22:39:28,986: INFO: common: yaml file: params.yaml loaded successfully]\n",
+      "[2024-08-11 22:39:28,989: INFO: common: created directory at: artifacts]\n",
+      "[2024-08-11 22:39:28,992: INFO: common: created directory at: artifacts/model_evaluation]\n",
+      "[2024-08-11 22:39:29,723: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/azizulhakim8291/text-summarization \"HTTP/1.1 200 OK\"]\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "[2024-08-11 22:39:29,731: INFO: helpers: Initialized MLflow to track repo \"azizulhakim8291/text-summarization\"]\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "[2024-08-11 22:39:29,735: INFO: helpers: Repository azizulhakim8291/text-summarization initialized!]\n",
+      "[2024-08-11 22:39:29,802: WARNING: connectionpool: Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'RemoteDisconnected('Remote end closed connection without response')': /azizulhakim8291/text-summarization.mlflow/api/2.0/mlflow/experiments/get-by-name?experiment_name=text-summarization-evaluation]\n"
      ]
     },
     {
       "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
       "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n",
       "  warnings.warn(\n",
+      "100%|████████���█| 5/5 [00:17<00:00,  3.48s/it]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "[2024-08-11 22:39:59,553: INFO: rouge_scorer: Using default tokenizer.]\n"
      ]
     },
     {