{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2022-11-25T13:49:41.215612Z","iopub.execute_input":"2022-11-25T13:49:41.216011Z","iopub.status.idle":"2022-11-25T13:49:41.221723Z","shell.execute_reply.started":"2022-11-25T13:49:41.215977Z","shell.execute_reply":"2022-11-25T13:49:41.220827Z"},"_kg_hide-input":true,"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"markdown","source":"## Scikit-learn with Transformers\n\nIn this notebook, I will show how you can use scikit-learn estimators with model weights from [🤗 Transformers](https://huggingface.co/docs/transformers/main/en/index) thanks to [whatlies](https://github.com/koaning/whatlies). We will later push our model with a model card using [skops](https://skops.readthedocs.org/) to Hugging Face Hub.","metadata":{}},{"cell_type":"markdown","source":"# Installing whatlies, datasets, scikit-learn and gradio","metadata":{}},{"cell_type":"code","source":"!pip install datasets\n!pip install gradio\n!pip install whatlies[transformers]\n!pip install scikit-learn\n!pip install skops","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2022-11-25T13:49:50.291974Z","iopub.execute_input":"2022-11-25T13:49:50.292348Z","iopub.status.idle":"2022-11-25T13:50:42.641689Z","shell.execute_reply.started":"2022-11-25T13:49:50.292320Z","shell.execute_reply":"2022-11-25T13:50:42.640626Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Requirement already satisfied: datasets in /opt/conda/lib/python3.7/site-packages (2.1.0)\nRequirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (2.27.1)\nRequirement already satisfied: multiprocess in /opt/conda/lib/python3.7/site-packages (from datasets) (0.70.13)\nRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from datasets) (4.11.4)\nRequirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (from datasets) (1.3.5)\nRequirement already satisfied: xxhash in /opt/conda/lib/python3.7/site-packages (from datasets) (3.0.0)\nRequirement already satisfied: aiohttp in /opt/conda/lib/python3.7/site-packages (from datasets) (3.8.1)\nRequirement already satisfied: responses<0.19 in /opt/conda/lib/python3.7/site-packages (from datasets) (0.18.0)\nRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.7/site-packages (from datasets) (1.21.6)\nRequirement already satisfied: packaging in /opt/conda/lib/python3.7/site-packages (from datasets) (21.3)\nRequirement already satisfied: pyarrow>=5.0.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (5.0.0)\nRequirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (2022.5.0)\nRequirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.7/site-packages (from datasets) (4.64.0)\nRequirement already satisfied: dill in /opt/conda/lib/python3.7/site-packages (from datasets) (0.3.5.1)\nRequirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (0.11.0)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.6.0)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.2.0)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.7/site-packages (from packaging->datasets) (3.0.9)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (2022.5.18.1)\nRequirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (2.0.12)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (1.26.9)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (3.3)\nRequirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (4.0.2)\nRequirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.7.2)\nRequirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (6.0.2)\nRequirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.2.0)\nRequirement already satisfied: asynctest==0.13.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (0.13.0)\nRequirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.3.0)\nRequirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (21.4.0)\nRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->datasets) (3.8.0)\nRequirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas->datasets) (2.8.2)\nRequirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas->datasets) (2022.1)\nRequirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.16.0)\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: gradio in /opt/conda/lib/python3.7/site-packages (3.11.0)\nRequirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from gradio) (2.27.1)\nRequirement already satisfied: websockets>=10.0 in /opt/conda/lib/python3.7/site-packages (from gradio) (10.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.7/site-packages (from gradio) (3.1.2)\nRequirement already satisfied: pycryptodome in /opt/conda/lib/python3.7/site-packages (from gradio) (3.15.0)\nRequirement already satisfied: aiohttp in /opt/conda/lib/python3.7/site-packages (from gradio) (3.8.1)\nRequirement already satisfied: fsspec in /opt/conda/lib/python3.7/site-packages (from gradio) (2022.5.0)\nRequirement already satisfied: pydub in /opt/conda/lib/python3.7/site-packages (from gradio) (0.25.1)\nRequirement already satisfied: uvicorn in /opt/conda/lib/python3.7/site-packages (from gradio) (0.17.6)\nRequirement already satisfied: pyyaml in /opt/conda/lib/python3.7/site-packages (from gradio) (6.0)\nRequirement already satisfied: httpx in /opt/conda/lib/python3.7/site-packages (from gradio) (0.23.1)\nRequirement already satisfied: fastapi in /opt/conda/lib/python3.7/site-packages (from gradio) (0.78.0)\nRequirement already satisfied: matplotlib in /opt/conda/lib/python3.7/site-packages (from gradio) (3.5.2)\nRequirement already satisfied: ffmpy in /opt/conda/lib/python3.7/site-packages (from gradio) (0.3.0)\nRequirement already satisfied: markdown-it-py[linkify,plugins] in /opt/conda/lib/python3.7/site-packages (from gradio) (2.1.0)\nRequirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (from gradio) (1.3.5)\nRequirement already satisfied: pydantic in /opt/conda/lib/python3.7/site-packages (from gradio) (1.8.2)\nRequirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from gradio) (1.21.6)\nRequirement already satisfied: orjson in /opt/conda/lib/python3.7/site-packages (from gradio) (3.6.8)\nRequirement already satisfied: python-multipart in /opt/conda/lib/python3.7/site-packages (from gradio) (0.0.5)\nRequirement already satisfied: pillow in /opt/conda/lib/python3.7/site-packages (from gradio) (9.1.0)\nRequirement already satisfied: h11<0.13,>=0.11 in /opt/conda/lib/python3.7/site-packages (from gradio) (0.12.0)\nRequirement already satisfied: paramiko in /opt/conda/lib/python3.7/site-packages (from gradio) (2.12.0)\nRequirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (4.0.2)\nRequirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (6.0.2)\nRequirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (1.3.0)\nRequirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (2.0.12)\nRequirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (21.4.0)\nRequirement already satisfied: asynctest==0.13.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (0.13.0)\nRequirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (1.7.2)\nRequirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (1.2.0)\nRequirement already satisfied: typing-extensions>=3.7.4 in /opt/conda/lib/python3.7/site-packages (from aiohttp->gradio) (4.2.0)\nRequirement already satisfied: starlette==0.19.1 in /opt/conda/lib/python3.7/site-packages (from fastapi->gradio) (0.19.1)\nRequirement already satisfied: anyio<5,>=3.4.0 in /opt/conda/lib/python3.7/site-packages (from starlette==0.19.1->fastapi->gradio) (3.6.1)\nRequirement already satisfied: httpcore<0.17.0,>=0.15.0 in /opt/conda/lib/python3.7/site-packages (from httpx->gradio) (0.15.0)\nRequirement already satisfied: sniffio in /opt/conda/lib/python3.7/site-packages (from httpx->gradio) (1.2.0)\nRequirement already satisfied: rfc3986[idna2008]<2,>=1.3 in /opt/conda/lib/python3.7/site-packages (from httpx->gradio) (1.5.0)\nRequirement already satisfied: certifi in /opt/conda/lib/python3.7/site-packages (from httpx->gradio) (2022.5.18.1)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.7/site-packages (from jinja2->gradio) (2.0.1)\nRequirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.7/site-packages (from markdown-it-py[linkify,plugins]->gradio) (0.1.0)\nRequirement already satisfied: linkify-it-py~=1.0 in /opt/conda/lib/python3.7/site-packages (from markdown-it-py[linkify,plugins]->gradio) (1.0.3)\nRequirement already satisfied: mdit-py-plugins in /opt/conda/lib/python3.7/site-packages (from markdown-it-py[linkify,plugins]->gradio) (0.3.0)\nRequirement already satisfied: pyparsing>=2.2.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib->gradio) (3.0.9)\nRequirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.7/site-packages (from matplotlib->gradio) (2.8.2)\nRequirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.7/site-packages (from matplotlib->gradio) (0.11.0)\nRequirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib->gradio) (1.4.2)\nRequirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.7/site-packages (from matplotlib->gradio) (4.33.3)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.7/site-packages (from matplotlib->gradio) (21.3)\nRequirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas->gradio) (2022.1)\nRequirement already satisfied: pynacl>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from paramiko->gradio) (1.5.0)\nRequirement already satisfied: cryptography>=2.5 in /opt/conda/lib/python3.7/site-packages (from paramiko->gradio) (36.0.2)\nRequirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from paramiko->gradio) (1.16.0)\nRequirement already satisfied: bcrypt>=3.1.3 in /opt/conda/lib/python3.7/site-packages (from paramiko->gradio) (4.0.1)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->gradio) (3.3)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->gradio) (1.26.9)\nRequirement already satisfied: asgiref>=3.4.0 in /opt/conda/lib/python3.7/site-packages (from uvicorn->gradio) (3.5.2)\nRequirement already satisfied: click>=7.0 in /opt/conda/lib/python3.7/site-packages (from uvicorn->gradio) (8.0.4)\nRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from click>=7.0->uvicorn->gradio) (4.11.4)\nRequirement already satisfied: cffi>=1.12 in /opt/conda/lib/python3.7/site-packages (from cryptography>=2.5->paramiko->gradio) (1.15.0)\nRequirement already satisfied: uc-micro-py in /opt/conda/lib/python3.7/site-packages (from linkify-it-py~=1.0->markdown-it-py[linkify,plugins]->gradio) (1.0.1)\nRequirement already satisfied: pycparser in /opt/conda/lib/python3.7/site-packages (from cffi>=1.12->cryptography>=2.5->paramiko->gradio) (2.21)\nRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click>=7.0->uvicorn->gradio) (3.8.0)\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: whatlies[transformers] in /opt/conda/lib/python3.7/site-packages (0.7.0)\nRequirement already satisfied: altair>=4.2.0 in /opt/conda/lib/python3.7/site-packages (from whatlies[transformers]) (4.2.0)\nRequirement already satisfied: matplotlib>=3.5.0 in /opt/conda/lib/python3.7/site-packages (from whatlies[transformers]) (3.5.2)\nRequirement already satisfied: scikit-learn>=1.0.0 in /opt/conda/lib/python3.7/site-packages (from whatlies[transformers]) (1.0.2)\nRequirement already satisfied: gensim~=3.8.3 in /opt/conda/lib/python3.7/site-packages (from whatlies[transformers]) (3.8.3)\nRequirement already satisfied: bpemb>=0.3.0 in /opt/conda/lib/python3.7/site-packages (from whatlies[transformers]) (0.3.4)\nRequirement already satisfied: transformers>=4.19.0 in /opt/conda/lib/python3.7/site-packages (from whatlies[transformers]) (4.24.0)\nRequirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from altair>=4.2.0->whatlies[transformers]) (1.21.6)\nRequirement already satisfied: jsonschema>=3.0 in /opt/conda/lib/python3.7/site-packages (from altair>=4.2.0->whatlies[transformers]) (4.5.1)\nRequirement already satisfied: pandas>=0.18 in /opt/conda/lib/python3.7/site-packages (from altair>=4.2.0->whatlies[transformers]) (1.3.5)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.7/site-packages (from altair>=4.2.0->whatlies[transformers]) (3.1.2)\nRequirement already satisfied: toolz in /opt/conda/lib/python3.7/site-packages (from altair>=4.2.0->whatlies[transformers]) (0.11.2)\nRequirement already satisfied: entrypoints in /opt/conda/lib/python3.7/site-packages (from altair>=4.2.0->whatlies[transformers]) (0.4)\nRequirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from bpemb>=0.3.0->whatlies[transformers]) (4.64.0)\nRequirement already satisfied: sentencepiece in /opt/conda/lib/python3.7/site-packages (from bpemb>=0.3.0->whatlies[transformers]) (0.1.96)\nRequirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from bpemb>=0.3.0->whatlies[transformers]) (2.27.1)\nRequirement already satisfied: smart-open>=1.8.1 in /opt/conda/lib/python3.7/site-packages (from gensim~=3.8.3->whatlies[transformers]) (5.2.1)\nRequirement already satisfied: scipy>=0.18.1 in /opt/conda/lib/python3.7/site-packages (from gensim~=3.8.3->whatlies[transformers]) (1.7.3)\nRequirement already satisfied: six>=1.5.0 in /opt/conda/lib/python3.7/site-packages (from gensim~=3.8.3->whatlies[transformers]) (1.16.0)\nRequirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.5.0->whatlies[transformers]) (9.1.0)\nRequirement already satisfied: pyparsing>=2.2.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.5.0->whatlies[transformers]) (3.0.9)\nRequirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.5.0->whatlies[transformers]) (0.11.0)\nRequirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.5.0->whatlies[transformers]) (2.8.2)\nRequirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.5.0->whatlies[transformers]) (4.33.3)\nRequirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.5.0->whatlies[transformers]) (1.4.2)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.7/site-packages (from matplotlib>=3.5.0->whatlies[transformers]) (21.3)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=1.0.0->whatlies[transformers]) (3.1.0)\nRequirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=1.0.0->whatlies[transformers]) (1.1.0)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from transformers>=4.19.0->whatlies[transformers]) (3.6.0)\nRequirement already satisfied: huggingface-hub<1.0,>=0.10.0 in /opt/conda/lib/python3.7/site-packages (from transformers>=4.19.0->whatlies[transformers]) (0.11.0)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from transformers>=4.19.0->whatlies[transformers]) (6.0)\nRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.7/site-packages (from transformers>=4.19.0->whatlies[transformers]) (2021.11.10)\nRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from transformers>=4.19.0->whatlies[transformers]) (4.11.4)\nRequirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/lib/python3.7/site-packages (from transformers>=4.19.0->whatlies[transformers]) (0.12.1)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0,>=0.10.0->transformers>=4.19.0->whatlies[transformers]) (4.2.0)\nRequirement already satisfied: importlib-resources>=1.4.0 in /opt/conda/lib/python3.7/site-packages (from jsonschema>=3.0->altair>=4.2.0->whatlies[transformers]) (5.7.1)\nRequirement already satisfied: attrs>=17.4.0 in /opt/conda/lib/python3.7/site-packages (from jsonschema>=3.0->altair>=4.2.0->whatlies[transformers]) (21.4.0)\nRequirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.7/site-packages (from jsonschema>=3.0->altair>=4.2.0->whatlies[transformers]) (0.18.1)\nRequirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.18->altair>=4.2.0->whatlies[transformers]) (2022.1)\nRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->transformers>=4.19.0->whatlies[transformers]) (3.8.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.7/site-packages (from jinja2->altair>=4.2.0->whatlies[transformers]) (2.0.1)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->bpemb>=0.3.0->whatlies[transformers]) (1.26.9)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->bpemb>=0.3.0->whatlies[transformers]) (3.3)\nRequirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.7/site-packages (from requests->bpemb>=0.3.0->whatlies[transformers]) (2.0.12)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->bpemb>=0.3.0->whatlies[transformers]) (2022.5.18.1)\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: scikit-learn in /opt/conda/lib/python3.7/site-packages (1.0.2)\nRequirement already satisfied: numpy>=1.14.6 in /opt/conda/lib/python3.7/site-packages (from scikit-learn) (1.21.6)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn) (3.1.0)\nRequirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn) (1.7.3)\nRequirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.7/site-packages (from scikit-learn) (1.1.0)\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: skops in /opt/conda/lib/python3.7/site-packages (0.2)\nRequirement already satisfied: scikit-learn>=0.24 in /opt/conda/lib/python3.7/site-packages (from skops) (1.0.2)\nRequirement already satisfied: tabulate>=0.8.8 in /opt/conda/lib/python3.7/site-packages (from skops) (0.8.9)\nRequirement already satisfied: modelcards>=0.1.6 in /opt/conda/lib/python3.7/site-packages (from skops) (0.1.6)\nRequirement already satisfied: huggingface-hub>=0.9.0rc3 in /opt/conda/lib/python3.7/site-packages (from skops) (0.11.0)\nRequirement already satisfied: typing-extensions>=3.7 in /opt/conda/lib/python3.7/site-packages (from skops) (4.2.0)\nRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.9.0rc3->skops) (4.11.4)\nRequirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.9.0rc3->skops) (4.64.0)\nRequirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.9.0rc3->skops) (2.27.1)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.9.0rc3->skops) (6.0)\nRequirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.9.0rc3->skops) (21.3)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.9.0rc3->skops) (3.6.0)\nRequirement already satisfied: Jinja2 in /opt/conda/lib/python3.7/site-packages (from modelcards>=0.1.6->skops) (3.1.2)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (3.1.0)\nRequirement already satisfied: numpy>=1.14.6 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.21.6)\nRequirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.1.0)\nRequirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.7.3)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.7/site-packages (from packaging>=20.9->huggingface-hub>=0.9.0rc3->skops) (3.0.9)\nRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->huggingface-hub>=0.9.0rc3->skops) (3.8.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.7/site-packages (from Jinja2->modelcards>=0.1.6->skops) (2.0.1)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.9.0rc3->skops) (3.3)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.9.0rc3->skops) (1.26.9)\nRequirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.9.0rc3->skops) (2.0.12)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.9.0rc3->skops) (2022.5.18.1)\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import datasets\nimport sklearn\nimport gradio as gr\nimport whatlies\nfrom whatlies.language import HFTransformersLanguage\nfrom transformers import pipeline\nfrom sklearn.pipeline import Pipeline # yeah it's a bit confusing! 😅\nfrom sklearn.linear_model import LogisticRegression","metadata":{"execution":{"iopub.status.busy":"2022-11-25T13:50:46.200063Z","iopub.execute_input":"2022-11-25T13:50:46.200452Z","iopub.status.idle":"2022-11-25T13:50:53.433011Z","shell.execute_reply.started":"2022-11-25T13:50:46.200418Z","shell.execute_reply":"2022-11-25T13:50:53.432126Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"markdown","source":"## Load and preprocess the dataset\nWe'll drop nan values, get rid of entries with 1024 characters for both simplicity and to fit gpt-2's conditions and convert them to list (as whatlies accepts lists).","metadata":{}},{"cell_type":"code","source":"train_set, test_set = datasets.load_dataset('imdb', split =['train[0:1000]+train[24000:25000]', 'test[0:1000]+test[24000:25000]'])","metadata":{"execution":{"iopub.status.busy":"2022-11-25T13:51:23.321696Z","iopub.execute_input":"2022-11-25T13:51:23.322412Z","iopub.status.idle":"2022-11-25T13:51:25.550318Z","shell.execute_reply.started":"2022-11-25T13:51:23.322374Z","shell.execute_reply":"2022-11-25T13:51:25.549509Z"},"trusted":true},"execution_count":5,"outputs":[{"output_type":"display_data","data":{"text/plain":" 0%| | 0/2 [00:00, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"12771f9ca2974ea885c9f1490a6ddfc3"}},"metadata":{}}]},{"cell_type":"code","source":"df_train = pd.DataFrame(train_set)\ndf_test = pd.DataFrame(test_set)\ndf_train.dropna(inplace=True)\ndf_test.dropna(inplace=True)\ndf_train = df_train[df_train['text'].apply(lambda x: len(x) < 1024)]\ndf_test = df_test[df_test['text'].apply(lambda x: len(x) < 1024)]","metadata":{"execution":{"iopub.status.busy":"2022-11-25T13:51:30.973618Z","iopub.execute_input":"2022-11-25T13:51:30.974022Z","iopub.status.idle":"2022-11-25T13:51:31.223343Z","shell.execute_reply.started":"2022-11-25T13:51:30.973990Z","shell.execute_reply":"2022-11-25T13:51:31.222539Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"X_train = df_train[\"text\"].tolist()\ny_train = df_train[\"label\"].tolist()\nX_test = df_test[\"text\"].tolist()\ny_test = df_test[\"label\"].tolist()","metadata":{"execution":{"iopub.status.busy":"2022-11-25T13:51:33.322281Z","iopub.execute_input":"2022-11-25T13:51:33.322649Z","iopub.status.idle":"2022-11-25T13:51:33.328371Z","shell.execute_reply.started":"2022-11-25T13:51:33.322619Z","shell.execute_reply":"2022-11-25T13:51:33.327284Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"markdown","source":"# Setup classifier","metadata":{}},{"cell_type":"markdown","source":"We'll use gpt-2 weights.","metadata":{}},{"cell_type":"code","source":"pipe = Pipeline([\n (\"embedding\", HFTransformersLanguage(\"facebook/bart-base\")),\n (\"model\", LogisticRegression())\n])","metadata":{"execution":{"iopub.status.busy":"2022-11-25T13:51:43.068329Z","iopub.execute_input":"2022-11-25T13:51:43.068705Z","iopub.status.idle":"2022-11-25T13:51:47.292129Z","shell.execute_reply.started":"2022-11-25T13:51:43.068671Z","shell.execute_reply":"2022-11-25T13:51:47.291213Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"markdown","source":"## Visualizing pipeline and see the hyperparameters","metadata":{}},{"cell_type":"code","source":"from sklearn import set_config\nset_config(display=\"diagram\")\npipe","metadata":{"execution":{"iopub.status.busy":"2022-11-25T13:51:53.359407Z","iopub.execute_input":"2022-11-25T13:51:53.359791Z","iopub.status.idle":"2022-11-25T13:51:53.375506Z","shell.execute_reply.started":"2022-11-25T13:51:53.359759Z","shell.execute_reply":"2022-11-25T13:51:53.374429Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"Pipeline(steps=[('embedding',\n HFTransformersLanguage(model_name_or_path='facebook/bart-base')),\n ('model', LogisticRegression())])","text/html":"
Pipeline(steps=[('embedding',\n HFTransformersLanguage(model_name_or_path='facebook/bart-base')),\n ('model', LogisticRegression())])Please rerun this cell to show the HTML repr or trust the notebook.
Pipeline(steps=[('embedding',\n HFTransformersLanguage(model_name_or_path='facebook/bart-base')),\n ('model', LogisticRegression())])
HFTransformersLanguage(model_name_or_path='facebook/bart-base')
LogisticRegression()
Pipeline(steps=[('embedding',\n HFTransformersLanguage(model_name_or_path='facebook/bart-base')),\n ('model', LogisticRegression())])Please rerun this cell to show the HTML repr or trust the notebook.
Pipeline(steps=[('embedding',\n HFTransformersLanguage(model_name_or_path='facebook/bart-base')),\n ('model', LogisticRegression())])
HFTransformersLanguage(model_name_or_path='facebook/bart-base')
LogisticRegression()