{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'model'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[5], line 7\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mIPython\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdisplay\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mipd\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m nn\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmodel\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CNNEmotinoalClassifier\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'model'" ] } ], "source": [ "import gradio as gr\n", "import torch\n", "# from lr_ed.model import CNNEmotinoalClassifier\n", "import torchaudio\n", "import IPython.display as ipd\n", "from torch import nn\n", "from model import CNNEmotinoalClassifier" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CNNEmotinoalClassifier(\n", " (conv1): Sequential(\n", " (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (1): ReLU()\n", " (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", " )\n", " (conv2): Sequential(\n", " (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (1): ReLU()\n", " (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", " )\n", " (conv3): Sequential(\n", " (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))\n", " (1): ReLU()\n", " (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", " )\n", " (conv4): Sequential(\n", " (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))\n", " (1): ReLU()\n", " (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", " )\n", " (flatten): Flatten(start_dim=1, end_dim=-1)\n", " (fully_connected): Sequential(\n", " (0): Linear(in_features=32000, out_features=128, bias=True)\n", " (1): ReLU()\n", " (2): Linear(in_features=128, out_features=64, bias=True)\n", " (3): ReLU()\n", " (4): Linear(in_features=64, out_features=32, bias=True)\n", " (5): ReLU()\n", " (6): Linear(in_features=32, out_features=16, bias=True)\n", " (7): ReLU()\n", " (8): Linear(in_features=16, out_features=6, bias=True)\n", " )\n", " (softmax): Softmax(dim=1)\n", ")" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = CNNEmotinoalClassifier()\n", "model.load_state_dict(torch.load('/raid/adal_abilbekov/lr_ed/CNN_emotional_classifier/cnn_class_17.pt'))\n", "model.eval()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# path = '/raid/adal_abilbekov/emodiff_try_2/Emo_diff/demo_190224/Akzhol_happy.wav'\n", "# path = '/raid/adal_abilbekov/emodiff_try_2/Emo_diff/demo_190224/Akzhol_neutral.wav'\n", "path = '/raid/adal_abilbekov/emodiff_try_2/Emo_diff/demo_190224/Marzhan_happy.wav'\n", "waveform, sr = torchaudio.load(path)\n", "ipd.Audio(data=waveform, rate=sr)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "to_melspec = torchaudio.transforms.MelSpectrogram(\n", " sample_rate= 22050,\n", " n_fft = 1024,\n", " hop_length = 512,\n", " n_mels=64\n", ")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "def _get_right_pad(target_waveform, waveform):\n", " target_waveform = target_waveform\n", " waveform_samples_number = waveform.shape[1]\n", " if waveform_samples_number < target_waveform:\n", " right_pad = target_waveform - waveform_samples_number\n", " padding_touple = (0, right_pad)\n", " waveform_padded = nn.functional.pad(waveform, padding_touple)\n", " else:\n", " waveform_padded = waveform\n", " return waveform_padded" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "waveform = _get_right_pad(400384, waveform)\n", "input_x = to_melspec(waveform)\n", "input_x = torch.unsqueeze(input_x, dim=1)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "probs = model(input_x)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "emotions = ['happy', 'angry', 'sad', 'neutral', 'surprised', 'fear']\n", "emotions = sorted(emotions)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "# def get_probs(input_x, emotions):\n", "# probs = model(input_x)\n", "# prediction = emotions[probs.argmax(dim=1).item()]\n", "# return prediction, dict(zip(emotions, list(map(float, probs))))" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[2.9495e-18, 6.7292e-20, 9.9882e-01, 2.4566e-18, 1.0296e-12, 1.1847e-03]],\n", " grad_fn=)" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "probs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "asr_hug", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 2 }