AMP-2023-S2-SoundGeneration

Running

App Files Files Community

acanivet commited on Feb 7

Commit

e72f4c2

•

1 Parent(s): 2237ddd

Formatting

Browse files

Files changed (6) hide show

README.md +1 -1
app.py +40 -17
cvae/__init__.py +1 -7
cvae/blocks.py +7 -7
cvae/models.py +84 -53
model.py +64 -17

README.md CHANGED Viewed

@@ -13,7 +13,7 @@ pinned: false
 ![Screenshot of the app](app.png)
-This is a demo for the sound generation models built in `pytorch`. It relies on a simple `streamlit` app calling the model with the parameters given by the user.
 ## Install :

 ![Screenshot of the app](app.png)
+This is a demo for the sound generation models built in `pytorch`. It relies on a simple `streamlit` app calling the model with the parameters given by the user. Due to time and material difficulties, the model isn't properly trained and isn't able to produce interesting sounds now.
 ## Install :

app.py CHANGED Viewed

@@ -4,42 +4,66 @@ import io
 import numpy as np
 from scipy.io.wavfile import write
 @st.cache_data
-def np_to_wav(waveform, sample_rate) -> bytes:
     bytes_wav = bytes()
     byte_io = io.BytesIO(bytes_wav)
     write(byte_io, sample_rate, waveform.T)
     return byte_io.read()
 if "result" not in st.session_state:
     st.session_state["result"] = None
 st.title("Sound Exploration")
 col1, col2 = st.columns(2)
 with col1:
     instrument = st.selectbox(
-        'Which intrument do you want ?',
-        ('🎸 Bass', '🎺 Brass', '🪈 Flute', '🪕 Guitar', '🎹 Keyboard', '🔨 Mallet', '🪗 Organ', '🎷 Reed', '🎻 String', '⚡ Synth lead', '🎤 Vocal')
     )
 with col2:
     instrument_t = st.selectbox(
-        'Which type intrument do you want ?',
-        ('📯 Acoustic', '🎙️ Electronic', '🎛️ Synthetic')
-    )
 with st.expander("Magical parameters 🪄"):
     col1, col2 = st.columns(2)
     with col1:
-        p1 = st.slider('p1', 0., 1., step=0.001, label_visibility='collapsed')
-        p2 = st.slider('p2', 0., 1., step=0.001, label_visibility='collapsed')
-        p3 = st.slider('p3', 0., 1., step=0.001, label_visibility='collapsed')
     with col2:
-        p4 = st.slider('p4', 0., 1., step=0.001, label_visibility='collapsed')
-        p5 = st.slider('p5', 0., 1., step=0.001, label_visibility='collapsed')
-        use_params = st.toggle('Use magical parameters ?')
     params = (p1, p2, p3, p4, p5) if use_params else None
 if st.button("Generate ✨", type="primary"):
@@ -53,6 +77,5 @@ if st.session_state["result"] is not None:
         st.download_button(
             label="Download  ⬇️",
             data=np_to_wav(st.session_state["result"], 16000),
-            file_name='result.wav',
         )

 import numpy as np
 from scipy.io.wavfile import write
+# -----
+# Utils
+# -----
 @st.cache_data
+def np_to_wav(waveform: np.Array, sample_rate: int) -> bytes:
     bytes_wav = bytes()
     byte_io = io.BytesIO(bytes_wav)
     write(byte_io, sample_rate, waveform.T)
     return byte_io.read()
+# ------------------
+# App initialization
+# ------------------
 if "result" not in st.session_state:
     st.session_state["result"] = None
+# ---
+# App
+# ---
 st.title("Sound Exploration")
 col1, col2 = st.columns(2)
 with col1:
     instrument = st.selectbox(
+        "Which intrument do you want ?",
+        (
+            "🎸 Bass",
+            "🎺 Brass",
+            "🪈 Flute",
+            "🪕 Guitar",
+            "🎹 Keyboard",
+            "🔨 Mallet",
+            "🪗 Organ",
+            "🎷 Reed",
+            "🎻 String",
+            "⚡ Synth lead",
+            "🎤 Vocal",
+        ),
     )
 with col2:
     instrument_t = st.selectbox(
+        "Which type intrument do you want ?",
+        ("📯 Acoustic", "🎙️ Electronic", "🎛️ Synthetic"),
+    )
 with st.expander("Magical parameters 🪄"):
     col1, col2 = st.columns(2)
     with col1:
+        p1 = st.slider("p1", 0.0, 1.0, step=0.001, label_visibility="collapsed")
+        p2 = st.slider("p2", 0.0, 1.0, step=0.001, label_visibility="collapsed")
+        p3 = st.slider("p3", 0.0, 1.0, step=0.001, label_visibility="collapsed")
     with col2:
+        p4 = st.slider("p4", 0.0, 1.0, step=0.001, label_visibility="collapsed")
+        p5 = st.slider("p5", 0.0, 1.0, step=0.001, label_visibility="collapsed")
+        use_params = st.toggle("Use magical parameters ?")
     params = (p1, p2, p3, p4, p5) if use_params else None
 if st.button("Generate ✨", type="primary"):
         st.download_button(
             label="Download  ⬇️",
             data=np_to_wav(st.session_state["result"], 16000),
+            file_name="result.wav",
         )

cvae/__init__.py CHANGED Viewed

@@ -1,7 +1 @@
-from .models import (
-    Encoder, Decoder, VAE, CVAE
-)
-from .blocks import (
-    UpResConvBlock, DownResConvBlock
-)


1	+ from .models import VAE, CVAE

cvae/blocks.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from torch import nn
 class UpResConvBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size):
         super(UpResConvBlock, self).__init__()
         self.residual = nn.Sequential(
@@ -19,11 +19,11 @@ class UpResConvBlock(nn.Module):
             nn.GELU()
         )
-    def forward(self, x):
         return self.main(x) + self.residual(x)
 class DownResConvBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size):
         super(DownResConvBlock, self).__init__()
         self.residual = nn.Conv1d(in_channels, out_channels, 1, 2, bias=False)
@@ -37,11 +37,11 @@ class DownResConvBlock(nn.Module):
             nn.GELU()
         )
-    def forward(self, x):
         return self.main(x) + self.residual(x)
 class ResConvBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size):
         super(ResConvBlock, self).__init__()
         self.residual = nn.Identity() if in_channels == out_channels else nn.Conv1d(in_channels, out_channels, 1, bias=False)
@@ -55,5 +55,5 @@ class ResConvBlock(nn.Module):
             nn.GELU()
         )
-    def forward(self, x):
         return self.main(x) + self.residual(x)

+from torch import nn, Tensor
 class UpResConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size) -> None:
         super(UpResConvBlock, self).__init__()
         self.residual = nn.Sequential(
             nn.GELU()
         )
+    def forward(self, x: Tensor) -> Tensor:
         return self.main(x) + self.residual(x)
 class DownResConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size) -> None:
         super(DownResConvBlock, self).__init__()
         self.residual = nn.Conv1d(in_channels, out_channels, 1, 2, bias=False)
             nn.GELU()
         )
+    def forward(self, x: Tensor) -> Tensor:
         return self.main(x) + self.residual(x)
 class ResConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size) -> None:
         super(ResConvBlock, self).__init__()
         self.residual = nn.Identity() if in_channels == out_channels else nn.Conv1d(in_channels, out_channels, 1, bias=False)
             nn.GELU()
         )
+    def forward(self, x: Tensor) -> Tensor:
         return self.main(x) + self.residual(x)

cvae/models.py CHANGED Viewed

@@ -4,164 +4,195 @@ from torch.optim import Optimizer
 from .blocks import UpResConvBlock, DownResConvBlock
 import lightning as L
 from auraloss.freq import MultiResolutionSTFTLoss
 class Encoder(nn.Module):
-    def __init__(self,
         in_channels: int,
         in_features: int,
         out_features: int,
-        channels: list = None,
-        ) -> None:
         super(Encoder, self).__init__()
-        assert in_features % 2**len(channels) == 0, f"in_features ({in_features}) must be a multiple of downscale factor ({2**len(channels)})"
-        modules = [
-            nn.Conv1d(in_channels, channels[0], 1),
-            nn.GELU()
-        ]
-        for in_channel, out_channel in zip(channels, channels[1:]+[channels[-1]]):
             modules += [
                 DownResConvBlock(in_channel, out_channel, 1),
             ]
-        n_features = int(in_features*.5**len(channels))
         modules += [
             nn.Flatten(),
-            nn.Linear(n_features*channels[-1], 2*out_features)
         ]
         self.net = nn.Sequential(*modules)
-    def forward(self, x):
         mean, logvar = self.net(x).chunk(2, dim=1)
         return mean, logvar
 class Decoder(nn.Module):
-    def __init__(self,
         out_channels: int,
         in_features: int,
         out_features: int,
-        channels: list = None,
-        ) -> None:
         super(Decoder, self).__init__()
-        n_features = int(out_features/2**len(channels))
         modules = [
-            nn.Linear(in_features, n_features*channels[0]),
-            nn.Unflatten(-1, (channels[0], n_features))
         ]
-        for in_channel, out_channel in zip(channels, channels[1:]+[channels[-1]]):
             modules += [
                 UpResConvBlock(in_channel, out_channel, 1),
             ]
-        modules += [
-                nn.Conv1d(channels[-1], out_channels, 1),
-                nn.GELU()
-            ]
         self.net = nn.Sequential(*modules)
-    def forward(self, x):
         x = torch.tanh(self.net(x))
         return x
 class VAE(L.LightningModule):
-    def __init__(self, io_channels: int, io_features: int, latent_features: int, channels: list, learning_rate: float):
         super().__init__()
         self.encoder = Encoder(io_channels, io_features, latent_features, channels)
         channels.reverse()
         self.decoder = Decoder(io_channels, latent_features, io_features, channels)
         self.latent_features = latent_features
         self.audio_loss_func = MultiResolutionSTFTLoss()
         self.learning_rate = learning_rate
     @torch.no_grad()
-    def sample(self, eps=None):
         if eps is None:
             eps = torch.rand((1, self.latent_features))
         return self.decoder(eps)
-    def loss_function(self, x, x_hat, mean, logvar):
         audio_loss = self.audio_loss_func(x, x_hat)
         kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
         return audio_loss + kld_loss
-    def reparameterize(self, mean, logvar):
-        std= torch.exp(0.5 * logvar)
         eps = torch.randn_like(std)
         return eps * std + mean
-    def forward(self, x):
         mean, logvar = self.encoder(x)
         z = self.reparameterize(mean, logvar)
         return self.decoder(z), mean, logvar
     def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
         x_hat, mean, logvar = self.forward(batch)
         loss = self.loss_function(batch, x_hat, mean, logvar)
-        if log: self.log("train_loss", loss, prog_bar=True)
         return loss
     def configure_optimizers(self) -> Optimizer:
         optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
         return optimizer
 class CVAE(L.LightningModule):
-    def __init__(self, io_channels: int, io_features: int, latent_features: int, channels: list, num_classes: int, learning_rate: float):
         super().__init__()
         self.class_embedder = nn.Linear(num_classes, io_features)
         self.data_embedder = nn.Conv1d(io_channels, io_channels, kernel_size=1)
-        self.encoder = Encoder(io_channels+1, io_features, latent_features, channels)
         channels.reverse()
-        self.decoder = Decoder(io_channels, latent_features+num_classes, io_features, channels)
         self.num_classes = num_classes
         self.latent_features = latent_features
         self.audio_loss_func = MultiResolutionSTFTLoss()
         self.learning_rate = learning_rate
     @torch.no_grad()
-    def sample(self, c, eps=None):
         c = nn.functional.one_hot(c, num_classes=self.num_classes).float().unsqueeze(0)
         if eps is None:
             eps = torch.rand((1, self.latent_features))
         z = torch.cat([eps, c], dim=1)
         return self.decoder(z)
-    def loss_function(self, x, x_hat, mean, logvar):
         audio_loss = self.audio_loss_func(x, x_hat)
         kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
         return audio_loss + kld_loss
-    def reparameterize(self, mean, logvar):
-        std= torch.exp(0.5 * logvar)
         eps = torch.randn_like(std)
         return eps * std + mean
-    def forward(self, x, c):
         c = nn.functional.one_hot(c, num_classes=self.num_classes).float()
         c_embedding = self.class_embedder(c).unsqueeze(1)
         x_embedding = self.data_embedder(x)
-        x = torch.cat([x_embedding, c_embedding], dim = 1)
         mean, logvar = self.encoder(x)
         z = self.reparameterize(mean, logvar)
-        z = torch.cat([z, c], dim = 1)
         return self.decoder(z), mean, logvar
     def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
         x, c = batch
         x_hat, mean, logvar = self.forward(x, c)
         loss = self.loss_function(x, x_hat, mean, logvar)
-        if log: self.log("train_loss", loss, prog_bar=True)
         return loss
     def configure_optimizers(self) -> Optimizer:
         optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
-        return optimizer

 from .blocks import UpResConvBlock, DownResConvBlock
 import lightning as L
 from auraloss.freq import MultiResolutionSTFTLoss
+from typing import Sequence
 class Encoder(nn.Module):
+    def __init__(
+        self,
         in_channels: int,
         in_features: int,
         out_features: int,
+        channels: Sequence[int],
+    ) -> None:
         super(Encoder, self).__init__()
+        assert (
+            in_features % 2 ** len(channels) == 0
+        ), f"in_features ({in_features}) must be a multiple of downscale factor ({2**len(channels)})"
+        modules = [nn.Conv1d(in_channels, channels[0], 1), nn.GELU()]
+        for in_channel, out_channel in zip(channels, channels[1:] + [channels[-1]]):
             modules += [
                 DownResConvBlock(in_channel, out_channel, 1),
             ]
+        n_features = int(in_features * 0.5 ** len(channels))
         modules += [
             nn.Flatten(),
+            nn.Linear(n_features * channels[-1], 2 * out_features),
         ]
         self.net = nn.Sequential(*modules)
+    def forward(self, x: Tensor) -> Tensor:
         mean, logvar = self.net(x).chunk(2, dim=1)
         return mean, logvar
 class Decoder(nn.Module):
+    def __init__(
+        self,
         out_channels: int,
         in_features: int,
         out_features: int,
+        channels: Sequence[int],
+    ) -> None:
         super(Decoder, self).__init__()
+        n_features = int(out_features / 2 ** len(channels))
         modules = [
+            nn.Linear(in_features, n_features * channels[0]),
+            nn.Unflatten(-1, (channels[0], n_features)),
         ]
+        for in_channel, out_channel in zip(channels, channels[1:] + [channels[-1]]):
             modules += [
                 UpResConvBlock(in_channel, out_channel, 1),
             ]
+        modules += [nn.Conv1d(channels[-1], out_channels, 1), nn.GELU()]
         self.net = nn.Sequential(*modules)
+    def forward(self, x: Tensor) -> Tensor:
         x = torch.tanh(self.net(x))
         return x
 class VAE(L.LightningModule):
+    def __init__(
+        self,
+        io_channels: int,
+        io_features: int,
+        latent_features: int,
+        channels: Sequence[int],
+        learning_rate: float,
+    ) -> None:
         super().__init__()
         self.encoder = Encoder(io_channels, io_features, latent_features, channels)
         channels.reverse()
         self.decoder = Decoder(io_channels, latent_features, io_features, channels)
         self.latent_features = latent_features
         self.audio_loss_func = MultiResolutionSTFTLoss()
         self.learning_rate = learning_rate
     @torch.no_grad()
+    def sample(self, eps: Tensor = None) -> Tensor:
         if eps is None:
             eps = torch.rand((1, self.latent_features))
         return self.decoder(eps)
+    def loss_function(
+        self, x: Tensor, x_hat: Tensor, mean: Tensor, logvar: Tensor
+    ) -> Tensor:
         audio_loss = self.audio_loss_func(x, x_hat)
         kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
         return audio_loss + kld_loss
+    def reparameterize(self, mean: Tensor, logvar: Tensor) -> Tensor:
+        std = torch.exp(0.5 * logvar)
         eps = torch.randn_like(std)
         return eps * std + mean
+    def forward(self, x: Tensor) -> tuple[Tensor]:
         mean, logvar = self.encoder(x)
         z = self.reparameterize(mean, logvar)
         return self.decoder(z), mean, logvar
     def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
         x_hat, mean, logvar = self.forward(batch)
         loss = self.loss_function(batch, x_hat, mean, logvar)
+        if log:
+            self.log("train_loss", loss, prog_bar=True)
         return loss
     def configure_optimizers(self) -> Optimizer:
         optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
         return optimizer
 class CVAE(L.LightningModule):
+    def __init__(
+        self,
+        io_channels: int,
+        io_features: int,
+        latent_features: int,
+        channels: Sequence[int],
+        num_classes: int,
+        learning_rate: float,
+    ):
         super().__init__()
         self.class_embedder = nn.Linear(num_classes, io_features)
         self.data_embedder = nn.Conv1d(io_channels, io_channels, kernel_size=1)
+        self.encoder = Encoder(io_channels + 1, io_features, latent_features, channels)
         channels.reverse()
+        self.decoder = Decoder(
+            io_channels, latent_features + num_classes, io_features, channels
+        )
         self.num_classes = num_classes
         self.latent_features = latent_features
         self.audio_loss_func = MultiResolutionSTFTLoss()
         self.learning_rate = learning_rate
     @torch.no_grad()
+    def sample(self, c, eps=None) -> Tensor:
         c = nn.functional.one_hot(c, num_classes=self.num_classes).float().unsqueeze(0)
         if eps is None:
             eps = torch.rand((1, self.latent_features))
         z = torch.cat([eps, c], dim=1)
         return self.decoder(z)
+    def loss_function(
+        self, x: Tensor, x_hat: Tensor, mean: Tensor, logvar: Tensor
+    ) -> Tensor:
         audio_loss = self.audio_loss_func(x, x_hat)
         kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
         return audio_loss + kld_loss
+    def reparameterize(self, mean: Tensor, logvar: Tensor) -> Tensor:
+        std = torch.exp(0.5 * logvar)
         eps = torch.randn_like(std)
         return eps * std + mean
+    def forward(self, x: Tensor, c: Tensor) -> tuple[Tensor]:
         c = nn.functional.one_hot(c, num_classes=self.num_classes).float()
         c_embedding = self.class_embedder(c).unsqueeze(1)
         x_embedding = self.data_embedder(x)
+        x = torch.cat([x_embedding, c_embedding], dim=1)
         mean, logvar = self.encoder(x)
         z = self.reparameterize(mean, logvar)
+        z = torch.cat([z, c], dim=1)
         return self.decoder(z), mean, logvar
     def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
         x, c = batch
         x_hat, mean, logvar = self.forward(x, c)
         loss = self.loss_function(x, x_hat, mean, logvar)
+        if log:
+            self.log("train_loss", loss, prog_bar=True)
         return loss
     def configure_optimizers(self) -> Optimizer:
         optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
+        return optimizer

model.py CHANGED Viewed

@@ -2,34 +2,81 @@ from cvae import CVAE
 import torch
 from typing import Sequence
 import streamlit as st
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-instruments = ['bass_acoustic', 'brass_acoustic', 'flute_acoustic', 'guitar_acoustic', 'keyboard_acoustic', 'mallet_acoustic', 'organ_acoustic', 'reed_acoustic', 'string_acoustic', 'synth_lead_acoustic', 'vocal_acoustic', 'bass_synthetic', 'brass_synthetic', 'flute_synthetic', 'guitar_synthetic', 'keyboard_synthetic', 'mallet_synthetic', 'organ_synthetic', 'reed_synthetic', 'string_synthetic', 'synth_lead_synthetic', 'vocal_synthetic', 'bass_electronic', 'brass_electronic', 'flute_electronic', 'guitar_electronic', 'keyboard_electronic', 'mallet_electronic', 'organ_electronic', 'reed_electronic', 'string_electronic', 'synth_lead_electronic', 'vocal_electronic']
 @st.cache_resource
-def load_model(device):
     return CVAE.load_from_checkpoint(
-        'epoch=77-step=2819778.ckpt',
         io_channels=1,
-        io_features=16000*4,
         latent_features=5,
         channels=[32, 64, 128, 256, 512],
         num_classes=len(instruments),
-        learning_rate=1e-5
     ).to(device)
-model = load_model(device)
-def format(text):
-    stems = text.split(' ')[1:]
-    stems = [stem.replace(" ", "").lower() for stem in stems]
-    return '_'.join(stems)
-def choice_to_tensor(choice: Sequence[str]) -> torch.Tensor:
-    choice = '_'.join([format(i) for i in choice])
-    return torch.tensor(instruments.index(choice))
-def generate(choice: Sequence[str], params: Sequence[int]=None):
-    noise = torch.tensor(params).unsqueeze(0).to(device) if params else torch.randn(1, 5).to(device)
-    return model.sample(eps=noise, c = choice_to_tensor(choice).to(device)).cpu().numpy()[0]

 import torch
 from typing import Sequence
 import streamlit as st
+from lightning import LightningModule
+def format_instruments(text: str) -> str:
+    stems = text.split(" ")[1:]
+    stems = [stem.replace(" ", "").lower() for stem in stems]
+    return "_".join(stems)
+def choice_to_tensor(choice: Sequence[str]) -> torch.Tensor:
+    choice = "_".join([format_instruments(i) for i in choice])
+    return torch.tensor(instruments.index(choice))
 @st.cache_resource
+def load_model(device: str) -> LightningModule:
     return CVAE.load_from_checkpoint(
+        "epoch=77-step=2819778.ckpt",
         io_channels=1,
+        io_features=16000 * 4,
         latent_features=5,
         channels=[32, 64, 128, 256, 512],
         num_classes=len(instruments),
+        learning_rate=1e-5,
     ).to(device)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+instruments = [
+    "bass_acoustic",
+    "brass_acoustic",
+    "flute_acoustic",
+    "guitar_acoustic",
+    "keyboard_acoustic",
+    "mallet_acoustic",
+    "organ_acoustic",
+    "reed_acoustic",
+    "string_acoustic",
+    "synth_lead_acoustic",
+    "vocal_acoustic",
+    "bass_synthetic",
+    "brass_synthetic",
+    "flute_synthetic",
+    "guitar_synthetic",
+    "keyboard_synthetic",
+    "mallet_synthetic",
+    "organ_synthetic",
+    "reed_synthetic",
+    "string_synthetic",
+    "synth_lead_synthetic",
+    "vocal_synthetic",
+    "bass_electronic",
+    "brass_electronic",
+    "flute_electronic",
+    "guitar_electronic",
+    "keyboard_electronic",
+    "mallet_electronic",
+    "organ_electronic",
+    "reed_electronic",
+    "string_electronic",
+    "synth_lead_electronic",
+    "vocal_electronic",
+]
+model = load_model(device)
+def generate(choice: Sequence[str], params: Sequence[int] = None):
+    noise = (
+        torch.tensor(params).unsqueeze(0).to(device)
+        if params
+        else torch.randn(1, 5).to(device)
+    )
+    return (
+        model.sample(eps=noise, c=choice_to_tensor(choice).to(device)).cpu().numpy()[0]
+    )