Spaces:

Matthijs
/

image2reverb

Runtime error

App Files Files Community

Matthijs Hollemans commited on Dec 8, 2022

Commit

0ecd9fb

•

1 Parent(s): e2a288e

add app

Browse files

Files changed (29) hide show

.gitattributes +1 -0
.gitignore +4 -0
LICENSE +21 -0
app.py +261 -0
checkpoints/image2reverb_f22.ckpt +3 -0
checkpoints/mono_odom_640x192/depth.pth +3 -0
checkpoints/mono_odom_640x192/encoder.pth +3 -0
checkpoints/mono_odom_640x192/pose.pth +3 -0
checkpoints/mono_odom_640x192/pose_encoder.pth +3 -0
checkpoints/mono_odom_640x192/poses.npy +3 -0
examples/input.0c3f5013.png +0 -0
examples/input.2238dc21.png +0 -0
examples/input.321eef38.png +0 -0
examples/input.4d280b40.png +0 -0
examples/input.4e2f71f6.png +0 -0
examples/input.5416407f.png +0 -0
examples/input.67bc502e.png +0 -0
examples/input.98773b90.png +0 -0
examples/input.ac61500f.png +0 -0
examples/input.c9ee9d49.png +0 -0
image2reverb/dataset.py +96 -0
image2reverb/layers.py +88 -0
image2reverb/mel.py +20 -0
image2reverb/model.py +207 -0
image2reverb/networks.py +344 -0
image2reverb/stft.py +23 -0
image2reverb/util.py +167 -0
model.jpg +0 -0
requirements.txt +6 -0

.gitattributes CHANGED Viewed

@@ -28,6 +28,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text

 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*.pyc
+__pycache__/
+.DS_Store

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# Hacked together using the code from https://github.com/nikhilsinghmus/image2reverb
+import os, types
+import numpy as np
+import gradio as gr
+import soundfile as sf
+import scipy
+import librosa.display
+from PIL import Image
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import torch
+from torch.utils.data import Dataset
+import torchvision.transforms as transforms
+from pytorch_lightning import Trainer
+from image2reverb.model import Image2Reverb
+from image2reverb.stft import STFT
+predicted_ir = None
+predicted_spectrogram = None
+predicted_depthmap = None
+def test_step(self, batch, batch_idx):
+    spec, label, paths = batch
+    examples = [os.path.splitext(os.path.basename(s))[0] for _, s in zip(*paths)]
+    f, img = self.enc.forward(label)
+    shape = (
+        f.shape[0],
+        (self._latent_dimension - f.shape[1]) if f.shape[1] < self._latent_dimension else f.shape[1],
+        f.shape[2],
+        f.shape[3]
+    )
+    z = torch.cat((f, torch.randn(shape, device=model.device)), 1)
+    fake_spec = self.g(z)
+    stft = STFT()
+    y_f = [stft.inverse(s.squeeze()) for s in fake_spec]
+    # TODO: bit hacky
+    global predicted_ir, predicted_spectrogram, predicted_depthmap
+    predicted_ir = y_f[0]
+    s = fake_spec.squeeze().cpu().numpy()
+    predicted_spectrogram = np.exp((((s + 1) * 0.5) * 19.5) - 17.5) - 1e-8
+    img = (img + 1) * 0.5
+    predicted_depthmap = img.cpu().squeeze().permute(1, 2, 0)[:,:,-1].squeeze().numpy()
+    return {"test_audio": y_f, "test_examples": examples}
+def test_epoch_end(self, outputs):
+    if not self.test_callback:
+        return
+    examples = []
+    audio = []
+    for output in outputs:
+        for i in range(len(output["test_examples"])):
+            audio.append(output["test_audio"][i])
+            examples.append(output["test_examples"][i])
+    self.test_callback(examples, audio)
+checkpoint_path = "./checkpoints/image2reverb_f22.ckpt"
+encoder_path = None
+depthmodel_path = "./checkpoints/mono_odom_640x192"
+constant_depth = None
+latent_dimension = 512
+model = Image2Reverb(encoder_path, depthmodel_path)
+m = torch.load(checkpoint_path, map_location=model.device)
+model.load_state_dict(m["state_dict"])
+model.test_step = types.MethodType(test_step, model)
+model.test_epoch_end = types.MethodType(test_epoch_end, model)
+image_transforms = transforms.Compose([
+    transforms.Resize([224, 224], transforms.functional.InterpolationMode.BICUBIC),
+    transforms.ToTensor(),
+    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+])
+class Image2ReverbDemoDataset(Dataset):
+    def __init__(self, image):
+        self.image = Image.fromarray(image)
+        self.stft = STFT()
+    def __getitem__(self, index):
+        img_tensor = image_transforms(self.image.convert("RGB"))
+        return torch.zeros(1, int(5.94 * 22050)), img_tensor, ("", "")
+    def __len__(self):
+        return 1
+    def name(self):
+        return "Image2ReverbDemo"
+def convolve(audio, reverb):
+    # convolve audio with reverb
+    wet_audio = np.concatenate((audio, np.zeros(reverb.shape)))
+    wet_audio = scipy.signal.oaconvolve(wet_audio, reverb, "full")[:len(wet_audio)]
+    # normalize audio to roughly -1 dB peak and remove DC offset
+    wet_audio /= np.max(np.abs(wet_audio))
+    wet_audio -= np.mean(wet_audio)
+    wet_audio *= 0.9
+    return wet_audio
+def predict(image, audio):
+    # image = numpy (height, width, channels)
+    # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
+    test_set = Image2ReverbDemoDataset(image)
+    test_loader = torch.utils.data.DataLoader(test_set, num_workers=0, batch_size=1)
+    trainer = Trainer(limit_test_batches=1)
+    trainer.test(model, test_loader, verbose=True)
+    # depthmap output
+    depthmap_fig = plt.figure()
+    plt.imshow(predicted_depthmap)
+    plt.close()
+    # spectrogram output
+    spectrogram_fig = plt.figure()
+    librosa.display.specshow(predicted_spectrogram, sr=22050, x_axis="time", y_axis="hz")
+    plt.close()
+    # plot the IR as a waveform
+    waveform_fig = plt.figure()
+    librosa.display.waveshow(predicted_ir, sr=22050, alpha=0.5)
+    plt.close()
+    # output audio as 16-bit signed integer
+    ir = (22050, (predicted_ir * 32767).astype(np.int16))
+    sample_rate, original_audio = audio
+    # incoming audio is 16-bit signed integer, convert to float and normalize
+    original_audio = original_audio.astype(np.float32) / 32768.0
+    original_audio /= np.max(np.abs(original_audio))
+    # resample reverb to sample_rate first, also normalize
+    reverb = predicted_ir.copy()
+    reverb = scipy.signal.resample_poly(reverb, up=sample_rate, down=22050)
+    reverb /= np.max(np.abs(reverb))
+    # stereo?
+    if len(original_audio.shape) > 1:
+        wet_left = convolve(original_audio[:, 0], reverb)
+        wet_right = convolve(original_audio[:, 1], reverb)
+        wet_audio = np.concatenate([wet_left[:, None], wet_right[:, None]], axis=1)
+    else:
+        wet_audio = convolve(original_audio, reverb)
+    # 50% dry-wet mix
+    mixed_audio = wet_audio * 0.5
+    mixed_audio[:len(original_audio), ...] += original_audio * 0.9 * 0.5
+    # convert back to 16-bit signed integer
+    wet_audio = (wet_audio * 32767).astype(np.int16)
+    mixed_audio = (mixed_audio * 32767).astype(np.int16)
+    convolved_audio_100 = (sample_rate, wet_audio)
+    convolved_audio_50 = (sample_rate, mixed_audio)
+    return depthmap_fig, spectrogram_fig, waveform_fig, ir, convolved_audio_100, convolved_audio_50
+title = "Image2Reverb: Cross-Modal Reverb Impulse Response Synthesis"
+description = """
+<b>Image2Reverb</b> predicts the acoustic reverberation of a given environment from a 2D image. <a href="https://arxiv.org/abs/2103.14201">Read the paper</a>
+How to use: Choose an image of a room or other environment and an audio file.
+The model will predict what the reverb of the room sounds like and applies this to the audio file.
+First, the image is resized to 224×224. The monodepth model is used to predict a depthmap, which is added as an
+additional channel to the image input. A ResNet-based encoder then converts the image into features, and
+finally a GAN predicts the spectrogram of the reverb's impulse response.
+<center><img src="file/model.jpg" width="870" height="297" alt="model architecture"></center>
+The predicted impulse response is mono 22050 kHz. It is upsampled to the sampling rate of the audio
+file and applied to both channels if the audio is stereo.
+Generating the impulse response involves a certain amount of randomness, making it sound a little
+different every time you try it.
+"""
+article = """
+<div style='margin:20px auto;'>
+<p>Based on original work by Nikhil Singh, Jeff Mentch, Jerry Ng, Matthew Beveridge, Iddo Drori.
+<a href="https://web.media.mit.edu/~nsingh1/image2reverb/">Project Page</a> |
+<a href="https://arxiv.org/abs/2103.14201">Paper</a> |
+<a href="https://github.com/nikhilsinghmus/image2reverb">GitHub</a></p>
+<pre>
+@InProceedings{Singh_2021_ICCV,
+    author    = {Singh, Nikhil and Mentch, Jeff and Ng, Jerry and Beveridge, Matthew and Drori, Iddo},
+    title     = {Image2Reverb: Cross-Modal Reverb Impulse Response Synthesis},
+    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+    month     = {October},
+    year      = {2021},
+    pages     = {286-295}
+}
+</pre>
+<p>🌠 Example images from <a href="https://web.media.mit.edu/~nsingh1/image2reverb/">the original project page</a>.</p>
+<p>🎶 Example sound from <a href="https://freesound.org/people/ashesanddreams/sounds/610414/">Ashes and Dreams @ freesound.org</a> (CC BY 4.0 license). This is a mono 48 kHz recording that has no reverb on it.</p>
+</div>
+"""
+audio_example = "examples/ashesanddreams.wav"
+examples = [
+    ["examples/input.4e2f71f6.png", audio_example],
+    ["examples/input.321eef38.png", audio_example],
+    ["examples/input.2238dc21.png", audio_example],
+    ["examples/input.4d280b40.png", audio_example],
+    ["examples/input.0c3f5013.png", audio_example],
+    ["examples/input.98773b90.png", audio_example],
+    ["examples/input.ac61500f.png", audio_example],
+    ["examples/input.5416407f.png", audio_example],
+]
+gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.inputs.Image(label="Upload Image"),
+        gr.inputs.Audio(label="Upload Audio", source="upload"),
+    ],
+    outputs=[
+        gr.Plot(label="Depthmap"),
+        gr.Plot(label="Impulse Response Spectrogram"),
+        gr.Plot(label="Impulse Response Waveform"),
+        gr.outputs.Audio(label="Impulse Response"),
+        gr.outputs.Audio(label="Output Audio (100% Wet)"),
+        gr.outputs.Audio(label="Output Audio (50% Dry, 50% Wet)"),
+    ],
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
+).launch()

checkpoints/image2reverb_f22.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d61422e95dc963e258b68536dc8135633a999c3a85a5a80925878ff75ca092e3
+size 687498725

checkpoints/mono_odom_640x192/depth.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a2f542e274a5b0567e3118bc16aea4c2f44ba09df4a08a6c3a47d6d98285b72
+size 12617260

checkpoints/mono_odom_640x192/encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acbf2534608f06be40eecd5026c505ebd0c1d9442fe5864abba1b5d90bff2e3e
+size 46819013

checkpoints/mono_odom_640x192/pose.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4da0fe66fc1f781a05d8c4778f33ffa1851c219cb7fd561328479f5b439707e
+size 5259718

checkpoints/mono_odom_640x192/pose_encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df8659ecf4363335c13ffc4510ff34556715c7f6435707622c3641a7fe055eb2
+size 46856589

checkpoints/mono_odom_640x192/poses.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71a413ff381d4a58345e9152e0ca8d0b45a71e550df7730633a8cf7693edcced
+size 76928

examples/input.0c3f5013.png ADDED Viewed

examples/input.2238dc21.png ADDED Viewed

examples/input.321eef38.png ADDED Viewed

examples/input.4d280b40.png ADDED Viewed

examples/input.4e2f71f6.png ADDED Viewed

examples/input.5416407f.png ADDED Viewed

examples/input.67bc502e.png ADDED Viewed

examples/input.98773b90.png ADDED Viewed

examples/input.ac61500f.png ADDED Viewed

examples/input.c9ee9d49.png ADDED Viewed

image2reverb/dataset.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import soundfile
+import torch
+import torchvision.transforms as transforms
+from torch.utils.data import Dataset
+from PIL import Image
+from .stft import STFT
+from .mel import LogMel
+F_EXTENSIONS = [
+    ".jpg", ".JPG", ".jpeg", ".JPEG",
+    ".png", ".PNG", ".ppm", ".PPM", ".bmp", ".BMP", ".tiff", ".wav", ".WAV", ".aif", ".aiff", ".AIF", ".AIFF"
+]
+def is_image_audio_file(filename):
+    return any(filename.endswith(extension) for extension in F_EXTENSIONS)
+def make_dataset(dir, extensions=F_EXTENSIONS):
+    images = []
+    assert os.path.isdir(dir), "%s is not a valid directory." % dir
+    for root, _, fnames in sorted(os.walk(dir)):
+        for fname in fnames:
+            if is_image_audio_file(fname):
+                path = os.path.join(root, fname)
+                images.append(path)
+    return images
+class Image2ReverbDataset(Dataset):
+    def __init__(self, dataroot, phase="train", spec="stft"):
+        self.root = dataroot
+        self.stft = LogMel() if spec == "mel" else STFT()
+        ### input A (images)
+        dir_A = "_A"
+        self.dir_A = os.path.join(self.root, phase + dir_A)
+        self.A_paths = sorted(make_dataset(self.dir_A))
+        ### input B (audio)
+        dir_B = "_B"
+        self.dir_B = os.path.join(self.root, phase + dir_B)
+        self.B_paths = sorted(make_dataset(self.dir_B))
+    def __getitem__(self, index):
+        if index > len(self):
+            return None
+        ### input A (images)
+        A_path = self.A_paths[index]
+        A = Image.open(A_path)
+        t = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+        A_tensor = t(A.convert("RGB"))
+        ### input B (audio)
+        B_path = self.B_paths[index]
+        B, _ = soundfile.read(B_path)
+        B_spec = self.stft.transform(B)
+        return B_spec, A_tensor, (B_path, A_path)
+    def __len__(self):
+        return len(self.A_paths)
+    def name(self):
+        return "Image2Reverb"
+class Image2ReverbDemoDataset(Dataset):
+    def __init__(self, image_paths):
+        if isinstance(image_paths, str) and os.path.isdir(image_paths):
+            self.paths = sorted(make_dataset(image_paths, [".jpg", ".JPG", ".jpeg", ".JPEG", ".png", ".PNG", ".ppm", ".PPM", ".bmp", ".BMP", ".tiff"]))
+        else:
+            self.paths = sorted(image_paths)
+        self.stft = STFT()
+    def __getitem__(self, index):
+        if index > len(self):
+            return None
+        ### input A (images)
+        path = self.paths[index]
+        img = Image.open(path)
+        t = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+        img_tensor = t(img.convert("RGB"))
+        return torch.zeros(1, int(5.94 * 22050)), img_tensor, ("", path)
+    def __len__(self):
+        return len(self.paths)
+    def name(self):
+        return "Image2ReverbDemo"

image2reverb/layers.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import kaiming_normal_, calculate_gain
+class PixelWiseNormLayer(nn.Module):
+    """PixelNorm layer. Implementation is from https://github.com/shanexn/pytorch-pggan."""
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x/torch.sqrt(torch.mean(x ** 2, dim=1, keepdim=True) + 1e-8)
+class MiniBatchAverageLayer(nn.Module):
+    """Minibatch stat concatenation layer. Implementation is from https://github.com/shanexn/pytorch-pggan."""
+    def __init__(self, offset=1e-8):
+        super().__init__()
+        self.offset = offset
+    def forward(self, x):
+        stddev = torch.sqrt(torch.mean((x - torch.mean(x, dim=0, keepdim=True))**2, dim=0, keepdim=True) + self.offset)
+        inject_shape = list(x.size())[:]
+        inject_shape[1] = 1
+        inject = torch.mean(stddev, dim=1, keepdim=True)
+        inject = inject.expand(inject_shape)
+        return torch.cat((x, inject), dim=1)
+class EqualizedLearningRateLayer(nn.Module):
+    """Applies equalized learning rate to the preceding layer. Implementation is from https://github.com/shanexn/pytorch-pggan."""
+    def __init__(self, layer):
+        super().__init__()
+        self.layer_ = layer
+        kaiming_normal_(self.layer_.weight, a=calculate_gain("conv2d"))
+        self.layer_norm_constant_ = (torch.mean(self.layer_.weight.data ** 2)) ** 0.5
+        self.layer_.weight.data.copy_(self.layer_.weight.data / self.layer_norm_constant_)
+        self.bias_ = self.layer_.bias if self.layer_.bias else None
+        self.layer_.bias = None
+    def forward(self, x):
+        self.layer_norm_constant_ = self.layer_norm_constant_.type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor)
+        x = self.layer_norm_constant_ * x
+        if self.bias_ is not None:
+            x += self.bias.view(1, self.bias.size()[0], 1, 1)
+        return x
+class ConvBlock(nn.Module):
+    """Layer to perform a convolution followed by ELU
+    """
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv = Conv3x3(in_channels, out_channels)
+        self.nonlin = nn.ELU(inplace=True)
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.nonlin(out)
+        return out
+class Conv3x3(nn.Module):
+    """Layer to pad and convolve input
+    """
+    def __init__(self, in_channels, out_channels, use_refl=True):
+        super(Conv3x3, self).__init__()
+        if use_refl:
+            self.pad = nn.ReflectionPad2d(1)
+        else:
+            self.pad = nn.ZeroPad2d(1)
+        self.conv = nn.Conv2d(int(in_channels), int(out_channels), 3)
+    def forward(self, x):
+        out = self.pad(x)
+        out = self.conv(out)
+        return out
+def upsample(x):
+    """Upsample input tensor by a factor of 2
+    """
+    return F.interpolate(x, scale_factor=2, mode="nearest")

image2reverb/mel.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import numpy
+import torch
+import librosa
+class LogMel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._eps = 1e-8
+    def transform(self, audio):
+        m = librosa.feature.melspectrogram(audio/numpy.abs(audio).max())
+        m = numpy.log(m + self._eps)
+        return torch.Tensor(((m - m.mean()) / m.std()) * 0.8).unsqueeze(0)
+    def inverse(self, spec):
+        s = spec.cpu().detach().numpy()
+        s = numpy.exp((s * 5) - 15.96) - self._eps # Empirical mean and standard deviation over test set
+        y = librosa.feature.inverse.mel_to_audio(s) # Reconstruct audio
+        return y/numpy.abs(y).max()

image2reverb/model.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+import json
+import numpy
+import torch
+from torch import nn
+import torch.nn.functional as F
+import pytorch_lightning as pl
+import torchvision
+import pyroomacoustics
+from .networks import Encoder, Generator, Discriminator
+from .stft import STFT
+from .mel import LogMel
+from .util import compare_t60
+# Hyperparameters
+G_LR = 4e-4
+D_LR = 2e-4
+ENC_LR = 1e-5
+ADAM_BETA = (0.0, 0.99)
+ADAM_EPS = 1e-8
+LAMBDA = 100
+class Image2Reverb(pl.LightningModule):
+    def __init__(self, encoder_path, depthmodel_path, latent_dimension=512, spec="stft", d_threshold=0.2, t60p=True, constant_depth = None, test_callback=None):
+        super().__init__()
+        self._latent_dimension = latent_dimension
+        self._d_threshold = d_threshold
+        self.constant_depth = constant_depth
+        self.t60p = t60p
+        self.confidence = {}
+        self.tau = 50
+        self.test_callback = test_callback
+        self._opt = (d_threshold != None) and (d_threshold > 0) and (d_threshold < 1)
+        self.enc = Encoder(encoder_path, depthmodel_path, constant_depth=self.constant_depth, device=self.device)
+        self.g = Generator(latent_dimension, spec == "mel")
+        self.d = Discriminator(365, spec == "mel")
+        self.validation_inputs = []
+        self.stft_type = spec
+    def forward(self, x):
+        f = self.enc.forward(x)[0]
+        z = torch.cat((f, torch.randn((f.shape[0], (self._latent_dimension - f.shape[1]) if f.shape[1] < self._latent_dimension else f.shape[1], f.shape[2], f.shape[3]), device=self.device)), 1)
+        return self.g(z)
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        opts = None
+        if self._opt:
+            opts = self.optimizers()
+        spec, label, p = batch
+        spec.requires_grad = True # For the backward pass, seems necessary for now
+        # Forward passes through models
+        f = self.enc.forward(label)[0]
+        z = torch.cat((f, torch.randn((f.shape[0], (self._latent_dimension - f.shape[1]) if f.shape[1] < self._latent_dimension else f.shape[1], f.shape[2], f.shape[3]), device=self.device)), 1)
+        fake_spec = self.g(z)
+        d_fake = self.d(fake_spec.detach(), f)
+        d_real = self.d(spec, f)
+        # Train Generator or Encoder
+        if optimizer_idx == 0 or optimizer_idx == 1:
+            d_fake2 = self.d(fake_spec.detach(), f)
+            G_loss1 = F.mse_loss(d_fake2, torch.ones(d_fake2.shape, device=self.device))
+            G_loss2 = F.l1_loss(fake_spec, spec)
+            G_loss = G_loss1 + (LAMBDA * G_loss2)
+            if self.t60p:
+                t60_err = torch.Tensor([compare_t60(torch.exp(a).sum(-2).squeeze(), torch.exp(b).sum(-2).squeeze()) for a, b in zip(spec, fake_spec)]).to(self.device).mean()
+                G_loss += t60_err
+                self.log("t60", t60_err, on_step=True, on_epoch=True, prog_bar=True)
+            if self._opt:
+                self.manual_backward(G_loss, self.opts[optimizer_idx])
+                opts[optimizer_idx].step()
+                opts[optimizer_idx].zero_grad()
+            self.log("G", G_loss, on_step=True, on_epoch=True, prog_bar=True)
+            return G_loss
+        else: # Train Discriminator
+            l_fakeD = F.mse_loss(d_fake, torch.zeros(d_fake.shape, device=self.device))
+            l_realD = F.mse_loss(d_real, torch.ones(d_real.shape, device=self.device))
+            D_loss = (l_realD + l_fakeD)
+            if self._opt and (D_loss > self._d_threshold):
+                self.manual_backward(D_loss, self.opts[optimizer_idx])
+                opts[optimizer_idx].step()
+                opts[optimizer_idx].zero_grad()
+            self.log("D", D_loss, on_step=True, on_epoch=True, prog_bar=True)
+            return D_loss
+    def configure_optimizers(self):
+        g_optim = torch.optim.Adam(self.g.parameters(), lr=G_LR, betas=ADAM_BETA, eps=ADAM_EPS)
+        d_optim = torch.optim.Adam(self.d.parameters(), lr=D_LR, betas=ADAM_BETA, eps=ADAM_EPS)
+        enc_optim = torch.optim.Adam(self.enc.parameters(), lr=ENC_LR, betas=ADAM_BETA, eps=ADAM_EPS)
+        return [enc_optim, g_optim, d_optim], []
+    def validation_step(self, batch, batch_idx):
+        spec, label, paths = batch
+        examples = [os.path.basename(s[:s.rfind("_")]) for s, _ in zip(*paths)]
+        # Forward passes through models
+        f = self.enc.forward(label)[0]
+        z = torch.cat((f, torch.randn((f.shape[0], (self._latent_dimension - f.shape[1]) if f.shape[1] < self._latent_dimension else f.shape[1], f.shape[2], f.shape[3]), device=self.device)), 1)
+        fake_spec = self.g(z)
+        # Get audio
+        stft = LogMel() if self.stft_type == "mel" else STFT()
+        y_r = [stft.inverse(s.squeeze()) for s in spec]
+        y_f = [stft.inverse(s.squeeze()) for s in fake_spec]
+        # RT60 error (in percentages)
+        val_pct = 1
+        try:
+            f = lambda x : pyroomacoustics.experimental.rt60.measure_rt60(x, 22050)
+            t60_r = [f(y) for y in y_r if len(y)]
+            t60_f = [f(y) for y in y_f if len(y)]
+            val_pct = numpy.mean([((t_b - t_a)/t_a) for t_a, t_b in zip(t60_r, t60_f)])
+        except:
+            pass
+        return {"val_t60err": val_pct, "val_spec": fake_spec, "val_audio": torch.Tensor(y_f), "val_img": label, "val_examples": examples}
+    def validation_epoch_end(self, outputs):
+        if not len(outputs):
+            return
+        # Log mean T60 errors (in percentages)
+        val_t60errmean = torch.Tensor(numpy.array([output["val_t60err"] for output in outputs])).mean()
+        self.log("val_t60err", val_t60errmean, on_epoch=True, prog_bar=True)
+        # Log generated spectrogram images
+        grid = torchvision.utils.make_grid([torch.flip(x, [0]) for y in [output["val_spec"] for output in outputs] for x in y])
+        self.logger.experiment.add_image("generated_spectrograms", grid, self.current_epoch)
+        # Log model input images
+        grid = torchvision.utils.make_grid([x for y in [output["val_img"] for output in outputs] for x in y])
+        self.logger.experiment.add_image("input_images_with_depthmaps", grid, self.current_epoch)
+        # Log generated audio examples
+        for output in outputs:
+            for example, audio in zip(output["val_examples"], output["val_audio"]):
+                y = audio
+                self.logger.experiment.add_audio("generated_audio_%s" % example, y, self.current_epoch, sample_rate=22050)
+    def test_step(self, batch, batch_idx):
+        spec, label, paths = batch
+        examples = [os.path.basename(s[:s.rfind("_")]) for s, _ in zip(*paths)]
+        # Forward passes through models
+        f, img = self.enc.forward(label)
+        img = (img + 1) * 0.5
+        z = torch.cat((f, torch.randn((f.shape[0], (self._latent_dimension - f.shape[1]) if f.shape[1] < self._latent_dimension else f.shape[1], f.shape[2], f.shape[3]), device=self.device)), 1)
+        fake_spec = self.g(z)
+        # Get audio
+        stft = LogMel() if self.stft_type == "mel" else STFT()
+        y_r = [stft.inverse(s.squeeze()) for s in spec]
+        y_f = [stft.inverse(s.squeeze()) for s in fake_spec]
+        # RT60 error (in percentages)
+        val_pct = 1
+        f = lambda x : pyroomacoustics.experimental.rt60.measure_rt60(x, 22050)
+        val_pct = []
+        for y_real, y_fake in zip(y_r, y_f):
+            try:
+                t_a = f(y_real)
+                t_b = f(y_fake)
+                val_pct.append((t_b - t_a)/t_a)
+            except:
+                val_pct.append(numpy.nan)
+        return {"test_t60err": val_pct, "test_spec": fake_spec, "test_audio": y_f, "test_img": img, "test_examples": examples}
+    def test_epoch_end(self, outputs):
+        if not self.test_callback:
+            return
+        examples = []
+        t60 = []
+        spec_images = []
+        audio = []
+        input_images = []
+        input_depthmaps = []
+        for output in outputs:
+            for i in range(len(output["test_examples"])):
+                img = output["test_img"][i]
+                if img.shape[0] == 3:
+                    rgb = img
+                    img = torch.cat((rgb, torch.zeros((1, rgb.shape[1], rgb.shape[2]), device=self.device)), 0)
+                t60.append(output["test_t60err"][i])
+                spec_images.append(output["test_spec"][i].cpu().squeeze().detach().numpy())
+                audio.append(output["test_audio"][i])
+                input_images.append(img.cpu().squeeze().permute(1, 2, 0)[:,:,:-1].detach().numpy())
+                input_depthmaps.append(img.cpu().squeeze().permute(1, 2, 0)[:,:,-1].squeeze().detach().numpy())
+                examples.append(output["test_examples"][i])
+        self.test_callback(examples, t60, spec_images, audio, input_images, input_depthmaps)
+    @property
+    def automatic_optimization(self) -> bool:
+        return not self._opt

image2reverb/networks.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import os
+import numpy
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torch.utils.model_zoo as model_zoo
+from collections import OrderedDict
+from .layers import PixelWiseNormLayer, MiniBatchAverageLayer, EqualizedLearningRateLayer, Conv3x3, ConvBlock, upsample
+class Encoder(nn.Module):
+    """Load encoder from pre-trained ResNet50 (places365 CNNs) model. Link: http://places2.csail.mit.edu/models_places365/resnet50_places365.pth.tar"""
+    def __init__(self, model_weights, depth_model, constant_depth=None, device="cuda", train_enc=True):
+        super().__init__()
+        self.device = device
+        self._constant_depth = constant_depth
+        self.model = models.resnet50(num_classes=365)
+        if model_weights:
+            c = torch.load(model_weights, map_location=self.device)
+            state_dict = {k.replace("module.", ""): v for k, v in c["state_dict"].items()}
+            self.model.load_state_dict(state_dict)
+        self._has_depth = False
+        if depth_model:
+            f = self.model.conv1.weight
+            self.model.conv1.weight = torch.nn.Parameter(torch.cat((f, torch.randn(64, 1, 7, 7)), 1))
+            self.model.to(self.device)
+            encoder_path = os.path.join(depth_model, "encoder.pth")
+            depth_decoder_path = os.path.join(depth_model, "depth.pth")
+            self.depth_encoder = ResnetEncoder(18, False)
+            loaded_dict_enc = torch.load(encoder_path, map_location=self.device)
+            self.feed_height = loaded_dict_enc["height"]
+            self.feed_width = loaded_dict_enc["width"]
+            filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in self.depth_encoder.state_dict()}
+            self.depth_encoder.load_state_dict(filtered_dict_enc)
+            self.depth_encoder.to(self.device)
+            self.depth_encoder.eval()
+            self.depth_decoder = DepthDecoder(num_ch_enc=self.depth_encoder.num_ch_enc, scales=range(4))
+            loaded_dict = torch.load(depth_decoder_path, map_location=self.device)
+            self.depth_decoder.load_state_dict(loaded_dict, strict=False)
+            self.depth_decoder.to(self.device)
+            self.depth_decoder.eval()
+            self._has_depth = True
+        if train_enc:
+            self.model.train()
+    def forward(self, x):
+        if self._has_depth:
+            d = torch.full((x.shape[0], 1, x.shape[2], x.shape[3]), self._constant_depth, device=x.device) if self._constant_depth is not None else list(self.depth_decoder(self.depth_encoder(x)).values())[-1]
+            x = torch.cat((x, d), 1)
+        return self.model.forward(x).unsqueeze(-1).unsqueeze(-1), x
+class Generator(nn.Module):
+    """Build non-progressive variant of GANSynth generator."""
+    def __init__(self, latent_size=512, mel_spec=False): # Encoder output should contain 2048 values
+        super().__init__()
+        self.latent_size = latent_size
+        self._mel_spec = mel_spec
+        self.build_model()
+    def forward(self, x):
+        return self.model(x)
+    def build_model(self):
+        model = []
+        # Input block
+        if self._mel_spec:
+            model.append(nn.Conv2d(self.latent_size, 256, kernel_size=(4, 2), stride=1, padding=2, bias=False))
+        else:
+            model.append(nn.Conv2d(self.latent_size, 256, kernel_size=8, stride=1, padding=7, bias=False)) # Modified to k=8, p=7 for our image dimensions (i.e. 512x512)
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Upsample(scale_factor=2, mode="nearest"))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Upsample(scale_factor=2, mode="nearest"))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Upsample(scale_factor=2, mode="nearest"))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Upsample(scale_factor=2, mode="nearest"))
+        model.append(nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Upsample(scale_factor=2, mode="nearest"))
+        model.append(nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Upsample(scale_factor=2, mode="nearest"))
+        model.append(nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(PixelWiseNormLayer())
+        model.append(nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.Tanh())
+        self.model = nn.Sequential(*model)
+class Discriminator(nn.Module):
+    def __init__(self, label_size=365, mel_spec=False):
+        super().__init__()
+        self._label_size = 365
+        self._mel_spec = mel_spec
+        self.build_model()
+    def forward(self, x, l):
+        d = self.model(x)
+        if self._mel_spec:
+            s = list(l.squeeze().shape)
+            s[-1] = 19
+            z = torch.cat((l.squeeze(), torch.zeros(s).type_as(x)), -1).reshape(d.shape[0], -1, 2, 4)
+        else:
+            s = list(l.squeeze().shape)
+            s[-1] = 512 - s[-1]
+            z = torch.cat((l.squeeze(), torch.zeros(s).type_as(x)), -1).reshape(d.shape[0], -1, 8, 8)
+        k = torch.cat((d, z), 1)
+        return self.output(k)
+    def build_model(self):
+        model = []
+        model.append(nn.Conv2d(1, 32, kernel_size=1, stride=1, padding=0, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=False, count_include_pad=False))
+        model.append(nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=False, count_include_pad=False))
+        model.append(nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=False, count_include_pad=False))
+        model.append(nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=False, count_include_pad=False))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=False, count_include_pad=False))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=False, count_include_pad=False))
+        model.append(MiniBatchAverageLayer())
+        model.append(nn.Conv2d(257, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        model.append(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False))
+        model.append(EqualizedLearningRateLayer(model[-1]))
+        model.append(nn.LeakyReLU(negative_slope=0.2))
+        output = [] # After the label concatenation
+        if self._mel_spec:
+            output.append(nn.Conv2d(304, 256, kernel_size=1, stride=1, padding=0, bias=False))
+        else:
+            output.append(nn.Conv2d(264, 256, kernel_size=1, stride=1, padding=0, bias=False))
+        output.append(nn.Conv2d(256, 1, kernel_size=1, stride=1, padding=0, bias=False))
+        # model.append(nn.Sigmoid()) # Output probability (in [0, 1])
+        self.model = nn.Sequential(*model)
+        self.output = nn.Sequential(*output)
+class ResnetEncoder(nn.Module):
+    """Pytorch module for a resnet encoder
+    """
+    def __init__(self, num_layers, pretrained, num_input_images=1):
+        super(ResnetEncoder, self).__init__()
+        self.num_ch_enc = numpy.array([64, 64, 128, 256, 512])
+        resnets = {18: models.resnet18,
+                   34: models.resnet34,
+                   50: models.resnet50,
+                   101: models.resnet101,
+                   152: models.resnet152}
+        if num_layers not in resnets:
+            raise ValueError("{} is not a valid number of resnet layers".format(num_layers))
+        if num_input_images > 1:
+            self.encoder = resnet_multiimage_input(num_layers, pretrained, num_input_images)
+        else:
+            self.encoder = resnets[num_layers](pretrained)
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+    def forward(self, input_image):
+        self.features = []
+        x = (input_image - 0.45) / 0.225
+        x = self.encoder.conv1(x)
+        x = self.encoder.bn1(x)
+        self.features.append(self.encoder.relu(x))
+        self.features.append(self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
+        self.features.append(self.encoder.layer2(self.features[-1]))
+        self.features.append(self.encoder.layer3(self.features[-1]))
+        self.features.append(self.encoder.layer4(self.features[-1]))
+        return self.features
+class DepthDecoder(nn.Module):
+    def __init__(self, num_ch_enc, scales=range(4), num_output_channels=1, use_skips=True):
+        super(DepthDecoder, self).__init__()
+        self.num_output_channels = num_output_channels
+        self.use_skips = use_skips
+        self.upsample_mode = "nearest"
+        self.scales = scales
+        self.num_ch_enc = num_ch_enc
+        self.num_ch_dec = numpy.array([16, 32, 64, 128, 256])
+        # decoder
+        self.convs = OrderedDict()
+        for i in range(4, -1, -1):
+            # upconv_0
+            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1]
+            num_ch_out = self.num_ch_dec[i]
+            # self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out)
+            setattr(self, "upconv_{}_0".format(i), ConvBlock(num_ch_in, num_ch_out))
+            # upconv_1
+            num_ch_in = self.num_ch_dec[i]
+            if self.use_skips and i > 0:
+                num_ch_in += self.num_ch_enc[i - 1]
+            num_ch_out = self.num_ch_dec[i]
+            # self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out)
+            setattr(self, "upconv_{}_1".format(i), ConvBlock(num_ch_in, num_ch_out))
+        for s in self.scales:
+            # self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels)
+            setattr(self, "disp_{}".format(s), Conv3x3(self.num_ch_dec[s], self.num_output_channels))
+        self.decoder = nn.ModuleList(
+            [x for y in [[getattr(self, "upconv_{}_0".format(i)), getattr(self, "upconv_{}_1".format(i))] for i in range(4, -1, -1)] for x in y] +
+            [getattr(self, "disp_{}".format(s)) for s in self.scales]
+        )
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, input_features):
+        outputs = {}
+        # decoder
+        x = input_features[-1]
+        for i in range(4, -1, -1):
+            # x = self.convs[("upconv", i, 0)](x)
+            x = getattr(self, "upconv_{}_0".format(i))(x)
+            x = [upsample(x)]
+            if self.use_skips and i > 0:
+                x += [input_features[i - 1]]
+            x = torch.cat(x, 1)
+            # x = self.convs[("upconv", i, 1)](x)
+            x = getattr(self, "upconv_{}_1".format(i))(x)
+            if i in self.scales:
+                outputs[("disp", i)] = self.sigmoid(getattr(self, "disp_{}".format(i))(x))
+                # setattr(self, "outputs_disp_{}".format(i), self.sigmoid(getattr(self, "disp_{}".format(i))(x)))
+        return outputs

image2reverb/stft.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy
+import torch
+import librosa
+class STFT(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._eps = 1e-8
+    def transform(self, audio):
+        m = numpy.abs(librosa.stft(audio/numpy.abs(audio).max(), 1024, 256))[:-1,:]
+        m = numpy.log(m + self._eps)
+        m = (((m - m.min())/(m.max() - m.min()) * 2) - 1)
+        return (torch.FloatTensor if torch.cuda.is_available() else torch.Tensor)(m * 0.8).unsqueeze(0)
+    def inverse(self, spec):
+        s = spec.cpu().detach().numpy()
+        s = numpy.exp((((s + 1) * 0.5) * 19.5) - 17.5) - self._eps # Empirical (average) min and max over test set
+        rp = numpy.random.uniform(-numpy.pi, numpy.pi, s.shape)
+        f = s * (numpy.cos(rp) + (1.j * numpy.sin(rp)))
+        y = librosa.istft(f) # Reconstruct audio
+        return y/numpy.abs(y).max()

image2reverb/util.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import math
+import numpy
+import torch
+import torch.fft
+from PIL import Image
+def compare_t60(a, b, sr=86):
+    try:
+        a = a.detach().clone().abs()
+        b = b.detach().clone().abs()
+        a = (a - a.min())/(a.max() - a.min())
+        b = (b - b.min())/(b.max() - b.min())
+        t_a = estimate_t60(a, sr)
+        t_b = estimate_t60(b, sr)
+        return abs((t_b - t_a)/t_a) * 100
+    except Exception as error:
+        return 100
+def estimate_t60(audio, sr):
+    fs = float(sr)
+    audio = audio.detach().clone()
+    decay_db = 20
+    # The power of the impulse response in dB
+    power = audio ** 2
+    energy = torch.flip(torch.cumsum(torch.flip(power, [0]), 0), [0])  # Integration according to Schroeder
+    # remove the possibly all zero tail
+    i_nz = torch.max(torch.where(energy > 0)[0])
+    n = energy[:i_nz]
+    db = 10 * torch.log10(n)
+    db = db - db[0]
+    # -5 dB headroom
+    i_5db = torch.min(torch.where(-5 - db > 0)[0])
+    e_5db = db[i_5db]
+    t_5db = i_5db / fs
+    # after decay
+    i_decay = torch.min(torch.where(-5 - decay_db - db > 0)[0])
+    t_decay = i_decay / fs
+    # compute the decay time
+    decay_time = t_decay - t_5db
+    est_rt60 = (60 / decay_db) * decay_time
+    return est_rt60
+def hilbert(x): #hilbert transform
+    N = x.shape[1]
+    Xf = torch.fft.fft(x, n=None, dim=-1)
+    h = torch.zeros(N)
+    if N % 2 == 0:
+        h[0] = h[N//2] = 1
+        h[1:N//2] = 2
+    else:
+        h[0] = 1
+        h[1:(N + 1)//2] = 2
+    x = torch.fft.ifft(Xf * h)
+    return x
+def spectral_centroid(x): #calculate the spectral centroid "brightness" of an audio input
+    Xf = torch.abs(torch.fft.fft(x,n=None,dim=-1)) #take fft and abs of x
+    norm_Xf = Xf / sum(sum(Xf))  # like probability mass function
+    norm_freqs = torch.linspace(0, 1, Xf.shape[1])
+    spectral_centroid = sum(sum(norm_freqs * norm_Xf))
+    return spectral_centroid
+# Converts a Tensor into a Numpy array
+# |imtype|: the desired type of the converted numpy array
+def tensor2im(image_tensor, imtype=numpy.uint8, normalize=True):
+    if isinstance(image_tensor, list):
+        image_numpy = []
+        for i in range(len(image_tensor)):
+            image_numpy.append(tensor2im(image_tensor[i], imtype, normalize))
+        return image_numpy
+    image_numpy = image_tensor.cpu().float().numpy()
+    if normalize:
+        image_numpy = (numpy.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0
+    else:
+        image_numpy = numpy.transpose(image_numpy, (1, 2, 0)) * 255.0
+    image_numpy = numpy.clip(image_numpy, 0, 255)
+    if image_numpy.shape[2] == 1 or image_numpy.shape[2] > 3:
+        image_numpy = image_numpy[:,:,0]
+    return image_numpy.astype(imtype)
+# Converts a one-hot tensor into a colorful label map
+def tensor2label(label_tensor, n_label, imtype=numpy.uint8):
+    if n_label == 0:
+        return tensor2im(label_tensor, imtype)
+    label_tensor = label_tensor.cpu().float()
+    if label_tensor.size()[0] > 1:
+        label_tensor = label_tensor.max(0, keepdim=True)[1]
+    label_tensor = Colorize(n_label)(label_tensor)
+    label_numpy = numpy.transpose(label_tensor.numpy(), (1, 2, 0))
+    return label_numpy.astype(imtype)
+def save_image(image_numpy, image_path):
+    image_pil = Image.fromarray(image_numpy)
+    image_pil.save(image_path)
+def mkdirs(paths):
+    if isinstance(paths, list) and not isinstance(paths, str):
+        for path in paths:
+            mkdir(path)
+    else:
+        mkdir(paths)
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+###############################################################################
+# Code from
+# https://github.com/ycszen/pytorch-seg/blob/master/transform.py
+# Modified so it complies with the Citscape label map colors
+###############################################################################
+def uint82bin(n, count=8):
+    """returns the binary of integer n, count refers to amount of bits"""
+    return ''.join([str((n >> y) & 1) for y in range(count-1, -1, -1)])
+def labelcolormap(N):
+    if N == 35: # cityscape
+        cmap = numpy.array([(  0,  0,  0), (  0,  0,  0), (  0,  0,  0), (  0,  0,  0), (  0,  0,  0), (111, 74,  0), ( 81,  0, 81),
+                     (128, 64,128), (244, 35,232), (250,170,160), (230,150,140), ( 70, 70, 70), (102,102,156), (190,153,153),
+                     (180,165,180), (150,100,100), (150,120, 90), (153,153,153), (153,153,153), (250,170, 30), (220,220,  0),
+                     (107,142, 35), (152,251,152), ( 70,130,180), (220, 20, 60), (255,  0,  0), (  0,  0,142), (  0,  0, 70),
+                     (  0, 60,100), (  0,  0, 90), (  0,  0,110), (  0, 80,100), (  0,  0,230), (119, 11, 32), (  0,  0,142)],
+                     dtype=numpy.uint8)
+    else:
+        cmap = numpy.zeros((N, 3), dtype=numpy.uint8)
+        for i in range(N):
+            r, g, b = 0, 0, 0
+            id = i
+            for j in range(7):
+                str_id = uint82bin(id)
+                r = r ^ (numpy.uint8(str_id[-1]) << (7-j))
+                g = g ^ (numpy.uint8(str_id[-2]) << (7-j))
+                b = b ^ (numpy.uint8(str_id[-3]) << (7-j))
+                id = id >> 3
+            cmap[i, 0] = r
+            cmap[i, 1] = g
+            cmap[i, 2] = b
+    return cmap
+class Colorize(object):
+    def __init__(self, n=35):
+        self.cmap = labelcolormap(n)
+        self.cmap = torch.from_numpy(self.cmap[:n])
+    def __call__(self, gray_image):
+        size = gray_image.size()
+        color_image = torch.ByteTensor(3, size[1], size[2]).fill_(0)
+        for label in range(0, len(self.cmap)):
+            mask = (label == gray_image[0]).cpu()
+            color_image[0][mask] = self.cmap[label][0]
+            color_image[1][mask] = self.cmap[label][1]
+            color_image[2][mask] = self.cmap[label][2]
+        return color_image

model.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchvision
+pytorch_lightning
+pyroomacoustics
+soundfile
+librosa