FoleyCrafter

Running on Zero

App Files Files Community

ymzhang319 commited on Jun 27

Commit

7f2690b

•

1 Parent(s): 8c104ce

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +276 -0
configs/auffusion/vocoder/config.json +37 -0
configs/train/train_semantic_adapter.yaml +54 -0
configs/train/train_temporal_adapter.yaml +48 -0
environment.yaml +24 -0
foleycrafter/data/dataset.py +175 -0
foleycrafter/data/video_transforms.py +400 -0
foleycrafter/models/adapters/attention_processor.py +653 -0
foleycrafter/models/adapters/ip_adapter.py +217 -0
foleycrafter/models/adapters/resampler.py +158 -0
foleycrafter/models/adapters/transformer.py +327 -0
foleycrafter/models/adapters/utils.py +81 -0
foleycrafter/models/auffusion/attention.py +669 -0
foleycrafter/models/auffusion/attention_processor.py +0 -0
foleycrafter/models/auffusion/dual_transformer_2d.py +156 -0
foleycrafter/models/auffusion/loaders/ip_adapter.py +520 -0
foleycrafter/models/auffusion/loaders/unet.py +1100 -0
foleycrafter/models/auffusion/resnet.py +685 -0
foleycrafter/models/auffusion/transformer_2d.py +460 -0
foleycrafter/models/auffusion/unet_2d_blocks.py +0 -0
foleycrafter/models/auffusion_unet.py +1260 -0
foleycrafter/models/specvqgan/data/greatesthit.py +993 -0
foleycrafter/models/specvqgan/data/impactset.py +778 -0
foleycrafter/models/specvqgan/data/transforms.py +685 -0
foleycrafter/models/specvqgan/data/utils.py +265 -0
foleycrafter/models/specvqgan/models/av_cond_transformer.py +528 -0
foleycrafter/models/specvqgan/models/cond_transformer.py +455 -0
foleycrafter/models/specvqgan/models/vqgan.py +397 -0
foleycrafter/models/specvqgan/modules/diffusionmodules/model.py +999 -0
foleycrafter/models/specvqgan/modules/discriminator/model.py +295 -0
foleycrafter/models/specvqgan/modules/losses/__init__.py +7 -0
foleycrafter/models/specvqgan/modules/losses/lpaps.py +152 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/configs/melception.yaml +24 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/configs/vggish.yaml +34 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/configs/vggish_gh.yaml +25 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/configs/vggish_gh_action.yaml +25 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/configs/vggish_gh_material.yaml +25 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/dataset.py +295 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/logger.py +90 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/loss.py +41 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/metrics.py +69 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/model.py +77 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/predict.py +90 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/predict_gh.py +66 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/train_melception.py +241 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/train_vggishish.py +199 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/train_vggishish_gh.py +218 -0
foleycrafter/models/specvqgan/modules/losses/vggishish/transforms.py +98 -0
foleycrafter/models/specvqgan/modules/losses/vqperceptual.py +209 -0
foleycrafter/models/specvqgan/modules/misc/class_cond.py +21 -0

app.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import torch
+import torchvision
+import os
+import os.path as osp
+import random
+from argparse import ArgumentParser
+from datetime import datetime
+import gradio as gr
+from foleycrafter.utils.util import build_foleycrafter, read_frames_with_moviepy
+from foleycrafter.pipelines.auffusion_pipeline import denormalize_spectrogram
+from foleycrafter.pipelines.auffusion_pipeline import Generator
+from foleycrafter.models.time_detector.model import VideoOnsetNet
+from foleycrafter.models.specvqgan.onset_baseline.utils import torch_utils
+from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
+from huggingface_hub import snapshot_download
+from diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler
+import soundfile as sf
+from moviepy.editor import AudioFileClip, VideoFileClip
+os.environ['GRADIO_TEMP_DIR'] = './tmp'
+sample_idx = 0
+scheduler_dict = {
+    "DDIM": DDIMScheduler,
+    "Euler": EulerDiscreteScheduler,
+    "PNDM": PNDMScheduler,
+}
+css = """
+.toolbutton {
+    margin-buttom: 0em 0em 0em 0em;
+    max-width: 2.5em;
+    min-width: 2.5em !important;
+    height: 2.5em;
+}
+"""
+parser = ArgumentParser()
+parser.add_argument("--config", type=str, default="example/config/base.yaml")
+parser.add_argument("--server-name", type=str, default="0.0.0.0")
+parser.add_argument("--port", type=int, default=11451)
+parser.add_argument("--share", action="store_true")
+parser.add_argument("--save-path", default="samples")
+args = parser.parse_args()
+N_PROMPT = (
+    ""
+)
+class FoleyController:
+    def __init__(self):
+        # config dirs
+        self.basedir = os.getcwd()
+        self.model_dir = os.path.join(self.basedir, "models")
+        self.savedir = os.path.join(self.basedir, args.save_path, datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
+        self.savedir_sample = os.path.join(self.savedir, "sample")
+        os.makedirs(self.savedir, exist_ok=True)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.pipeline = None
+        self.loaded = False
+        self.load_model()
+    def load_model(self):
+        gr.Info("Start Load Models...")
+        print("Start Load Models...")
+        # download ckpt
+        pretrained_model_name_or_path = 'auffusion/auffusion-full-no-adapter'
+        if not os.path.isdir(pretrained_model_name_or_path):
+            pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path, local_dir='models/auffusion')
+        fc_ckpt = 'ymzhang319/FoleyCrafter'
+        if not os.path.isdir(fc_ckpt):
+            fc_ckpt = snapshot_download(fc_ckpt, local_dir='models/')
+        # set model config
+        temporal_ckpt_path = osp.join(self.model_dir, 'temporal_adapter.ckpt')
+        # load vocoder
+        vocoder_config_path= "./models/auffusion"
+        self.vocoder       = Generator.from_pretrained(
+                        vocoder_config_path,
+                        subfolder="vocoder").to(self.device)
+        # load time detector
+        time_detector_ckpt = osp.join(osp.join(self.model_dir, 'timestamp_detector.pth.tar'))
+        time_detector      = VideoOnsetNet(False)
+        self.time_detector, _   = torch_utils.load_model(time_detector_ckpt, time_detector, strict=True, device=self.device)
+        self.pipeline = build_foleycrafter().to(self.device)
+        ckpt = torch.load(temporal_ckpt_path)
+        # load temporal adapter
+        if 'state_dict' in ckpt.keys():
+            ckpt = ckpt['state_dict']
+        load_gligen_ckpt = {}
+        for key, value in ckpt.items():
+            if key.startswith('module.'):
+                load_gligen_ckpt[key[len('module.'):]] = value
+            else:
+                load_gligen_ckpt[key] = value
+        m, u        = self.pipeline.controlnet.load_state_dict(load_gligen_ckpt, strict=False)
+        print(f"### Control Net missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        self.image_processor      = CLIPImageProcessor()
+        self.image_encoder        = CLIPVisionModelWithProjection.from_pretrained('h94/IP-Adapter', subfolder='models/image_encoder').to(self.device)
+        self.pipeline.load_ip_adapter(fc_ckpt, subfolder='semantic', weight_name='semantic_adapter.bin', image_encoder_folder=None)
+        gr.Info("Load Finish!")
+        print("Load Finish!")
+        self.loaded = True
+        return "Load"
+    def foley(
+        self,
+        input_video,
+        prompt_textbox,
+        negative_prompt_textbox,
+        ip_adapter_scale,
+        temporal_scale,
+        sampler_dropdown,
+        sample_step_slider,
+        cfg_scale_slider,
+        seed_textbox,
+    ):
+        vision_transform_list = [
+            torchvision.transforms.Resize((128, 128)),
+            torchvision.transforms.CenterCrop((112, 112)),
+            torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ]
+        video_transform = torchvision.transforms.Compose(vision_transform_list)
+        if not self.loaded:
+            raise gr.Error("Error with loading model")
+        generator  = torch.Generator()
+        if seed_textbox != "":
+            torch.manual_seed(int(seed_textbox))
+            generator.manual_seed(int(seed_textbox))
+        max_frame_nums = 15
+        frames, duration  = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
+        if duration >= 10:
+            duration = 10
+        time_frames = torch.FloatTensor(frames).permute(0, 3, 1, 2)
+        time_frames = video_transform(time_frames)
+        time_frames = {'frames': time_frames.unsqueeze(0).permute(0, 2, 1, 3, 4)}
+        preds       = self.time_detector(time_frames)
+        preds       = torch.sigmoid(preds)
+        # duration
+        time_condition = [-1 if preds[0][int(i / (1024 / 10 * duration) * max_frame_nums)] < 0.5 else 1 for i in range(int(1024 / 10 * duration))]
+        time_condition = time_condition + [-1] * (1024 - len(time_condition))
+        # w -> b c h w
+        time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
+        images = self.image_processor(images=frames, return_tensors="pt").to(self.device)
+        image_embeddings = self.image_encoder(**images).image_embeds
+        image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
+        neg_image_embeddings = torch.zeros_like(image_embeddings)
+        image_embeddings = torch.cat([neg_image_embeddings, image_embeddings], dim=1)
+        self.pipeline.set_ip_adapter_scale(ip_adapter_scale)
+        sample = self.pipeline(
+            prompt=prompt_textbox,
+            negative_prompt=negative_prompt_textbox,
+            ip_adapter_image_embeds=image_embeddings,
+            image=time_condition,
+            controlnet_conditioning_scale=float(temporal_scale),
+            num_inference_steps=sample_step_slider,
+            height=256,
+            width=1024,
+            output_type="pt",
+            generator=generator,
+        )
+        name = 'output'
+        audio_img = sample.images[0]
+        audio     = denormalize_spectrogram(audio_img)
+        audio     = self.vocoder.inference(audio, lengths=160000)[0]
+        audio_save_path = osp.join(self.savedir_sample, 'audio')
+        os.makedirs(audio_save_path, exist_ok=True)
+        audio = audio[:int(duration * 16000)]
+        save_path = osp.join(audio_save_path, f'{name}.wav')
+        sf.write(save_path, audio, 16000)
+        audio = AudioFileClip(osp.join(audio_save_path, f'{name}.wav'))
+        video = VideoFileClip(input_video)
+        audio = audio.subclip(0, duration)
+        video.audio = audio
+        video = video.subclip(0, duration)
+        video.write_videofile(osp.join(self.savedir_sample, f'{name}.mp4'))
+        save_sample_path = os.path.join(self.savedir_sample, f"{name}.mp4")
+        return save_sample_path
+controller = FoleyController()
+def ui():
+    with gr.Blocks(css=css) as demo:
+        gr.HTML(
+            "<div align='center'><font size='6'>FoleyCrafter: Bring Silent Videos to Life with Lifelike and Synchronized Sounds</font></div>"
+        )
+        with gr.Row():
+            gr.Markdown(
+                "<div align='center'><font size='5'><a href='https://foleycrafter.github.io/'>Project Page</a> &ensp;"  # noqa
+                "<a href='https://arxiv.org/abs/xxxx.xxxxx/'>Paper</a> &ensp;"
+                "<a href='https://github.com/open-mmlab/foleycrafter'>Code</a> &ensp;"
+                "<a href='https://huggingface.co/spaces/ymzhang319/FoleyCrafter'>Demo</a> </font></div>"
+            )
+        with gr.Column(variant="panel"):
+            with gr.Row(equal_height=False):
+                with gr.Column():
+                    with gr.Row():
+                        init_img = gr.Video(label="Input Video")
+                    with gr.Row():
+                        prompt_textbox = gr.Textbox(value='', label="Prompt", lines=1)
+                    with gr.Row():
+                        negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
+                    with gr.Row():
+                        sampler_dropdown = gr.Dropdown(
+                            label="Sampling method",
+                            choices=list(scheduler_dict.keys()),
+                            value=list(scheduler_dict.keys())[0],
+                        )
+                        sample_step_slider = gr.Slider(
+                            label="Sampling steps", value=25, minimum=10, maximum=100, step=1
+                        )
+                    cfg_scale_slider = gr.Slider(label="CFG Scale", value=7.5, minimum=0, maximum=20)
+                    ip_adapter_scale = gr.Slider(label="Visual Content Scale", value=1.0, minimum=0, maximum=1)
+                    temporal_scale = gr.Slider(label="Temporal Align Scale", value=0., minimum=0., maximum=1.0)
+                    with gr.Row():
+                        seed_textbox = gr.Textbox(label="Seed", value=42)
+                        seed_button = gr.Button(value="\U0001f3b2", elem_classes="toolbutton")
+                    seed_button.click(fn=lambda x: random.randint(1, 1e8), outputs=[seed_textbox], queue=False)
+                    generate_button = gr.Button(value="Generate", variant="primary")
+                result_video = gr.Video(label="Generated Audio", interactive=False)
+            generate_button.click(
+                fn=controller.foley,
+                inputs=[
+                    init_img,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    ip_adapter_scale,
+                    temporal_scale,
+                    sampler_dropdown,
+                    sample_step_slider,
+                    cfg_scale_slider,
+                    seed_textbox,
+                ],
+                outputs=[result_video],
+            )
+    return demo
+if __name__ == "__main__":
+    demo = ui()
+    demo.queue(3)
+    demo.launch(server_name=args.server_name, server_port=args.port, share=args.share)

configs/auffusion/vocoder/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates": [5,4,4,2],
+    "upsample_kernel_sizes": [11,8,8,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "segment_size": 5120,
+    "num_mels": 256,
+    "num_freq": 2049,
+    "n_fft": 2048,
+    "hop_size": 160,
+    "win_size": 1024,
+    "sampling_rate": 16000,
+    "fmin": 0,
+    "fmax": null,
+    "fmax_for_loss": null,
+    "num_workers": 4,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

configs/train/train_semantic_adapter.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+output_dir: "outputs"
+pretrained_model_path: ""
+motion_module_path: "models/mm_sd_v15_v2.ckpt"
+train_data:
+  csv_path: "./curated.csv"
+  audio_fps: 48000
+  audio_size: 480000
+validation_data:
+  prompts:
+    - "./data/input/lighthouse.png"
+    - "./data/input/guitar.png"
+    - "./data/input/lion.png"
+    - "./data/input/gun.png"
+  num_inference_steps: 25
+  guidance_scale: 7.5
+  sample_size: 512
+trainable_modules:
+  - 'to_k_ip'
+  - 'to_v_ip'
+audio_unet_checkpoint_path: ""
+learning_rate:    1.0e-4
+train_batch_size: 1 # max for mixed
+gradient_accumulation_steps: 1
+max_train_epoch:      -1
+max_train_steps:      200000
+checkpointing_epochs: 4000
+checkpointing_steps:  500
+validation_steps:       3000
+validation_steps_tuple: [2, 50, 300, 1000]
+global_seed: 42
+mixed_precision_training: true
+is_debug: False
+resume_ckpt: ""
+# params for adapter
+init_from_ip_adapter: false
+always_null_text: false
+reverse_null_text_prob: true
+frame_wise_condition: true

configs/train/train_temporal_adapter.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+output_dir: "outputs"
+pretrained_model_path: ""
+motion_module_path: "models/mm_sd_v15_v2.ckpt"
+train_data:
+  csv_path: "./curated.csv"
+  audio_fps: 48000
+  audio_size: 480000
+validation_data:
+  prompts:
+    - "./data/input/lighthouse.png"
+    - "./data/input/guitar.png"
+    - "./data/input/lion.png"
+    - "./data/input/gun.png"
+  num_inference_steps: 25
+  guidance_scale: 7.5
+  sample_size: 512
+trainable_modules:
+  - 'time_conv_in.'
+  - 'conv_in.'
+video_unet_checkpoint_path: "models/vggsound_unet.ckpt"
+audio_unet_checkpoint_path: ""
+learning_rate:    5.0e-5
+train_batch_size: 1 # max for mixed
+gradient_accumulation_steps: 1
+max_train_epoch:      -1
+max_train_steps:      500000
+checkpointing_epochs: 4000
+checkpointing_steps:  500
+validation_steps:       3000
+validation_steps_tuple: [2, 300, 1000]
+global_seed: 42
+mixed_precision_training: true
+is_debug: False
+resume_ckpt: ""
+zero_no_label_mel: false

environment.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+name: foleycrafter
+channels:
+  - pytorch
+  - nvidia
+dependencies:
+  - python=3.10
+  - pytorch=2.2.0
+  - torchvision=0.17.0
+  - pytorch-cuda=11.8
+  - pip
+  - pip:
+    - diffusers==0.25.1
+    - transformers==4.30.2
+    - xformers
+    - imageio==2.33.1
+    - decord==0.6.0
+    - einops
+    - omegaconf
+    - safetensors
+    - gradio
+    - tqdm==4.66.1
+    - soundfile==0.12.1
+    - wandb
+    - moviepy==1.0.3

foleycrafter/data/dataset.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import torch
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import Dataset
+import torch.distributed as dist
+import torchaudio
+import torchvision
+import torchvision.io
+import os, io, csv, math, random
+import os.path as osp
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from einops import rearrange
+import glob
+from decord import VideoReader, AudioReader
+import decord
+from copy import deepcopy
+import pickle
+from petrel_client.client import Client
+import sys
+sys.path.append('./')
+from foleycrafter.data import video_transforms
+from foleycrafter.utils.util import \
+    random_audio_video_clip, get_full_indices, video_tensor_to_np, get_video_frames
+from foleycrafter.utils.spec_to_mel import wav_tensor_to_fbank, read_wav_file_io, load_audio, normalize_wav, pad_wav
+from foleycrafter.utils.converter import get_mel_spectrogram_from_audio, pad_spec, normalize, normalize_spectrogram
+def zero_rank_print(s):
+    if (not dist.is_initialized()) or (dist.is_initialized() and dist.get_rank() == 0): print("### " + s, flush=True)
+@torch.no_grad()
+def get_mel(audio_data, audio_cfg):
+    # mel shape: (n_mels, T)
+    mel = torchaudio.transforms.MelSpectrogram(
+        sample_rate=audio_cfg["sample_rate"],
+        n_fft=audio_cfg["window_size"],
+        win_length=audio_cfg["window_size"],
+        hop_length=audio_cfg["hop_size"],
+        center=True,
+        pad_mode="reflect",
+        power=2.0,
+        norm=None,
+        onesided=True,
+        n_mels=64,
+        f_min=audio_cfg["fmin"],
+        f_max=audio_cfg["fmax"],
+    ).to(audio_data.device)
+    mel = mel(audio_data)
+    # we use log mel spectrogram as input
+    mel = torchaudio.transforms.AmplitudeToDB(top_db=None)(mel)
+    return mel  # (T, n_mels)
+def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return normalize_fun(torch.clamp(x, min=clip_val) * C)
+class CPU_Unpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        if module == 'torch.storage' and name == '_load_from_bytes':
+            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
+        else:
+            return super().find_class(module, name)
+class AudioSetStrong(Dataset):
+    # read feature and audio
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self.data_path = 'data/AudioSetStrong/train/feature'
+        self.data_list = list(self._client.list(self.data_path))
+        self.length = len(self.data_list)
+        # get video feature
+        self.video_path = 'data/AudioSetStrong/train/video'
+        vision_transform_list = [
+            transforms.Resize((128, 128)),
+            transforms.CenterCrop((112, 112)),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ]
+        self.video_transform = transforms.Compose(vision_transform_list)
+    def get_batch(self, idx):
+        embeds = self.data_list[idx]
+        mel           = embeds['mel']
+        save_bsz      = mel.shape[0]
+        audio_info    = embeds['audio_info']
+        text_embeds   = embeds['text_embeds']
+        # audio_info['label_list'] = np.array(audio_info['label_list'])
+        audio_info_array = np.array(audio_info['label_list'])
+        prompts = []
+        for i in range(save_bsz):
+            prompts.append(', '.join(audio_info_array[i, :audio_info['event_num'][i]].tolist()))
+        # import ipdb; ipdb.set_trace()
+        # read videos
+        videos = None
+        for video_name in audio_info['audio_name']:
+            video_bytes  = self._client.Get(osp.join(self.video_path, video_name+'.mp4'))
+            video_bytes  = io.BytesIO(video_bytes)
+            video_reader = VideoReader(video_bytes)
+            video        = video_reader.get_batch(get_full_indices(video_reader)).asnumpy()
+            video        = get_video_frames(video, 150)
+            video        = torch.from_numpy(video).permute(0, 3, 1, 2).contiguous().float()
+            video        = self.video_transform(video)
+            video        = video.unsqueeze(0)
+            if videos is None:
+                videos = video
+            else:
+                videos = torch.cat([videos, video], dim=0)
+            # video        = torch.from_numpy(video).permute(0, 3, 1, 2).contiguous()
+        assert videos is not None, 'no video read'
+        return mel, audio_info, text_embeds, prompts, videos
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                mel, audio_info, text_embeds, prompts, videos = self.get_batch(idx)
+                break
+            except Exception as e:
+                zero_rank_print(' >>> load error <<<')
+                idx = random.randint(0, self.length-1)
+        sample = dict(mel=mel, audio_info=audio_info, text_embeds=text_embeds, prompts=prompts, videos=videos)
+        return sample
+class VGGSound(Dataset):
+    # read feature and audio
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self.data_path = 'data/VGGSound/train/video'
+        self.visual_data_path = 'data/VGGSound/train/feature'
+        self.embeds_list = glob.glob(f'{self.data_path}/*.pt')
+        self.visual_list = glob.glob(f'{self.visual_data_path}/*.pt')
+        self.length = len(self.embeds_list)
+    def get_batch(self, idx):
+        embeds = torch.load(self.embeds_list[idx], map_location='cpu')
+        visual_embeds = torch.load(self.visual_list[idx], map_location='cpu')
+        # audio_embeds  = embeds['audio_embeds']
+        visual_embeds = visual_embeds['visual_embeds']
+        video_name    = embeds['video_name']
+        text          = embeds['text']
+        mel           = embeds['mel']
+        audio = mel
+        return visual_embeds, audio, text
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                visual_embeds, audio, text = self.get_batch(idx)
+                break
+            except Exception as e:
+                zero_rank_print('load error')
+                idx = random.randint(0, self.length-1)
+        sample = dict(visual_embeds=visual_embeds, audio=audio, text=text)
+        return sample

foleycrafter/data/video_transforms.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import torch
+import random
+import numbers
+from torchvision.transforms import RandomCrop, RandomResizedCrop
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tensor. Got %s" % type(clip))
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+    return True
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+    """
+    if len(clip.size()) != 4:
+        raise ValueError("clip should be a 4D tensor")
+    return clip[..., i : i + h, j : j + w]
+def resize(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)
+def resize_scale(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    _, _, H, W = clip.shape
+    scale_ = target_size[0] / min(H, W)
+    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+def center_crop(clip, crop_size):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    if h < th or w < tw:
+        raise ValueError("height and width must be no smaller than crop_size")
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw)
+def random_shift_crop(clip):
+    '''
+    Slide along the long edge, with the short edge as crop size
+    '''
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h <= w:
+        long_edge = w
+        short_edge = h
+    else:
+        long_edge = h
+        short_edge =w
+    th, tw = short_edge, short_edge
+    i = torch.randint(0, h - th + 1, size=(1,)).item()
+    j = torch.randint(0, w - tw + 1, size=(1,)).item()
+    return crop(clip, i, j, th, tw)
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+    Return:
+        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    # return clip.float().permute(3, 0, 1, 2) / 255.0
+    return clip.float() / 255.0
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    print(mean)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    return clip
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    return clip.flip(-1)
+class RandomCropVideo:
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: randomly cropped video clip.
+                size is (T, C, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip)
+        return crop(clip, i, j, h, w)
+    def get_params(self, clip):
+        h, w = clip.shape[-2:]
+        th, tw = self.size
+        if h < th or w < tw:
+            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = torch.randint(0, h - th + 1, size=(1,)).item()
+        j = torch.randint(0, w - tw + 1, size=(1,)).item()
+        return i, j, th, tw
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+class UCFCenterCropVideo:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        clip_center_crop = center_crop(clip_resize, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class KineticsRandomCropResizeVideo:
+    '''
+    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
+    '''
+    def __init__(
+            self,
+            size,
+            interpolation_mode="bilinear",
+         ):
+        if isinstance(size, tuple):
+                if len(size) != 2:
+                    raise ValueError(f"size should be tuple (height, width), instead got {size}")
+                self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        clip_random_crop = random_shift_crop(clip)
+        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
+        return clip_resize
+class CenterCropVideo:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_center_crop = center_crop(clip, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class NormalizeVideo:
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
+        """
+        return normalize(clip, self.mean, self.std, self.inplace)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
+class ToTensorVideo:
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    """
+    def __init__(self):
+        pass
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+        """
+        return to_tensor(clip)
+    def __repr__(self) -> str:
+        return self.__class__.__name__
+class RandomHorizontalFlipVideo:
+    """
+    Flip the video clip along the horizontal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor): Size is (T, C, H, W)
+        """
+        if random.random() < self.p:
+            clip = hflip(clip)
+        return clip
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+#  ------------------------------------------------------------
+#  ---------------------  Sampling  ---------------------------
+#  ------------------------------------------------------------
+class TemporalRandomCrop(object):
+	"""Temporally crop the given frame indices at a random location.
+	Args:
+		size (int): Desired length of frames will be seen in the model.
+	"""
+	def __init__(self, size):
+		self.size = size
+	def __call__(self, total_frames):
+		rand_end = max(0, total_frames - self.size - 1)
+		begin_index = random.randint(0, rand_end)
+		end_index = min(begin_index + self.size, total_frames)
+		return begin_index, end_index
+if __name__ == '__main__':
+    from torchvision import transforms
+    import torchvision.io as io
+    import numpy as np
+    from torchvision.utils import save_image
+    import os
+    vframes, aframes, info = io.read_video(
+    filename='./v_Archery_g01_c03.avi',
+    pts_unit='sec',
+    output_format='TCHW'
+    )
+    trans = transforms.Compose([
+        ToTensorVideo(),
+        RandomHorizontalFlipVideo(),
+        UCFCenterCropVideo(512),
+        # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    target_video_len = 32
+    frame_interval = 1
+    total_frames = len(vframes)
+    print(total_frames)
+    temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)
+    # Sampling video frames
+    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
+    # print(start_frame_ind)
+    # print(end_frame_ind)
+    assert end_frame_ind - start_frame_ind >= target_video_len
+    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
+    select_vframes = vframes[frame_indice]
+    select_vframes_trans = trans(select_vframes)
+    select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
+    io.write_video('./test.avi', select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)
+    for i in range(target_video_len):
+        save_image(select_vframes_trans[i], os.path.join('./test000', '%04d.png' % i), normalize=True, value_range=(-1, 1))

foleycrafter/models/adapters/attention_processor.py ADDED Viewed

	@@ -0,0 +1,653 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Union
+from einops import rearrange, repeat
+from diffusers.utils import logging
+from foleycrafter.models.adapters.ip_adapter import MLPProjModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class AttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        self.attn_map = ip_attention_probs
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0(torch.nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0WithProjection(torch.nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.before_proj_size = 1024
+        self.after_proj_size = 768
+        self.visual_proj = nn.Linear(self.before_proj_size, self.after_proj_size)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        # encoder_hidden_states = self.visual_proj(encoder_hidden_states)
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        with torch.no_grad():
+            self.attn_map = query @ ip_key.transpose(-2, -1).softmax(dim=-1)
+            #print(self.attn_map.shape)
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+## for controlnet
+class CNAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(self, num_tokens=4):
+        self.num_tokens = num_tokens
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states = encoder_hidden_states[:, :end_pos]  # only use text
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class CNAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self, num_tokens=4):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.num_tokens = num_tokens
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states = encoder_hidden_states[:, :end_pos]  # only use text
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

foleycrafter/models/adapters/ip_adapter.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import os
+from typing import List
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from PIL import Image
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from foleycrafter.models.adapters.resampler import Resampler
+from foleycrafter.models.adapters.utils import is_torch2_available
+class IPAdapter(torch.nn.Module):
+    """IP-Adapter"""
+    def __init__(self, unet, image_proj_model, adapter_modules, ckpt_path=None):
+        super().__init__()
+        self.unet = unet
+        self.image_proj_model = image_proj_model
+        self.adapter_modules = adapter_modules
+        if ckpt_path is not None:
+            self.load_from_checkpoint(ckpt_path)
+    def forward(self, noisy_latents, timesteps, encoder_hidden_states, image_embeds):
+        ip_tokens = self.image_proj_model(image_embeds)
+        encoder_hidden_states = torch.cat([encoder_hidden_states, ip_tokens], dim=1)
+        # Predict the noise residual
+        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
+        return noise_pred
+    def load_from_checkpoint(self, ckpt_path: str):
+        # Calculate original checksums
+        orig_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
+        orig_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
+        state_dict = torch.load(ckpt_path, map_location="cpu")
+        # Load state dict for image_proj_model and adapter_modules
+        self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
+        self.adapter_modules.load_state_dict(state_dict["ip_adapter"], strict=True)
+        # Calculate new checksums
+        new_ip_proj_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
+        new_adapter_sum = torch.sum(torch.stack([torch.sum(p) for p in self.adapter_modules.parameters()]))
+        # Verify if the weights have changed
+        assert orig_ip_proj_sum != new_ip_proj_sum, "Weights of image_proj_model did not change!"
+        assert orig_adapter_sum != new_adapter_sum, "Weights of adapter_modules did not change!"
+        print(f"Successfully loaded weights from checkpoint {ckpt_path}")
+class VideoProjModel(torch.nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=1, video_frame=50):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        self.video_frame = video_frame
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds)
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+class ImageProjModel(torch.nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds).reshape(
+            -1, self.clip_extra_context_tokens, self.cross_attention_dim
+        )
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+class MLPProjModel(torch.nn.Module):
+    """SD model with image prompt"""
+    def zero_initialize(module):
+        for param in module.parameters():
+            param.data.zero_()
+    def zero_initialize_last_layer(module):
+        last_layer = None
+        for module_name, layer in module.named_modules():
+            if isinstance(layer, torch.nn.Linear):
+                last_layer = layer
+        if last_layer is not None:
+            last_layer.weight.data.zero_()
+            last_layer.bias.data.zero_()
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(clip_embeddings_dim, clip_embeddings_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(clip_embeddings_dim, cross_attention_dim),
+            torch.nn.LayerNorm(cross_attention_dim)
+        )
+        # zero initialize the last layer
+        # self.zero_initialize_last_layer()
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class V2AMapperMLP(torch.nn.Module):
+    def __init__(self, cross_attention_dim=512, clip_embeddings_dim=512, mult=4):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(clip_embeddings_dim, clip_embeddings_dim * mult),
+            torch.nn.GELU(),
+            torch.nn.Linear(clip_embeddings_dim * mult, cross_attention_dim),
+            torch.nn.LayerNorm(cross_attention_dim)
+        )
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class TimeProjModel(torch.nn.Module):
+    def __init__(self, positive_len, out_dim, feature_type="text-only", frame_nums:int=64):
+        super().__init__()
+        self.positive_len = positive_len
+        self.out_dim = out_dim
+        self.position_dim = frame_nums
+        if isinstance(out_dim, tuple):
+            out_dim = out_dim[0]
+        if feature_type == "text-only":
+            self.linears = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+        elif feature_type == "text-image":
+            self.linears_text = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.linears_image = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_text_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+            self.null_image_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+        # self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
+    def forward(
+        self,
+        boxes,
+        masks,
+        positive_embeddings=None,
+    ):
+        masks = masks.unsqueeze(-1)
+        # # embedding position (it may includes padding as placeholder)
+        # xyxy_embedding = self.fourier_embedder(boxes)  # B*N*4 -> B*N*C
+        # # learnable null embedding
+        # xyxy_null = self.null_position_feature.view(1, 1, -1)
+        # # replace padding with learnable null embedding
+        # xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
+        time_embeds = boxes
+        # positionet with text only information
+        if positive_embeddings is not None:
+            # learnable null embedding
+            positive_null = self.null_positive_feature.view(1, 1, -1)
+            # replace padding with learnable null embedding
+            positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
+            objs = self.linears(torch.cat([positive_embeddings, time_embeds], dim=-1))
+        # positionet with text and image infomation
+        else:
+            raise NotImplementedError
+        return objs

foleycrafter/models/adapters/resampler.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+# and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange
+from einops.layers.torch import Rearrange
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class Resampler(nn.Module):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+        max_seq_len: int = 257,  # CLIP tokens + CLS token
+        apply_pos_emb: bool = False,
+        num_latents_mean_pooled: int = 0,  # number of latents derived from mean pooled representation of the sequence
+    ):
+        super().__init__()
+        self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.to_latents_from_mean_pooled_seq = (
+            nn.Sequential(
+                nn.LayerNorm(dim),
+                nn.Linear(dim, dim * num_latents_mean_pooled),
+                Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled),
+            )
+            if num_latents_mean_pooled > 0
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        if self.pos_emb is not None:
+            n, device = x.shape[1], x.device
+            pos_emb = self.pos_emb(torch.arange(n, device=device))
+            x = x + pos_emb
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+        if self.to_latents_from_mean_pooled_seq:
+            meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool))
+            meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
+            latents = torch.cat((meanpooled_latents, latents), dim=-2)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+def masked_mean(t, *, dim, mask=None):
+    if mask is None:
+        return t.mean(dim=dim)
+    denom = mask.sum(dim=dim, keepdim=True)
+    mask = rearrange(mask, "b n -> b n 1")
+    masked_t = t.masked_fill(~mask, 0.0)
+    return masked_t.sum(dim=dim) / denom.clamp(min=1e-5)

foleycrafter/models/adapters/transformer.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from typing import Any, Optional, Tuple, Union
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, hidden_size, num_attention_heads, attention_head_dim, attention_dropout=0.0):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.num_heads = num_attention_heads
+        self.head_dim = attention_head_dim
+        self.scale = self.head_dim**-0.5
+        self.dropout = attention_dropout
+        self.inner_dim = self.head_dim * self.num_heads
+        self.k_proj = nn.Linear(self.embed_dim, self.inner_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.inner_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.inner_dim)
+        self.out_proj = nn.Linear(self.inner_dim, self.embed_dim)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.inner_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class MLP(nn.Module):
+    def __init__(self, hidden_size, intermediate_size, mult=4):
+        super().__init__()
+        self.activation_fn = nn.SiLU()
+        self.fc1 = nn.Linear(hidden_size, intermediate_size * mult)
+        self.fc2 = nn.Linear(intermediate_size * mult, hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class Transformer(nn.Module):
+    def __init__(self, depth=12):
+        super().__init__()
+        self.layers = nn.ModuleList([TransformerBlock() for _ in range(depth)])
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor=None,
+        causal_attention_mask: torch.Tensor=None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+        return hidden_states
+class TransformerBlock(nn.Module):
+    def __init__(self, hidden_size=512, num_attention_heads=12, attention_head_dim=64, attention_dropout=0.0, dropout=0.0, eps=1e-5):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.self_attn = Attention(hidden_size=hidden_size, num_attention_heads=num_attention_heads, attention_head_dim=attention_head_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=eps)
+        self.mlp = MLP(hidden_size=hidden_size, intermediate_size=hidden_size)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor=None,
+        causal_attention_mask: torch.Tensor=None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs[0]
+class DiffusionTransformerBlock(nn.Module):
+    def __init__(self, hidden_size=512, num_attention_heads=12, attention_head_dim=64, attention_dropout=0.0, dropout=0.0, eps=1e-5):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.self_attn = Attention(hidden_size=hidden_size, num_attention_heads=num_attention_heads, attention_head_dim=attention_head_dim)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=eps)
+        self.mlp = MLP(hidden_size=hidden_size, intermediate_size=hidden_size)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=eps)
+        self.output_token = nn.Parameter(torch.randn(1, hidden_size))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor=None,
+        causal_attention_mask: torch.Tensor=None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        output_token = self.output_token.unsqueeze(0).repeat(hidden_states.shape[0], 1, 1)
+        hidden_states = torch.cat([output_token, hidden_states], dim=1)
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs[0][:,0:1,...]
+class V2AMapperMLP(nn.Module):
+    def __init__(self, input_dim=512, output_dim=512, expansion_rate=4):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, input_dim * expansion_rate)
+        self.silu = nn.SiLU()
+        self.layer_norm = nn.LayerNorm(input_dim * expansion_rate)
+        self.linear2 = nn.Linear(input_dim * expansion_rate, output_dim)
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.silu(x)
+        x = self.layer_norm(x)
+        x = self.linear2(x)
+        return x
+class ImageProjModel(torch.nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        self.zero_initialize_last_layer()
+    def zero_initialize_last_layer(module):
+        last_layer = None
+        for module_name, layer in module.named_modules():
+            if isinstance(layer, torch.nn.Linear):
+                last_layer = layer
+        if last_layer is not None:
+            last_layer.weight.data.zero_()
+            last_layer.bias.data.zero_()
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds).reshape(
+            -1, self.clip_extra_context_tokens, self.cross_attention_dim
+        )
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+class VisionAudioAdapter(torch.nn.Module):
+    def __init__(
+            self,
+            embedding_size=768,
+            expand_dim=4,
+            token_num=4,
+        ):
+        super().__init__()
+        self.mapper = V2AMapperMLP(
+            embedding_size,
+            embedding_size,
+            expansion_rate=expand_dim,
+        )
+        self.proj = ImageProjModel(
+            cross_attention_dim=embedding_size,
+            clip_embeddings_dim=embedding_size,
+            clip_extra_context_tokens=token_num,
+        )
+    def forward(self, image_embeds):
+        image_embeds = self.mapper(image_embeds)
+        image_embeds = self.proj(image_embeds)
+        return image_embeds

foleycrafter/models/adapters/utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+attn_maps = {}
+def hook_fn(name):
+    def forward_hook(module, input, output):
+        if hasattr(module.processor, "attn_map"):
+            attn_maps[name] = module.processor.attn_map
+            del module.processor.attn_map
+    return forward_hook
+def register_cross_attention_hook(unet):
+    for name, module in unet.named_modules():
+        if name.split('.')[-1].startswith('attn2'):
+            module.register_forward_hook(hook_fn(name))
+    return unet
+def upscale(attn_map, target_size):
+    attn_map = torch.mean(attn_map, dim=0)
+    attn_map = attn_map.permute(1,0)
+    temp_size = None
+    for i in range(0,5):
+        scale = 2 ** i
+        if ( target_size[0] // scale ) * ( target_size[1] // scale) == attn_map.shape[1]*64:
+            temp_size = (target_size[0]//(scale*8), target_size[1]//(scale*8))
+            break
+    assert temp_size is not None, "temp_size cannot is None"
+    attn_map = attn_map.view(attn_map.shape[0], *temp_size)
+    attn_map = F.interpolate(
+        attn_map.unsqueeze(0).to(dtype=torch.float32),
+        size=target_size,
+        mode='bilinear',
+        align_corners=False
+    )[0]
+    attn_map = torch.softmax(attn_map, dim=0)
+    return attn_map
+def get_net_attn_map(image_size, batch_size=2, instance_or_negative=False, detach=True):
+    idx = 0 if instance_or_negative else 1
+    net_attn_maps = []
+    for name, attn_map in attn_maps.items():
+        attn_map = attn_map.cpu() if detach else attn_map
+        attn_map = torch.chunk(attn_map, batch_size)[idx].squeeze()
+        attn_map = upscale(attn_map, image_size)
+        net_attn_maps.append(attn_map)
+    net_attn_maps = torch.mean(torch.stack(net_attn_maps,dim=0),dim=0)
+    return net_attn_maps
+def attnmaps2images(net_attn_maps):
+    #total_attn_scores = 0
+    images = []
+    for attn_map in net_attn_maps:
+        attn_map = attn_map.cpu().numpy()
+        #total_attn_scores += attn_map.mean().item()
+        normalized_attn_map = (attn_map - np.min(attn_map)) / (np.max(attn_map) - np.min(attn_map)) * 255
+        normalized_attn_map = normalized_attn_map.astype(np.uint8)
+        #print("norm: ", normalized_attn_map.shape)
+        image = Image.fromarray(normalized_attn_map)
+        #image = fix_save_attn_map(attn_map)
+        images.append(image)
+    #print(total_attn_scores)
+    return images
+def is_torch2_available():
+    return hasattr(F, "scaled_dot_product_attention")

foleycrafter/models/auffusion/attention.py ADDED Viewed

	@@ -0,0 +1,669 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import USE_PEFT_BACKEND
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.models.normalization import\
+        AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+from foleycrafter.models.auffusion.attention_processor import Attention
+def _chunked_feed_forward(
+    ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int, lora_scale: Optional[float] = None
+):
+    # "feed_forward_chunk_size" can be used to save memory
+    if hidden_states.shape[chunk_dim] % chunk_size != 0:
+        raise ValueError(
+            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+        )
+    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
+    if lora_scale is None:
+        ff_output = torch.cat(
+            [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+            dim=chunk_dim,
+        )
+    else:
+        # TOOD(Patrick): LoRA scale can be removed once PEFT refactor is complete
+        ff_output = torch.cat(
+            [ff(hid_slice, scale=lora_scale) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+            dim=chunk_dim,
+        )
+    return ff_output
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Module):
+    r"""
+    A gated self-attention dense layer that combines visual features and object features.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        context_dim (`int`): The number of channels in the context.
+        n_heads (`int`): The number of heads to use for attention.
+        d_head (`int`): The number of channels in each head.
+    """
+    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
+        super().__init__()
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
+        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
+        self.enabled = True
+    def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
+        if not self.enabled:
+            return x
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+        return x
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_continuous:
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if (only_cross_attention and not double_self_attention) else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if self.use_ada_layer_norm:
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif self.use_ada_layer_norm_continuous:
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        if self.use_ada_layer_norm_continuous:
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+        elif not self.use_ada_layer_norm_single:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+        # 5. Scale-shift for PixArt-Alpha.
+        if self.use_ada_layer_norm_single:
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.use_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.use_ada_layer_norm_continuous:
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.use_ada_layer_norm_single:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            norm_hidden_states = norm_hidden_states.squeeze(1)
+        else:
+            raise ValueError("Incorrect norm used")
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+        # 1. Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 2. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.use_ada_layer_norm_single:
+            attn_output = gate_msa * attn_output
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        # 2.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero or self.use_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.use_ada_layer_norm_single:
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.use_ada_layer_norm_continuous:
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+            if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        if self.use_ada_layer_norm_continuous:
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(
+                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.use_ada_layer_norm_single:
+            ff_output = gate_mlp * ff_output
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        return hidden_states
+@maybe_allow_in_graph
+class TemporalBasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block for video like data.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        time_mix_inner_dim (`int`): The number of channels for temporal attention.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+    """
+    def __init__(
+        self,
+        dim: int,
+        time_mix_inner_dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.is_res = dim == time_mix_inner_dim
+        self.norm_in = nn.LayerNorm(dim)
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm_in = nn.LayerNorm(dim)
+        self.ff_in = FeedForward(
+            dim,
+            dim_out=time_mix_inner_dim,
+            activation_fn="geglu",
+        )
+        self.norm1 = nn.LayerNorm(time_mix_inner_dim)
+        self.attn1 = Attention(
+            query_dim=time_mix_inner_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            cross_attention_dim=None,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = nn.LayerNorm(time_mix_inner_dim)
+            self.attn2 = Attention(
+                query_dim=time_mix_inner_dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(time_mix_inner_dim)
+        self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = None
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
+        self._chunk_dim = 1
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        num_frames: int,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        batch_frames, seq_length, channels = hidden_states.shape
+        batch_size = batch_frames // num_frames
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)
+        residual = hidden_states
+        hidden_states = self.norm_in(hidden_states)
+        if self._chunk_size is not None:
+            hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            hidden_states = self.ff_in(hidden_states)
+        if self.is_res:
+            hidden_states = hidden_states + residual
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        hidden_states = attn_output + hidden_states
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self._chunk_size is not None:
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.is_res:
+            hidden_states = ff_output + hidden_states
+        else:
+            hidden_states = ff_output
+        hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
+        return hidden_states
+class SkipFFTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        kv_input_dim: int,
+        kv_input_dim_proj_use_bias: bool,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        if kv_input_dim != dim:
+            self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias)
+        else:
+            self.kv_mapper = None
+        self.norm1 = RMSNorm(dim, 1e-06)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
+            out_bias=attention_out_bias,
+        )
+        self.norm2 = RMSNorm(dim, 1e-06)
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+        )
+    def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs):
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        if self.kv_mapper is not None:
+            encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states))
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+        hidden_states = attn_output + hidden_states
+        norm_hidden_states = self.norm2(hidden_states)
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+        hidden_states = attn_output + hidden_states
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
+        for module in self.net:
+            if isinstance(module, compatible_cls):
+                hidden_states = module(hidden_states, scale)
+            else:
+                hidden_states = module(hidden_states)
+        return hidden_states

foleycrafter/models/auffusion/attention_processor.py ADDED Viewed

The diff for this file is too large to render. See raw diff

foleycrafter/models/auffusion/dual_transformer_2d.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+from torch import nn
+from foleycrafter.models.auffusion.transformer_2d \
+    import Transformer2DModel, Transformer2DModelOutput
+class DualTransformer2DModel(nn.Module):
+    """
+    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.transformers = nn.ModuleList(
+            [
+                Transformer2DModel(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    in_channels=in_channels,
+                    num_layers=num_layers,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    sample_size=sample_size,
+                    num_vector_embeds=num_vector_embeds,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                )
+                for _ in range(2)
+            ]
+        )
+        # Variables that can be set by a pipeline:
+        # The ratio of transformer1 to transformer2's output states to be combined during inference
+        self.mix_ratio = 0.5
+        # The shape of `encoder_hidden_states` is expected to be
+        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
+        self.condition_lengths = [77, 257]
+        # Which transformer to use to encode which condition.
+        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
+        self.transformer_index_for_condition = [1, 0]
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        timestep=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            attention_mask (`torch.FloatTensor`, *optional*):
+                Optional attention mask to be applied in Attention.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        input_states = hidden_states
+        encoded_states = []
+        tokens_start = 0
+        # attention_mask is not used yet
+        for i in range(2):
+            # for each of the two transformers, pass the corresponding condition tokens
+            condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
+            transformer_index = self.transformer_index_for_condition[i]
+            encoded_state = self.transformers[transformer_index](
+                input_states,
+                encoder_hidden_states=condition_state,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            encoded_states.append(encoded_state - input_states)
+            tokens_start += self.condition_lengths[i]
+        output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
+        output_states = output_states + input_states
+        if not return_dict:
+            return (output_states,)
+        return Transformer2DModelOutput(sample=output_states)

foleycrafter/models/auffusion/loaders/ip_adapter.py ADDED Viewed

	@@ -0,0 +1,520 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+from safetensors import safe_open
+from diffusers.models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
+from diffusers.utils import (
+    _get_model_file,
+    is_accelerate_available,
+    is_torch_version,
+    is_transformers_available,
+    logging,
+)
+if is_transformers_available():
+    from transformers import (
+        CLIPImageProcessor,
+        CLIPVisionModelWithProjection,
+    )
+    from diffusers.models.attention_processor import (
+        IPAdapterAttnProcessor,
+    )
+from foleycrafter.models.auffusion.attention_processor import IPAdapterAttnProcessor2_0, VPTemporalAdapterAttnProcessor2_0
+logger = logging.get_logger(__name__)
+class IPAdapterMixin:
+    """Mixin for handling IP Adapters."""
+    @validate_hf_hub_args
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, torch.Tensor]],
+        subfolder: Union[str, List[str]],
+        weight_name: Union[str, List[str]],
+        image_encoder_folder: Optional[str] = "image_encoder",
+        **kwargs,
+    ):
+        """
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+            subfolder (`str` or `List[str]`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+                If a list is passed, it should have the same length as `weight_name`.
+            weight_name (`str` or `List[str]`):
+                The name of the weight file to load. If a list is passed, it should have the same length as
+                `weight_name`.
+            image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
+                The subfolder location of the image encoder within a larger model repository on the Hub or locally.
+                Pass `None` to not load the image encoder. If the image encoder is located in a folder inside `subfolder`,
+                you only need to pass the name of the folder that contains image encoder weights, e.g. `image_encoder_folder="image_encoder"`.
+                If the image encoder is located in a folder other than `subfolder`, you should pass the path to the folder that contains image encoder weights,
+                for example, `image_encoder_folder="different_subfolder/image_encoder"`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+        """
+        # handle the list inputs for multiple IP Adapters
+        if not isinstance(weight_name, list):
+            weight_name = [weight_name]
+        if not isinstance(pretrained_model_name_or_path_or_dict, list):
+            pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
+        if len(pretrained_model_name_or_path_or_dict) == 1:
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)
+        if not isinstance(subfolder, list):
+            subfolder = [subfolder]
+        if len(subfolder) == 1:
+            subfolder = subfolder * len(weight_name)
+        if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
+            raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")
+        if len(weight_name) != len(subfolder):
+            raise ValueError("`weight_name` and `subfolder` must have the same length.")
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        state_dicts = []
+        for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
+            pretrained_model_name_or_path_or_dict, weight_name, subfolder
+        ):
+            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                if weight_name.endswith(".safetensors"):
+                    state_dict = {"image_proj": {}, "ip_adapter": {}}
+                    with safe_open(model_file, framework="pt", device="cpu") as f:
+                        for key in f.keys():
+                            if key.startswith("image_proj."):
+                                state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                            elif key.startswith("ip_adapter."):
+                                state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+                else:
+                    state_dict = torch.load(model_file, map_location="cpu")
+            else:
+                state_dict = pretrained_model_name_or_path_or_dict
+            keys = list(state_dict.keys())
+            if keys != ["image_proj", "ip_adapter"]:
+                raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
+            state_dicts.append(state_dict)
+            # load CLIP image encoder here if it has not been registered to the pipeline yet
+            if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+                if image_encoder_folder is not None:
+                    if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                        logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                        if image_encoder_folder.count("/") == 0:
+                            image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix()
+                        else:
+                            image_encoder_subfolder = Path(image_encoder_folder).as_posix()
+                        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                            pretrained_model_name_or_path_or_dict,
+                            subfolder=image_encoder_subfolder,
+                            low_cpu_mem_usage=low_cpu_mem_usage,
+                        ).to(self.device, dtype=self.dtype)
+                        self.register_modules(image_encoder=image_encoder)
+                    else:
+                        raise ValueError(
+                            "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict."
+                        )
+                else:
+                    logger.warning(
+                        "image_encoder is not loaded since `image_encoder_folder=None` passed. You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter."
+                        "Use `ip_adapter_image_embeds` to pass pre-generated image embedding instead."
+                    )
+            # create feature extractor if it has not been registered to the pipeline yet
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+                feature_extractor = CLIPImageProcessor()
+                self.register_modules(feature_extractor=feature_extractor)
+        # load ip-adapter into unet
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        unet._load_ip_adapter_weights(state_dicts)
+    def set_ip_adapter_scale(self, scale):
+        """
+        Sets the conditioning scale between text and image.
+        Example:
+        ```py
+        pipeline.set_ip_adapter_scale(0.5)
+        ```
+        """
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for attn_processor in unet.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                if not isinstance(scale, list):
+                    scale = [scale] * len(attn_processor.scale)
+                if len(attn_processor.scale) != len(scale):
+                    raise ValueError(
+                        f"`scale` should be a list of same length as the number if ip-adapters "
+                        f"Expected {len(attn_processor.scale)} but got {len(scale)}."
+                    )
+                attn_processor.scale = scale
+    def unload_ip_adapter(self):
+        """
+        Unloads the IP Adapter weights
+        Examples:
+        ```python
+        >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
+        >>> pipeline.unload_ip_adapter()
+        >>> ...
+        ```
+        """
+        # remove CLIP image encoder
+        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is not None:
+            self.image_encoder = None
+            self.register_to_config(image_encoder=[None, None])
+        # remove feature extractor only when safety_checker is None as safety_checker uses
+        # the feature_extractor later
+        if not hasattr(self, "safety_checker"):
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None:
+                self.feature_extractor = None
+                self.register_to_config(feature_extractor=[None, None])
+        # remove hidden encoder
+        self.unet.encoder_hid_proj = None
+        self.config.encoder_hid_dim_type = None
+        # restore original Unet attention processors layers
+        self.unet.set_default_attn_processor()
+class VPAdapterMixin:
+    """Mixin for handling IP Adapters."""
+    @validate_hf_hub_args
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, torch.Tensor]],
+        subfolder: Union[str, List[str]],
+        weight_name: Union[str, List[str]],
+        image_encoder_folder: Optional[str] = "image_encoder",
+        **kwargs,
+    ):
+        """
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+            subfolder (`str` or `List[str]`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+                If a list is passed, it should have the same length as `weight_name`.
+            weight_name (`str` or `List[str]`):
+                The name of the weight file to load. If a list is passed, it should have the same length as
+                `weight_name`.
+            image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
+                The subfolder location of the image encoder within a larger model repository on the Hub or locally.
+                Pass `None` to not load the image encoder. If the image encoder is located in a folder inside `subfolder`,
+                you only need to pass the name of the folder that contains image encoder weights, e.g. `image_encoder_folder="image_encoder"`.
+                If the image encoder is located in a folder other than `subfolder`, you should pass the path to the folder that contains image encoder weights,
+                for example, `image_encoder_folder="different_subfolder/image_encoder"`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+        """
+        # handle the list inputs for multiple IP Adapters
+        if not isinstance(weight_name, list):
+            weight_name = [weight_name]
+        if not isinstance(pretrained_model_name_or_path_or_dict, list):
+            pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
+        if len(pretrained_model_name_or_path_or_dict) == 1:
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)
+        if not isinstance(subfolder, list):
+            subfolder = [subfolder]
+        if len(subfolder) == 1:
+            subfolder = subfolder * len(weight_name)
+        if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
+            raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")
+        if len(weight_name) != len(subfolder):
+            raise ValueError("`weight_name` and `subfolder` must have the same length.")
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        state_dicts = []
+        for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
+            pretrained_model_name_or_path_or_dict, weight_name, subfolder
+        ):
+            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                if weight_name.endswith(".safetensors"):
+                    state_dict = {"image_proj": {}, "ip_adapter": {}}
+                    with safe_open(model_file, framework="pt", device="cpu") as f:
+                        for key in f.keys():
+                            if key.startswith("image_proj."):
+                                state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                            elif key.startswith("ip_adapter."):
+                                state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+                else:
+                    state_dict = torch.load(model_file, map_location="cpu")
+            else:
+                state_dict = pretrained_model_name_or_path_or_dict
+            keys = list(state_dict.keys())
+            if keys != ["image_proj", "ip_adapter"]:
+                raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
+            state_dicts.append(state_dict)
+            # load CLIP image encoder here if it has not been registered to the pipeline yet
+            if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+                if image_encoder_folder is not None:
+                    if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                        logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                        if image_encoder_folder.count("/") == 0:
+                            image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix()
+                        else:
+                            image_encoder_subfolder = Path(image_encoder_folder).as_posix()
+                        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                            pretrained_model_name_or_path_or_dict,
+                            subfolder=image_encoder_subfolder,
+                            low_cpu_mem_usage=low_cpu_mem_usage,
+                        ).to(self.device, dtype=self.dtype)
+                        self.register_modules(image_encoder=image_encoder)
+                    else:
+                        raise ValueError(
+                            "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict."
+                        )
+                else:
+                    logger.warning(
+                        "image_encoder is not loaded since `image_encoder_folder=None` passed. You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter."
+                        "Use `ip_adapter_image_embeds` to pass pre-generated image embedding instead."
+                    )
+            # create feature extractor if it has not been registered to the pipeline yet
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+                feature_extractor = CLIPImageProcessor()
+                self.register_modules(feature_extractor=feature_extractor)
+        # load ip-adapter into unet
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        unet._load_ip_adapter_weights_VPAdapter(state_dicts)
+    def set_ip_adapter_scale(self, scale):
+        """
+        Sets the conditioning scale between text and image.
+        Example:
+        ```py
+        pipeline.set_ip_adapter_scale(0.5)
+        ```
+        """
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for attn_processor in unet.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, VPTemporalAdapterAttnProcessor2_0)):
+                if not isinstance(scale, list):
+                    scale = [scale] * len(attn_processor.scale)
+                if len(attn_processor.scale) != len(scale):
+                    raise ValueError(
+                        f"`scale` should be a list of same length as the number if ip-adapters "
+                        f"Expected {len(attn_processor.scale)} but got {len(scale)}."
+                    )
+                attn_processor.scale = scale
+    def unload_ip_adapter(self):
+        """
+        Unloads the IP Adapter weights
+        Examples:
+        ```python
+        >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
+        >>> pipeline.unload_ip_adapter()
+        >>> ...
+        ```
+        """
+        # remove CLIP image encoder
+        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is not None:
+            self.image_encoder = None
+            self.register_to_config(image_encoder=[None, None])
+        # remove feature extractor only when safety_checker is None as safety_checker uses
+        # the feature_extractor later
+        if not hasattr(self, "safety_checker"):
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None:
+                self.feature_extractor = None
+                self.register_to_config(feature_extractor=[None, None])
+        # remove hidden encoder
+        self.unet.encoder_hid_proj = None
+        self.config.encoder_hid_dim_type = None
+        # restore original Unet attention processors layers
+        self.unet.set_default_attn_processor()

foleycrafter/models/auffusion/loaders/unet.py ADDED Viewed

	@@ -0,0 +1,1100 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+from collections import defaultdict
+from contextlib import nullcontext
+from functools import partial
+from typing import Callable, Dict, List, Optional, Union, Tuple
+import safetensors
+import torch
+import torch.nn.functional as F
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import nn
+from diffusers.models.embeddings import ImageProjection, MLPProjection, Resampler
+from diffusers.models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    _get_model_file,
+    delete_adapter_layers,
+    is_accelerate_available,
+    logging,
+    is_torch_version,
+    set_adapter_layers,
+    set_weights_and_activate_adapters,
+)
+from diffusers.loaders.utils import AttnProcsLayers
+from foleycrafter.models.adapters.ip_adapter import VideoProjModel
+from foleycrafter.models.auffusion.attention_processor import IPAdapterAttnProcessor2_0, VPTemporalAdapterAttnProcessor2_0, AttnProcessor2_0
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+logger = logging.get_logger(__name__)
+class VPAdapterImageProjection(nn.Module):
+    def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Module], Tuple[nn.Module]]):
+        super().__init__()
+        self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers)
+    def forward(self, image_embeds: List[torch.FloatTensor]):
+        projected_image_embeds = []
+        # currently, we accept `image_embeds` as
+        #  1. a tensor (deprecated) with shape [batch_size, embed_dim] or [batch_size, sequence_length, embed_dim]
+        #  2. list of `n` tensors where `n` is number of ip-adapters, each tensor can hae shape [batch_size, num_images, embed_dim] or [batch_size, num_images, sequence_length, embed_dim]
+        if not isinstance(image_embeds, list):
+            deprecation_message = (
+                "You have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release."
+                " Please make sure to update your script to pass `image_embeds` as a list of tensors to supress this warning."
+            )
+            image_embeds = [image_embeds.unsqueeze(1)]
+        if len(image_embeds) != len(self.image_projection_layers):
+            raise ValueError(
+                f"image_embeds must have the same length as image_projection_layers, got {len(image_embeds)} and {len(self.image_projection_layers)}"
+            )
+        for image_embed, image_projection_layer in zip(image_embeds, self.image_projection_layers):
+            image_embed = image_embed.squeeze(1)
+            batch_size, num_images = image_embed.shape[0], image_embed.shape[1]
+            image_embed = image_embed.reshape((batch_size * num_images,) + image_embed.shape[2:])
+            image_embed = image_projection_layer(image_embed)
+            image_embed = image_embed.reshape((batch_size, num_images) + image_embed.shape[1:])
+            projected_image_embeds.append(image_embed)
+        return projected_image_embeds
+class MultiIPAdapterImageProjection(nn.Module):
+    def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Module], Tuple[nn.Module]]):
+        super().__init__()
+        self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers)
+    def forward(self, image_embeds: List[torch.FloatTensor]):
+        projected_image_embeds = []
+        # currently, we accept `image_embeds` as
+        #  1. a tensor (deprecated) with shape [batch_size, embed_dim] or [batch_size, sequence_length, embed_dim]
+        #  2. list of `n` tensors where `n` is number of ip-adapters, each tensor can hae shape [batch_size, num_images, embed_dim] or [batch_size, num_images, sequence_length, embed_dim]
+        if not isinstance(image_embeds, list):
+            deprecation_message = (
+                "You have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release."
+                " Please make sure to update your script to pass `image_embeds` as a list of tensors to supress this warning."
+            )
+            image_embeds = [image_embeds.unsqueeze(1)]
+        if len(image_embeds) != len(self.image_projection_layers):
+            raise ValueError(
+                f"image_embeds must have the same length as image_projection_layers, got {len(image_embeds)} and {len(self.image_projection_layers)}"
+            )
+        for image_embed, image_projection_layer in zip(image_embeds, self.image_projection_layers):
+            batch_size, num_images = image_embed.shape[0], image_embed.shape[1]
+            image_embed = image_embed.reshape((batch_size * num_images,) + image_embed.shape[2:])
+            image_embed = image_projection_layer(image_embed)
+            image_embed = image_embed.reshape((batch_size, num_images) + image_embed.shape[1:])
+            projected_image_embeds.append(image_embed)
+        return projected_image_embeds
+TEXT_ENCODER_NAME = "text_encoder"
+UNET_NAME = "unet"
+LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
+LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
+CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
+CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
+class UNet2DConditionLoadersMixin:
+    """
+    Load LoRA layers into a [`UNet2DCondtionModel`].
+    """
+    text_encoder_name = TEXT_ENCODER_NAME
+    unet_name = UNET_NAME
+    @validate_hf_hub_args
+    def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
+        r"""
+        Load pretrained attention processor layers into [`UNet2DConditionModel`]. Attention processor layers have to be
+        defined in
+        [`attention_processor.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py)
+        and be a `torch.nn.Module` class.
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+                    - A string, the model id (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a directory (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+        Example:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.unet.load_attn_procs(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        ```
+        """
+        from diffusers.models.attention_processor import CustomDiffusionAttnProcessor
+        from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear, LoRAConv2dLayer, LoRALinearLayer
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        network_alphas = kwargs.pop("network_alphas", None)
+        _pipeline = kwargs.pop("_pipeline", None)
+        is_network_alphas_none = network_alphas is None
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        model_file = None
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            # Let's first try to load .safetensors weights
+            if (use_safetensors and weight_name is None) or (
+                weight_name is not None and weight_name.endswith(".safetensors")
+            ):
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path_or_dict,
+                        weights_name=weight_name or LORA_WEIGHT_NAME_SAFE,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                    )
+                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    # try loading non-safetensors weights
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name or LORA_WEIGHT_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+        # fill attn processors
+        lora_layers_list = []
+        is_lora = all(("lora" in k or k.endswith(".alpha")) for k in state_dict.keys()) and not USE_PEFT_BACKEND
+        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
+        if is_lora:
+            # correct keys
+            state_dict, network_alphas = self.convert_state_dict_legacy_attn_format(state_dict, network_alphas)
+            if network_alphas is not None:
+                network_alphas_keys = list(network_alphas.keys())
+                used_network_alphas_keys = set()
+            lora_grouped_dict = defaultdict(dict)
+            mapped_network_alphas = {}
+            all_keys = list(state_dict.keys())
+            for key in all_keys:
+                value = state_dict.pop(key)
+                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                lora_grouped_dict[attn_processor_key][sub_key] = value
+                # Create another `mapped_network_alphas` dictionary so that we can properly map them.
+                if network_alphas is not None:
+                    for k in network_alphas_keys:
+                        if k.replace(".alpha", "") in key:
+                            mapped_network_alphas.update({attn_processor_key: network_alphas.get(k)})
+                            used_network_alphas_keys.add(k)
+            if not is_network_alphas_none:
+                if len(set(network_alphas_keys) - used_network_alphas_keys) > 0:
+                    raise ValueError(
+                        f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
+                    )
+            if len(state_dict) > 0:
+                raise ValueError(
+                    f"The `state_dict` has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
+                )
+            for key, value_dict in lora_grouped_dict.items():
+                attn_processor = self
+                for sub_key in key.split("."):
+                    attn_processor = getattr(attn_processor, sub_key)
+                # Process non-attention layers, which don't have to_{k,v,q,out_proj}_lora layers
+                # or add_{k,v,q,out_proj}_proj_lora layers.
+                rank = value_dict["lora.down.weight"].shape[0]
+                if isinstance(attn_processor, LoRACompatibleConv):
+                    in_features = attn_processor.in_channels
+                    out_features = attn_processor.out_channels
+                    kernel_size = attn_processor.kernel_size
+                    ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
+                    with ctx():
+                        lora = LoRAConv2dLayer(
+                            in_features=in_features,
+                            out_features=out_features,
+                            rank=rank,
+                            kernel_size=kernel_size,
+                            stride=attn_processor.stride,
+                            padding=attn_processor.padding,
+                            network_alpha=mapped_network_alphas.get(key),
+                        )
+                elif isinstance(attn_processor, LoRACompatibleLinear):
+                    ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
+                    with ctx():
+                        lora = LoRALinearLayer(
+                            attn_processor.in_features,
+                            attn_processor.out_features,
+                            rank,
+                            mapped_network_alphas.get(key),
+                        )
+                else:
+                    raise ValueError(f"Module {key} is not a LoRACompatibleConv or LoRACompatibleLinear module.")
+                value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()}
+                lora_layers_list.append((attn_processor, lora))
+                if low_cpu_mem_usage:
+                    device = next(iter(value_dict.values())).device
+                    dtype = next(iter(value_dict.values())).dtype
+                    load_model_dict_into_meta(lora, value_dict, device=device, dtype=dtype)
+                else:
+                    lora.load_state_dict(value_dict)
+        elif is_custom_diffusion:
+            attn_processors = {}
+            custom_diffusion_grouped_dict = defaultdict(dict)
+            for key, value in state_dict.items():
+                if len(value) == 0:
+                    custom_diffusion_grouped_dict[key] = {}
+                else:
+                    if "to_out" in key:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                    else:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+                    custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
+            for key, value_dict in custom_diffusion_grouped_dict.items():
+                if len(value_dict) == 0:
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
+                    )
+                else:
+                    cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
+                    hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
+                    train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        train_kv=True,
+                        train_q_out=train_q_out,
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                    )
+                    attn_processors[key].load_state_dict(value_dict)
+        elif USE_PEFT_BACKEND:
+            # In that case we have nothing to do as loading the adapter weights is already handled above by `set_peft_model_state_dict`
+            # on the Unet
+            pass
+        else:
+            raise ValueError(
+                f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
+            )
+        # <Unsafe code
+        # We can be sure that the following works as it just sets attention processors, lora layers and puts all in the same dtype
+        # Now we remove any existing hooks to
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        # For PEFT backend the Unet is already offloaded at this stage as it is handled inside `lora_lora_weights_into_unet`
+        if not USE_PEFT_BACKEND:
+            if _pipeline is not None:
+                for _, component in _pipeline.components.items():
+                    if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                        is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                        is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                        logger.info(
+                            "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                        )
+                        remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+            # only custom diffusion needs to set attn processors
+            if is_custom_diffusion:
+                self.set_attn_processor(attn_processors)
+            # set lora layers
+            for target_module, lora_layer in lora_layers_list:
+                target_module.set_lora_layer(lora_layer)
+            self.to(dtype=self.dtype, device=self.device)
+            # Offload back.
+            if is_model_cpu_offload:
+                _pipeline.enable_model_cpu_offload()
+            elif is_sequential_cpu_offload:
+                _pipeline.enable_sequential_cpu_offload()
+            # Unsafe code />
+    def convert_state_dict_legacy_attn_format(self, state_dict, network_alphas):
+        is_new_lora_format = all(
+            key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+        )
+        if is_new_lora_format:
+            # Strip the `"unet"` prefix.
+            is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
+            if is_text_encoder_present:
+                warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
+                logger.warn(warn_message)
+            unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
+            state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
+        # change processor format to 'pure' LoRACompatibleLinear format
+        if any("processor" in k.split(".") for k in state_dict.keys()):
+            def format_to_lora_compatible(key):
+                if "processor" not in key.split("."):
+                    return key
+                return key.replace(".processor", "").replace("to_out_lora", "to_out.0.lora").replace("_lora", ".lora")
+            state_dict = {format_to_lora_compatible(k): v for k, v in state_dict.items()}
+            if network_alphas is not None:
+                network_alphas = {format_to_lora_compatible(k): v for k, v in network_alphas.items()}
+        return state_dict, network_alphas
+    def save_attn_procs(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        **kwargs,
+    ):
+        r"""
+        Save attention processor layers to a directory so that it can be reloaded with the
+        [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] method.
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save an attention processor to (will be created if it doesn't exist).
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or with `pickle`.
+        Example:
+        ```py
+        import torch
+        from diffusers import DiffusionPipeline
+        pipeline = DiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            torch_dtype=torch.float16,
+        ).to("cuda")
+        pipeline.unet.load_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
+        pipeline.unet.save_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
+        ```
+        """
+        from diffusers.models.attention_processor import (
+            CustomDiffusionAttnProcessor,
+            CustomDiffusionAttnProcessor2_0,
+            CustomDiffusionXFormersAttnProcessor,
+        )
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        if save_function is None:
+            if safe_serialization:
+                def save_function(weights, filename):
+                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
+            else:
+                save_function = torch.save
+        os.makedirs(save_directory, exist_ok=True)
+        is_custom_diffusion = any(
+            isinstance(
+                x,
+                (CustomDiffusionAttnProcessor, CustomDiffusionAttnProcessor2_0, CustomDiffusionXFormersAttnProcessor),
+            )
+            for (_, x) in self.attn_processors.items()
+        )
+        if is_custom_diffusion:
+            model_to_save = AttnProcsLayers(
+                {
+                    y: x
+                    for (y, x) in self.attn_processors.items()
+                    if isinstance(
+                        x,
+                        (
+                            CustomDiffusionAttnProcessor,
+                            CustomDiffusionAttnProcessor2_0,
+                            CustomDiffusionXFormersAttnProcessor,
+                        ),
+                    )
+                }
+            )
+            state_dict = model_to_save.state_dict()
+            for name, attn in self.attn_processors.items():
+                if len(attn.state_dict()) == 0:
+                    state_dict[name] = {}
+        else:
+            model_to_save = AttnProcsLayers(self.attn_processors)
+            state_dict = model_to_save.state_dict()
+        if weight_name is None:
+            if safe_serialization:
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else LORA_WEIGHT_NAME_SAFE
+            else:
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else LORA_WEIGHT_NAME
+        # Save the model
+        save_function(state_dict, os.path.join(save_directory, weight_name))
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
+    def fuse_lora(self, lora_scale=1.0, safe_fusing=False, adapter_names=None):
+        self.lora_scale = lora_scale
+        self._safe_fusing = safe_fusing
+        self.apply(partial(self._fuse_lora_apply, adapter_names=adapter_names))
+    def _fuse_lora_apply(self, module, adapter_names=None):
+        if not USE_PEFT_BACKEND:
+            if hasattr(module, "_fuse_lora"):
+                module._fuse_lora(self.lora_scale, self._safe_fusing)
+            if adapter_names is not None:
+                raise ValueError(
+                    "The `adapter_names` argument is not supported in your environment. Please switch"
+                    " to PEFT backend to use this argument by installing latest PEFT and transformers."
+                    " `pip install -U peft transformers`"
+                )
+        else:
+            from peft.tuners.tuners_utils import BaseTunerLayer
+            merge_kwargs = {"safe_merge": self._safe_fusing}
+            if isinstance(module, BaseTunerLayer):
+                if self.lora_scale != 1.0:
+                    module.scale_layer(self.lora_scale)
+                # For BC with prevous PEFT versions, we need to check the signature
+                # of the `merge` method to see if it supports the `adapter_names` argument.
+                supported_merge_kwargs = list(inspect.signature(module.merge).parameters)
+                if "adapter_names" in supported_merge_kwargs:
+                    merge_kwargs["adapter_names"] = adapter_names
+                elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None:
+                    raise ValueError(
+                        "The `adapter_names` argument is not supported with your PEFT version. Please upgrade"
+                        " to the latest version of PEFT. `pip install -U peft`"
+                    )
+                module.merge(**merge_kwargs)
+    def unfuse_lora(self):
+        self.apply(self._unfuse_lora_apply)
+    def _unfuse_lora_apply(self, module):
+        if not USE_PEFT_BACKEND:
+            if hasattr(module, "_unfuse_lora"):
+                module._unfuse_lora()
+        else:
+            from peft.tuners.tuners_utils import BaseTunerLayer
+            if isinstance(module, BaseTunerLayer):
+                module.unmerge()
+    def set_adapters(
+        self,
+        adapter_names: Union[List[str], str],
+        weights: Optional[Union[List[float], float]] = None,
+    ):
+        """
+        Set the currently active adapters for use in the UNet.
+        Args:
+            adapter_names (`List[str]` or `str`):
+                The names of the adapters to use.
+            adapter_weights (`Union[List[float], float]`, *optional*):
+                The adapter(s) weights to use with the UNet. If `None`, the weights are set to `1.0` for all the
+                adapters.
+        Example:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.set_adapters(["cinematic", "pixel"], adapter_weights=[0.5, 0.5])
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for `set_adapters()`.")
+        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
+        if weights is None:
+            weights = [1.0] * len(adapter_names)
+        elif isinstance(weights, float):
+            weights = [weights] * len(adapter_names)
+        if len(adapter_names) != len(weights):
+            raise ValueError(
+                f"Length of adapter names {len(adapter_names)} is not equal to the length of their weights {len(weights)}."
+            )
+        set_weights_and_activate_adapters(self, adapter_names, weights)
+    def disable_lora(self):
+        """
+        Disable the UNet's active LoRA layers.
+        Example:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.disable_lora()
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        set_adapter_layers(self, enabled=False)
+    def enable_lora(self):
+        """
+        Enable the UNet's active LoRA layers.
+        Example:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.enable_lora()
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        set_adapter_layers(self, enabled=True)
+    def delete_adapters(self, adapter_names: Union[List[str], str]):
+        """
+        Delete an adapter's LoRA layers from the UNet.
+        Args:
+            adapter_names (`Union[List[str], str]`):
+                The names (single string or list of strings) of the adapter to delete.
+        Example:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_names="cinematic"
+        )
+        pipeline.delete_adapters("cinematic")
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        if isinstance(adapter_names, str):
+            adapter_names = [adapter_names]
+        for adapter_name in adapter_names:
+            delete_adapter_layers(self, adapter_name)
+            # Pop also the corresponding adapter from the config
+            if hasattr(self, "peft_config"):
+                self.peft_config.pop(adapter_name, None)
+    def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_usage=False):
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        updated_state_dict = {}
+        image_projection = None
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+        if "proj.weight" in state_dict:
+            # IP-Adapter
+            num_image_text_embeds = 4
+            clip_embeddings_dim = state_dict["proj.weight"].shape[-1]
+            cross_attention_dim = state_dict["proj.weight"].shape[0] // num_image_text_embeds
+            with init_context():
+                image_projection = ImageProjection(
+                    cross_attention_dim=cross_attention_dim,
+                    image_embed_dim=clip_embeddings_dim,
+                    num_image_text_embeds=num_image_text_embeds,
+                )
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj", "image_embeds")
+                updated_state_dict[diffusers_name] = value
+        if not low_cpu_mem_usage:
+            image_projection.load_state_dict(updated_state_dict)
+        else:
+            load_model_dict_into_meta(image_projection, updated_state_dict, device=self.device, dtype=self.dtype)
+        return image_projection
+    # def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, multi_frames_condition):
+    #     updated_state_dict = {}
+    #     image_projection = None
+    #     if "proj.weight" in state_dict:
+    #         # IP-Adapter
+    #         # NOTE: adapt for  multi-frame
+    #         num_image_text_embeds = 4
+    #         clip_embeddings_dim = state_dict["proj.weight"].shape[-1]
+    #         cross_attention_dim = state_dict["proj.weight"].shape[0] // 4
+    #         # cross_attention_dim = state_dict["proj.weight"].shape[0]
+    #         if not multi_frames_condition:
+    #             image_projection = ImageProjection(
+    #                 cross_attention_dim=cross_attention_dim,
+    #                 image_embed_dim=clip_embeddings_dim,
+    #                 num_image_text_embeds=num_image_text_embeds,
+    #             )
+    #         else:
+    #             num_image_text_embeds = 50
+    #             cross_attention_dim = state_dict["proj.weight"].shape[0]
+    #             image_projection = VideoProjModel(
+    #                 cross_attention_dim=cross_attention_dim,
+    #                 clip_embeddings_dim=clip_embeddings_dim,
+    #                 clip_extra_context_tokens=1,
+    #                 video_frame=num_image_text_embeds,
+    #             )
+    #         for key, value in state_dict.items():
+    #             if not multi_frames_condition:
+    #                 diffusers_name = key.replace("proj", "image_embeds")
+    #             else:
+    #                 diffusers_name = key
+    #             updated_state_dict[diffusers_name] = value
+    #     elif "proj.3.weight" in state_dict:
+    #         # IP-Adapter Full
+    #         clip_embeddings_dim = state_dict["proj.0.weight"].shape[0]
+    #         cross_attention_dim = state_dict["proj.3.weight"].shape[0]
+    #         image_projection = MLPProjection(
+    #             cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim
+    #         )
+    #         for key, value in state_dict.items():
+    #             diffusers_name = key.replace("proj.0", "ff.net.0.proj")
+    #             diffusers_name = diffusers_name.replace("proj.2", "ff.net.2")
+    #             diffusers_name = diffusers_name.replace("proj.3", "norm")
+    #             updated_state_dict[diffusers_name] = value
+    #     else:
+    #         # IP-Adapter Plus
+    #         num_image_text_embeds = state_dict["latents"].shape[1]
+    #         embed_dims = state_dict["proj_in.weight"].shape[1]
+    #         output_dims = state_dict["proj_out.weight"].shape[0]
+    #         hidden_dims = state_dict["latents"].shape[2]
+    #         heads = state_dict["layers.0.0.to_q.weight"].shape[0] // 64
+    #         image_projection = Resampler(
+    #             embed_dims=embed_dims,
+    #             output_dims=output_dims,
+    #             hidden_dims=hidden_dims,
+    #             heads=heads,
+    #             num_queries=num_image_text_embeds,
+    #         )
+    #         for key, value in state_dict.items():
+    #             diffusers_name = key.replace("0.to", "2.to")
+    #             diffusers_name = diffusers_name.replace("1.0.weight", "3.0.weight")
+    #             diffusers_name = diffusers_name.replace("1.0.bias", "3.0.bias")
+    #             diffusers_name = diffusers_name.replace("1.1.weight", "3.1.net.0.proj.weight")
+    #             diffusers_name = diffusers_name.replace("1.3.weight", "3.1.net.2.weight")
+    #             if "norm1" in diffusers_name:
+    #                 updated_state_dict[diffusers_name.replace("0.norm1", "0")] = value
+    #             elif "norm2" in diffusers_name:
+    #                 updated_state_dict[diffusers_name.replace("0.norm2", "1")] = value
+    #             elif "to_kv" in diffusers_name:
+    #                 v_chunk = value.chunk(2, dim=0)
+    #                 updated_state_dict[diffusers_name.replace("to_kv", "to_k")] = v_chunk[0]
+    #                 updated_state_dict[diffusers_name.replace("to_kv", "to_v")] = v_chunk[1]
+    #             elif "to_out" in diffusers_name:
+    #                 updated_state_dict[diffusers_name.replace("to_out", "to_out.0")] = value
+    #             else:
+    #                 updated_state_dict[diffusers_name] = value
+    #     image_projection.load_state_dict(updated_state_dict)
+    #     return image_projection
+    def _convert_ip_adapter_attn_to_diffusers_VPAdapter(self, state_dicts, low_cpu_mem_usage=False):
+        from diffusers.models.attention_processor import (
+            AttnProcessor,
+            IPAdapterAttnProcessor,
+        )
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        # set ip-adapter cross-attention processors & load state_dict
+        attn_procs = {}
+        key_id = 1
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+        for name in self.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.config.block_out_channels[block_id]
+            if cross_attention_dim is None or "motion_modules" in name or 'fuser' in name:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+            else:
+                attn_processor_class = (
+                    VPTemporalAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                num_image_text_embeds = []
+                for state_dict in state_dicts:
+                    if "proj.weight" in state_dict["image_proj"]:
+                        # IP-Adapter
+                        num_image_text_embeds += [4]
+                    elif "proj.3.weight" in state_dict["image_proj"]:
+                        # IP-Adapter Full Face
+                        num_image_text_embeds += [257]  # 256 CLIP tokens + 1 CLS token
+                    else:
+                        # IP-Adapter Plus
+                        num_image_text_embeds += [state_dict["image_proj"]["latents"].shape[1]]
+                with init_context():
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        scale=1.0,
+                        num_tokens=num_image_text_embeds,
+                    )
+                value_dict = {}
+                for i, state_dict in enumerate(state_dicts):
+                    value_dict.update({f"to_k_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
+                    value_dict.update({f"to_v_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
+                if not low_cpu_mem_usage:
+                    attn_procs[name].load_state_dict(value_dict)
+                else:
+                    device = next(iter(value_dict.values())).device
+                    dtype = next(iter(value_dict.values())).dtype
+                    load_model_dict_into_meta(attn_procs[name], value_dict, device=device, dtype=dtype)
+                key_id += 2
+        return attn_procs
+    def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=False):
+        from diffusers.models.attention_processor import (
+            AttnProcessor,
+            IPAdapterAttnProcessor,
+        )
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        # set ip-adapter cross-attention processors & load state_dict
+        attn_procs = {}
+        key_id = 1
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+        for name in self.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.config.block_out_channels[block_id]
+            if cross_attention_dim is None or "motion_modules" in name or 'fuser' in name:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+            else:
+                attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                num_image_text_embeds = []
+                for state_dict in state_dicts:
+                    if "proj.weight" in state_dict["image_proj"]:
+                        # IP-Adapter
+                        num_image_text_embeds += [4]
+                    elif "proj.3.weight" in state_dict["image_proj"]:
+                        # IP-Adapter Full Face
+                        num_image_text_embeds += [257]  # 256 CLIP tokens + 1 CLS token
+                    else:
+                        # IP-Adapter Plus
+                        num_image_text_embeds += [state_dict["image_proj"]["latents"].shape[1]]
+                with init_context():
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        scale=1.0,
+                        num_tokens=num_image_text_embeds,
+                    )
+                value_dict = {}
+                for i, state_dict in enumerate(state_dicts):
+                    value_dict.update({f"to_k_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
+                    value_dict.update({f"to_v_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
+                if not low_cpu_mem_usage:
+                    attn_procs[name].load_state_dict(value_dict)
+                else:
+                    device = next(iter(value_dict.values())).device
+                    dtype = next(iter(value_dict.values())).dtype
+                    load_model_dict_into_meta(attn_procs[name], value_dict, device=device, dtype=dtype)
+                key_id += 2
+        return attn_procs
+    def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=False):
+        attn_procs = self._convert_ip_adapter_attn_to_diffusers(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
+        self.set_attn_processor(attn_procs)
+        # convert IP-Adapter Image Projection layers to diffusers
+        image_projection_layers = []
+        for state_dict in state_dicts:
+            image_projection_layer = self._convert_ip_adapter_image_proj_to_diffusers(
+                state_dict["image_proj"], low_cpu_mem_usage=low_cpu_mem_usage
+            )
+            image_projection_layers.append(image_projection_layer)
+        self.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
+        self.config.encoder_hid_dim_type = "ip_image_proj"
+        self.to(dtype=self.dtype, device=self.device)
+    def _load_ip_adapter_weights_VPAdapter(self, state_dicts, low_cpu_mem_usage=False):
+        attn_procs = self._convert_ip_adapter_attn_to_diffusers_VPAdapter(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
+        self.set_attn_processor(attn_procs)
+        # convert IP-Adapter Image Projection layers to diffusers
+        image_projection_layers = []
+        for state_dict in state_dicts:
+            image_projection_layer = self._convert_ip_adapter_image_proj_to_diffusers(
+                state_dict["image_proj"], low_cpu_mem_usage=low_cpu_mem_usage
+            )
+            image_projection_layers.append(image_projection_layer)
+        self.encoder_hid_proj = VPAdapterImageProjection(image_projection_layers)
+        self.config.encoder_hid_dim_type = "ip_image_proj"
+        self.to(dtype=self.dtype, device=self.device)

foleycrafter/models/auffusion/resnet.py ADDED Viewed

	@@ -0,0 +1,685 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+# `TemporalConvLayer` Copyright 2023 Alibaba DAMO-VILAB, The ModelScope Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.utils import USE_PEFT_BACKEND
+from diffusers.models.activations import get_activation
+from diffusers.models.downsampling import (  # noqa
+    Downsample1D,
+    Downsample2D,
+    FirDownsample2D,
+    KDownsample2D,
+    downsample_2d,
+)
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.upsampling import (  # noqa
+    FirUpsample2D,
+    KUpsample2D,
+    Upsample1D,
+    Upsample2D,
+    upfirdn2d_native,
+    upsample_2d,
+)
+from foleycrafter.models.auffusion.attention_processor import SpatialNorm
+class ResnetBlock2D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
+            "ada_group" for a stronger conditioning with scale and shift.
+        kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        skip_time_act: bool = False,
+        time_embedding_norm: str = "default",  # default, scale_shift, ada_group, spatial
+        kernel: Optional[torch.FloatTensor] = None,
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        up: bool = False,
+        down: bool = False,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        self.skip_time_act = skip_time_act
+        linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
+        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
+        if groups_out is None:
+            groups_out = groups
+        if self.time_embedding_norm == "ada_group":
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                self.time_emb_proj = linear_cls(temb_channels, out_channels)
+            elif self.time_embedding_norm == "scale_shift":
+                self.time_emb_proj = linear_cls(temb_channels, 2 * out_channels)
+            elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+                self.time_emb_proj = None
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+        else:
+            self.time_emb_proj = None
+        if self.time_embedding_norm == "ada_group":
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = conv_cls(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
+        self.nonlinearity = get_activation(non_linearity)
+        self.upsample = self.downsample = None
+        if self.up:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
+            else:
+                self.upsample = Upsample2D(in_channels, use_conv=False)
+        elif self.down:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
+            else:
+                self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                conv_2d_out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=conv_shortcut_bias,
+            )
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm1(hidden_states, temb)
+        else:
+            hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = (
+                self.upsample(input_tensor, scale=scale)
+                if isinstance(self.upsample, Upsample2D)
+                else self.upsample(input_tensor)
+            )
+            hidden_states = (
+                self.upsample(hidden_states, scale=scale)
+                if isinstance(self.upsample, Upsample2D)
+                else self.upsample(hidden_states)
+            )
+        elif self.downsample is not None:
+            input_tensor = (
+                self.downsample(input_tensor, scale=scale)
+                if isinstance(self.downsample, Downsample2D)
+                else self.downsample(input_tensor)
+            )
+            hidden_states = (
+                self.downsample(hidden_states, scale=scale)
+                if isinstance(self.downsample, Downsample2D)
+                else self.downsample(hidden_states)
+            )
+        hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
+        if self.time_emb_proj is not None:
+            if not self.skip_time_act:
+                temb = self.nonlinearity(temb)
+            temb = (
+                self.time_emb_proj(temb, scale)[:, :, None, None]
+                if not USE_PEFT_BACKEND
+                # NOTE: Maybe we can use different prompt in different time
+                else self.time_emb_proj(temb)[:, :, None, None]
+            )
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm2(hidden_states, temb)
+        else:
+            hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = (
+                self.conv_shortcut(input_tensor, scale) if not USE_PEFT_BACKEND else self.conv_shortcut(input_tensor)
+            )
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+# unet_rl.py
+def rearrange_dims(tensor: torch.Tensor) -> torch.Tensor:
+    if len(tensor.shape) == 2:
+        return tensor[:, :, None]
+    if len(tensor.shape) == 3:
+        return tensor[:, :, None, :]
+    elif len(tensor.shape) == 4:
+        return tensor[:, :, 0, :]
+    else:
+        raise ValueError(f"`len(tensor)`: {len(tensor)} has to be 2, 3 or 4.")
+class Conv1dBlock(nn.Module):
+    """
+    Conv1d --> GroupNorm --> Mish
+    Parameters:
+        inp_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        kernel_size (`int` or `tuple`): Size of the convolving kernel.
+        n_groups (`int`, default `8`): Number of groups to separate the channels into.
+        activation (`str`, defaults to `mish`): Name of the activation function.
+    """
+    def __init__(
+        self,
+        inp_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        n_groups: int = 8,
+        activation: str = "mish",
+    ):
+        super().__init__()
+        self.conv1d = nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
+        self.group_norm = nn.GroupNorm(n_groups, out_channels)
+        self.mish = get_activation(activation)
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        intermediate_repr = self.conv1d(inputs)
+        intermediate_repr = rearrange_dims(intermediate_repr)
+        intermediate_repr = self.group_norm(intermediate_repr)
+        intermediate_repr = rearrange_dims(intermediate_repr)
+        output = self.mish(intermediate_repr)
+        return output
+# unet_rl.py
+class ResidualTemporalBlock1D(nn.Module):
+    """
+    Residual 1D block with temporal convolutions.
+    Parameters:
+        inp_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        embed_dim (`int`): Embedding dimension.
+        kernel_size (`int` or `tuple`): Size of the convolving kernel.
+        activation (`str`, defaults `mish`): It is possible to choose the right activation function.
+    """
+    def __init__(
+        self,
+        inp_channels: int,
+        out_channels: int,
+        embed_dim: int,
+        kernel_size: Union[int, Tuple[int, int]] = 5,
+        activation: str = "mish",
+    ):
+        super().__init__()
+        self.conv_in = Conv1dBlock(inp_channels, out_channels, kernel_size)
+        self.conv_out = Conv1dBlock(out_channels, out_channels, kernel_size)
+        self.time_emb_act = get_activation(activation)
+        self.time_emb = nn.Linear(embed_dim, out_channels)
+        self.residual_conv = (
+            nn.Conv1d(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity()
+        )
+    def forward(self, inputs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            inputs : [ batch_size x inp_channels x horizon ]
+            t : [ batch_size x embed_dim ]
+        returns:
+            out : [ batch_size x out_channels x horizon ]
+        """
+        t = self.time_emb_act(t)
+        t = self.time_emb(t)
+        out = self.conv_in(inputs) + rearrange_dims(t)
+        out = self.conv_out(out)
+        return out + self.residual_conv(inputs)
+class TemporalConvLayer(nn.Module):
+    """
+    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
+    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
+    Parameters:
+        in_dim (`int`): Number of input channels.
+        out_dim (`int`): Number of output channels.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+    ):
+        super().__init__()
+        out_dim = out_dim or in_dim
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # conv layers
+        self.conv1 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, in_dim),
+            nn.SiLU(),
+            nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv2 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv3 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv4 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        # zero out the last layer params,so the conv block is identity
+        nn.init.zeros_(self.conv4[-1].weight)
+        nn.init.zeros_(self.conv4[-1].bias)
+    def forward(self, hidden_states: torch.Tensor, num_frames: int = 1) -> torch.Tensor:
+        hidden_states = (
+            hidden_states[None, :].reshape((-1, num_frames) + hidden_states.shape[1:]).permute(0, 2, 1, 3, 4)
+        )
+        identity = hidden_states
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.conv3(hidden_states)
+        hidden_states = self.conv4(hidden_states)
+        hidden_states = identity + hidden_states
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(
+            (hidden_states.shape[0] * hidden_states.shape[2], -1) + hidden_states.shape[3:]
+        )
+        return hidden_states
+class TemporalResnetBlock(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        temb_channels: int = 512,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        kernel_size = (3, 1, 1)
+        padding = [k // 2 for k in kernel_size]
+        self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+        )
+        if temb_channels is not None:
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(0.0)
+        self.conv2 = nn.Conv3d(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+        )
+        self.nonlinearity = get_activation("silu")
+        self.use_in_shortcut = self.in_channels != out_channels
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+    def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if self.time_emb_proj is not None:
+            temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, :, None, None]
+            temb = temb.permute(0, 2, 1, 3, 4)
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = input_tensor + hidden_states
+        return output_tensor
+# VideoResBlock
+class SpatioTemporalResBlock(nn.Module):
+    r"""
+    A SpatioTemporal Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
+        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
+        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
+        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
+            The merge strategy to use for the temporal mixing.
+        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
+            If `True`, switch the spatial and temporal mixing.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        temb_channels: int = 512,
+        eps: float = 1e-6,
+        temporal_eps: Optional[float] = None,
+        merge_factor: float = 0.5,
+        merge_strategy="learned_with_images",
+        switch_spatial_to_temporal_mix: bool = False,
+    ):
+        super().__init__()
+        self.spatial_res_block = ResnetBlock2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            eps=eps,
+        )
+        self.temporal_res_block = TemporalResnetBlock(
+            in_channels=out_channels if out_channels is not None else in_channels,
+            out_channels=out_channels if out_channels is not None else in_channels,
+            temb_channels=temb_channels,
+            eps=temporal_eps if temporal_eps is not None else eps,
+        )
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor,
+            merge_strategy=merge_strategy,
+            switch_spatial_to_temporal_mix=switch_spatial_to_temporal_mix,
+        )
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ):
+        num_frames = image_only_indicator.shape[-1]
+        hidden_states = self.spatial_res_block(hidden_states, temb)
+        batch_frames, channels, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+        hidden_states_mix = (
+            hidden_states[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        )
+        hidden_states = (
+            hidden_states[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        )
+        if temb is not None:
+            temb = temb.reshape(batch_size, num_frames, -1)
+        hidden_states = self.temporal_res_block(hidden_states, temb)
+        hidden_states = self.time_mixer(
+            x_spatial=hidden_states_mix,
+            x_temporal=hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(batch_frames, channels, height, width)
+        return hidden_states
+class AlphaBlender(nn.Module):
+    r"""
+    A module to blend spatial and temporal features.
+    Parameters:
+        alpha (`float`): The initial value of the blending factor.
+        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
+            The merge strategy to use for the temporal mixing.
+        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
+            If `True`, switch the spatial and temporal mixing.
+    """
+    strategies = ["learned", "fixed", "learned_with_images"]
+    def __init__(
+        self,
+        alpha: float,
+        merge_strategy: str = "learned_with_images",
+        switch_spatial_to_temporal_mix: bool = False,
+    ):
+        super().__init__()
+        self.merge_strategy = merge_strategy
+        self.switch_spatial_to_temporal_mix = switch_spatial_to_temporal_mix  # For TemporalVAE
+        if merge_strategy not in self.strategies:
+            raise ValueError(f"merge_strategy needs to be in {self.strategies}")
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned" or self.merge_strategy == "learned_with_images":
+            self.register_parameter("mix_factor", torch.nn.Parameter(torch.Tensor([alpha])))
+        else:
+            raise ValueError(f"Unknown merge strategy {self.merge_strategy}")
+    def get_alpha(self, image_only_indicator: torch.Tensor, ndims: int) -> torch.Tensor:
+        if self.merge_strategy == "fixed":
+            alpha = self.mix_factor
+        elif self.merge_strategy == "learned":
+            alpha = torch.sigmoid(self.mix_factor)
+        elif self.merge_strategy == "learned_with_images":
+            if image_only_indicator is None:
+                raise ValueError("Please provide image_only_indicator to use learned_with_images merge strategy")
+            alpha = torch.where(
+                image_only_indicator.bool(),
+                torch.ones(1, 1, device=image_only_indicator.device),
+                torch.sigmoid(self.mix_factor)[..., None],
+            )
+            # (batch, channel, frames, height, width)
+            if ndims == 5:
+                alpha = alpha[:, None, :, None, None]
+            # (batch*frames, height*width, channels)
+            elif ndims == 3:
+                alpha = alpha.reshape(-1)[:, None, None]
+            else:
+                raise ValueError(f"Unexpected ndims {ndims}. Dimensions should be 3 or 5")
+        else:
+            raise NotImplementedError
+        return alpha
+    def forward(
+        self,
+        x_spatial: torch.Tensor,
+        x_temporal: torch.Tensor,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        alpha = self.get_alpha(image_only_indicator, x_spatial.ndim)
+        alpha = alpha.to(x_spatial.dtype)
+        if self.switch_spatial_to_temporal_mix:
+            alpha = 1.0 - alpha
+        x = alpha * x_spatial + (1.0 - alpha) * x_temporal
+        return x

foleycrafter/models/auffusion/transformer_2d.py ADDED Viewed

	@@ -0,0 +1,460 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, is_torch_version
+from diffusers.models.embeddings import PatchEmbed, PixArtAlphaTextProjection
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from foleycrafter.models.auffusion.attention import BasicTransformerBlock
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
+        linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = linear_cls(in_channels, inner_dim)
+            else:
+                self.proj_in = conv_cls(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+                interpolation_scale=interpolation_scale,
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                # NOTE: remember to change
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = linear_cls(inner_dim, in_channels)
+            else:
+                self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches and norm_type != "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        elif self.is_input_patches and norm_type == "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        # 5. PixArt-Alpha blocks.
+        self.adaln_single = None
+        self.use_additional_conditions = False
+        if norm_type == "ada_norm_single":
+            self.use_additional_conditions = self.config.sample_size == 128
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            inner_dim = hidden_states.shape[1]
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            self.height, self.width = height, width
+            hidden_states = self.pos_embed(hidden_states)
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                batch_size = hidden_states.shape[0]
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+                )
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+            else:
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.squeeze(1)
+            # unpatchify
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

foleycrafter/models/auffusion/unet_2d_blocks.py ADDED Viewed

The diff for this file is too large to render. See raw diff

foleycrafter/models/auffusion_unet.py ADDED Viewed

	@@ -0,0 +1,1260 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils.import_utils import is_xformers_available, is_torch_version
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.models.activations import get_activation
+# from diffusers import StableDiffusionGLIGENPipeline
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from foleycrafter.models.auffusion.unet_2d_blocks import (
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+from foleycrafter.models.auffusion.attention_processor\
+    import AttnProcessor2_0
+from foleycrafter.models.adapters.ip_adapter import TimeProjModel
+from foleycrafter.models.auffusion.loaders.unet import UNet2DConditionLoadersMixin
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+        # param for joint
+        video_feature_dim: tuple=(320, 640, 1280, 1280),
+        video_cross_attn_dim: int=1024,
+        video_frame_nums: int=16,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = TimeProjModel(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+        # additional settings
+        self.video_feature_dim    = video_feature_dim
+        self.cross_attention_dim  = cross_attention_dim
+        self.video_cross_attn_dim = video_cross_attn_dim
+        self.video_frame_nums     = video_frame_nums
+        self.multi_frames_condition = False
+    def load_attention(self):
+        attn_dict = {}
+        for name in self.attn_processors.keys():
+            # if self-attention, save feature
+            if name.endswith("attn1.processor"):
+                if is_xformers_available():
+                    attn_dict[name] = XFormersAttnProcessor()
+                else:
+                    attn_dict[name] = AttnProcessor()
+            else:
+                attn_dict[name] = AttnProcessor2_0()
+        self.set_attn_processor(attn_dict)
+    def get_writer_feature(self):
+        return self.attn_feature_writer.get_cross_attention_feature()
+    def clear_writer_feature(self):
+        self.attn_feature_writer.clear_cross_attention_feature()
+    def disable_feature_adapters(self):
+        raise NotImplementedError
+    def set_reader_feature(self, features:list):
+        return self.attn_feature_reader.set_cross_attention_feature(features)
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        # import ipdb; ipdb.set_trace()
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            if isinstance(image_embeds, list):
+                image_embeds = [image_embed.to(encoder_hidden_states.dtype) for image_embed in image_embeds]
+            else:
+                image_embeds = image_embeds.to(encoder_hidden_states.dtype)
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
+            # encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+        # import ipdb; ipdb.set_trace()
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+        # import ipdb; ipdb.set_trace()
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+                # import ipdb; ipdb.set_trace()
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # import ipdb; ipdb.set_trace()
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                )
+        # import ipdb; ipdb.set_trace()
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (sample,)
+        # import ipdb; ipdb.set_trace()
+        return UNet2DConditionOutput(sample=sample)

foleycrafter/models/specvqgan/data/greatesthit.py ADDED Viewed

	@@ -0,0 +1,993 @@

+from matplotlib import collections
+import json
+import os
+import copy
+import matplotlib.pyplot as plt
+import torch
+from torchvision import transforms
+import numpy as np
+from tqdm import tqdm
+from random import sample
+import torchaudio
+import logging
+import collections
+from glob import glob
+import sys
+import albumentations
+import soundfile
+sys.path.insert(0, '.')  # nopep8
+from train import instantiate_from_config
+from foleycrafter.models.specvqgan.data.transforms import *
+torchaudio.set_audio_backend("sox_io")
+logger = logging.getLogger(f'main.{__name__}')
+SR = 22050
+FPS = 15
+MAX_SAMPLE_ITER = 10
+def non_negative(x): return int(np.round(max(0, x), 0))
+def rms(x): return np.sqrt(np.mean(x**2))
+def get_GH_data_identifier(video_name, start_idx, split='_'):
+    if isinstance(start_idx, str):
+        return video_name + split + start_idx
+    elif isinstance(start_idx, int):
+        return video_name + split + str(start_idx)
+    else:
+        raise NotImplementedError
+class Crop(object):
+    def __init__(self, cropped_shape=None, random_crop=False):
+        self.cropped_shape = cropped_shape
+        if cropped_shape is not None:
+            mel_num, spec_len = cropped_shape
+            if random_crop:
+                self.cropper = albumentations.RandomCrop
+            else:
+                self.cropper = albumentations.CenterCrop
+            self.preprocessor = albumentations.Compose([self.cropper(mel_num, spec_len)])
+        else:
+            self.preprocessor = lambda **kwargs: kwargs
+    def __call__(self, item):
+        item['image'] = self.preprocessor(image=item['image'])['image']
+        if 'cond_image' in item.keys():
+            item['cond_image'] = self.preprocessor(image=item['cond_image'])['image']
+        return item
+class CropImage(Crop):
+    def __init__(self, *crop_args):
+        super().__init__(*crop_args)
+class CropFeats(Crop):
+    def __init__(self, *crop_args):
+        super().__init__(*crop_args)
+    def __call__(self, item):
+        item['feature'] = self.preprocessor(image=item['feature'])['image']
+        return item
+class CropCoords(Crop):
+    def __init__(self, *crop_args):
+        super().__init__(*crop_args)
+    def __call__(self, item):
+        item['coord'] = self.preprocessor(image=item['coord'])['image']
+        return item
+class ResampleFrames(object):
+    def __init__(self, feat_sample_size, times_to_repeat_after_resample=None):
+        self.feat_sample_size = feat_sample_size
+        self.times_to_repeat_after_resample = times_to_repeat_after_resample
+    def __call__(self, item):
+        feat_len = item['feature'].shape[0]
+        ## resample
+        assert feat_len >= self.feat_sample_size
+        # evenly spaced points (abcdefghkl -> aoooofoooo)
+        idx = np.linspace(0, feat_len, self.feat_sample_size, dtype=np.int, endpoint=False)
+        # xoooo xoooo -> ooxoo ooxoo
+        shift = feat_len // (self.feat_sample_size + 1)
+        idx = idx + shift
+        ## repeat after resampling (abc -> aaaabbbbcccc)
+        if self.times_to_repeat_after_resample is not None and self.times_to_repeat_after_resample > 1:
+            idx = np.repeat(idx, self.times_to_repeat_after_resample)
+        item['feature'] = item['feature'][idx, :]
+        return item
+class GreatestHitSpecs(torch.utils.data.Dataset):
+    def __init__(self, split, spec_dir_path, spec_len, random_crop, mel_num,
+                spec_crop_len, L=2.0, rand_shift=False, spec_transforms=None, splits_path='./data',
+                meta_path='./data/info_r2plus1d_dim1024_15fps.json'):
+        super().__init__()
+        self.split = split
+        self.specs_dir = spec_dir_path
+        self.spec_transforms = spec_transforms
+        self.splits_path = splits_path
+        self.meta_path = meta_path
+        self.spec_len = spec_len
+        self.rand_shift = rand_shift
+        self.L = L
+        self.spec_take_first = int(math.ceil(860 * (L / 10.) / 32) * 32)
+        self.spec_take_first = 860 if self.spec_take_first > 860 else self.spec_take_first
+        greatesthit_meta = json.load(open(self.meta_path, 'r'))
+        unique_classes = sorted(list(set(ht for ht in greatesthit_meta['hit_type'])))
+        self.label2target = {label: target for target, label in enumerate(unique_classes)}
+        self.target2label = {target: label for label, target in self.label2target.items()}
+        self.video_idx2label = {
+            get_GH_data_identifier(greatesthit_meta['video_name'][i], greatesthit_meta['start_idx'][i]):
+            greatesthit_meta['hit_type'][i] for i in range(len(greatesthit_meta['video_name']))
+        }
+        self.available_video_hit = list(self.video_idx2label.keys())
+        self.video_idx2path = {
+            vh: os.path.join(self.specs_dir,
+                vh.replace('_', '_denoised_') + '_' + self.video_idx2label[vh].replace(' ', '_') +'_mel.npy')
+            for vh in self.available_video_hit
+        }
+        self.video_idx2idx = {
+            get_GH_data_identifier(greatesthit_meta['video_name'][i], greatesthit_meta['start_idx'][i]):
+            i for i in range(len(greatesthit_meta['video_name']))
+        }
+        split_clip_ids_path = os.path.join(splits_path, f'greatesthit_{split}.json')
+        if not os.path.exists(split_clip_ids_path):
+            raise NotImplementedError()
+        clip_video_hit = json.load(open(split_clip_ids_path, 'r'))
+        self.dataset = clip_video_hit
+        spec_crop_len = self.spec_take_first if self.spec_take_first <= spec_crop_len else spec_crop_len
+        self.spec_transforms = transforms.Compose([
+            CropImage([mel_num, spec_crop_len], random_crop),
+            # transforms.RandomApply([FrequencyMasking(freq_mask_param=20)], p=0),
+            # transforms.RandomApply([TimeMasking(time_mask_param=int(32 * self.L))], p=0)
+        ])
+        self.video2indexes = {}
+        for video_idx in self.dataset:
+            video, start_idx = video_idx.split('_')
+            if video not in self.video2indexes.keys():
+                self.video2indexes[video] = []
+            self.video2indexes[video].append(start_idx)
+        for video in self.video2indexes.keys():
+            if len(self.video2indexes[video]) == 1: # given video contains only one hit
+                self.dataset.remove(
+                    get_GH_data_identifier(video, self.video2indexes[video][0])
+                )
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video_idx = self.dataset[idx]
+        spec_path = self.video_idx2path[video_idx]
+        spec = np.load(spec_path) # (80, 860)
+        if self.rand_shift:
+            shift = random.uniform(0, 0.5)
+            spec_shift = int(shift * spec.shape[1] // 10)
+            # Since only the first second is used
+            spec = np.roll(spec, -spec_shift, 1)
+        # concat spec outside dataload
+        item['image'] = 2 * spec - 1 # (80, 860)
+        item['image'] = item['image'][:, :self.spec_take_first]
+        item['file_path'] = spec_path
+        item['label'] = self.video_idx2label[video_idx]
+        item['target'] = self.label2target[item['label']]
+        if self.spec_transforms is not None:
+            item = self.spec_transforms(item)
+        return item
+class GreatestHitSpecsTrain(GreatestHitSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class GreatestHitSpecsValidation(GreatestHitSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('val', **specs_dataset_cfg)
+class GreatestHitSpecsTest(GreatestHitSpecs):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class GreatestHitWave(torch.utils.data.Dataset):
+    def __init__(self, split, wav_dir, random_crop, mel_num, spec_crop_len, spec_len,
+                L=2.0, splits_path='./data', rand_shift=True,
+                data_path='data/greatesthit/greatesthit-process-resized'):
+        super().__init__()
+        self.split = split
+        self.wav_dir = wav_dir
+        self.splits_path = splits_path
+        self.data_path = data_path
+        self.L = L
+        self.rand_shift = rand_shift
+        split_clip_ids_path = os.path.join(splits_path, f'greatesthit_{split}.json')
+        if not os.path.exists(split_clip_ids_path):
+            raise NotImplementedError()
+        clip_video_hit = json.load(open(split_clip_ids_path, 'r'))
+        video_name = list(set([vidx.split('_')[0] for vidx in clip_video_hit]))
+        self.video_frame_cnt = {v: len(os.listdir(os.path.join(self.data_path, v, 'frames'))) // 2 for v in video_name}
+        self.left_over = int(FPS * L + 1)
+        self.video_audio_path = {v: os.path.join(self.data_path, v, f'audio/{v}_denoised_resampled.wav') for v in video_name}
+        self.dataset = clip_video_hit
+        self.video2indexes = {}
+        for video_idx in self.dataset:
+            video, start_idx = video_idx.split('_')
+            if video not in self.video2indexes.keys():
+                self.video2indexes[video] = []
+            self.video2indexes[video].append(start_idx)
+        for video in self.video2indexes.keys():
+            if len(self.video2indexes[video]) == 1: # given video contains only one hit
+                self.dataset.remove(
+                    get_GH_data_identifier(video, self.video2indexes[video][0])
+                )
+        self.wav_transforms = transforms.Compose([
+            MakeMono(),
+            Padding(target_len=int(SR * self.L)),
+        ])
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video_idx = self.dataset[idx]
+        video, start_idx = video_idx.split('_')
+        start_idx = int(start_idx)
+        if self.rand_shift:
+            shift = int(random.uniform(-0.5, 0.5) * SR)
+            start_idx = non_negative(start_idx + shift)
+        wave_path = self.video_audio_path[video]
+        wav, sr = soundfile.read(wave_path, frames=int(SR * self.L), start=start_idx)
+        assert sr == SR
+        wav = self.wav_transforms(wav)
+        item['image'] = wav # (44100,)
+        # item['wav'] = wav
+        item['file_path_wav_'] = wave_path
+        item['label'] = 'None'
+        item['target'] = 'None'
+        return item
+class GreatestHitWaveTrain(GreatestHitWave):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class GreatestHitWaveValidation(GreatestHitWave):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('val', **specs_dataset_cfg)
+class GreatestHitWaveTest(GreatestHitWave):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class CondGreatestHitSpecsCondOnImage(torch.utils.data.Dataset):
+    def __init__(self, split, specs_dir, spec_len, feat_len, feat_depth, feat_crop_len, random_crop, mel_num, spec_crop_len,
+                vqgan_L=10.0, L=1.0, rand_shift=False, spec_transforms=None, frame_transforms=None, splits_path='./data',
+                meta_path='./data/info_r2plus1d_dim1024_15fps.json', frame_path='data/greatesthit/greatesthit_processed',
+                p_outside_cond=0., p_audio_aug=0.5):
+        super().__init__()
+        self.split = split
+        self.specs_dir = specs_dir
+        self.spec_transforms = spec_transforms
+        self.frame_transforms = frame_transforms
+        self.splits_path = splits_path
+        self.meta_path = meta_path
+        self.frame_path = frame_path
+        self.feat_len = feat_len
+        self.feat_depth = feat_depth
+        self.feat_crop_len = feat_crop_len
+        self.spec_len = spec_len
+        self.rand_shift = rand_shift
+        self.L = L
+        self.spec_take_first = int(math.ceil(860 * (vqgan_L / 10.) / 32) * 32)
+        self.spec_take_first = 860 if self.spec_take_first > 860 else self.spec_take_first
+        self.p_outside_cond = torch.tensor(p_outside_cond)
+        greatesthit_meta = json.load(open(self.meta_path, 'r'))
+        unique_classes = sorted(list(set(ht for ht in greatesthit_meta['hit_type'])))
+        self.label2target = {label: target for target, label in enumerate(unique_classes)}
+        self.target2label = {target: label for label, target in self.label2target.items()}
+        self.video_idx2label = {
+            get_GH_data_identifier(greatesthit_meta['video_name'][i], greatesthit_meta['start_idx'][i]):
+            greatesthit_meta['hit_type'][i] for i in range(len(greatesthit_meta['video_name']))
+        }
+        self.available_video_hit = list(self.video_idx2label.keys())
+        self.video_idx2path = {
+            vh: os.path.join(self.specs_dir,
+                vh.replace('_', '_denoised_') + '_' + self.video_idx2label[vh].replace(' ', '_') +'_mel.npy')
+            for vh in self.available_video_hit
+        }
+        for value in self.video_idx2path.values():
+            assert os.path.exists(value)
+        self.video_idx2idx = {
+            get_GH_data_identifier(greatesthit_meta['video_name'][i], greatesthit_meta['start_idx'][i]):
+            i for i in range(len(greatesthit_meta['video_name']))
+        }
+        split_clip_ids_path = os.path.join(splits_path, f'greatesthit_{split}.json')
+        if not os.path.exists(split_clip_ids_path):
+            self.make_split_files()
+        clip_video_hit = json.load(open(split_clip_ids_path, 'r'))
+        self.dataset = clip_video_hit
+        spec_crop_len = self.spec_take_first if self.spec_take_first <= spec_crop_len else spec_crop_len
+        self.spec_transforms = transforms.Compose([
+            CropImage([mel_num, spec_crop_len], random_crop),
+            # transforms.RandomApply([FrequencyMasking(freq_mask_param=20)], p=p_audio_aug),
+            # transforms.RandomApply([TimeMasking(time_mask_param=int(32 * self.L))], p=p_audio_aug)
+        ])
+        if self.frame_transforms == None:
+            self.frame_transforms = transforms.Compose([
+                Resize3D(128),
+                RandomResizedCrop3D(112, scale=(0.5, 1.0)),
+                RandomHorizontalFlip3D(),
+                ColorJitter3D(brightness=0.1, saturation=0.1),
+                ToTensor3D(),
+                Normalize3D(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225]),
+            ])
+        self.video2indexes = {}
+        for video_idx in self.dataset:
+            video, start_idx = video_idx.split('_')
+            if video not in self.video2indexes.keys():
+                self.video2indexes[video] = []
+            self.video2indexes[video].append(start_idx)
+        for video in self.video2indexes.keys():
+            if len(self.video2indexes[video]) == 1: # given video contains only one hit
+                self.dataset.remove(
+                    get_GH_data_identifier(video, self.video2indexes[video][0])
+                )
+        clip_classes = [self.label2target[self.video_idx2label[vh]] for vh in clip_video_hit]
+        class2count = collections.Counter(clip_classes)
+        self.class_counts = torch.tensor([class2count[cls] for cls in range(len(class2count))])
+        if self.L != 1.0:
+            print(split, L)
+            self.validate_data()
+        self.video2indexes = {}
+        for video_idx in self.dataset:
+            video, start_idx = video_idx.split('_')
+            if video not in self.video2indexes.keys():
+                self.video2indexes[video] = []
+            self.video2indexes[video].append(start_idx)
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        try:
+            video_idx = self.dataset[idx]
+            spec_path = self.video_idx2path[video_idx]
+            spec = np.load(spec_path) # (80, 860)
+            video, start_idx = video_idx.split('_')
+            frame_path = os.path.join(self.frame_path, video, 'frames')
+            start_frame_idx = non_negative(FPS * int(start_idx)/SR)
+            end_frame_idx = non_negative(start_frame_idx + FPS * self.L)
+            if self.rand_shift:
+                shift = random.uniform(0, 0.5)
+                spec_shift = int(shift * spec.shape[1] // 10)
+                # Since only the first second is used
+                spec = np.roll(spec, -spec_shift, 1)
+                start_frame_idx += int(FPS * shift)
+                end_frame_idx += int(FPS * shift)
+            frames = [Image.open(os.path.join(
+                frame_path, f'frame{i+1:0>6d}.jpg')).convert('RGB') for i in
+                range(start_frame_idx, end_frame_idx)]
+            # Sample condition
+            if torch.all(torch.bernoulli(self.p_outside_cond) == 1.):
+                # Sample condition from outside video
+                all_idx = set(list(range(len(self.dataset))))
+                all_idx.remove(idx)
+                cond_video_idx = self.dataset[sample(all_idx, k=1)[0]]
+                cond_video, cond_start_idx = cond_video_idx.split('_')
+            else:
+                cond_video = video
+                video_hits_idx = copy.copy(self.video2indexes[video])
+                video_hits_idx.remove(start_idx)
+                cond_start_idx = sample(video_hits_idx, k=1)[0]
+                cond_video_idx = get_GH_data_identifier(cond_video, cond_start_idx)
+            cond_spec_path = self.video_idx2path[cond_video_idx]
+            cond_spec = np.load(cond_spec_path) # (80, 860)
+            cond_video, cond_start_idx = cond_video_idx.split('_')
+            cond_frame_path = os.path.join(self.frame_path, cond_video, 'frames')
+            cond_start_frame_idx = non_negative(FPS * int(cond_start_idx)/SR)
+            cond_end_frame_idx = non_negative(cond_start_frame_idx + FPS * self.L)
+            if self.rand_shift:
+                cond_shift = random.uniform(0, 0.5)
+                cond_spec_shift = int(cond_shift * cond_spec.shape[1] // 10)
+                # Since only the first second is used
+                cond_spec = np.roll(cond_spec, -cond_spec_shift, 1)
+                cond_start_frame_idx += int(FPS * cond_shift)
+                cond_end_frame_idx += int(FPS * cond_shift)
+            cond_frames = [Image.open(os.path.join(
+                cond_frame_path, f'frame{i+1:0>6d}.jpg')).convert('RGB') for i in
+                range(cond_start_frame_idx, cond_end_frame_idx)]
+            # concat spec outside dataload
+            item['image'] = 2 * spec - 1 # (80, 860)
+            item['cond_image'] = 2 * cond_spec - 1 # (80, 860)
+            item['image'] = item['image'][:, :self.spec_take_first]
+            item['cond_image'] = item['cond_image'][:, :self.spec_take_first]
+            item['file_path_specs_'] = spec_path
+            item['file_path_cond_specs_'] = cond_spec_path
+            if self.frame_transforms is not None:
+                cond_frames = self.frame_transforms(cond_frames)
+                frames = self.frame_transforms(frames)
+            item['feature'] = np.stack(cond_frames + frames, axis=0) # (30 * L, 112, 112, 3)
+            item['file_path_feats_'] = (frame_path, start_frame_idx)
+            item['file_path_cond_feats_'] = (cond_frame_path, cond_start_frame_idx)
+            item['label'] = self.video_idx2label[video_idx]
+            item['target'] = self.label2target[item['label']]
+            if self.spec_transforms is not None:
+                item = self.spec_transforms(item)
+        except Exception:
+            print(sys.exc_info()[2])
+            print('!!!!!!!!!!!!!!!!!!!!', video_idx, cond_video_idx)
+            print('!!!!!!!!!!!!!!!!!!!!', end_frame_idx, cond_end_frame_idx)
+            exit(1)
+        return item
+    def validate_data(self):
+        original_len = len(self.dataset)
+        valid_dataset = []
+        for video_idx in tqdm(self.dataset):
+            video, start_idx = video_idx.split('_')
+            frame_path = os.path.join(self.frame_path, video, 'frames')
+            start_frame_idx = non_negative(FPS * int(start_idx)/SR)
+            end_frame_idx = non_negative(start_frame_idx + FPS * (self.L + 0.6))
+            if os.path.exists(os.path.join(frame_path, f'frame{end_frame_idx:0>6d}.jpg')):
+                valid_dataset.append(video_idx)
+            else:
+                self.video2indexes[video].remove(start_idx)
+        for video_idx in valid_dataset:
+            video, start_idx = video_idx.split('_')
+            if len(self.video2indexes[video]) == 1:
+                valid_dataset.remove(video_idx)
+        if original_len != len(valid_dataset):
+            print(f'Validated dataset with enough frames: {len(valid_dataset)}')
+        self.dataset = valid_dataset
+        split_clip_ids_path = os.path.join(self.splits_path, f'greatesthit_{self.split}_{self.L:.2f}.json')
+        if not os.path.exists(split_clip_ids_path):
+            with open(split_clip_ids_path, 'w') as f:
+                json.dump(valid_dataset, f)
+    def make_split_files(self, ratio=[0.85, 0.1, 0.05]):
+        random.seed(1337)
+        print(f'The split files do not exist @ {self.splits_path}. Calculating the new ones.')
+        # The downloaded videos (some went missing on YouTube and no longer available)
+        available_mel_paths = set(glob(os.path.join(self.specs_dir, '*_mel.npy')))
+        self.available_video_hit = [vh for vh in self.available_video_hit if self.video_idx2path[vh] in available_mel_paths]
+        all_video = list(self.video2indexes.keys())
+        print(f'The number of clips available after download: {len(self.available_video_hit)}')
+        print(f'The number of videos available after download: {len(all_video)}')
+        available_idx = list(range(len(all_video)))
+        random.shuffle(available_idx)
+        assert sum(ratio) == 1.
+        cut_train = int(ratio[0] * len(all_video))
+        cut_test = cut_train + int(ratio[1] * len(all_video))
+        train_idx = available_idx[:cut_train]
+        test_idx = available_idx[cut_train:cut_test]
+        valid_idx = available_idx[cut_test:]
+        train_video = [all_video[i] for i in train_idx]
+        test_video = [all_video[i] for i in test_idx]
+        valid_video = [all_video[i] for i in valid_idx]
+        train_video_hit = []
+        for v in train_video:
+            train_video_hit += [get_GH_data_identifier(v, hit_idx) for hit_idx in self.video2indexes[v]]
+        test_video_hit = []
+        for v in test_video:
+            test_video_hit += [get_GH_data_identifier(v, hit_idx) for hit_idx in self.video2indexes[v]]
+        valid_video_hit = []
+        for v in valid_video:
+            valid_video_hit += [get_GH_data_identifier(v, hit_idx) for hit_idx in self.video2indexes[v]]
+        # mix train and valid for better validation loss
+        mixed = train_video_hit + valid_video_hit
+        random.shuffle(mixed)
+        split = int(len(mixed) * ratio[0] / (ratio[0] + ratio[2]))
+        train_video_hit = mixed[:split]
+        valid_video_hit = mixed[split:]
+        with open(os.path.join(self.splits_path, 'greatesthit_train.json'), 'w') as train_file,\
+             open(os.path.join(self.splits_path, 'greatesthit_test.json'), 'w') as test_file,\
+             open(os.path.join(self.splits_path, 'greatesthit_valid.json'), 'w') as valid_file:
+            json.dump(train_video_hit, train_file)
+            json.dump(test_video_hit, test_file)
+            json.dump(valid_video_hit, valid_file)
+        print(f'Put {len(train_idx)} clips to the train set and saved it to ./data/greatesthit_train.json')
+        print(f'Put {len(test_idx)} clips to the test set and saved it to ./data/greatesthit_test.json')
+        print(f'Put {len(valid_idx)} clips to the valid set and saved it to ./data/greatesthit_valid.json')
+class CondGreatestHitSpecsCondOnImageTrain(CondGreatestHitSpecsCondOnImage):
+    def __init__(self, dataset_cfg):
+        train_transforms = transforms.Compose([
+            Resize3D(256),
+            RandomResizedCrop3D(224, scale=(0.5, 1.0)),
+            RandomHorizontalFlip3D(),
+            ColorJitter3D(brightness=0.1, saturation=0.1),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('train', frame_transforms=train_transforms, **dataset_cfg)
+class CondGreatestHitSpecsCondOnImageValidation(CondGreatestHitSpecsCondOnImage):
+    def __init__(self, dataset_cfg):
+        valid_transforms = transforms.Compose([
+            Resize3D(256),
+            CenterCrop3D(224),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('val', frame_transforms=valid_transforms, **dataset_cfg)
+class CondGreatestHitSpecsCondOnImageTest(CondGreatestHitSpecsCondOnImage):
+    def __init__(self, dataset_cfg):
+        test_transforms = transforms.Compose([
+            Resize3D(256),
+            CenterCrop3D(224),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('test', frame_transforms=test_transforms, **dataset_cfg)
+class CondGreatestHitWaveCondOnImage(torch.utils.data.Dataset):
+    def __init__(self, split, wav_dir, spec_len, random_crop, mel_num, spec_crop_len,
+                L=2.0, frame_transforms=None, splits_path='./data',
+                data_path='data/greatesthit/greatesthit-process-resized',
+                p_outside_cond=0., p_audio_aug=0.5, rand_shift=True):
+        super().__init__()
+        self.split = split
+        self.wav_dir = wav_dir
+        self.frame_transforms = frame_transforms
+        self.splits_path = splits_path
+        self.data_path = data_path
+        self.spec_len = spec_len
+        self.L = L
+        self.rand_shift = rand_shift
+        self.p_outside_cond = torch.tensor(p_outside_cond)
+        split_clip_ids_path = os.path.join(splits_path, f'greatesthit_{split}.json')
+        if not os.path.exists(split_clip_ids_path):
+            raise NotImplementedError()
+        clip_video_hit = json.load(open(split_clip_ids_path, 'r'))
+        video_name = list(set([vidx.split('_')[0] for vidx in clip_video_hit]))
+        self.video_frame_cnt = {v: len(os.listdir(os.path.join(self.data_path, v, 'frames')))//2 for v in video_name}
+        self.left_over = int(FPS * L + 1)
+        self.video_audio_path = {v: os.path.join(self.data_path, v, f'audio/{v}_denoised_resampled.wav') for v in video_name}
+        self.dataset = clip_video_hit
+        self.video2indexes = {}
+        for video_idx in self.dataset:
+            video, start_idx = video_idx.split('_')
+            if video not in self.video2indexes.keys():
+                self.video2indexes[video] = []
+            self.video2indexes[video].append(start_idx)
+        for video in self.video2indexes.keys():
+            if len(self.video2indexes[video]) == 1: # given video contains only one hit
+                self.dataset.remove(
+                    get_GH_data_identifier(video, self.video2indexes[video][0])
+                )
+        self.wav_transforms = transforms.Compose([
+            MakeMono(),
+            Padding(target_len=int(SR * self.L)),
+        ])
+        if self.frame_transforms == None:
+            self.frame_transforms = transforms.Compose([
+                Resize3D(256),
+                RandomResizedCrop3D(224, scale=(0.5, 1.0)),
+                RandomHorizontalFlip3D(),
+                ColorJitter3D(brightness=0.1, saturation=0.1),
+                ToTensor3D(),
+                Normalize3D(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225]),
+            ])
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video_idx = self.dataset[idx]
+        video, start_idx = video_idx.split('_')
+        start_idx = int(start_idx)
+        frame_path = os.path.join(self.data_path, video, 'frames')
+        start_frame_idx = non_negative(FPS * int(start_idx)/SR)
+        if self.rand_shift:
+            shift = random.uniform(-0.5, 0.5)
+            start_frame_idx = non_negative(start_frame_idx + int(FPS * shift))
+            start_idx = non_negative(start_idx + int(SR * shift))
+        if start_frame_idx > self.video_frame_cnt[video] - self.left_over:
+            start_frame_idx = self.video_frame_cnt[video] - self.left_over
+            start_idx = non_negative(SR * (start_frame_idx / FPS))
+        end_frame_idx = non_negative(start_frame_idx + FPS * self.L)
+        # target
+        wave_path = self.video_audio_path[video]
+        frames = [Image.open(os.path.join(
+            frame_path, f'frame{i+1:0>6d}')).convert('RGB') for i in
+            range(start_frame_idx, end_frame_idx)]
+        wav, sr = soundfile.read(wave_path, frames=int(SR * self.L), start=start_idx)
+        assert sr == SR
+        wav = self.wav_transforms(wav)
+        # cond
+        if torch.all(torch.bernoulli(self.p_outside_cond) == 1.):
+            all_idx = set(list(range(len(self.dataset))))
+            all_idx.remove(idx)
+            cond_video_idx = self.dataset[sample(all_idx, k=1)[0]]
+            cond_video, cond_start_idx = cond_video_idx.split('_')
+        else:
+            cond_video = video
+            video_hits_idx = copy.copy(self.video2indexes[video])
+            if str(start_idx) in video_hits_idx:
+                video_hits_idx.remove(str(start_idx))
+            cond_start_idx = sample(video_hits_idx, k=1)[0]
+            cond_video_idx = get_GH_data_identifier(cond_video, cond_start_idx)
+        cond_video, cond_start_idx = cond_video_idx.split('_')
+        cond_start_idx = int(cond_start_idx)
+        cond_frame_path = os.path.join(self.data_path, cond_video, 'frames')
+        cond_start_frame_idx = non_negative(FPS * int(cond_start_idx)/SR)
+        cond_wave_path = self.video_audio_path[cond_video]
+        if self.rand_shift:
+            cond_shift = random.uniform(-0.5, 0.5)
+            cond_start_frame_idx = non_negative(cond_start_frame_idx + int(FPS * cond_shift))
+            cond_start_idx = non_negative(cond_start_idx + int(shift * SR))
+        if cond_start_frame_idx > self.video_frame_cnt[cond_video] - self.left_over:
+            cond_start_frame_idx = self.video_frame_cnt[cond_video] - self.left_over
+            cond_start_idx = non_negative(SR * (cond_start_frame_idx / FPS))
+        cond_end_frame_idx = non_negative(cond_start_frame_idx + FPS * self.L)
+        cond_frames = [Image.open(os.path.join(
+                cond_frame_path, f'frame{i+1:0>6d}')).convert('RGB') for i in
+                range(cond_start_frame_idx, cond_end_frame_idx)]
+        cond_wav, _ = soundfile.read(cond_wave_path, frames=int(SR * self.L), start=cond_start_idx)
+        cond_wav = self.wav_transforms(cond_wav)
+        item['image'] = wav # (44100,)
+        item['cond_image'] = cond_wav # (44100,)
+        item['file_path_wav_'] = wave_path
+        item['file_path_cond_wav_'] = cond_wave_path
+        if self.frame_transforms is not None:
+            cond_frames = self.frame_transforms(cond_frames)
+            frames = self.frame_transforms(frames)
+        item['feature'] = np.stack(cond_frames + frames, axis=0) # (30 * L, 112, 112, 3)
+        item['file_path_feats_'] = (frame_path, start_idx)
+        item['file_path_cond_feats_'] = (cond_frame_path, cond_start_idx)
+        item['label'] = 'None'
+        item['target'] = 'None'
+        return item
+    def validate_data(self):
+        raise NotImplementedError()
+    def make_split_files(self, ratio=[0.85, 0.1, 0.05]):
+        random.seed(1337)
+        print(f'The split files do not exist @ {self.splits_path}. Calculating the new ones.')
+        all_video = sorted(os.listdir(self.data_path))
+        print(f'The number of videos available after download: {len(all_video)}')
+        available_idx = list(range(len(all_video)))
+        random.shuffle(available_idx)
+        assert sum(ratio) == 1.
+        cut_train = int(ratio[0] * len(all_video))
+        cut_test = cut_train + int(ratio[1] * len(all_video))
+        train_idx = available_idx[:cut_train]
+        test_idx = available_idx[cut_train:cut_test]
+        valid_idx = available_idx[cut_test:]
+        train_video = [all_video[i] for i in train_idx]
+        test_video = [all_video[i] for i in test_idx]
+        valid_video = [all_video[i] for i in valid_idx]
+        with open(os.path.join(self.splits_path, 'greatesthit_video_train.json'), 'w') as train_file,\
+             open(os.path.join(self.splits_path, 'greatesthit_video_test.json'), 'w') as test_file,\
+             open(os.path.join(self.splits_path, 'greatesthit_video_valid.json'), 'w') as valid_file:
+            json.dump(train_video, train_file)
+            json.dump(test_video, test_file)
+            json.dump(valid_video, valid_file)
+        print(f'Put {len(train_idx)} videos to the train set and saved it to ./data/greatesthit_video_train.json')
+        print(f'Put {len(test_idx)} videos to the test set and saved it to ./data/greatesthit_video_test.json')
+        print(f'Put {len(valid_idx)} videos to the valid set and saved it to ./data/greatesthit_video_valid.json')
+class CondGreatestHitWaveCondOnImageTrain(CondGreatestHitWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        train_transforms = transforms.Compose([
+            Resize3D(128),
+            RandomResizedCrop3D(112, scale=(0.5, 1.0)),
+            RandomHorizontalFlip3D(),
+            ColorJitter3D(brightness=0.4, saturation=0.4, contrast=0.2, hue=0.1),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('train', frame_transforms=train_transforms, **dataset_cfg)
+class CondGreatestHitWaveCondOnImageValidation(CondGreatestHitWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        valid_transforms = transforms.Compose([
+            Resize3D(128),
+            CenterCrop3D(112),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('val', frame_transforms=valid_transforms, **dataset_cfg)
+class CondGreatestHitWaveCondOnImageTest(CondGreatestHitWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        test_transforms = transforms.Compose([
+            Resize3D(128),
+            CenterCrop3D(112),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('test', frame_transforms=test_transforms, **dataset_cfg)
+class GreatestHitWaveCondOnImage(torch.utils.data.Dataset):
+    def __init__(self, split, wav_dir, spec_len, random_crop, mel_num, spec_crop_len,
+                L=2.0, frame_transforms=None, splits_path='./data',
+                data_path='data/greatesthit/greatesthit-process-resized',
+                p_outside_cond=0., p_audio_aug=0.5, rand_shift=True):
+        super().__init__()
+        self.split = split
+        self.wav_dir = wav_dir
+        self.frame_transforms = frame_transforms
+        self.splits_path = splits_path
+        self.data_path = data_path
+        self.spec_len = spec_len
+        self.L = L
+        self.rand_shift = rand_shift
+        self.p_outside_cond = torch.tensor(p_outside_cond)
+        split_clip_ids_path = os.path.join(splits_path, f'greatesthit_{split}.json')
+        if not os.path.exists(split_clip_ids_path):
+            raise NotImplementedError()
+        clip_video_hit = json.load(open(split_clip_ids_path, 'r'))
+        video_name = list(set([vidx.split('_')[0] for vidx in clip_video_hit]))
+        self.video_frame_cnt = {v: len(os.listdir(os.path.join(self.data_path, v, 'frames')))//2 for v in video_name}
+        self.left_over = int(FPS * L + 1)
+        self.video_audio_path = {v: os.path.join(self.data_path, v, f'audio/{v}_denoised_resampled.wav') for v in video_name}
+        self.dataset = clip_video_hit
+        self.video2indexes = {}
+        for video_idx in self.dataset:
+            video, start_idx = video_idx.split('_')
+            if video not in self.video2indexes.keys():
+                self.video2indexes[video] = []
+            self.video2indexes[video].append(start_idx)
+        for video in self.video2indexes.keys():
+            if len(self.video2indexes[video]) == 1: # given video contains only one hit
+                self.dataset.remove(
+                    get_GH_data_identifier(video, self.video2indexes[video][0])
+                )
+        self.wav_transforms = transforms.Compose([
+            MakeMono(),
+            Padding(target_len=int(SR * self.L)),
+        ])
+        if self.frame_transforms == None:
+            self.frame_transforms = transforms.Compose([
+                Resize3D(256),
+                RandomResizedCrop3D(224, scale=(0.5, 1.0)),
+                RandomHorizontalFlip3D(),
+                ColorJitter3D(brightness=0.1, saturation=0.1),
+                ToTensor3D(),
+                Normalize3D(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225]),
+            ])
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video_idx = self.dataset[idx]
+        video, start_idx = video_idx.split('_')
+        start_idx = int(start_idx)
+        frame_path = os.path.join(self.data_path, video, 'frames')
+        start_frame_idx = non_negative(FPS * int(start_idx)/SR)
+        if self.rand_shift:
+            shift = random.uniform(-0.5, 0.5)
+            start_frame_idx = non_negative(start_frame_idx + int(FPS * shift))
+            start_idx = non_negative(start_idx + int(SR * shift))
+        if start_frame_idx > self.video_frame_cnt[video] - self.left_over:
+            start_frame_idx = self.video_frame_cnt[video] - self.left_over
+            start_idx = non_negative(SR * (start_frame_idx / FPS))
+        end_frame_idx = non_negative(start_frame_idx + FPS * self.L)
+        # target
+        wave_path = self.video_audio_path[video]
+        frames = [Image.open(os.path.join(
+            frame_path, f'frame{i+1:0>6d}')).convert('RGB') for i in
+            range(start_frame_idx, end_frame_idx)]
+        wav, sr = soundfile.read(wave_path, frames=int(SR * self.L), start=start_idx)
+        assert sr == SR
+        wav = self.wav_transforms(wav)
+        item['image'] = wav # (44100,)
+        item['file_path_wav_'] = wave_path
+        if self.frame_transforms is not None:
+            frames = self.frame_transforms(frames)
+        item['feature'] = torch.stack(frames, dim=0) # (15 * L, 112, 112, 3)
+        item['file_path_feats_'] = (frame_path, start_idx)
+        item['label'] = 'None'
+        item['target'] = 'None'
+        return item
+    def validate_data(self):
+        raise NotImplementedError()
+    def make_split_files(self, ratio=[0.85, 0.1, 0.05]):
+        random.seed(1337)
+        print(f'The split files do not exist @ {self.splits_path}. Calculating the new ones.')
+        all_video = sorted(os.listdir(self.data_path))
+        print(f'The number of videos available after download: {len(all_video)}')
+        available_idx = list(range(len(all_video)))
+        random.shuffle(available_idx)
+        assert sum(ratio) == 1.
+        cut_train = int(ratio[0] * len(all_video))
+        cut_test = cut_train + int(ratio[1] * len(all_video))
+        train_idx = available_idx[:cut_train]
+        test_idx = available_idx[cut_train:cut_test]
+        valid_idx = available_idx[cut_test:]
+        train_video = [all_video[i] for i in train_idx]
+        test_video = [all_video[i] for i in test_idx]
+        valid_video = [all_video[i] for i in valid_idx]
+        with open(os.path.join(self.splits_path, 'greatesthit_video_train.json'), 'w') as train_file,\
+             open(os.path.join(self.splits_path, 'greatesthit_video_test.json'), 'w') as test_file,\
+             open(os.path.join(self.splits_path, 'greatesthit_video_valid.json'), 'w') as valid_file:
+            json.dump(train_video, train_file)
+            json.dump(test_video, test_file)
+            json.dump(valid_video, valid_file)
+        print(f'Put {len(train_idx)} videos to the train set and saved it to ./data/greatesthit_video_train.json')
+        print(f'Put {len(test_idx)} videos to the test set and saved it to ./data/greatesthit_video_test.json')
+        print(f'Put {len(valid_idx)} videos to the valid set and saved it to ./data/greatesthit_video_valid.json')
+class GreatestHitWaveCondOnImageTrain(GreatestHitWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        train_transforms = transforms.Compose([
+            Resize3D(128),
+            RandomResizedCrop3D(112, scale=(0.5, 1.0)),
+            RandomHorizontalFlip3D(),
+            ColorJitter3D(brightness=0.4, saturation=0.4, contrast=0.2, hue=0.1),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('train', frame_transforms=train_transforms, **dataset_cfg)
+class GreatestHitWaveCondOnImageValidation(GreatestHitWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        valid_transforms = transforms.Compose([
+            Resize3D(128),
+            CenterCrop3D(112),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('val', frame_transforms=valid_transforms, **dataset_cfg)
+class GreatestHitWaveCondOnImageTest(GreatestHitWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        test_transforms = transforms.Compose([
+            Resize3D(128),
+            CenterCrop3D(112),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('test', frame_transforms=test_transforms, **dataset_cfg)
+def draw_spec(spec, dest, cmap='magma'):
+    plt.imshow(spec, cmap=cmap, origin='lower')
+    plt.axis('off')
+    plt.savefig(dest, bbox_inches='tight', pad_inches=0., dpi=300)
+    plt.close()
+if __name__ == '__main__':
+    import sys
+    from omegaconf import OmegaConf
+    # cfg = OmegaConf.load('configs/greatesthit_transformer_with_vNet_randshift_2s_GH_vqgan_no_earlystop.yaml')
+    cfg = OmegaConf.load('configs/greatesthit_codebook.yaml')
+    data = instantiate_from_config(cfg.data)
+    data.prepare_data()
+    data.setup()
+    print(len(data.datasets['train']))
+    print(data.datasets['train'][24])

foleycrafter/models/specvqgan/data/impactset.py ADDED Viewed

	@@ -0,0 +1,778 @@

+import json
+import os
+import matplotlib.pyplot as plt
+import torch
+from torchvision import transforms
+import numpy as np
+from tqdm import tqdm
+from random import sample
+import torchaudio
+import logging
+from glob import glob
+import sys
+import soundfile
+import copy
+import csv
+import noisereduce as nr
+sys.path.insert(0, '.')  # nopep8
+from train import instantiate_from_config
+from foleycrafter.models.specvqgan.data.transforms import *
+torchaudio.set_audio_backend("sox_io")
+logger = logging.getLogger(f'main.{__name__}')
+SR = 22050
+FPS = 15
+MAX_SAMPLE_ITER = 10
+def non_negative(x): return int(np.round(max(0, x), 0))
+def rms(x): return np.sqrt(np.mean(x**2))
+def get_GH_data_identifier(video_name, start_idx, split='_'):
+    if isinstance(start_idx, str):
+        return video_name + split + start_idx
+    elif isinstance(start_idx, int):
+        return video_name + split + str(start_idx)
+    else:
+        raise NotImplementedError
+def draw_spec(spec, dest, cmap='magma'):
+    plt.imshow(spec, cmap=cmap, origin='lower')
+    plt.axis('off')
+    plt.savefig(dest, bbox_inches='tight', pad_inches=0., dpi=300)
+    plt.close()
+def convert_to_decibel(arr):
+    ref = 1
+    return 20 * np.log10(abs(arr + 1e-4) / ref)
+class ResampleFrames(object):
+    def __init__(self, feat_sample_size, times_to_repeat_after_resample=None):
+        self.feat_sample_size = feat_sample_size
+        self.times_to_repeat_after_resample = times_to_repeat_after_resample
+    def __call__(self, item):
+        feat_len = item['feature'].shape[0]
+        ## resample
+        assert feat_len >= self.feat_sample_size
+        # evenly spaced points (abcdefghkl -> aoooofoooo)
+        idx = np.linspace(0, feat_len, self.feat_sample_size, dtype=np.int, endpoint=False)
+        # xoooo xoooo -> ooxoo ooxoo
+        shift = feat_len // (self.feat_sample_size + 1)
+        idx = idx + shift
+        ## repeat after resampling (abc -> aaaabbbbcccc)
+        if self.times_to_repeat_after_resample is not None and self.times_to_repeat_after_resample > 1:
+            idx = np.repeat(idx, self.times_to_repeat_after_resample)
+        item['feature'] = item['feature'][idx, :]
+        return item
+class ImpactSetWave(torch.utils.data.Dataset):
+    def __init__(self, split, random_crop, mel_num, spec_crop_len,
+                L=2.0, denoise=False, splits_path='./data',
+                data_path='data/ImpactSet/impactset-proccess-resize'):
+        super().__init__()
+        self.split = split
+        self.splits_path = splits_path
+        self.data_path = data_path
+        self.L = L
+        self.denoise = denoise
+        video_name_split_path = os.path.join(splits_path, f'countixAV_{split}.json')
+        if not os.path.exists(video_name_split_path):
+            self.make_split_files()
+        video_name = json.load(open(video_name_split_path, 'r'))
+        self.video_frame_cnt = {v: len(os.listdir(os.path.join(self.data_path, v, 'frames'))) for v in video_name}
+        self.left_over = int(FPS * L + 1)
+        self.video_audio_path = {v: os.path.join(self.data_path, v, f'audio/{v}_resampled.wav') for v in video_name}
+        self.dataset = video_name
+        self.wav_transforms = transforms.Compose([
+            MakeMono(),
+            Padding(target_len=int(SR * self.L)),
+        ])
+        self.spec_transforms = CropImage([mel_num, spec_crop_len], random_crop)
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video = self.dataset[idx]
+        available_frame_idx = self.video_frame_cnt[video] - self.left_over
+        wav = None
+        spec = None
+        max_db = -np.inf
+        wave_path = ''
+        cur_wave_path = self.video_audio_path[video]
+        if self.denoise:
+            cur_wave_path = cur_wave_path.replace('.wav', '_denoised.wav')
+        for _ in range(10):
+            start_idx = torch.randint(0, available_frame_idx, (1,)).tolist()[0]
+            # target
+            start_t = (start_idx + 0.5) / FPS
+            start_audio_idx = non_negative(start_t * SR)
+            cur_wav, _ = soundfile.read(cur_wave_path, frames=int(SR * self.L), start=start_audio_idx)
+            decibel = convert_to_decibel(cur_wav)
+            if float(np.mean(decibel)) > max_db:
+                wav = cur_wav
+                wave_path = cur_wave_path
+                max_db = float(np.mean(decibel))
+            if max_db >= -40:
+                break
+        # print(max_db)
+        wav = self.wav_transforms(wav)
+        item['image'] = wav # (80, 173)
+        # item['wav'] = wav
+        item['file_path_wav_'] = wave_path
+        item['label'] = 'None'
+        item['target'] = 'None'
+        return item
+    def make_split_files(self):
+        raise NotImplementedError
+class ImpactSetWaveTrain(ImpactSetWave):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class ImpactSetWaveValidation(ImpactSetWave):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('val', **specs_dataset_cfg)
+class ImpactSetWaveTest(ImpactSetWave):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class ImpactSetSpec(torch.utils.data.Dataset):
+    def __init__(self, split, random_crop, mel_num, spec_crop_len,
+                L=2.0, denoise=False, splits_path='./data',
+                data_path='data/ImpactSet/impactset-proccess-resize'):
+        super().__init__()
+        self.split = split
+        self.splits_path = splits_path
+        self.data_path = data_path
+        self.L = L
+        self.denoise = denoise
+        video_name_split_path = os.path.join(splits_path, f'countixAV_{split}.json')
+        if not os.path.exists(video_name_split_path):
+            self.make_split_files()
+        video_name = json.load(open(video_name_split_path, 'r'))
+        self.video_frame_cnt = {v: len(os.listdir(os.path.join(self.data_path, v, 'frames'))) for v in video_name}
+        self.left_over = int(FPS * L + 1)
+        self.video_audio_path = {v: os.path.join(self.data_path, v, f'audio/{v}_resampled.wav') for v in video_name}
+        self.dataset = video_name
+        self.wav_transforms = transforms.Compose([
+            MakeMono(),
+            SpectrogramTorchAudio(nfft=1024, hoplen=1024//4, spec_power=1),
+            MelScaleTorchAudio(sr=SR, stft=513, fmin=125, fmax=7600, nmels=80),
+            LowerThresh(1e-5),
+            Log10(),
+            Multiply(20),
+            Subtract(20),
+            Add(100),
+            Divide(100),
+            Clip(0, 1.0),
+            TrimSpec(173),
+        ])
+        self.spec_transforms = CropImage([mel_num, spec_crop_len], random_crop)
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video = self.dataset[idx]
+        available_frame_idx = self.video_frame_cnt[video] - self.left_over
+        wav = None
+        spec = None
+        max_rms = -np.inf
+        wave_path = ''
+        cur_wave_path = self.video_audio_path[video]
+        if self.denoise:
+            cur_wave_path = cur_wave_path.replace('.wav', '_denoised.wav')
+        for _ in range(10):
+            start_idx = torch.randint(0, available_frame_idx, (1,)).tolist()[0]
+            # target
+            start_t = (start_idx + 0.5) / FPS
+            start_audio_idx = non_negative(start_t * SR)
+            cur_wav, _ = soundfile.read(cur_wave_path, frames=int(SR * self.L), start=start_audio_idx)
+            if self.wav_transforms is not None:
+                spec_tensor = self.wav_transforms(torch.tensor(cur_wav).float())
+                cur_spec = spec_tensor.numpy()
+            # zeros padding if not enough spec t steps
+            if cur_spec.shape[1] < 173:
+                pad = np.zeros((80, 173), dtype=cur_spec.dtype)
+                pad[:, :cur_spec.shape[1]] = cur_spec
+                cur_spec = pad
+            rms_val = rms(cur_spec)
+            if rms_val > max_rms:
+                wav = cur_wav
+                spec = cur_spec
+                wave_path = cur_wave_path
+                max_rms = rms_val
+            # print(rms_val)
+            if max_rms >= 0.1:
+                break
+        item['image'] = 2 * spec - 1 # (80, 173)
+        # item['wav'] = wav
+        item['file_path_wav_'] = wave_path
+        item['label'] = 'None'
+        item['target'] = 'None'
+        if self.spec_transforms is not None:
+            item = self.spec_transforms(item)
+        return item
+    def make_split_files(self):
+        raise NotImplementedError
+class ImpactSetSpecTrain(ImpactSetSpec):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class ImpactSetSpecValidation(ImpactSetSpec):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('val', **specs_dataset_cfg)
+class ImpactSetSpecTest(ImpactSetSpec):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class ImpactSetWaveTestTime(torch.utils.data.Dataset):
+    def __init__(self, split, random_crop, mel_num, spec_crop_len,
+                L=2.0, denoise=False, splits_path='./data',
+                data_path='data/ImpactSet/impactset-proccess-resize'):
+        super().__init__()
+        self.split = split
+        self.splits_path = splits_path
+        self.data_path = data_path
+        self.L = L
+        self.denoise = denoise
+        self.video_list = glob('data/ImpactSet/RawVideos/StockVideo_sound/*.wav') + [
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/1_ckbCU5aQs/1_ckbCU5aQs_0013_0016_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/GFmuVBiwz6k/GFmuVBiwz6k_0034_0054_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/OsPcY316h1M/OsPcY316h1M_0000_0005_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/SExIpBIBj_k/SExIpBIBj_k_0009_0019_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/S6TkbV4B4QI/S6TkbV4B4QI_0028_0036_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/2Ld24pPIn3k/2Ld24pPIn3k_0005_0011_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/6d1YS7fdBK4/6d1YS7fdBK4_0007_0019_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/JnBsmJgEkiw/JnBsmJgEkiw_0008_0016_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/xcUyiXt0gjo/xcUyiXt0gjo_0015_0021_resize.wav',
+            'data/ImpactSet/RawVideos/YouTube-impact-ccl/4DRFJnZjpMM/4DRFJnZjpMM_0000_0010_resize.wav'
+        ] + glob('data/ImpactSet/RawVideos/self_recorded/*_resize.wav')
+        self.wav_transforms = transforms.Compose([
+            MakeMono(),
+            SpectrogramTorchAudio(nfft=1024, hoplen=1024//4, spec_power=1),
+            MelScaleTorchAudio(sr=SR, stft=513, fmin=125, fmax=7600, nmels=80),
+            LowerThresh(1e-5),
+            Log10(),
+            Multiply(20),
+            Subtract(20),
+            Add(100),
+            Divide(100),
+            Clip(0, 1.0),
+            TrimSpec(173),
+        ])
+        self.spec_transforms = CropImage([mel_num, spec_crop_len], random_crop)
+    def __len__(self):
+        return len(self.video_list)
+    def __getitem__(self, idx):
+        item = {}
+        wave_path = self.video_list[idx]
+        wav, _ = soundfile.read(wave_path)
+        start_idx = random.randint(0, min(4, wav.shape[0] - int(SR * self.L)))
+        wav = wav[start_idx:start_idx+int(SR * self.L)]
+        if self.denoise:
+            if len(wav.shape) == 1:
+                wav = wav[None, :]
+            wav = nr.reduce_noise(y=wav, sr=SR, n_fft=1024, hop_length=1024//4)
+            wav = wav.squeeze()
+        if self.wav_transforms is not None:
+            spec_tensor = self.wav_transforms(torch.tensor(wav).float())
+            spec = spec_tensor.numpy()
+        if spec.shape[1] < 173:
+            pad = np.zeros((80, 173), dtype=spec.dtype)
+            pad[:, :spec.shape[1]] = spec
+            spec = pad
+        item['image'] = 2 * spec - 1 # (80, 173)
+        # item['wav'] = wav
+        item['file_path_wav_'] = wave_path
+        item['label'] = 'None'
+        item['target'] = 'None'
+        if self.spec_transforms is not None:
+            item = self.spec_transforms(item)
+        return item
+    def make_split_files(self):
+        raise NotImplementedError
+class ImpactSetWaveTestTimeTrain(ImpactSetWaveTestTime):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class ImpactSetWaveTestTimeValidation(ImpactSetWaveTestTime):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('val', **specs_dataset_cfg)
+class ImpactSetWaveTestTimeTest(ImpactSetWaveTestTime):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class ImpactSetWaveWithSilent(torch.utils.data.Dataset):
+    def __init__(self, split, random_crop, mel_num, spec_crop_len,
+                L=2.0, denoise=False, splits_path='./data',
+                data_path='data/ImpactSet/impactset-proccess-resize'):
+        super().__init__()
+        self.split = split
+        self.splits_path = splits_path
+        self.data_path = data_path
+        self.L = L
+        self.denoise = denoise
+        video_name_split_path = os.path.join(splits_path, f'countixAV_{split}.json')
+        if not os.path.exists(video_name_split_path):
+            self.make_split_files()
+        video_name = json.load(open(video_name_split_path, 'r'))
+        self.video_frame_cnt = {v: len(os.listdir(os.path.join(self.data_path, v, 'frames'))) for v in video_name}
+        self.left_over = int(FPS * L + 1)
+        self.video_audio_path = {v: os.path.join(self.data_path, v, f'audio/{v}_resampled.wav') for v in video_name}
+        self.dataset = video_name
+        self.wav_transforms = transforms.Compose([
+            MakeMono(),
+            Padding(target_len=int(SR * self.L)),
+        ])
+        self.spec_transforms = CropImage([mel_num, spec_crop_len], random_crop)
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video = self.dataset[idx]
+        available_frame_idx = self.video_frame_cnt[video] - self.left_over
+        wave_path = self.video_audio_path[video]
+        if self.denoise:
+            wave_path = wave_path.replace('.wav', '_denoised.wav')
+        start_idx = torch.randint(0, available_frame_idx, (1,)).tolist()[0]
+        # target
+        start_t = (start_idx + 0.5) / FPS
+        start_audio_idx = non_negative(start_t * SR)
+        wav, _ = soundfile.read(wave_path, frames=int(SR * self.L), start=start_audio_idx)
+        wav = self.wav_transforms(wav)
+        item['image'] = wav # (44100,)
+        # item['wav'] = wav
+        item['file_path_wav_'] = wave_path
+        item['label'] = 'None'
+        item['target'] = 'None'
+        return item
+    def make_split_files(self):
+        raise NotImplementedError
+class ImpactSetWaveWithSilentTrain(ImpactSetWaveWithSilent):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('train', **specs_dataset_cfg)
+class ImpactSetWaveWithSilentValidation(ImpactSetWaveWithSilent):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('val', **specs_dataset_cfg)
+class ImpactSetWaveWithSilentTest(ImpactSetWaveWithSilent):
+    def __init__(self, specs_dataset_cfg):
+        super().__init__('test', **specs_dataset_cfg)
+class ImpactSetWaveCondOnImage(torch.utils.data.Dataset):
+    def __init__(self, split,
+                L=2.0, frame_transforms=None, denoise=False, splits_path='./data',
+                data_path='data/ImpactSet/impactset-proccess-resize',
+                p_outside_cond=0.):
+        super().__init__()
+        self.split = split
+        self.splits_path = splits_path
+        self.frame_transforms = frame_transforms
+        self.data_path = data_path
+        self.L = L
+        self.denoise = denoise
+        self.p_outside_cond = torch.tensor(p_outside_cond)
+        video_name_split_path = os.path.join(splits_path, f'countixAV_{split}.json')
+        if not os.path.exists(video_name_split_path):
+            self.make_split_files()
+        video_name = json.load(open(video_name_split_path, 'r'))
+        self.video_frame_cnt = {v: len(os.listdir(os.path.join(self.data_path, v, 'frames'))) for v in video_name}
+        self.left_over = int(FPS * L + 1)
+        for v, cnt in self.video_frame_cnt.items():
+            if cnt - (3*self.left_over) <= 0:
+                video_name.remove(v)
+        self.video_audio_path = {v: os.path.join(self.data_path, v, f'audio/{v}_resampled.wav') for v in video_name}
+        self.dataset = video_name
+        video_timing_split_path = os.path.join(splits_path, f'countixAV_{split}_timing.json')
+        self.video_timing = json.load(open(video_timing_split_path, 'r'))
+        self.video_timing = {v: [int(float(t) * FPS) for t in ts] for v, ts in self.video_timing.items()}
+        if split != 'test':
+            video_class_path = os.path.join(splits_path, f'countixAV_{split}_class.json')
+            if not os.path.exists(video_class_path):
+                self.make_video_class()
+            self.video_class = json.load(open(video_class_path, 'r'))
+            self.class2video = {}
+            for v, c in self.video_class.items():
+                if c not in self.class2video.keys():
+                    self.class2video[c] = []
+                self.class2video[c].append(v)
+        self.wav_transforms = transforms.Compose([
+            MakeMono(),
+            Padding(target_len=int(SR * self.L)),
+        ])
+        if self.frame_transforms == None:
+            self.frame_transforms = transforms.Compose([
+                Resize3D(128),
+                RandomResizedCrop3D(112, scale=(0.5, 1.0)),
+                RandomHorizontalFlip3D(),
+                ColorJitter3D(brightness=0.1, saturation=0.1),
+                ToTensor3D(),
+                Normalize3D(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225]),
+            ])
+    def make_video_class(self):
+        meta_path = f'data/ImpactSet/data-info/CountixAV_{self.split}.csv'
+        video_class = {}
+        with open(meta_path, 'r') as f:
+            reader = csv.reader(f)
+            for i, row in enumerate(reader):
+                if i == 0:
+                    continue
+                vid, k_st, k_et = row[:3]
+                video_name = f'{vid}_{int(k_st):0>4d}_{int(k_et):0>4d}'
+                if video_name not in self.dataset:
+                    continue
+                video_class[video_name] = row[-1]
+        with open(os.path.join(self.splits_path, f'countixAV_{self.split}_class.json'), 'w') as f:
+            json.dump(video_class, f)
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video = self.dataset[idx]
+        available_frame_idx = self.video_frame_cnt[video] - self.left_over
+        rep_start_idx, rep_end_idx = self.video_timing[video]
+        rep_end_idx = min(available_frame_idx, rep_end_idx)
+        if available_frame_idx <= rep_start_idx + self.L * FPS:
+            idx_set = list(range(0, available_frame_idx))
+        else:
+            idx_set = list(range(rep_start_idx, rep_end_idx))
+        start_idx = sample(idx_set, k=1)[0]
+        wave_path = self.video_audio_path[video]
+        if self.denoise:
+            wave_path = wave_path.replace('.wav', '_denoised.wav')
+        # target
+        start_t = (start_idx + 0.5) / FPS
+        end_idx= non_negative(start_idx + FPS * self.L)
+        start_audio_idx = non_negative(start_t * SR)
+        wav, sr = soundfile.read(wave_path, frames=int(SR * self.L), start=start_audio_idx)
+        assert sr == SR
+        wav = self.wav_transforms(wav)
+        frame_path = os.path.join(self.data_path, video, 'frames')
+        frames = [Image.open(os.path.join(
+            frame_path, f'frame{i+1:0>6d}.jpg')).convert('RGB') for i in
+            range(start_idx, end_idx)]
+        if torch.all(torch.bernoulli(self.p_outside_cond) == 1.) and self.split != 'test':
+            # outside from the same class
+            cur_class = self.video_class[video]
+            tmp_video = copy.copy(self.class2video[cur_class])
+            if len(tmp_video) > 1:
+                # if only 1 video in the class, use itself
+                tmp_video.remove(video)
+            cond_video = sample(tmp_video, k=1)[0]
+            cond_available_frame_idx = self.video_frame_cnt[cond_video] - self.left_over
+            cond_start_idx = torch.randint(0, cond_available_frame_idx, (1,)).tolist()[0]
+        else:
+            cond_video = video
+            idx_set = list(range(0, start_idx)) + list(range(end_idx, available_frame_idx))
+            cond_start_idx = random.sample(idx_set, k=1)[0]
+        cond_end_idx = non_negative(cond_start_idx + FPS * self.L)
+        cond_start_t = (cond_start_idx + 0.5) / FPS
+        cond_audio_idx = non_negative(cond_start_t * SR)
+        cond_frame_path = os.path.join(self.data_path, cond_video, 'frames')
+        cond_wave_path = self.video_audio_path[cond_video]
+        cond_frames = [Image.open(os.path.join(
+            cond_frame_path, f'frame{i+1:0>6d}.jpg')).convert('RGB') for i in
+            range(cond_start_idx, cond_end_idx)]
+        cond_wav, sr = soundfile.read(cond_wave_path, frames=int(SR * self.L), start=cond_audio_idx)
+        assert sr == SR
+        cond_wav = self.wav_transforms(cond_wav)
+        item['image'] = wav # (44100,)
+        item['cond_image'] = cond_wav # (44100,)
+        item['file_path_wav_'] = wave_path
+        item['file_path_cond_wav_'] = cond_wave_path
+        if self.frame_transforms is not None:
+            cond_frames = self.frame_transforms(cond_frames)
+            frames = self.frame_transforms(frames)
+        item['feature'] = np.stack(cond_frames + frames, axis=0) # (30 * L, 112, 112, 3)
+        item['file_path_feats_'] = (frame_path, start_idx)
+        item['file_path_cond_feats_'] = (cond_frame_path, cond_start_idx)
+        item['label'] = 'None'
+        item['target'] = 'None'
+        return item
+    def make_split_files(self):
+        raise NotImplementedError
+class ImpactSetWaveCondOnImageTrain(ImpactSetWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        train_transforms = transforms.Compose([
+            Resize3D(128),
+            RandomResizedCrop3D(112, scale=(0.5, 1.0)),
+            RandomHorizontalFlip3D(),
+            ColorJitter3D(brightness=0.4, saturation=0.4, contrast=0.2, hue=0.1),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('train', frame_transforms=train_transforms, **dataset_cfg)
+class ImpactSetWaveCondOnImageValidation(ImpactSetWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        valid_transforms = transforms.Compose([
+            Resize3D(128),
+            CenterCrop3D(112),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('val', frame_transforms=valid_transforms, **dataset_cfg)
+class ImpactSetWaveCondOnImageTest(ImpactSetWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        test_transforms = transforms.Compose([
+            Resize3D(128),
+            CenterCrop3D(112),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('test', frame_transforms=test_transforms, **dataset_cfg)
+class ImpactSetCleanWaveCondOnImage(ImpactSetWaveCondOnImage):
+    def __init__(self, split, L=2, frame_transforms=None, denoise=False, splits_path='./data', data_path='data/ImpactSet/impactset-proccess-resize', p_outside_cond=0):
+        super().__init__(split, L, frame_transforms, denoise, splits_path, data_path, p_outside_cond)
+        pred_timing_path = f'data/countixAV_{split}_timing_processed_0.20.json'
+        assert os.path.exists(pred_timing_path)
+        self.pred_timing = json.load(open(pred_timing_path, 'r'))
+        self.dataset = []
+        for v, ts in self.pred_timing.items():
+            if v in self.video_audio_path.keys():
+                for t in ts:
+                    self.dataset.append([v, t])
+    def __getitem__(self, idx):
+        item = {}
+        video, start_t = self.dataset[idx]
+        available_frame_idx = self.video_frame_cnt[video] - self.left_over
+        available_timing = (available_frame_idx + 0.5) / FPS
+        start_t = float(start_t)
+        start_t = min(start_t, available_timing)
+        start_idx = non_negative(start_t * FPS - 0.5)
+        wave_path = self.video_audio_path[video]
+        if self.denoise:
+            wave_path = wave_path.replace('.wav', '_denoised.wav')
+        # target
+        end_idx= non_negative(start_idx + FPS * self.L)
+        start_audio_idx = non_negative(start_t * SR)
+        wav, sr = soundfile.read(wave_path, frames=int(SR * self.L), start=start_audio_idx)
+        assert sr == SR
+        wav = self.wav_transforms(wav)
+        frame_path = os.path.join(self.data_path, video, 'frames')
+        frames = [Image.open(os.path.join(
+            frame_path, f'frame{i+1:0>6d}.jpg')).convert('RGB') for i in
+            range(start_idx, end_idx)]
+        if torch.all(torch.bernoulli(self.p_outside_cond) == 1.):
+            other_video = list(self.pred_timing.keys())
+            other_video.remove(video)
+            cond_video = sample(other_video, k=1)[0]
+            cond_available_frame_idx = self.video_frame_cnt[cond_video] - self.left_over
+            cond_available_timing = (cond_available_frame_idx + 0.5) / FPS
+        else:
+            cond_video = video
+            cond_available_timing = available_timing
+        cond_start_t = sample(self.pred_timing[cond_video], k=1)[0]
+        cond_start_t = float(cond_start_t)
+        cond_start_t = min(cond_start_t, cond_available_timing)
+        cond_start_idx = non_negative(cond_start_t * FPS - 0.5)
+        cond_end_idx = non_negative(cond_start_idx + FPS * self.L)
+        cond_audio_idx = non_negative(cond_start_t * SR)
+        cond_frame_path = os.path.join(self.data_path, cond_video, 'frames')
+        cond_wave_path = self.video_audio_path[cond_video]
+        cond_frames = [Image.open(os.path.join(
+            cond_frame_path, f'frame{i+1:0>6d}.jpg')).convert('RGB') for i in
+            range(cond_start_idx, cond_end_idx)]
+        cond_wav, sr = soundfile.read(cond_wave_path, frames=int(SR * self.L), start=cond_audio_idx)
+        assert sr == SR
+        cond_wav = self.wav_transforms(cond_wav)
+        item['image'] = wav # (44100,)
+        item['cond_image'] = cond_wav # (44100,)
+        item['file_path_wav_'] = wave_path
+        item['file_path_cond_wav_'] = cond_wave_path
+        if self.frame_transforms is not None:
+            cond_frames = self.frame_transforms(cond_frames)
+            frames = self.frame_transforms(frames)
+        item['feature'] = np.stack(cond_frames + frames, axis=0) # (30 * L, 112, 112, 3)
+        item['file_path_feats_'] = (frame_path, start_idx)
+        item['file_path_cond_feats_'] = (cond_frame_path, cond_start_idx)
+        item['label'] = 'None'
+        item['target'] = 'None'
+        return item
+class ImpactSetCleanWaveCondOnImageTrain(ImpactSetCleanWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        train_transforms = transforms.Compose([
+            Resize3D(128),
+            RandomResizedCrop3D(112, scale=(0.5, 1.0)),
+            RandomHorizontalFlip3D(),
+            ColorJitter3D(brightness=0.4, saturation=0.4, contrast=0.2, hue=0.1),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('train', frame_transforms=train_transforms, **dataset_cfg)
+class ImpactSetCleanWaveCondOnImageValidation(ImpactSetCleanWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        valid_transforms = transforms.Compose([
+            Resize3D(128),
+            CenterCrop3D(112),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('val', frame_transforms=valid_transforms, **dataset_cfg)
+class ImpactSetCleanWaveCondOnImageTest(ImpactSetCleanWaveCondOnImage):
+    def __init__(self, dataset_cfg):
+        test_transforms = transforms.Compose([
+            Resize3D(128),
+            CenterCrop3D(112),
+            ToTensor3D(),
+            Normalize3D(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225]),
+        ])
+        super().__init__('test', frame_transforms=test_transforms, **dataset_cfg)
+if __name__ == '__main__':
+    import sys
+    from omegaconf import OmegaConf
+    cfg = OmegaConf.load('configs/countixAV_transformer_denoise_clean.yaml')
+    data = instantiate_from_config(cfg.data)
+    data.prepare_data()
+    data.setup()
+    print(data.datasets['train'])
+    print(len(data.datasets['train']))
+    # print(data.datasets['train'][24])
+    exit()
+    stats = []
+    torch.manual_seed(0)
+    np.random.seed(0)
+    random.seed = 0
+    for k in range(1):
+        x = np.arange(SR * 2)
+        for i in tqdm(range(len(data.datasets['train']))):
+            wav = data.datasets['train'][i]['wav']
+            spec = data.datasets['train'][i]['image']
+            spec = 0.5 * (spec + 1)
+            spec_rms = rms(spec)
+            stats.append(float(spec_rms))
+            # plt.plot(x, wav)
+            # plt.ylim(-1, 1)
+            # plt.savefig(f'tmp/th0.1_wav_e_{k}_{i}_{mean_val:.3f}_{spec_rms:.3f}.png')
+            # plt.close()
+            # plt.cla()
+            soundfile.write(f'tmp/wav_e_{k}_{i}_{spec_rms:.3f}.wav', wav, SR)
+            draw_spec(spec, f'tmp/wav_spec_e_{k}_{i}_{spec_rms:.3f}.png')
+            if i == 100:
+                break
+    # plt.hist(stats, bins=50)
+    # plt.savefig(f'tmp/rms_spec_stats.png')

foleycrafter/models/specvqgan/data/transforms.py ADDED Viewed

	@@ -0,0 +1,685 @@

+import torch
+import torchaudio
+import torchaudio.functional
+from torchvision import transforms
+import torchvision.transforms.functional as F
+import torch.nn as nn
+from PIL import Image
+import numpy as np
+import math
+import random
+import soundfile
+import os
+import librosa
+import albumentations
+from torch_pitch_shift import *
+SR = 22050
+class ResizeShortSide(object):
+    def __init__(self, size):
+        super().__init__()
+        self.size = size
+    def __call__(self, x):
+        '''
+        x must be PIL.Image
+        '''
+        w, h = x.size
+        short_side = min(w, h)
+        w_target = int((w / short_side) * self.size)
+        h_target = int((h / short_side) * self.size)
+        return x.resize((w_target, h_target))
+class Crop(object):
+    def __init__(self, cropped_shape=None, random_crop=False):
+        self.cropped_shape = cropped_shape
+        if cropped_shape is not None:
+            mel_num, spec_len = cropped_shape
+            if random_crop:
+                self.cropper = albumentations.RandomCrop
+            else:
+                self.cropper = albumentations.CenterCrop
+            self.preprocessor = albumentations.Compose([self.cropper(mel_num, spec_len)])
+        else:
+            self.preprocessor = lambda **kwargs: kwargs
+    def __call__(self, item):
+        item['image'] = self.preprocessor(image=item['image'])['image']
+        if 'cond_image' in item.keys():
+            item['cond_image'] = self.preprocessor(image=item['cond_image'])['image']
+        return item
+class CropImage(Crop):
+    def __init__(self, *crop_args):
+        super().__init__(*crop_args)
+class CropFeats(Crop):
+    def __init__(self, *crop_args):
+        super().__init__(*crop_args)
+    def __call__(self, item):
+        item['feature'] = self.preprocessor(image=item['feature'])['image']
+        return item
+class CropCoords(Crop):
+    def __init__(self, *crop_args):
+        super().__init__(*crop_args)
+    def __call__(self, item):
+        item['coord'] = self.preprocessor(image=item['coord'])['image']
+        return item
+class RandomResizedCrop3D(nn.Module):
+    """Crop the given series of images to random size and aspect ratio.
+    The image can be a PIL Images or a Tensor, in which case it is expected
+    to have [N, ..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
+    is finally resized to given size.
+    This is popularly used to train the Inception networks.
+    Args:
+      size (int or sequence): expected output size of each edge. If size is an
+        int instead of sequence like (h, w), a square output size ``(size, size)`` is
+        made. If provided a tuple or list of length 1, it will be interpreted as (size[0], size[0]).
+      scale (tuple of float): range of size of the origin size cropped
+      ratio (tuple of float): range of aspect ratio of the origin aspect ratio cropped.
+      interpolation (int): Desired interpolation enum defined by `filters`_.
+        Default is ``PIL.Image.BILINEAR``. If input is Tensor, only ``PIL.Image.NEAREST``, ``PIL.Image.BILINEAR``
+        and ``PIL.Image.BICUBIC`` are supported.
+    """
+    def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), interpolation=transforms.InterpolationMode.BILINEAR):
+        super().__init__()
+        if isinstance(size, tuple) and len(size) == 2:
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation = interpolation
+        self.scale = scale
+        self.ratio = ratio
+    @staticmethod
+    def get_params(img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+        Args:
+          img (PIL Image or Tensor): Input image.
+          scale (list): range of scale of the origin size cropped
+          ratio (list): range of aspect ratio of the origin aspect ratio cropped
+        Returns:
+          tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+            sized crop.
+        """
+        width, height = img.size
+        area = height * width
+        for _ in range(10):
+            target_area = area * \
+                torch.empty(1).uniform_(scale[0], scale[1]).item()
+            log_ratio = torch.log(torch.tensor(ratio))
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(log_ratio[0], log_ratio[1])
+            ).item()
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+            if 0 < w <= width and 0 < h <= height:
+                i = torch.randint(0, height - h + 1, size=(1,)).item()
+                j = torch.randint(0, width - w + 1, size=(1,)).item()
+                return i, j, h, w
+        # Fallback to central crop
+        in_ratio = float(width) / float(height)
+        if in_ratio < min(ratio):
+            w = width
+            h = int(round(w / min(ratio)))
+        elif in_ratio > max(ratio):
+            h = height
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = width
+            h = height
+        i = (height - h) // 2
+        j = (width - w) // 2
+        return i, j, h, w
+    def forward(self, imgs):
+        """
+        Args:
+          img (PIL Image or Tensor): Image to be cropped and resized.
+        Returns:
+          PIL Image or Tensor: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(imgs[0], self.scale, self.ratio)
+        return [F.resized_crop(img, i, j, h, w, self.size, self.interpolation) for img in imgs]
+class Resize3D(object):
+    def __init__(self, size):
+        super().__init__()
+        self.size = size
+    def __call__(self, imgs):
+        '''
+        x must be PIL.Image
+        '''
+        return [x.resize((self.size, self.size)) for x in imgs]
+class RandomHorizontalFlip3D(object):
+    def __init__(self, p=0.5):
+        super().__init__()
+        self.p = p
+    def __call__(self, imgs):
+        '''
+        x must be PIL.Image
+        '''
+        if np.random.rand() < self.p:
+            return [x.transpose(Image.FLIP_LEFT_RIGHT) for x in imgs]
+        else:
+            return imgs
+class ColorJitter3D(torch.nn.Module):
+    """Randomly change the brightness, contrast and saturation of an image.
+    Args:
+    brightness (float or tuple of float (min, max)): How much to jitter brightness.
+        brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+        or the given [min, max]. Should be non negative numbers.
+    contrast (float or tuple of float (min, max)): How much to jitter contrast.
+        contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+        or the given [min, max]. Should be non negative numbers.
+    saturation (float or tuple of float (min, max)): How much to jitter saturation.
+        saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+        or the given [min, max]. Should be non negative numbers.
+    hue (float or tuple of float (min, max)): How much to jitter hue.
+        hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+        Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+    """
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+        super().__init__()
+        self.brightness = (1-brightness, 1+brightness)
+        self.contrast = (1-contrast, 1+contrast)
+        self.saturation = (1-saturation, 1+saturation)
+        self.hue = (0-hue, 0+hue)
+    @staticmethod
+    def get_params(brightness, contrast, saturation, hue):
+        """Get a randomized transform to be applied on image.
+        Arguments are same as that of __init__.
+        Returns:
+            Transform which randomly adjusts brightness, contrast and
+            saturation in a random order.
+        """
+        tfs = []
+        if brightness is not None:
+            brightness_factor = random.uniform(brightness[0], brightness[1])
+            tfs.append(transforms.Lambda(
+                lambda img: F.adjust_brightness(img, brightness_factor)))
+        if contrast is not None:
+            contrast_factor = random.uniform(contrast[0], contrast[1])
+            tfs.append(transforms.Lambda(
+                lambda img: F.adjust_contrast(img, contrast_factor)))
+        if saturation is not None:
+            saturation_factor = random.uniform(saturation[0], saturation[1])
+            tfs.append(transforms.Lambda(
+                lambda img: F.adjust_saturation(img, saturation_factor)))
+        if hue is not None:
+            hue_factor = random.uniform(hue[0], hue[1])
+            tfs.append(transforms.Lambda(
+                lambda img: F.adjust_hue(img, hue_factor)))
+        random.shuffle(tfs)
+        transform = transforms.Compose(tfs)
+        return transform
+    def forward(self, imgs):
+        """
+        Args:
+          img (PIL Image or Tensor): Input image.
+        Returns:
+          PIL Image or Tensor: Color jittered image.
+        """
+        transform = self.get_params(
+            self.brightness, self.contrast, self.saturation, self.hue)
+        return [transform(img) for img in imgs]
+class ToTensor3D(object):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, imgs):
+        '''
+        x must be PIL.Image
+        '''
+        return [F.to_tensor(img) for img in imgs]
+class Normalize3D(object):
+    def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+    def __call__(self, imgs):
+        '''
+        x must be PIL.Image
+        '''
+        return [F.normalize(img, self.mean, self.std, self.inplace) for img in imgs]
+class CenterCrop3D(object):
+    def __init__(self, size):
+        super().__init__()
+        self.size = size
+    def __call__(self, imgs):
+        '''
+        x must be PIL.Image
+        '''
+        return [F.center_crop(img, self.size) for img in imgs]
+class FrequencyMasking(object):
+    def __init__(self, freq_mask_param: int, iid_masks: bool = False):
+        super().__init__()
+        self.masking = torchaudio.transforms.FrequencyMasking(freq_mask_param, iid_masks)
+    def __call__(self, item):
+        if 'cond_image' in item.keys():
+            batched_spec = torch.stack(
+                [torch.tensor(item['image']), torch.tensor(item['cond_image'])], dim=0
+            )[:, None] # (2, 1, H, W)
+            masked = self.masking(batched_spec).numpy()
+            item['image'] = masked[0, 0]
+            item['cond_image'] = masked[1, 0]
+        elif 'image' in item.keys():
+            inp = torch.tensor(item['image'])
+            item['image'] = self.masking(inp).numpy()
+        else:
+            raise NotImplementedError()
+        return item
+class TimeMasking(object):
+    def __init__(self, time_mask_param: int, iid_masks: bool = False):
+        super().__init__()
+        self.masking = torchaudio.transforms.TimeMasking(time_mask_param, iid_masks)
+    def __call__(self, item):
+        if 'cond_image' in item.keys():
+            batched_spec = torch.stack(
+                [torch.tensor(item['image']), torch.tensor(item['cond_image'])], dim=0
+            )[:, None] # (2, 1, H, W)
+            masked = self.masking(batched_spec).numpy()
+            item['image'] = masked[0, 0]
+            item['cond_image'] = masked[1, 0]
+        elif 'image' in item.keys():
+            inp = torch.tensor(item['image'])
+            item['image'] = self.masking(inp).numpy()
+        else:
+            raise NotImplementedError()
+        return item
+class PitchShift(nn.Module):
+    def __init__(self, up=12, down=-12, sample_rate=SR):
+        super().__init__()
+        self.range = (down, up)
+        self.sr = sample_rate
+    def forward(self, x):
+        assert len(x.shape) == 2
+        x = x[:, None, :]
+        ratio = float(random.randint(self.range[0], self.range[1]) / 12.)
+        shifted = pitch_shift(x, ratio, self.sr)
+        return shifted.squeeze()
+class MelSpectrogram(object):
+    def __init__(self, sr, nfft, fmin, fmax, nmels, hoplen, spec_power, inverse=False):
+        self.sr = sr
+        self.nfft = nfft
+        self.fmin = fmin
+        self.fmax = fmax
+        self.nmels = nmels
+        self.hoplen = hoplen
+        self.spec_power = spec_power
+        self.inverse = inverse
+        self.mel_basis = librosa.filters.mel(sr=sr, n_fft=nfft, fmin=fmin, fmax=fmax, n_mels=nmels)
+    def __call__(self, x):
+        x = x.numpy()
+        if self.inverse:
+            spec = librosa.feature.inverse.mel_to_stft(
+                x, sr=self.sr, n_fft=self.nfft, fmin=self.fmin, fmax=self.fmax, power=self.spec_power
+            )
+            wav = librosa.griffinlim(spec, hop_length=self.hoplen)
+            return torch.FloatTensor(wav)
+        else:
+            spec = np.abs(librosa.stft(x, n_fft=self.nfft, hop_length=self.hoplen)) ** self.spec_power
+            mel_spec = np.dot(self.mel_basis, spec)
+            return torch.FloatTensor(mel_spec)
+class SpectrogramTorchAudio(object):
+    def __init__(self, nfft, hoplen, spec_power, inverse=False):
+        self.nfft = nfft
+        self.hoplen = hoplen
+        self.spec_power = spec_power
+        self.inverse = inverse
+        self.spec_trans = torchaudio.transforms.Spectrogram(
+            n_fft=self.nfft,
+            hop_length=self.hoplen,
+            power=self.spec_power,
+        )
+        self.inv_spec_trans = torchaudio.transforms.GriffinLim(
+            n_fft=self.nfft,
+            hop_length=self.hoplen,
+            power=self.spec_power,
+        )
+    def __call__(self, x):
+        if self.inverse:
+            wav = self.inv_spec_trans(x)
+            return wav
+        else:
+            spec = torch.abs(self.spec_trans(x))
+            return spec
+class MelScaleTorchAudio(object):
+    def __init__(self, sr, stft, fmin, fmax, nmels, inverse=False):
+        self.sr = sr
+        self.stft = stft
+        self.fmin = fmin
+        self.fmax = fmax
+        self.nmels = nmels
+        self.inverse = inverse
+        self.mel_trans = torchaudio.transforms.MelScale(
+            n_mels=self.nmels,
+            sample_rate=self.sr,
+            f_min=self.fmin,
+            f_max=self.fmax,
+            n_stft=self.stft,
+            norm='slaney'
+        )
+        self.inv_mel_trans = torchaudio.transforms.InverseMelScale(
+            n_mels=self.nmels,
+            sample_rate=self.sr,
+            f_min=self.fmin,
+            f_max=self.fmax,
+            n_stft=self.stft,
+            norm='slaney'
+        )
+    def __call__(self, x):
+        if self.inverse:
+            spec = self.inv_mel_trans(x)
+            return spec
+        else:
+            mel_spec = self.mel_trans(x)
+            return mel_spec
+class Padding(object):
+    def __init__(self, target_len, inverse=False):
+        self.target_len=int(target_len)
+        self.inverse = inverse
+    def __call__(self, x):
+        if self.inverse:
+            return x
+        else:
+            x = x.squeeze()
+            if x.shape[0] < self.target_len:
+                pad = torch.zeros((self.target_len,), dtype=x.dtype, device=x.device)
+                pad[:x.shape[0]] = x
+                x = pad
+            elif x.shape[0] > self.target_len:
+                raise NotImplementedError()
+            return x
+class MakeMono(object):
+    def __init__(self, inverse=False):
+        self.inverse = inverse
+    def __call__(self, x):
+        if self.inverse:
+            return x
+        else:
+            x = x.squeeze()
+            if len(x.shape) == 1:
+                return torch.FloatTensor(x)
+            elif len(x.shape) == 2:
+                target_dim = int(torch.argmin(torch.tensor(x.shape)))
+                return torch.mean(x, dim=target_dim)
+            else:
+                raise NotImplementedError
+class LowerThresh(object):
+    def __init__(self, min_val, inverse=False):
+        self.min_val = torch.tensor(min_val)
+        self.inverse = inverse
+    def __call__(self, x):
+        if self.inverse:
+            return x
+        else:
+            return torch.maximum(self.min_val, x)
+class Add(object):
+    def __init__(self, val, inverse=False):
+        self.inverse = inverse
+        self.val = val
+    def __call__(self, x):
+        if self.inverse:
+            return x - self.val
+        else:
+            return x + self.val
+class Subtract(Add):
+    def __init__(self, val, inverse=False):
+        self.inverse = inverse
+        self.val = val
+    def __call__(self, x):
+        if self.inverse:
+            return x + self.val
+        else:
+            return x - self.val
+class Multiply(object):
+    def __init__(self, val, inverse=False) -> None:
+        self.val = val
+        self.inverse = inverse
+    def __call__(self, x):
+        if self.inverse:
+            return x / self.val
+        else:
+            return x * self.val
+class Divide(Multiply):
+    def __init__(self, val, inverse=False):
+        self.inverse = inverse
+        self.val = val
+    def __call__(self, x):
+        if self.inverse:
+            return x * self.val
+        else:
+            return x / self.val
+class Log10(object):
+    def __init__(self, inverse=False):
+        self.inverse = inverse
+    def __call__(self, x):
+        if self.inverse:
+            return 10 ** x
+        else:
+            return torch.log10(x)
+class Clip(object):
+    def __init__(self, min_val, max_val, inverse=False):
+        self.min_val = min_val
+        self.max_val = max_val
+        self.inverse = inverse
+    def __call__(self, x):
+        if self.inverse:
+            return x
+        else:
+            return torch.clip(x, self.min_val, self.max_val)
+class TrimSpec(object):
+    def __init__(self, max_len, inverse=False):
+        self.max_len = max_len
+        self.inverse = inverse
+    def __call__(self, x):
+        if self.inverse:
+            return x
+        else:
+            return x[:, :self.max_len]
+class MaxNorm(object):
+    def __init__(self, inverse=False):
+        self.inverse = inverse
+        self.eps = 1e-10
+    def __call__(self, x):
+        if self.inverse:
+            return x
+        else:
+            return x / (x.max() + self.eps)
+class NormalizeAudio(object):
+    def __init__(self, inverse=False, desired_rms=0.1, eps=1e-4):
+        self.inverse = inverse
+        self.desired_rms = desired_rms
+        self.eps = torch.tensor(eps)
+    def __call__(self, x):
+        if self.inverse:
+            return x
+        else:
+            rms = torch.maximum(self.eps, torch.sqrt(torch.mean(x**2)))
+            x = x * (self.desired_rms / rms)
+            x[x > 1.] = 1.
+            x[x < -1.] = -1.
+            return x
+class RandomNormalizeAudio(object):
+    def __init__(self, inverse=False, rms_range=[0.05, 0.2], eps=1e-4):
+        self.inverse = inverse
+        self.rms_low, self.rms_high = rms_range
+        self.eps = torch.tensor(eps)
+    def __call__(self, x):
+        if self.inverse:
+            return x
+        else:
+            rms = torch.maximum(self.eps, torch.sqrt(torch.mean(x**2)))
+            desired_rms = (torch.rand(1) * (self.rms_high - self.rms_low)) + self.rms_low
+            x = x * (desired_rms / rms)
+            x[x > 1.] = 1.
+            x[x < -1.] = -1.
+            return x
+class MakeDouble(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.to(torch.double)
+class MakeFloat(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.to(torch.float)
+class Wave2Spectrogram(nn.Module):
+    def __init__(self, mel_num, spec_crop_len):
+        super().__init__()
+        self.trans = transforms.Compose([
+            LowerThresh(1e-5),
+            Log10(),
+            Multiply(20),
+            Subtract(20),
+            Add(100),
+            Divide(100),
+            Clip(0, 1.0),
+            TrimSpec(173),
+            transforms.CenterCrop((mel_num, spec_crop_len))
+        ])
+    def forward(self, x):
+        return self.trans(x)
+TRANSFORMS = transforms.Compose([
+    SpectrogramTorchAudio(nfft=1024, hoplen=1024//4, spec_power=1),
+    MelScaleTorchAudio(sr=22050, stft=513, fmin=125, fmax=7600, nmels=80),
+    LowerThresh(1e-5),
+    Log10(),
+    Multiply(20),
+    Subtract(20),
+    Add(100),
+    Divide(100),
+    Clip(0, 1.0),
+])
+def get_spectrogram_torch(audio_path, save_dir, length, save_results=True):
+    wav, _ = soundfile.read(audio_path)
+    wav = torch.FloatTensor(wav)
+    y = torch.zeros(length)
+    if wav.shape[0] < length:
+        y[:len(wav)] = wav
+    else:
+        y = wav[:length]
+    mel_spec = TRANSFORMS(y).numpy()
+    y = y.numpy()
+    if save_results:
+        os.makedirs(save_dir, exist_ok=True)
+        audio_name = os.path.basename(audio_path).split('.')[0]
+        np.save(os.path.join(save_dir, audio_name + '_mel.npy'), mel_spec)
+        np.save(os.path.join(save_dir, audio_name + '_audio.npy'), y)
+    else:
+        return y, mel_spec

foleycrafter/models/specvqgan/data/utils.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import json
+from random import shuffle, choice, sample
+from moviepy.editor import VideoFileClip
+import librosa
+from scipy import signal
+from scipy.io import wavfile
+import torchaudio
+torchaudio.set_audio_backend("sox_io")
+INTERVAL = 1000
+# discard
+stft = torchaudio.transforms.MelSpectrogram(
+    sample_rate=16000, hop_length=161, n_mels=64).cuda()
+def log10(x): return torch.log(x)/torch.log(torch.tensor(10.))
+def norm_range(x, min_val, max_val):
+    return 2.*(x - min_val)/float(max_val - min_val) - 1.
+def normalize_spec(spec, spec_min, spec_max):
+    return norm_range(spec, spec_min, spec_max)
+def db_from_amp(x, cuda=False):
+    # rescale the audio
+    if cuda:
+        return 20. * log10(torch.max(torch.tensor(1e-5).to('cuda'), x.float()))
+    else:
+        return 20. * log10(torch.max(torch.tensor(1e-5), x.float()))
+def audio_stft(audio, stft=stft):
+    # We'll apply stft to the audio samples to convert it to a HxW matrix
+    N, C, A = audio.size()
+    audio = audio.view(N * C, A)
+    spec = stft(audio)
+    spec = spec.transpose(-1, -2)
+    spec = db_from_amp(spec, cuda=True)
+    spec = normalize_spec(spec, -100., 100.)
+    _, T, F = spec.size()
+    spec = spec.view(N, C, T, F)
+    return spec
+# discard
+# def get_spec(
+#     wavs,
+#     sample_rate=16000,
+#     use_volume_jittering=False,
+#     center=False,
+# ):
+#     # Volume  jittering - scale volume by factor in range (0.9, 1.1)
+#     if use_volume_jittering:
+#         wavs = [wav * np.random.uniform(0.9, 1.1) for wav in wavs]
+#     if center:
+#         wavs = [center_only(wav) for wav in wavs]
+#     # Convert to log filterbank
+#     specs = [logfbank(
+#         wav,
+#         sample_rate,
+#         winlen=0.009,
+#         winstep=0.005,  # if num_sec==1 else 0.01,
+#         nfilt=256,
+#         nfft=1024
+#     ).astype('float32').T for wav in wavs]
+#     # Convert to 32-bit float and expand dim
+#     specs = np.stack(specs, axis=0)
+#     specs = np.expand_dims(specs, 1)
+#     specs = torch.as_tensor(specs)  # Nx1xFxT
+#     return specs
+def center_only(audio, sr=16000, L=1.0):
+    # center_wav = np.arange(0, L, L/(0.5*sr)) ** 2
+    # center_wav = np.concatenate([center_wav, center_wav[::-1]])
+    # center_wav[L*sr//2:3*L*sr//4] = 1
+    # only take 0.3 sec audio
+    center_wav = np.zeros(int(L * sr))
+    center_wav[int(0.4*L*sr):int(0.7*L*sr)] = 1
+    return audio * center_wav
+def get_spec_librosa(
+    wavs,
+    sample_rate=16000,
+    use_volume_jittering=False,
+    center=False,
+):
+    # Volume  jittering - scale volume by factor in range (0.9, 1.1)
+    if use_volume_jittering:
+        wavs = [wav * np.random.uniform(0.9, 1.1) for wav in wavs]
+    if center:
+        wavs = [center_only(wav) for wav in wavs]
+    # Convert to log filterbank
+    specs = [librosa.feature.melspectrogram(
+        y=wav,
+        sr=sample_rate,
+        n_fft=400,
+        hop_length=126,
+        n_mels=128,
+    ).astype('float32') for wav in wavs]
+    # Convert to 32-bit float and expand dim
+    specs = [librosa.power_to_db(spec) for spec in specs]
+    specs = np.stack(specs, axis=0)
+    specs = np.expand_dims(specs, 1)
+    specs = torch.as_tensor(specs)  # Nx1xFxT
+    return specs
+def calcEuclideanDistance_Mat(X, Y):
+    """
+    Inputs:
+    - X: A numpy array of shape (N, F)
+    - Y: A numpy array of shape (M, F)
+    Returns:
+    A numpy array D of shape (N, M) where D[i, j] is the Euclidean distance
+    between X[i] and Y[j].
+    """
+    return ((torch.sum(X ** 2, axis=1, keepdims=True)) + (torch.sum(Y ** 2, axis=1, keepdims=True)).T - 2 * X @ Y.T) ** 0.5
+def calcEuclideanDistance(x1, x2):
+    return torch.sum((x1 - x2)**2, dim=1)**0.5
+def split_data(in_list, portion=(0.9, 0.95), is_shuffle=True):
+    if is_shuffle:
+        shuffle(in_list)
+    if type(in_list) == str:
+        with open(in_list) as l:
+            fw_list = json.load(l)
+    elif type(in_list) == list:
+        fw_list = in_list
+    else:
+        print(type(in_list))
+        raise TypeError('Invalid input list type')
+    c1, c2 = int(len(fw_list) * portion[0]), int(len(fw_list) * portion[1])
+    tr_list, va_list, te_list = fw_list[:c1], fw_list[c1:c2], fw_list[c2:]
+    print(
+        f'==> train set: {len(tr_list)}, validation set: {len(va_list)}, test set: {len(te_list)}')
+    return tr_list, va_list, te_list
+def load_one_clip(video_path):
+    v = VideoFileClip(video_path)
+    fps = int(v.fps)
+    frames = [f for f in v.iter_frames()][:-1]
+    frame_cnt = len(frames)
+    frame_length = 1000./fps
+    total_length = int(1000 * (frame_cnt / fps))
+    a = v.audio
+    sr = a.fps
+    a = np.array([fa for fa in a.iter_frames()])
+    a = librosa.resample(a, sr, 48000)
+    if len(a.shape) > 1:
+        a = np.mean(a, axis=1)
+    while True:
+        idx = np.random.choice(np.arange(frame_cnt - 1), 1)[0]
+        frame_clip = frames[idx]
+        start_time = int(idx * frame_length + 0.5 * frame_length - 500)
+        end_time = start_time + INTERVAL
+        if start_time < 0 or end_time > total_length:
+            continue
+        wave_clip = a[48 * start_time: 48 * end_time]
+        if wave_clip.shape[0] != 48000:
+            continue
+        break
+    return frame_clip, wave_clip
+def resize_frame(frame):
+    H, W = frame.size
+    short_edge = min(H, W)
+    scale = 256 / short_edge
+    H_tar, W_tar = int(np.round(H * scale)), int(np.round(W * scale))
+    return frame.resize((H_tar, W_tar))
+def get_spectrogram(wave, amp_jitter, amp_jitter_range, log_scale=True, sr=48000):
+    # random clip-level amplitude jittering
+    if amp_jitter:
+        amplified = wave * np.random.uniform(*amp_jitter_range)
+        if wave.dtype == np.int16:
+            amplified[amplified >= 32767] = 32767
+            amplified[amplified <= -32768] = -32768
+            wave = amplified.astype('int16')
+        elif wave.dtype == np.float32 or wave.dtype == np.float64:
+            amplified[amplified >= 1] = 1
+            amplified[amplified <= -1] = -1
+    # fr, ts, spectrogram = signal.spectrogram(wave[:48000], fs=sr, nperseg=480, noverlap=240, nfft=512)
+    # spectrogram = librosa.feature.melspectrogram(S=spectrogram, n_mels=257) # Try log-mel spectrogram?
+    spectrogram = librosa.feature.melspectrogram(
+        y=wave[:48000], sr=sr, hop_length=240, win_length=480, n_mels=257)
+    if log_scale:
+        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
+    assert spectrogram.shape[0] == 257
+    return spectrogram
+def cropAudio(audio, sr, f_idx, fps=10, length=1., left_shift=0):
+    time_per_frame = 1./fps
+    assert audio.shape[0] > sr * length
+    start_time = f_idx * time_per_frame - left_shift
+    start_time = 0 if start_time < 0 else start_time
+    start_idx = int(np.round(sr * start_time))
+    end_idx = int(np.round(start_idx + (sr * length)))
+    if end_idx > audio.shape[0]:
+        end_idx = audio.shape[0]
+        start_idx = int(end_idx - (sr * length))
+    try:
+        assert audio[start_idx:end_idx].shape[0] == sr * length
+    except:
+        print(audio.shape, start_idx, end_idx, end_idx - start_idx)
+        exit(1)
+    return audio[start_idx:end_idx]
+def pick_async_frame_idx(idx, total_frames, fps=10, gap=2.0, length=1.0, cnt=1):
+    assert idx < total_frames - fps * length
+    lower_bound = idx - int((length + gap) * fps)
+    upper_bound = idx + int((length + gap) * fps)
+    proposal = list(range(0, lower_bound)) + \
+        list(range(upper_bound, int(total_frames - fps * length)))
+    # assert len(proposal) >= cnt
+    avail_cnt = len(proposal)
+    try:
+        for i in range(cnt - avail_cnt):
+            proposal.append(proposal[i % avail_cnt])
+    except Exception as e:
+        print(idx, total_frames, proposal)
+        raise e
+    return sample(proposal, k=cnt)
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate based on schedule"""
+    lr = args.lr
+    if args.cos:  # cosine lr schedule
+        lr *= 0.5 * (1. + math.cos(math.pi * epoch / args.epoch))
+    else:  # stepwise lr schedule
+        for milestone in args.schedule:
+            lr *= 0.1 if epoch >= milestone else 1.
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr

foleycrafter/models/specvqgan/models/av_cond_transformer.py ADDED Viewed

	@@ -0,0 +1,528 @@

+import sys
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+import torchaudio
+from omegaconf.listconfig import ListConfig
+sys.path.insert(0, '.')  # nopep8
+from foleycrafter.models.specvqgan.modules.transformer.mingpt import (GPTClass, GPTFeats, GPTFeatsClass)
+from foleycrafter.models.specvqgan.data.transforms import Wave2Spectrogram, PitchShift, NormalizeAudio
+from train import instantiate_from_config
+SR = 22050
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class Net2NetTransformerAVCond(pl.LightningModule):
+    def __init__(self, transformer_config, first_stage_config,
+                 cond_stage_config,
+                 drop_condition=False, drop_video=False, drop_cond_video=False,
+                 first_stage_permuter_config=None, cond_stage_permuter_config=None,
+                 ckpt_path=None, ignore_keys=[],
+                 first_stage_key="image",
+                 cond_first_stage_key="cond_image",
+                 cond_stage_key="depth",
+                 downsample_cond_size=-1,
+                 pkeep=1.0,
+                 clip=30,
+                 p_audio_aug=0.5,
+                 p_pitch_shift=0.,
+                 p_normalize=0.,
+                 mel_num=80,
+                 spec_crop_len=160):
+        super().__init__()
+        self.init_first_stage_from_ckpt(first_stage_config)
+        self.init_cond_stage_from_ckpt(cond_stage_config)
+        if first_stage_permuter_config is None:
+            first_stage_permuter_config = {"target": "foleycrafter.models.specvqgan.modules.transformer.permuter.Identity"}
+        if cond_stage_permuter_config is None:
+            cond_stage_permuter_config = {"target": "foleycrafter.models.specvqgan.modules.transformer.permuter.Identity"}
+        self.first_stage_permuter = instantiate_from_config(config=first_stage_permuter_config)
+        self.cond_stage_permuter = instantiate_from_config(config=cond_stage_permuter_config)
+        self.transformer = instantiate_from_config(config=transformer_config)
+        self.wav_transforms = nn.Sequential(
+            transforms.RandomApply([NormalizeAudio()], p=p_normalize),
+            transforms.RandomApply([PitchShift()], p=p_pitch_shift),
+            torchaudio.transforms.Spectrogram(
+                n_fft=1024,
+                hop_length=1024//4,
+                power=1,
+            ),
+            # transforms.RandomApply([
+            #     torchaudio.transforms.FrequencyMasking(freq_mask_param=40, iid_masks=False)
+            # ], p=p_audio_aug),
+            # transforms.RandomApply([
+            #     torchaudio.transforms.TimeMasking(time_mask_param=int(32 * 2), iid_masks=False)
+            # ], p=p_audio_aug),
+            torchaudio.transforms.MelScale(
+                n_mels=80,
+                sample_rate=SR,
+                f_min=125,
+                f_max=7600,
+                n_stft=513,
+                norm='slaney'
+            ),
+            Wave2Spectrogram(mel_num, spec_crop_len),
+        )
+        ignore_keys = ['wav_transforms']
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.drop_condition = drop_condition
+        self.drop_video = drop_video
+        self.drop_cond_video = drop_cond_video
+        print(f'>>> Feature setting: all cond: {self.drop_condition}, video: {self.drop_video}, cond video: {self.drop_cond_video}')
+        self.first_stage_key = first_stage_key
+        self.cond_first_stage_key = cond_first_stage_key
+        self.cond_stage_key = cond_stage_key
+        self.downsample_cond_size = downsample_cond_size
+        self.pkeep = pkeep
+        self.clip = clip
+        print('>>> model init done.')
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        for k in sd.keys():
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    self.print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def init_first_stage_from_ckpt(self, config):
+        model = instantiate_from_config(config)
+        model = model.eval()
+        model.train = disabled_train
+        self.first_stage_model = model
+    def init_cond_stage_from_ckpt(self, config):
+        model = instantiate_from_config(config)
+        model = model.eval()
+        model.train = disabled_train
+        self.cond_stage_model = model
+    def forward(self, x, c, xp):
+        # one step to produce the logits
+        _, z_indices = self.encode_to_z(x) # VQ-GAN encoding
+        _, zp_indices = self.encode_to_z(xp)
+        _, c_indices = self.encode_to_c(c) # Conv1-1 down dim + col-major permuter
+        z_indices = z_indices[:, :self.clip]
+        zp_indices = zp_indices[:, :self.clip]
+        if not self.drop_condition:
+            z_indices = torch.cat([zp_indices, z_indices], dim=1)
+        if self.training and self.pkeep < 1.0:
+            mask = torch.bernoulli(self.pkeep * torch.ones(z_indices.shape, device=z_indices.device))
+            mask = mask.round().to(dtype=torch.int64)
+            r_indices = torch.randint_like(z_indices, self.transformer.config.vocab_size)
+            a_indices = mask*z_indices+(1-mask)*r_indices
+        else:
+            a_indices = z_indices
+        # target includes all sequence elements (no need to handle first one
+        # differently because we are conditioning)
+        if self.drop_condition:
+            target = z_indices
+        else:
+            target = z_indices[:, self.clip:]
+        # in the case we do not want to encode condition anyhow (e.g. inputs are features)
+        if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)):
+            # make the prediction
+            logits, _, _ = self.transformer(z_indices[:, :-1], c)
+            # cut off conditioning outputs - output i corresponds to p(z_i | z_{<i}, c)
+            if isinstance(self.transformer, GPTFeatsClass):
+                cond_size = c['feature'].size(-1) + c['target'].size(-1)
+            else:
+                cond_size = c.size(-1)
+            if self.drop_condition:
+                logits = logits[:, cond_size-1:]
+            else:
+                logits = logits[:, cond_size-1:][:, self.clip:]
+        else:
+            cz_indices = torch.cat((c_indices, a_indices), dim=1)
+            # make the prediction
+            logits, _, _ = self.transformer(cz_indices[:, :-1])
+            # cut off conditioning outputs - output i corresponds to p(z_i | z_{<i}, c)
+            logits = logits[:, c_indices.shape[1]-1:]
+        return logits, target
+    def top_k_logits(self, logits, k):
+        v, ix = torch.topk(logits, k)
+        out = logits.clone()
+        out[out < v[..., [-1]]] = -float('Inf')
+        return out
+    @torch.no_grad()
+    def sample(self, x, c, steps, temperature=1.0, sample=False, top_k=None,
+               callback=lambda k: None):
+        x = x if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)) else torch.cat((c, x), dim=1)
+        block_size = self.transformer.get_block_size()
+        assert not self.transformer.training
+        if self.pkeep <= 0.0:
+            raise NotImplementedError('Implement for GPTFeatsCLass')
+            raise NotImplementedError('Implement for GPTFeats')
+            raise NotImplementedError('Implement for GPTClass')
+            raise NotImplementedError('also the model outputs attention')
+            # one pass suffices since input is pure noise anyway
+            assert len(x.shape)==2
+            # noise_shape = (x.shape[0], steps-1)
+            # noise = torch.randint(self.transformer.config.vocab_size, noise_shape).to(x)
+            noise = c.clone()[:,x.shape[1]-c.shape[1]:-1]
+            x = torch.cat((x,noise),dim=1)
+            logits, _ = self.transformer(x)
+            # take all logits for now and scale by temp
+            logits = logits / temperature
+            # optionally crop probabilities to only the top k options
+            if top_k is not None:
+                logits = self.top_k_logits(logits, top_k)
+            # apply softmax to convert to probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution or take the most likely
+            if sample:
+                shape = probs.shape
+                probs = probs.reshape(shape[0]*shape[1],shape[2])
+                ix = torch.multinomial(probs, num_samples=1)
+                probs = probs.reshape(shape[0],shape[1],shape[2])
+                ix = ix.reshape(shape[0],shape[1])
+            else:
+                _, ix = torch.topk(probs, k=1, dim=-1)
+            # cut off conditioning
+            x = ix[:, c.shape[1]-1:]
+        else:
+            for k in range(steps):
+                callback(k)
+                if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)):
+                    # if assert is removed, you need to make sure that the combined len is not longer block_s
+                    if isinstance(self.transformer, GPTFeatsClass):
+                        cond_size = c['feature'].size(-1) + c['target'].size(-1)
+                    else:
+                        cond_size = c.size(-1)
+                    assert x.size(1) + cond_size <= block_size
+                    x_cond = x
+                    c_cond = c
+                    logits, _, att = self.transformer(x_cond, c_cond)
+                else:
+                    assert x.size(1) <= block_size  # make sure model can see conditioning
+                    x_cond = x if x.size(1) <= block_size else x[:, -block_size:]  # crop context if needed
+                    logits, _, att = self.transformer(x_cond)
+                # pluck the logits at the final step and scale by temperature
+                logits = logits[:, -1, :] / temperature
+                # optionally crop probabilities to only the top k options
+                if top_k is not None:
+                    logits = self.top_k_logits(logits, top_k)
+                # apply softmax to convert to probabilities
+                probs = F.softmax(logits, dim=-1)
+                # sample from the distribution or take the most likely
+                if sample:
+                    ix = torch.multinomial(probs, num_samples=1)
+                else:
+                    _, ix = torch.topk(probs, k=1, dim=-1)
+                # append to the sequence and continue
+                x = torch.cat((x, ix), dim=1)
+            # cut off conditioning
+            x = x if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)) else x[:, c.shape[1]:]
+        return x, att.detach().cpu()
+    @torch.no_grad()
+    def encode_to_z(self, x):
+        quant_z, _, info = self.first_stage_model.encode(x)
+        indices = info[2].view(quant_z.shape[0], -1)
+        indices = self.first_stage_permuter(indices)
+        return quant_z, indices
+    @torch.no_grad()
+    def encode_to_c(self, c):
+        if self.downsample_cond_size > -1:
+            c = F.interpolate(c, size=(self.downsample_cond_size, self.downsample_cond_size))
+        quant_c, _, info = self.cond_stage_model.encode(c)
+        if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)):
+            # these are not indices but raw features or a class
+            indices = info[2]
+        else:
+            indices = info[2].view(quant_c.shape[0], -1)
+            indices = self.cond_stage_permuter(indices)
+        return quant_c, indices
+    @torch.no_grad()
+    def decode_to_img(self, index, zshape, stage='first'):
+        if stage == 'first':
+            index = self.first_stage_permuter(index, reverse=True)
+        elif stage == 'cond':
+            print('in cond stage in decode_to_img which is unexpected ')
+            index = self.cond_stage_permuter(index, reverse=True)
+        else:
+            raise NotImplementedError
+        bhwc = (zshape[0], zshape[2], zshape[3], zshape[1])
+        quant_z = self.first_stage_model.quantize.get_codebook_entry(index.reshape(-1), shape=bhwc)
+        x = self.first_stage_model.decode(quant_z)
+        return x
+    @torch.no_grad()
+    def log_images(self, batch, temperature=None, top_k=None, callback=None, lr_interface=False, **kwargs):
+        log = dict()
+        N = 4
+        if lr_interface:
+            x, c, xp = self.get_xcxp(batch, N, diffuse=False, upsample_factor=8)
+        else:
+            x, c, xp = self.get_xcxp(batch, N)
+        x = x.to(device=self.device)
+        xp = xp.to(device=self.device)
+        # c = c.to(device=self.device)
+        if isinstance(c, dict):
+            c = {k: v.to(self.device) for k, v in c.items()}
+        else:
+            c = c.to(self.device)
+        quant_z, z_indices = self.encode_to_z(x)
+        quant_zp, zp_indices = self.encode_to_z(xp)
+        quant_c, c_indices = self.encode_to_c(c)  # output can be features or a single class or a featcls dict
+        z_indices_rec = z_indices.clone()
+        zp_indices_clip = zp_indices[:, :self.clip]
+        z_indices_clip = z_indices[:, :self.clip]
+        # create a "half"" sample
+        z_start_indices = z_indices_clip[:, :z_indices_clip.shape[1]//2]
+        if self.drop_condition:
+            steps = z_indices_clip.shape[1]-z_start_indices.shape[1]
+        else:
+            z_start_indices = torch.cat([zp_indices_clip, z_start_indices], dim=-1)
+            steps = 2*z_indices_clip.shape[1]-z_start_indices.shape[1]
+        index_sample, att_half = self.sample(z_start_indices, c_indices,
+                                   steps=steps,
+                                   temperature=temperature if temperature is not None else 1.0,
+                                   sample=True,
+                                   top_k=top_k if top_k is not None else 100,
+                                   callback=callback if callback is not None else lambda k: None)
+        if self.drop_condition:
+            z_indices_rec[:, :self.clip] = index_sample
+        else:
+            z_indices_rec[:, :self.clip] = index_sample[:, self.clip:]
+        x_sample = self.decode_to_img(z_indices_rec, quant_z.shape)
+        # sample
+        z_start_indices = z_indices_clip[:, :0]
+        if not self.drop_condition:
+            z_start_indices = torch.cat([zp_indices_clip, z_start_indices], dim=-1)
+        index_sample, att_nopix = self.sample(z_start_indices, c_indices,
+                                              steps=z_indices_clip.shape[1],
+                                              temperature=temperature if temperature is not None else 1.0,
+                                              sample=True,
+                                              top_k=top_k if top_k is not None else 100,
+                                              callback=callback if callback is not None else lambda k: None)
+        if self.drop_condition:
+            z_indices_rec[:, :self.clip] = index_sample
+        else:
+            z_indices_rec[:, :self.clip] = index_sample[:, self.clip:]
+        x_sample_nopix = self.decode_to_img(z_indices_rec, quant_z.shape)
+        # det sample
+        z_start_indices = z_indices_clip[:, :0]
+        if not self.drop_condition:
+            z_start_indices = torch.cat([zp_indices_clip, z_start_indices], dim=-1)
+        index_sample, att_det = self.sample(z_start_indices, c_indices,
+                                            steps=z_indices_clip.shape[1],
+                                            sample=False,
+                                            callback=callback if callback is not None else lambda k: None)
+        if self.drop_condition:
+            z_indices_rec[:, :self.clip] = index_sample
+        else:
+            z_indices_rec[:, :self.clip] = index_sample[:, self.clip:]
+        x_sample_det = self.decode_to_img(z_indices_rec, quant_z.shape)
+        # reconstruction
+        x_rec = self.decode_to_img(z_indices, quant_z.shape)
+        log["inputs"] = x
+        log["reconstructions"] = x_rec
+        if isinstance(self.cond_stage_key, str):
+            cond_is_not_image = self.cond_stage_key != "image"
+            cond_has_segmentation = self.cond_stage_key == "segmentation"
+        elif isinstance(self.cond_stage_key, ListConfig):
+            cond_is_not_image = 'image' not in self.cond_stage_key
+            cond_has_segmentation = 'segmentation' in self.cond_stage_key
+        else:
+            raise NotImplementedError
+        if cond_is_not_image:
+            cond_rec = self.cond_stage_model.decode(quant_c)
+            if cond_has_segmentation:
+                # get image from segmentation mask
+                num_classes = cond_rec.shape[1]
+                c = torch.argmax(c, dim=1, keepdim=True)
+                c = F.one_hot(c, num_classes=num_classes)
+                c = c.squeeze(1).permute(0, 3, 1, 2).float()
+                c = self.cond_stage_model.to_rgb(c)
+                cond_rec = torch.argmax(cond_rec, dim=1, keepdim=True)
+                cond_rec = F.one_hot(cond_rec, num_classes=num_classes)
+                cond_rec = cond_rec.squeeze(1).permute(0, 3, 1, 2).float()
+                cond_rec = self.cond_stage_model.to_rgb(cond_rec)
+            log["conditioning_rec"] = cond_rec
+            log["conditioning"] = c
+        log["samples_half"] = x_sample
+        log["samples_nopix"] = x_sample_nopix
+        log["samples_det"] = x_sample_det
+        log["att_half"] = att_half
+        log["att_nopix"] = att_nopix
+        log["att_det"] = att_det
+        return log
+    def spec_transform(self, batch):
+        wav = batch[self.first_stage_key]
+        wav_cond = batch[self.cond_first_stage_key]
+        N = wav.shape[0]
+        wav_cat = torch.cat([wav, wav_cond], dim=0)
+        self.wav_transforms.to(wav_cat.device)
+        spec = self.wav_transforms(wav_cat.to(torch.float32))
+        batch[self.first_stage_key] = 2 * spec[:N] - 1
+        batch[self.cond_first_stage_key] = 2 * spec[N:] - 1
+        return batch
+    def get_input(self, key, batch):
+        if isinstance(key, str):
+            # if batch[key] is 1D; else the batch[key] is 2D
+            if key in ['feature', 'target']:
+                if self.drop_condition or self.drop_cond_video:
+                    cond_size = batch[key].shape[1] // 2
+                    batch[key] = batch[key][:, cond_size:]
+                x = self.cond_stage_model.get_input(
+                    batch, key, drop_cond=(self.drop_condition or self.drop_cond_video)
+                )
+            else:
+                x = batch[key]
+                if len(x.shape) == 3:
+                    x = x[..., None]
+                x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+            if x.dtype == torch.double:
+                x = x.float()
+        elif isinstance(key, ListConfig):
+            x = self.cond_stage_model.get_input(batch, key)
+            for k, v in x.items():
+                if v.dtype == torch.double:
+                    x[k] = v.float()
+        return x
+    def get_xcxp(self, batch, N=None):
+        if len(batch[self.first_stage_key].shape) == 2:
+            batch = self.spec_transform(batch)
+        x = self.get_input(self.first_stage_key, batch)
+        c = self.get_input(self.cond_stage_key, batch)
+        xp = self.get_input(self.cond_first_stage_key, batch)
+        if N is not None:
+            x = x[:N]
+            xp = xp[:N]
+            if isinstance(self.cond_stage_key, ListConfig):
+                c = {k: v[:N] for k, v in c.items()}
+            else:
+                c = c[:N]
+        # Drop additional information during training
+        if self.drop_condition:
+            xp[:] = 0
+        if self.drop_video:
+            c[:] = 0
+        return x, c, xp
+    def shared_step(self, batch, batch_idx):
+        x, c, xp = self.get_xcxp(batch)
+        logits, target = self(x, c, xp)
+        loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), target.reshape(-1))
+        return loss
+    def training_step(self, batch, batch_idx):
+        loss = self.shared_step(batch, batch_idx)
+        self.log("train/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self.shared_step(batch, batch_idx)
+        self.log("val/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        return loss
+    def configure_optimizers(self):
+        """
+        Following minGPT:
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding, torch.nn.Conv1d, torch.nn.LSTM, torch.nn.GRU)
+        for mn, m in self.transformer.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+                elif ('weight' in pn or 'bias' in pn) and isinstance(m, (torch.nn.LSTM, torch.nn.GRU)):
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.transformer.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": 0.01},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=self.learning_rate, betas=(0.9, 0.95))
+        return optimizer
+if __name__ == '__main__':
+    from omegaconf import OmegaConf
+    cfg_image = OmegaConf.load('./configs/vggsound_transformer.yaml')
+    cfg_image.model.params.first_stage_config.params.ckpt_path = './logs/2021-05-19T22-16-54_vggsound_codebook/checkpoints/last.ckpt'
+    transformer_cfg = cfg_image.model.params.transformer_config
+    first_stage_cfg = cfg_image.model.params.first_stage_config
+    cond_stage_cfg = cfg_image.model.params.cond_stage_config
+    permuter_cfg = cfg_image.model.params.permuter_config
+    transformer = Net2NetTransformerAVCond(
+        transformer_cfg, first_stage_cfg, cond_stage_cfg, permuter_cfg
+    )
+    c = torch.rand(2, 2048, 212)
+    x = torch.rand(2, 1, 80, 848)
+    logits, target = transformer(x, c)
+    print(logits.shape, target.shape)

foleycrafter/models/specvqgan/models/cond_transformer.py ADDED Viewed

	@@ -0,0 +1,455 @@

+import sys
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from omegaconf.listconfig import ListConfig
+from torchvision import transforms
+from foleycrafter.models.specvqgan.data.transforms import Wave2Spectrogram
+import torchaudio
+sys.path.insert(0, '.')  # nopep8
+from foleycrafter.models.specvqgan.modules.transformer.mingpt import (GPTClass, GPTFeats, GPTFeatsClass)
+from train import instantiate_from_config
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class Net2NetTransformer(pl.LightningModule):
+    def __init__(self, transformer_config, first_stage_config,
+                 cond_stage_config,
+                 first_stage_permuter_config=None, cond_stage_permuter_config=None,
+                 ckpt_path=None, ignore_keys=[],
+                 first_stage_key="image",
+                 cond_stage_key="depth",
+                 downsample_cond_size=-1,
+                 pkeep=1.0,
+                 mel_num=80,
+                 spec_crop_len=160):
+        super().__init__()
+        self.init_first_stage_from_ckpt(first_stage_config)
+        self.init_cond_stage_from_ckpt(cond_stage_config)
+        if first_stage_permuter_config is None:
+            first_stage_permuter_config = {"target": "foleycrafter.models.specvqgan.modules.transformer.permuter.Identity"}
+        if cond_stage_permuter_config is None:
+            cond_stage_permuter_config = {"target": "foleycrafter.models.specvqgan.modules.transformer.permuter.Identity"}
+        self.first_stage_permuter = instantiate_from_config(config=first_stage_permuter_config)
+        self.cond_stage_permuter = instantiate_from_config(config=cond_stage_permuter_config)
+        self.transformer = instantiate_from_config(config=transformer_config)
+        self.wav_transforms = nn.Sequential(
+            torchaudio.transforms.Spectrogram(
+                n_fft=1024,
+                hop_length=1024//4,
+                power=1,
+            ),
+            torchaudio.transforms.MelScale(
+                n_mels=80,
+                sample_rate=22050,
+                f_min=125,
+                f_max=7600,
+                n_stft=513,
+                norm='slaney'
+            ),
+            Wave2Spectrogram(mel_num, spec_crop_len),
+        )
+        ignore_keys = ['wav_transforms']
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.first_stage_key = first_stage_key
+        self.cond_stage_key = cond_stage_key
+        self.downsample_cond_size = downsample_cond_size
+        self.pkeep = pkeep
+        print('>>> model init done.')
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        for k in sd.keys():
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    self.print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def init_first_stage_from_ckpt(self, config):
+        model = instantiate_from_config(config)
+        model = model.eval()
+        model.train = disabled_train
+        self.first_stage_model = model
+    def init_cond_stage_from_ckpt(self, config):
+        model = instantiate_from_config(config)
+        model = model.eval()
+        model.train = disabled_train
+        self.cond_stage_model = model
+    def forward(self, x, c):
+        # one step to produce the logits
+        _, z_indices = self.encode_to_z(x)
+        _, c_indices = self.encode_to_c(c)
+        if self.training and self.pkeep < 1.0:
+            mask = torch.bernoulli(self.pkeep * torch.ones(z_indices.shape, device=z_indices.device))
+            mask = mask.round().to(dtype=torch.int64)
+            r_indices = torch.randint_like(z_indices, self.transformer.config.vocab_size)
+            a_indices = mask*z_indices+(1-mask)*r_indices
+        else:
+            a_indices = z_indices
+        # target includes all sequence elements (no need to handle first one
+        # differently because we are conditioning)
+        target = z_indices
+        # in the case we do not want to encode condition anyhow (e.g. inputs are features)
+        if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)):
+            # make the prediction
+            logits, _, _ = self.transformer(z_indices[:, :-1], c)
+            # cut off conditioning outputs - output i corresponds to p(z_i | z_{<i}, c)
+            if isinstance(self.transformer, GPTFeatsClass):
+                cond_size = c['feature'].size(-1) + c['target'].size(-1)
+            else:
+                cond_size = c.size(-1)
+            logits = logits[:, cond_size-1:]
+        else:
+            cz_indices = torch.cat((c_indices, a_indices), dim=1)
+            # make the prediction
+            logits, _, _ = self.transformer(cz_indices[:, :-1])
+            # cut off conditioning outputs - output i corresponds to p(z_i | z_{<i}, c)
+            logits = logits[:, c_indices.shape[1]-1:]
+        return logits, target
+    def top_k_logits(self, logits, k):
+        v, ix = torch.topk(logits, k)
+        out = logits.clone()
+        out[out < v[..., [-1]]] = -float('Inf')
+        return out
+    @torch.no_grad()
+    def sample(self, x, c, steps, temperature=1.0, sample=False, top_k=None,
+               callback=lambda k: None):
+        x = x if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)) else torch.cat((c, x), dim=1)
+        block_size = self.transformer.get_block_size()
+        assert not self.transformer.training
+        if self.pkeep <= 0.0:
+            raise NotImplementedError('Implement for GPTFeatsCLass')
+            raise NotImplementedError('Implement for GPTFeats')
+            raise NotImplementedError('Implement for GPTClass')
+            raise NotImplementedError('also the model outputs attention')
+            # one pass suffices since input is pure noise anyway
+            assert len(x.shape)==2
+            # noise_shape = (x.shape[0], steps-1)
+            # noise = torch.randint(self.transformer.config.vocab_size, noise_shape).to(x)
+            noise = c.clone()[:,x.shape[1]-c.shape[1]:-1]
+            x = torch.cat((x,noise),dim=1)
+            logits, _ = self.transformer(x)
+            # take all logits for now and scale by temp
+            logits = logits / temperature
+            # optionally crop probabilities to only the top k options
+            if top_k is not None:
+                logits = self.top_k_logits(logits, top_k)
+            # apply softmax to convert to probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution or take the most likely
+            if sample:
+                shape = probs.shape
+                probs = probs.reshape(shape[0]*shape[1],shape[2])
+                ix = torch.multinomial(probs, num_samples=1)
+                probs = probs.reshape(shape[0],shape[1],shape[2])
+                ix = ix.reshape(shape[0],shape[1])
+            else:
+                _, ix = torch.topk(probs, k=1, dim=-1)
+            # cut off conditioning
+            x = ix[:, c.shape[1]-1:]
+        else:
+            for k in range(steps):
+                callback(k)
+                if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)):
+                    # if assert is removed, you need to make sure that the combined len is not longer block_s
+                    if isinstance(self.transformer, GPTFeatsClass):
+                        cond_size = c['feature'].size(-1) + c['target'].size(-1)
+                    else:
+                        cond_size = c.size(-1)
+                    assert x.size(1) + cond_size <= block_size
+                    x_cond = x
+                    c_cond = c
+                    logits, _, att = self.transformer(x_cond, c_cond)
+                else:
+                    assert x.size(1) <= block_size  # make sure model can see conditioning
+                    x_cond = x if x.size(1) <= block_size else x[:, -block_size:]  # crop context if needed
+                    logits, _, att = self.transformer(x_cond)
+                # pluck the logits at the final step and scale by temperature
+                logits = logits[:, -1, :] / temperature
+                # optionally crop probabilities to only the top k options
+                if top_k is not None:
+                    logits = self.top_k_logits(logits, top_k)
+                # apply softmax to convert to probabilities
+                probs = F.softmax(logits, dim=-1)
+                # sample from the distribution or take the most likely
+                if sample:
+                    ix = torch.multinomial(probs, num_samples=1)
+                else:
+                    _, ix = torch.topk(probs, k=1, dim=-1)
+                # append to the sequence and continue
+                x = torch.cat((x, ix), dim=1)
+            # cut off conditioning
+            x = x if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)) else x[:, c.shape[1]:]
+        return x, att.detach().cpu()
+    @torch.no_grad()
+    def encode_to_z(self, x):
+        quant_z, _, info = self.first_stage_model.encode(x)
+        indices = info[2].view(quant_z.shape[0], -1)
+        indices = self.first_stage_permuter(indices)
+        return quant_z, indices
+    @torch.no_grad()
+    def encode_to_c(self, c):
+        if self.downsample_cond_size > -1:
+            c = F.interpolate(c, size=(self.downsample_cond_size, self.downsample_cond_size))
+        quant_c, _, info = self.cond_stage_model.encode(c)
+        if isinstance(self.transformer, (GPTFeats, GPTClass, GPTFeatsClass)):
+            # these are not indices but raw features or a class
+            indices = info[2]
+        else:
+            indices = info[2].view(quant_c.shape[0], -1)
+            indices = self.cond_stage_permuter(indices)
+        return quant_c, indices
+    @torch.no_grad()
+    def decode_to_img(self, index, zshape, stage='first'):
+        if stage == 'first':
+            index = self.first_stage_permuter(index, reverse=True)
+        elif stage == 'cond':
+            print('in cond stage in decode_to_img which is unexpected ')
+            index = self.cond_stage_permuter(index, reverse=True)
+        else:
+            raise NotImplementedError
+        bhwc = (zshape[0], zshape[2], zshape[3], zshape[1])
+        quant_z = self.first_stage_model.quantize.get_codebook_entry(index.reshape(-1), shape=bhwc)
+        x = self.first_stage_model.decode(quant_z)
+        return x
+    @torch.no_grad()
+    def log_images(self, batch, temperature=None, top_k=None, callback=None, lr_interface=False, **kwargs):
+        log = dict()
+        N = 4
+        if lr_interface:
+            x, c = self.get_xc(batch, N, diffuse=False, upsample_factor=8)
+        else:
+            x, c = self.get_xc(batch, N)
+        x = x.to(device=self.device)
+        # c = c.to(device=self.device)
+        if isinstance(c, dict):
+            c = {k: v.to(self.device) for k, v in c.items()}
+        else:
+            c = c.to(self.device)
+        quant_z, z_indices = self.encode_to_z(x)
+        quant_c, c_indices = self.encode_to_c(c)  # output can be features or a single class or a featcls dict
+        # create a "half"" sample
+        z_start_indices = z_indices[:, :z_indices.shape[1]//2]
+        index_sample, att_half = self.sample(z_start_indices, c_indices,
+                                   steps=z_indices.shape[1]-z_start_indices.shape[1],
+                                   temperature=temperature if temperature is not None else 1.0,
+                                   sample=True,
+                                   top_k=top_k if top_k is not None else 100,
+                                   callback=callback if callback is not None else lambda k: None)
+        x_sample = self.decode_to_img(index_sample, quant_z.shape)
+        # sample
+        z_start_indices = z_indices[:, :0]
+        index_sample, att_nopix = self.sample(z_start_indices, c_indices,
+                                              steps=z_indices.shape[1],
+                                              temperature=temperature if temperature is not None else 1.0,
+                                              sample=True,
+                                              top_k=top_k if top_k is not None else 100,
+                                              callback=callback if callback is not None else lambda k: None)
+        x_sample_nopix = self.decode_to_img(index_sample, quant_z.shape)
+        # det sample
+        z_start_indices = z_indices[:, :0]
+        index_sample, att_det = self.sample(z_start_indices, c_indices,
+                                            steps=z_indices.shape[1],
+                                            sample=False,
+                                            callback=callback if callback is not None else lambda k: None)
+        x_sample_det = self.decode_to_img(index_sample, quant_z.shape)
+        # reconstruction
+        x_rec = self.decode_to_img(z_indices, quant_z.shape)
+        log["inputs"] = x
+        log["reconstructions"] = x_rec
+        if isinstance(self.cond_stage_key, str):
+            cond_is_not_image = self.cond_stage_key != "image"
+            cond_has_segmentation = self.cond_stage_key == "segmentation"
+        elif isinstance(self.cond_stage_key, ListConfig):
+            cond_is_not_image = 'image' not in self.cond_stage_key
+            cond_has_segmentation = 'segmentation' in self.cond_stage_key
+        else:
+            raise NotImplementedError
+        if cond_is_not_image:
+            cond_rec = self.cond_stage_model.decode(quant_c)
+            if cond_has_segmentation:
+                # get image from segmentation mask
+                num_classes = cond_rec.shape[1]
+                c = torch.argmax(c, dim=1, keepdim=True)
+                c = F.one_hot(c, num_classes=num_classes)
+                c = c.squeeze(1).permute(0, 3, 1, 2).float()
+                c = self.cond_stage_model.to_rgb(c)
+                cond_rec = torch.argmax(cond_rec, dim=1, keepdim=True)
+                cond_rec = F.one_hot(cond_rec, num_classes=num_classes)
+                cond_rec = cond_rec.squeeze(1).permute(0, 3, 1, 2).float()
+                cond_rec = self.cond_stage_model.to_rgb(cond_rec)
+            log["conditioning_rec"] = cond_rec
+            log["conditioning"] = c
+        log["samples_half"] = x_sample
+        log["samples_nopix"] = x_sample_nopix
+        log["samples_det"] = x_sample_det
+        log["att_half"] = att_half
+        log["att_nopix"] = att_nopix
+        log["att_det"] = att_det
+        return log
+    def spec_transform(self, batch):
+        wav = batch[self.first_stage_key]
+        N = wav.shape[0]
+        self.wav_transforms.to(wav.device)
+        spec = self.wav_transforms(wav.to(torch.float32))
+        batch[self.first_stage_key] = 2 * spec[:N] - 1
+        return batch
+    def get_input(self, key, batch):
+        if isinstance(key, str):
+            # if batch[key] is 1D; else the batch[key] is 2D
+            if key in ['feature', 'target']:
+                x = self.cond_stage_model.get_input(batch, key)
+            else:
+                x = batch[key]
+                if len(x.shape) == 3:
+                    x = x[..., None]
+                x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+            if x.dtype == torch.double:
+                x = x.float()
+        elif isinstance(key, ListConfig):
+            x = self.cond_stage_model.get_input(batch, key)
+            for k, v in x.items():
+                if v.dtype == torch.double:
+                    x[k] = v.float()
+        return x
+    def get_xc(self, batch, N=None):
+        if len(batch[self.first_stage_key].shape) == 2:
+            batch = self.spec_transform(batch)
+        x = self.get_input(self.first_stage_key, batch)
+        c = self.get_input(self.cond_stage_key, batch)
+        if N is not None:
+            x = x[:N]
+            if isinstance(self.cond_stage_key, ListConfig):
+                c = {k: v[:N] for k, v in c.items()}
+            else:
+                c = c[:N]
+        return x, c
+    def shared_step(self, batch, batch_idx):
+        x, c = self.get_xc(batch)
+        logits, target = self(x, c)
+        loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), target.reshape(-1))
+        return loss
+    def training_step(self, batch, batch_idx):
+        loss = self.shared_step(batch, batch_idx)
+        self.log("train/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self.shared_step(batch, batch_idx)
+        self.log("val/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        return loss
+    def configure_optimizers(self):
+        """
+        Following minGPT:
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding, torch.nn.Conv1d, torch.nn.LSTM, torch.nn.GRU)
+        for mn, m in self.transformer.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+                elif ('weight' in pn or 'bias' in pn) and isinstance(m, (torch.nn.LSTM, torch.nn.GRU)):
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.transformer.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": 0.01},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=self.learning_rate, betas=(0.9, 0.95))
+        return optimizer
+if __name__ == '__main__':
+    from omegaconf import OmegaConf
+    cfg_image = OmegaConf.load('./configs/vggsound_transformer.yaml')
+    cfg_image.model.params.first_stage_config.params.ckpt_path = './logs/2021-05-19T22-16-54_vggsound_codebook/checkpoints/last.ckpt'
+    transformer_cfg = cfg_image.model.params.transformer_config
+    first_stage_cfg = cfg_image.model.params.first_stage_config
+    cond_stage_cfg = cfg_image.model.params.cond_stage_config
+    permuter_cfg = cfg_image.model.params.permuter_config
+    transformer = Net2NetTransformer(
+        transformer_cfg, first_stage_cfg, cond_stage_cfg, permuter_cfg
+    )
+    c = torch.rand(2, 2048, 212)
+    x = torch.rand(2, 1, 80, 160)
+    logits, target = transformer(x, c)
+    print(logits.shape, target.shape)

foleycrafter/models/specvqgan/models/vqgan.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import torch
+import torch.nn as nn
+import torchaudio
+from torchvision import transforms
+import torch.nn.functional as F
+import pytorch_lightning as pl
+import sys
+import math
+sys.path.insert(0, '.')  # nopep8
+from train import instantiate_from_config
+from foleycrafter.models.specvqgan.data.transforms import Wave2Spectrogram, NormalizeAudio
+from foleycrafter.models.specvqgan.modules.diffusionmodules.model import Encoder, Decoder, Encoder1d, Decoder1d
+from foleycrafter.models.specvqgan.modules.vqvae.quantize import VectorQuantizer, VectorQuantizer1d
+class VQModel(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 L=10.,
+                 mel_num=80,
+                 spec_crop_len=160,
+                 normalize=False,
+                 freeze_encoder=False,
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        # we need this one for compatibility in train.ImageLogger.log_img if statement
+        self.first_stage_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25)
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        aug_list = [
+            torchaudio.transforms.Spectrogram(
+                n_fft=1024,
+                hop_length=1024//4,
+                power=1,
+            ),
+            torchaudio.transforms.MelScale(
+                n_mels=80,
+                sample_rate=22050,
+                f_min=125,
+                f_max=7600,
+                n_stft=513,
+                norm='slaney'
+            ),
+            Wave2Spectrogram(mel_num, spec_crop_len),
+        ]
+        if normalize:
+            aug_list = [transforms.RandomApply([NormalizeAudio()], p=1. if normalize else 0.)] + aug_list
+        if not freeze_encoder:
+            self.wav_transforms = nn.Sequential(*aug_list)
+        ignore_keys += ['first_stage_model.wav_transforms', 'wav_transforms']
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.used_codes = []
+        self.counts = [0 for _ in range(self.quantize.n_e)]
+        if freeze_encoder:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+            for param in self.quantize.parameters():
+                param.requires_grad = False
+            for param in self.quant_conv.parameters():
+                param.requires_grad = False
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)  # 2d: (B, 256, 16, 16) <- (B, 3, 256, 256)
+        h = self.quant_conv(h)  # 2d: (B, 256, 16, 16)
+        quant, emb_loss, info = self.quantize(h)  # (B, 256, 16, 16), (), ((), (768, 1024), (768, 1))
+        if not self.training:
+            self.counts = [info[2].squeeze().tolist().count(i) + self.counts[i] for i in range(self.quantize.n_e)]
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 2:
+            x = self.spec_trans(x)
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+        return x.float()
+    def spec_trans(self, wav):
+        self.wav_transforms.to(wav.device)
+        spec = self.wav_transforms(wav.to(torch.float32))
+        return 2 * spec - 1
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("train/aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("train/disc_loss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        if batch_idx == 0 and self.global_step != 0 and sum(self.counts) > 0:
+            zero_hit_codes = len([1 for count in self.counts if count == 0])
+            used_codes = []
+            for c, count in enumerate(self.counts):
+                used_codes.extend([c] * count)
+            self.logger.experiment.add_histogram('val/code_hits', torch.tensor(used_codes), self.global_step)
+            self.logger.experiment.add_scalar('val/zero_hit_codes', zero_hit_codes, self.global_step)
+            self.counts = [0 for _ in range(self.quantize.n_e)]
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val")
+        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+        rec_loss = log_dict_ae['val/rec_loss']
+        self.log('val/rec_loss', rec_loss, prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
+        self.log('val/aeloss', aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters()) +
+                                  list(self.decoder.parameters()) +
+                                  list(self.quantize.parameters()) +
+                                  list(self.quant_conv.parameters()) +
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class VQModel1d(VQModel):
+    def __init__(self, ddconfig, lossconfig, n_embed, embed_dim, ckpt_path=None, ignore_keys=[],
+                 image_key='feature', colorize_nlabels=None, monitor=None):
+        # ckpt_path is none to super because otherwise will try to load 1D checkpoint into 2D model
+        super().__init__(ddconfig, lossconfig, n_embed, embed_dim)
+        self.image_key = image_key
+        # we need this one for compatibility in train.ImageLogger.log_img if statement
+        self.first_stage_key = image_key
+        self.encoder = Encoder1d(**ddconfig)
+        self.decoder = Decoder1d(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer1d(n_embed, embed_dim, beta=0.25)
+        self.quant_conv = torch.nn.Conv1d(ddconfig['z_channels'], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv1d(embed_dim, ddconfig['z_channels'], 1)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer('colorize', torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+    def get_input(self, batch, k):
+        x = batch[k]
+        if self.image_key == 'feature':
+            x = x.permute(0, 2, 1)
+        elif self.image_key == 'image':
+            x = x.unsqueeze(1)
+        x = x.to(memory_format=torch.contiguous_format)
+        return x.float()
+    def forward(self, input):
+        if self.image_key == 'image':
+            input = input.squeeze(1)
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        if self.image_key == 'image':
+            dec = dec.unsqueeze(1)
+        return dec, diff
+    def log_images(self, batch, **kwargs):
+        if self.image_key == 'image':
+            log = dict()
+            x = self.get_input(batch, self.image_key)
+            x = x.to(self.device)
+            xrec, _ = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log['inputs'] = x
+            log['reconstructions'] = xrec
+            return log
+        else:
+            raise NotImplementedError('1d input should be treated differently')
+    def to_rgb(self, batch, **kwargs):
+        raise NotImplementedError('1d input should be treated differently')
+class VQSegmentationModel(VQModel):
+    def __init__(self, n_labels, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.register_buffer("colorize", torch.randn(3, n_labels, 1, 1))
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quantize.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        return opt_ae
+    def training_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, split="train")
+        self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+        return aeloss
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, split="val")
+        self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+        total_loss = log_dict_ae["val/total_loss"]
+        self.log("val/total_loss", total_loss,
+                 prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
+        return aeloss
+    @torch.no_grad()
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            # convert logits to indices
+            xrec = torch.argmax(xrec, dim=1, keepdim=True)
+            xrec = F.one_hot(xrec, num_classes=x.shape[1])
+            xrec = xrec.squeeze(1).permute(0, 3, 1, 2).float()
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+class VQNoDiscModel(VQModel):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None
+                 ):
+        super().__init__(ddconfig=ddconfig, lossconfig=lossconfig, n_embed=n_embed, embed_dim=embed_dim,
+                         ckpt_path=ckpt_path, ignore_keys=ignore_keys, image_key=image_key,
+                         colorize_nlabels=colorize_nlabels)
+    def training_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        # autoencode
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, self.global_step, split="train")
+        output = pl.TrainResult(minimize=aeloss)
+        output.log("train/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        output.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+        return output
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, self.global_step, split="val")
+        rec_loss = log_dict_ae["val/rec_loss"]
+        output = pl.EvalResult(checkpoint_on=rec_loss)
+        output.log("val/rec_loss", rec_loss,
+                   prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        output.log("val/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        output.log_dict(log_dict_ae)
+        return output
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(list(self.encoder.parameters()) +
+                                     list(self.decoder.parameters()) +
+                                     list(self.quantize.parameters()) +
+                                     list(self.quant_conv.parameters()) +
+                                     list(self.post_quant_conv.parameters()),
+                                     lr=self.learning_rate, betas=(0.5, 0.9))
+        return optimizer
+if __name__ == '__main__':
+    from omegaconf import OmegaConf
+    from train import instantiate_from_config
+    image_key = 'image'
+    cfg_audio = OmegaConf.load('./configs/vggsound_codebook.yaml')
+    model = VQModel(cfg_audio.model.params.ddconfig,
+                    cfg_audio.model.params.lossconfig,
+                    cfg_audio.model.params.n_embed,
+                    cfg_audio.model.params.embed_dim,
+                    image_key='image')
+    batch = {
+        'image': torch.rand((4, 80, 848)),
+        'file_path_': ['data/vggsound/mel123.npy', 'data/vggsound/mel123.npy', 'data/vggsound/mel123.npy'],
+        'class': [1, 1, 1],
+    }
+    xrec, qloss = model(model.get_input(batch, image_key))
+    print(xrec.shape, qloss.shape)

foleycrafter/models/specvqgan/modules/diffusionmodules/model.py ADDED Viewed

	@@ -0,0 +1,999 @@

+# pytorch_diffusion + derived encoder decoder
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0,1,0,0))
+    return emb
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Upsample1d(Upsample):
+    def __init__(self, in_channels, with_conv):
+        super().__init__(in_channels, with_conv)
+        if self.with_conv:
+            self.conv = torch.nn.Conv1d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+            self.pad = (0, 1, 0, 1)
+        else:
+            self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
+    def forward(self, x):
+        if self.with_conv:  # bp: check self.avgpool and self.pad
+            x = torch.nn.functional.pad(x, self.pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = self.avg_pool(x)
+        return x
+class Downsample1d(Downsample):
+    def __init__(self, in_channels, with_conv):
+        super().__init__(in_channels, with_conv)
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            # TODO: can we replace it just with conv2d with padding 1?
+            self.conv = torch.nn.Conv1d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+            self.pad = (1, 1)
+        else:
+            self.avg_pool = nn.AvgPool1d(kernel_size=2, stride=2)
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class ResnetBlock1d(ResnetBlock):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__(in_channels=in_channels, out_channels=out_channels,
+                         conv_shortcut=conv_shortcut, dropout=dropout, temb_channels=temb_channels)
+        # redefining different elements (forward is goint to be the same as in RenetBlock)
+        if temb_channels > 0:
+            raise NotImplementedError('go to ResnetBlock and figure out how to deal with it in forward')
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.conv1 = torch.nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = torch.nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv1d(in_channels, out_channels, kernel_size=3,
+                                                     stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv1d(in_channels, out_channels, kernel_size=1,
+                                                    stride=1, padding=0)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class AttnBlock1d(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv1d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv1d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv1d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv1d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, t = q.shape
+        q = q.permute(0, 2, 1)   # b,t,c
+        w_ = torch.bmm(q, k)     # b,t,t    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        w_ = w_.permute(0, 2, 1)  # b,t,t (first t of k, second of q)
+        h_ = torch.bmm(v, w_)  # b,c,t (t of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = self.proj_out(h_)
+        return x + h_
+class Model(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, use_timestep=True):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = self.ch*4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch,
+                                self.temb_ch),
+                torch.nn.Linear(self.temb_ch,
+                                self.temb_ch),
+            ])
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            skip_in = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch*in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in+skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, t=None):
+        #assert x.shape[2] == x.shape[3] == self.resolution
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        #assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Encoder1d(Encoder):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **ignore_kwargs):
+        super().__init__(ch=ch, out_ch=out_ch, ch_mult=ch_mult, num_res_blocks=num_res_blocks,
+                         attn_resolutions=attn_resolutions, dropout=dropout,
+                         resamp_with_conv=resamp_with_conv,
+                         in_channels=in_channels, resolution=resolution, z_channels=z_channels,
+                         double_z=double_z, **ignore_kwargs)
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv1d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock1d(in_channels=block_in,
+                                           out_channels=block_out,
+                                           temb_channels=self.temb_ch,
+                                           dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock1d(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample1d(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock1d(in_channels=block_in,
+                                         out_channels=block_in,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout)
+        self.mid.attn_1 = AttnBlock1d(block_in)
+        self.mid.block_2 = ResnetBlock1d(in_channels=block_in,
+                                         out_channels=block_in,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv1d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, **ignorekwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        # self.z_shape = (1,z_channels,curr_res,curr_res)
+        # print("Working with z of shape {} = {} dimensions.".format(
+        #     self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder1d(Decoder):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, **ignorekwargs):
+        super().__init__(ch=ch, out_ch=out_ch, ch_mult=ch_mult, num_res_blocks=num_res_blocks,
+                         attn_resolutions=attn_resolutions, dropout=dropout,
+                         resamp_with_conv=resamp_with_conv,
+                         in_channels=in_channels, resolution=resolution, z_channels=z_channels,
+                         give_pre_end=give_pre_end, **ignorekwargs)
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        # self.z_shape = (1,z_channels,curr_res,curr_res)
+        # print("Working with z of shape {} = {} dimensions.".format(
+        #     self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv1d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock1d(in_channels=block_in, out_channels=block_in,
+                                         temb_channels=self.temb_ch, dropout=dropout)
+        self.mid.attn_1 = AttnBlock1d(block_in)
+        self.mid.block_2 = ResnetBlock1d(in_channels=block_in, out_channels=block_in,
+                                         temb_channels=self.temb_ch, dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock1d(in_channels=block_in, out_channels=block_out,
+                                           temb_channels=self.temb_ch, dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock1d(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample1d(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv1d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+class VUNet(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
+                 in_channels, c_channels,
+                 resolution, z_channels, use_timestep=False, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = self.ch*4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch,
+                                self.temb_ch),
+                torch.nn.Linear(self.temb_ch,
+                                self.temb_ch),
+            ])
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(c_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        self.z_in = torch.nn.Conv2d(z_channels,
+                                    block_in,
+                                    kernel_size=1,
+                                    stride=1,
+                                    padding=0)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=2*block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            skip_in = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch*in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in+skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, z):
+        #assert x.shape[2] == x.shape[3] == self.resolution
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        z = self.z_in(z)
+        h = torch.cat((h,z),dim=1)
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class SimpleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, *args, **kwargs):
+        super().__init__()
+        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
+                                     ResnetBlock(in_channels=in_channels,
+                                                 out_channels=2 * in_channels,
+                                                 temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=2 * in_channels,
+                                                out_channels=4 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=4 * in_channels,
+                                                out_channels=2 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     nn.Conv2d(2*in_channels, in_channels, 1),
+                                     Upsample(in_channels, with_conv=True)])
+        # end
+        self.norm_out = Normalize(in_channels)
+        self.conv_out = torch.nn.Conv2d(in_channels,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        for i, layer in enumerate(self.model):
+            if i in [1,2,3]:
+                x = layer(x, None)
+            else:
+                x = layer(x)
+        h = self.norm_out(x)
+        h = nonlinearity(h)
+        x = self.conv_out(h)
+        return x
+class UpsampleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
+                 ch_mult=(2,2), dropout=0.0):
+        super().__init__()
+        # upsampling
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = in_channels
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            res_block = []
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+            self.res_blocks.append(nn.ModuleList(res_block))
+            if i_level != self.num_resolutions - 1:
+                self.upsample_blocks.append(Upsample(block_in, True))
+                curr_res = curr_res * 2
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        # upsampling
+        h = x
+        for k, i_level in enumerate(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.res_blocks[i_level][i_block](h, None)
+            if i_level != self.num_resolutions - 1:
+                h = self.upsample_blocks[k](h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+if __name__ == '__main__':
+    ddconfig = {
+        'ch': 128,
+        'num_res_blocks': 2,
+        'dropout': 0.0,
+        'z_channels': 256,
+        'double_z': False,
+    }
+    # Audio example ##
+    ddconfig['in_channels'] = 1
+    ddconfig['resolution'] = 848
+    ddconfig['attn_resolutions'] = [53]
+    ddconfig['ch_mult'] = [1, 1, 2, 2, 4]
+    ddconfig['out_ch'] = 1
+    # input
+    inputs = torch.rand(4, 1, 80, 848)
+    print('Input:', inputs.shape)
+    # Encoder
+    encoder = Encoder(**ddconfig)
+    enc_outs = encoder(inputs)
+    print('Encoder out:', enc_outs.shape)
+    # Decoder
+    decoder = Decoder(**ddconfig)
+    quant_outs = torch.rand(4, 256, 5, 53)
+    dec_outs = decoder(quant_outs)
+    print('Decoder out:', dec_outs.shape)

foleycrafter/models/specvqgan/modules/discriminator/model.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import functools
+import torch.nn as nn
+class ActNorm(nn.Module):
+    def __init__(self, num_features, logdet=False, affine=True,
+                 allow_reverse_init=False):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+        self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = (
+                flatten.mean(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            std = (
+                flatten.std(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+        _, _, height, width = input.shape
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+        h = self.scale * (input + self.loc)
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height * width * torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+        return h
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+        if len(output.shape) == 2:
+            output = output[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+        h = output / self.scale - self.loc
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find('BatchNorm') != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm2d
+        else:
+            norm_layer = ActNorm
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        # output 1 channel prediction map
+        sequence += [nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)
+class NLayerDiscriminator1dFeats(NLayerDiscriminator):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input feats
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super().__init__(input_nc=input_nc, ndf=64, n_layers=n_layers, use_actnorm=use_actnorm)
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm1d
+        else:
+            norm_layer = ActNorm
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm1d
+        else:
+            use_bias = norm_layer != nn.BatchNorm1d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv1d(input_nc, input_nc//2, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = input_nc//2
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually decrease the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = max(nf_mult_prev // (2 ** n), 8)
+            sequence += [
+                nn.Conv1d(nf_mult_prev, nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = max(nf_mult_prev // (2 ** n), 8)
+        sequence += [
+            nn.Conv1d(nf_mult_prev, nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        nf_mult_prev = nf_mult
+        nf_mult = max(nf_mult_prev // (2 ** n), 8)
+        sequence += [
+            nn.Conv1d(nf_mult_prev, nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        # output 1 channel prediction map
+        sequence += [nn.Conv1d(nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]
+        self.main = nn.Sequential(*sequence)
+class NLayerDiscriminator1dSpecs(NLayerDiscriminator):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=80, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input specs
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super().__init__(input_nc=input_nc, ndf=64, n_layers=n_layers, use_actnorm=use_actnorm)
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm1d
+        else:
+            norm_layer = ActNorm
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm1d
+        else:
+            use_bias = norm_layer != nn.BatchNorm1d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv1d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually decrease the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv1d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv1d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        # output 1 channel prediction map
+        sequence += [nn.Conv1d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        # (B, C, L)
+        input = input.squeeze(1)
+        input = self.main(input)
+        return input
+if __name__ == '__main__':
+    import torch
+    ## FEATURES
+    disc_in_channels = 2048
+    disc_num_layers = 2
+    use_actnorm = False
+    disc_ndf = 64
+    discriminator = NLayerDiscriminator1dFeats(input_nc=disc_in_channels, n_layers=disc_num_layers,
+                                            use_actnorm=use_actnorm, ndf=disc_ndf).apply(weights_init)
+    inputs = torch.rand((6, 2048, 212))
+    outputs = discriminator(inputs)
+    print(outputs.shape)
+    ## AUDIO
+    disc_in_channels = 1
+    disc_num_layers = 3
+    use_actnorm = False
+    disc_ndf = 64
+    discriminator = NLayerDiscriminator(input_nc=disc_in_channels, n_layers=disc_num_layers,
+                                        use_actnorm=use_actnorm, ndf=disc_ndf).apply(weights_init)
+    inputs = torch.rand((6, 1, 80, 848))
+    outputs = discriminator(inputs)
+    print(outputs.shape)
+    ## IMAGE
+    disc_in_channels = 3
+    disc_num_layers = 3
+    use_actnorm = False
+    disc_ndf = 64
+    discriminator = NLayerDiscriminator(input_nc=disc_in_channels, n_layers=disc_num_layers,
+                                        use_actnorm=use_actnorm, ndf=disc_ndf).apply(weights_init)
+    inputs = torch.rand((6, 3, 256, 256))
+    outputs = discriminator(inputs)
+    print(outputs.shape)

foleycrafter/models/specvqgan/modules/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from foleycrafter.models.specvqgan.modules.losses.vqperceptual import DummyLoss
+# relative imports pain
+import os
+import sys
+path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vggishish')
+sys.path.append(path)

foleycrafter/models/specvqgan/modules/losses/lpaps.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+    Based on https://github.com/CompVis/taming-transformers/blob/52720829/taming/modules/losses/lpips.py
+    Adapted for spectrograms by Vladimir Iashin (v-iashin)
+"""
+from collections import namedtuple
+import numpy as np
+import torch
+import torch.nn as nn
+import sys
+sys.path.insert(0, '.')  # nopep8
+from foleycrafter.models.specvqgan.modules.losses.vggishish.model import VGGishish
+from foleycrafter.models.specvqgan.util import get_ckpt_path
+class LPAPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout=True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vggish16 features
+        self.net = vggishish16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+    def load_from_pretrained(self, name="vggishish_lpaps"):
+        ckpt = get_ckpt_path(name, "specvqgan/modules/autoencoder/lpaps")
+        self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+        print("loaded pretrained LPAPS loss from {}".format(ckpt))
+    @classmethod
+    def from_pretrained(cls, name="vggishish_lpaps"):
+        if name != "vggishish_lpaps":
+            raise NotImplementedError
+        model = cls()
+        ckpt = get_ckpt_path(name)
+        model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+        return model
+    def forward(self, input, target):
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        # we are gonna use get_ckpt_path to donwload the stats as well
+        stat_path = get_ckpt_path('vggishish_mean_std_melspec_10s_22050hz', 'specvqgan/modules/autoencoder/lpaps')
+        # if for images we normalize on the channel dim, in spectrogram we will norm on frequency dimension
+        means, stds = np.loadtxt(stat_path, dtype=np.float32).T
+        # the normalization in means and stds are given for [0, 1], but specvqgan expects [-1, 1]:
+        means = 2 * means - 1
+        stds = 2 * stds
+        # input is expected to be (B, 1, F, T)
+        self.register_buffer('shift', torch.from_numpy(means)[None, None, :, None])
+        self.register_buffer('scale', torch.from_numpy(stds)[None, None, :, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    """ A single linear layer which does a 1x1 conv """
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = [nn.Dropout(), ] if (use_dropout) else []
+        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+        self.model = nn.Sequential(*layers)
+class vggishish16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super().__init__()
+        vgg_pretrained_features = self.vggishish16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+    def vggishish16(self, pretrained: bool = True) -> VGGishish:
+        # loading vggishish pretrained on vggsound
+        num_classes_vggsound = 309
+        conv_layers = [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
+        model = VGGishish(conv_layers, use_bn=False, num_classes=num_classes_vggsound)
+        if pretrained:
+            ckpt_path = get_ckpt_path('vggishish_lpaps', "specvqgan/modules/autoencoder/lpaps")
+            ckpt = torch.load(ckpt_path, map_location=torch.device("cpu"))
+            model.load_state_dict(ckpt, strict=False)
+        return model
+def normalize_tensor(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True))
+    return x / (norm_factor+eps)
+def spatial_average(x, keepdim=True):
+    return x.mean([2, 3], keepdim=keepdim)
+if __name__ == '__main__':
+    inputs = torch.rand((16, 1, 80, 848))
+    reconstructions = torch.rand((16, 1, 80, 848))
+    lpips = LPAPS().eval()
+    loss_p = lpips(inputs.contiguous(), reconstructions.contiguous())
+    # (16, 1, 1, 1)
+    print(loss_p.shape)

foleycrafter/models/specvqgan/modules/losses/vggishish/configs/melception.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+seed: 1337
+log_code_state: True
+# patterns to ignore when backing up the code folder
+patterns_to_ignore: ['logs', '.git', '__pycache__', 'data', 'checkpoints', '*.pt']
+# data:
+mels_path: '/home/nvme/data/vggsound/features/melspec_10s_22050hz/'
+spec_shape: [80, 860]
+cropped_size: [80, 848]
+random_crop: False
+# train:
+device: 'cuda:0'
+batch_size: 8
+num_workers: 0
+optimizer: adam
+betas: [0.9, 0.999]
+momentum: 0.9
+learning_rate: 3e-4
+weight_decay: 0
+num_epochs: 100
+patience: 3
+logdir: './logs'
+cls_weights_in_loss: False

foleycrafter/models/specvqgan/modules/losses/vggishish/configs/vggish.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+seed: 1337
+log_code_state: True
+# patterns to ignore when backing up the code folder
+patterns_to_ignore: ['logs', '.git', '__pycache__']
+# data:
+mels_path: '/home/nvme/data/vggsound/features/melspec_10s_22050hz/'
+spec_shape: [80, 860]
+cropped_size: [80, 848]
+random_crop: False
+# model:
+# original vgg family except for MP is missing at the end
+# 'vggish': [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512]
+# 'vgg11': [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512, 'MP', 512, 512],
+# 'vgg13': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 'MP', 512, 512, 'MP', 512, 512],
+# 'vgg16': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512],
+# 'vgg19': [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 256, 'MP', 512, 512, 512, 512, 'MP', 512, 512, 512, 512],
+conv_layers: [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
+use_bn: False
+# train:
+device: 'cuda:0'
+batch_size: 32
+num_workers: 0
+optimizer: adam
+betas: [0.9, 0.999]
+momentum: 0.9
+learning_rate: 3e-4
+weight_decay: 0.0001
+num_epochs: 100
+patience: 3
+logdir: './logs'
+cls_weights_in_loss: False

foleycrafter/models/specvqgan/modules/losses/vggishish/configs/vggish_gh.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+seed: 1337
+log_code_state: True
+patterns_to_ignore: ['logs', '.git', '__pycache__']
+mels_path: '/home/duyxxd/SpecVQGAN/data/greatesthit/melspec_10s_22050hz'
+batch_size: 32
+num_workers: 8
+device: 'cuda:0'
+conv_layers: [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
+use_bn: False
+optimizer: adam
+learning_rate: 1e-4
+betas: [0.9, 0.999]
+cropped_size: [80, 160]
+momentum: 0.9
+weight_decay: 1e-4
+cls_weights_in_loss: False
+num_epochs: 100
+patience: 20
+logdir: '/home/duyxxd/SpecVQGAN/logs'
+exp_name: 'mix'
+action_only: False
+material_only: False
+load_model: /home/duyxxd/SpecVQGAN/logs/vggishish16.pt

foleycrafter/models/specvqgan/modules/losses/vggishish/configs/vggish_gh_action.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+seed: 1337
+log_code_state: True
+patterns_to_ignore: ['logs', '.git', '__pycache__']
+mels_path: '/home/duyxxd/SpecVQGAN/data/greatesthit/melspec_10s_22050hz'
+batch_size: 32
+num_workers: 8
+device: 'cuda:0'
+conv_layers: [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
+use_bn: False
+optimizer: adam
+learning_rate: 1e-4
+betas: [0.9, 0.999]
+cropped_size: [80, 160]
+momentum: 0.9
+weight_decay: 1e-4
+cls_weights_in_loss: False
+num_epochs: 20
+patience: 20
+logdir: '/home/duyxxd/SpecVQGAN/logs'
+exp_name: 'action'
+action_only: True
+material_only: False
+load_model: /home/duyxxd/SpecVQGAN/logs/vggishish16.pt

foleycrafter/models/specvqgan/modules/losses/vggishish/configs/vggish_gh_material.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+seed: 1337
+log_code_state: True
+patterns_to_ignore: ['logs', '.git', '__pycache__']
+mels_path: '/home/duyxxd/SpecVQGAN/data/greatesthit/melspec_10s_22050hz'
+batch_size: 32
+num_workers: 8
+device: 'cuda:0'
+conv_layers: [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
+use_bn: False
+optimizer: adam
+learning_rate: 1e-4
+betas: [0.9, 0.999]
+cropped_size: [80, 160]
+momentum: 0.9
+weight_decay: 1e-4
+cls_weights_in_loss: False
+num_epochs: 20
+patience: 20
+logdir: '/home/duyxxd/SpecVQGAN/logs'
+exp_name: 'material'
+action_only: False
+material_only: True
+load_model: /home/duyxxd/SpecVQGAN/logs/vggishish16.pt

foleycrafter/models/specvqgan/modules/losses/vggishish/dataset.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import collections
+import csv
+import logging
+import os
+import random
+import math
+import json
+from glob import glob
+from pathlib import Path
+import numpy as np
+import torch
+import torchvision
+logger = logging.getLogger(f'main.{__name__}')
+class VGGSound(torch.utils.data.Dataset):
+    def __init__(self, split, specs_dir, transforms=None, splits_path='./data', meta_path='./data/vggsound.csv'):
+        super().__init__()
+        self.split = split
+        self.specs_dir = specs_dir
+        self.transforms = transforms
+        self.splits_path = splits_path
+        self.meta_path = meta_path
+        vggsound_meta = list(csv.reader(open(meta_path), quotechar='"'))
+        unique_classes = sorted(list(set(row[2] for row in vggsound_meta)))
+        self.label2target = {label: target for target, label in enumerate(unique_classes)}
+        self.target2label = {target: label for label, target in self.label2target.items()}
+        self.video2target = {row[0]: self.label2target[row[2]] for row in vggsound_meta}
+        split_clip_ids_path = os.path.join(splits_path, f'vggsound_{split}_partial.txt')
+        print('&&&&&&&&&&&&&&&&', split_clip_ids_path)
+        if not os.path.exists(split_clip_ids_path):
+            self.make_split_files()
+        clip_ids_with_timestamp = open(split_clip_ids_path).read().splitlines()
+        clip_paths = [os.path.join(specs_dir, v + '_mel.npy') for v in clip_ids_with_timestamp]
+        self.dataset = clip_paths
+        # self.dataset = clip_paths[:10000]  # overfit one batch
+        # 'zyTX_1BXKDE_16000_26000'[:11] -> 'zyTX_1BXKDE'
+        vid_classes = [self.video2target[Path(path).stem[:11]] for path in self.dataset]
+        class2count = collections.Counter(vid_classes)
+        self.class_counts = torch.tensor([class2count[cls] for cls in range(len(class2count))])
+        # self.sample_weights = [len(self.dataset) / class2count[self.video2target[Path(path).stem[:11]]] for path in self.dataset]
+    def __getitem__(self, idx):
+        item = {}
+        spec_path = self.dataset[idx]
+        # 'zyTX_1BXKDE_16000_26000' -> 'zyTX_1BXKDE'
+        video_name = Path(spec_path).stem[:11]
+        item['input'] = np.load(spec_path)
+        item['input_path'] = spec_path
+        # if self.split in ['train', 'valid']:
+        item['target'] = self.video2target[video_name]
+        item['label'] = self.target2label[item['target']]
+        if self.transforms is not None:
+            item = self.transforms(item)
+        return item
+    def __len__(self):
+        return len(self.dataset)
+    def make_split_files(self):
+        random.seed(1337)
+        logger.info(f'The split files do not exist @ {self.splits_path}. Calculating the new ones.')
+        # The downloaded videos (some went missing on YouTube and no longer available)
+        available_vid_paths = sorted(glob(os.path.join(self.specs_dir, '*_mel.npy')))
+        logger.info(f'The number of clips available after download: {len(available_vid_paths)}')
+        # original (full) train and test sets
+        vggsound_meta = list(csv.reader(open(self.meta_path), quotechar='"'))
+        train_vids = {row[0] for row in vggsound_meta if row[3] == 'train'}
+        test_vids = {row[0] for row in vggsound_meta if row[3] == 'test'}
+        logger.info(f'The number of videos in vggsound train set: {len(train_vids)}')
+        logger.info(f'The number of videos in vggsound test set: {len(test_vids)}')
+        # class counts in test set. We would like to have the same distribution in valid
+        unique_classes = sorted(list(set(row[2] for row in vggsound_meta)))
+        label2target = {label: target for target, label in enumerate(unique_classes)}
+        video2target = {row[0]: label2target[row[2]] for row in vggsound_meta}
+        test_vid_classes = [video2target[vid] for vid in test_vids]
+        test_target2count = collections.Counter(test_vid_classes)
+        # now given the counts from test set, sample the same count for validation and the rest leave in train
+        train_vids_wo_valid, valid_vids = set(), set()
+        for target, label in enumerate(label2target.keys()):
+            class_train_vids = [vid for vid in train_vids if video2target[vid] == target]
+            random.shuffle(class_train_vids)
+            count = test_target2count[target]
+            valid_vids.update(class_train_vids[:count])
+            train_vids_wo_valid.update(class_train_vids[count:])
+        # make file with a list of available test videos (each video should contain timestamps as well)
+        train_i = valid_i = test_i = 0
+        with open(os.path.join(self.splits_path, 'vggsound_train.txt'), 'w') as train_file, \
+             open(os.path.join(self.splits_path, 'vggsound_valid.txt'), 'w') as valid_file, \
+             open(os.path.join(self.splits_path, 'vggsound_test.txt'), 'w') as test_file:
+            for path in available_vid_paths:
+                path = path.replace('_mel.npy', '')
+                vid_name = Path(path).name
+                # 'zyTX_1BXKDE_16000_26000'[:11] -> 'zyTX_1BXKDE'
+                if vid_name[:11] in train_vids_wo_valid:
+                    train_file.write(vid_name + '\n')
+                    train_i += 1
+                elif vid_name[:11] in valid_vids:
+                    valid_file.write(vid_name + '\n')
+                    valid_i += 1
+                elif vid_name[:11] in test_vids:
+                    test_file.write(vid_name + '\n')
+                    test_i += 1
+                else:
+                    raise Exception(f'Clip {vid_name} is neither in train, valid nor test. Strange.')
+        logger.info(f'Put {train_i} clips to the train set and saved it to ./data/vggsound_train.txt')
+        logger.info(f'Put {valid_i} clips to the valid set and saved it to ./data/vggsound_valid.txt')
+        logger.info(f'Put {test_i} clips to the test set and saved it to ./data/vggsound_test.txt')
+def get_GH_data_identifier(video_name, start_idx, split='_'):
+    if isinstance(start_idx, str):
+        return video_name + split + start_idx
+    elif isinstance(start_idx, int):
+        return video_name + split + str(start_idx)
+    else:
+        raise NotImplementedError
+class GreatestHit(torch.utils.data.Dataset):
+    def __init__(self, split, spec_dir_path, spec_transform=None, L=2.0, action_only=False,
+                material_only=False, splits_path='/home/duyxxd/SpecVQGAN/data',
+                meta_path='/home/duyxxd/SpecVQGAN/data/info_r2plus1d_dim1024_15fps.json'):
+        super().__init__()
+        self.split = split
+        self.specs_dir = spec_dir_path
+        self.splits_path = splits_path
+        self.meta_path = meta_path
+        self.spec_transform = spec_transform
+        self.L = L
+        self.spec_take_first = int(math.ceil(860 * (L / 10.) / 32) * 32)
+        self.spec_take_first = 860 if self.spec_take_first > 860 else self.spec_take_first
+        self.spec_take_first = 173
+        greatesthit_meta = json.load(open(self.meta_path, 'r'))
+        self.video_idx2label = {
+            get_GH_data_identifier(greatesthit_meta['video_name'][i], greatesthit_meta['start_idx'][i]):
+            greatesthit_meta['hit_type'][i] for i in range(len(greatesthit_meta['video_name']))
+        }
+        self.available_video_hit = list(self.video_idx2label.keys())
+        self.video_idx2path = {
+            vh: os.path.join(self.specs_dir,
+                vh.replace('_', '_denoised_') + '_' + self.video_idx2label[vh].replace(' ', '_') +'_mel.npy')
+            for vh in self.available_video_hit
+        }
+        self.video_idx2idx = {
+            get_GH_data_identifier(greatesthit_meta['video_name'][i], greatesthit_meta['start_idx'][i]):
+            i for i in range(len(greatesthit_meta['video_name']))
+        }
+        split_clip_ids_path = os.path.join(splits_path, f'greatesthit_{split}_2.00_single_type_only.json')
+        if not os.path.exists(split_clip_ids_path):
+            raise NotImplementedError()
+        clip_video_hit = json.load(open(split_clip_ids_path, 'r'))
+        self.dataset = list(clip_video_hit.keys())
+        if action_only:
+            self.video_idx2label = {k: v.split(' ')[1] for k, v in clip_video_hit.items()}
+        elif material_only:
+            self.video_idx2label = {k: v.split(' ')[0] for k, v in clip_video_hit.items()}
+        else:
+            self.video_idx2label = clip_video_hit
+        self.video2indexes = {}
+        for video_idx in self.dataset:
+            video, start_idx = video_idx.split('_')
+            if video not in self.video2indexes.keys():
+                self.video2indexes[video] = []
+            self.video2indexes[video].append(start_idx)
+        for video in self.video2indexes.keys():
+            if len(self.video2indexes[video]) == 1: # given video contains only one hit
+                self.dataset.remove(
+                    get_GH_data_identifier(video, self.video2indexes[video][0])
+                )
+        vid_classes = list(self.video_idx2label.values())
+        unique_classes = sorted(list(set(vid_classes)))
+        self.label2target = {label: target for target, label in enumerate(unique_classes)}
+        if action_only:
+            label2target_fix = {'hit': 0, 'scratch': 1}
+        elif material_only:
+            label2target_fix = {'carpet': 0, 'ceramic': 1, 'cloth': 2, 'dirt': 3, 'drywall': 4, 'glass': 5, 'grass': 6, 'gravel': 7, 'leaf': 8, 'metal': 9, 'paper': 10, 'plastic': 11, 'plastic-bag': 12, 'rock': 13, 'tile': 14, 'water': 15, 'wood': 16}
+        else:
+            label2target_fix = {'carpet hit': 0, 'carpet scratch': 1, 'ceramic hit': 2, 'ceramic scratch': 3, 'cloth hit': 4, 'cloth scratch': 5, 'dirt hit': 6, 'dirt scratch': 7, 'drywall hit': 8, 'drywall scratch': 9, 'glass hit': 10, 'glass scratch': 11, 'grass hit': 12, 'grass scratch': 13, 'gravel hit': 14, 'gravel scratch': 15, 'leaf hit': 16, 'leaf scratch': 17, 'metal hit': 18, 'metal scratch': 19, 'paper hit': 20, 'paper scratch': 21, 'plastic hit': 22, 'plastic scratch': 23, 'plastic-bag hit': 24, 'plastic-bag scratch': 25, 'rock hit': 26, 'rock scratch': 27, 'tile hit': 28, 'tile scratch': 29, 'water hit': 30, 'water scratch': 31, 'wood hit': 32, 'wood scratch': 33}
+        for k in self.label2target.keys():
+            assert k in label2target_fix.keys()
+        self.label2target = label2target_fix
+        self.target2label = {target: label for label, target in self.label2target.items()}
+        class2count = collections.Counter(vid_classes)
+        self.class_counts = torch.tensor([class2count[cls] for cls in range(len(class2count))])
+        print(self.label2target)
+        print(len(vid_classes), len(class2count), class2count)
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        video_idx = self.dataset[idx]
+        spec_path = self.video_idx2path[video_idx]
+        spec = np.load(spec_path) # (80, 860)
+        # concat spec outside dataload
+        item['input'] = 2 * spec - 1 # (80, 860)
+        item['input'] = item['input'][:, :self.spec_take_first] # (80, 173) (since 2sec audio can only generate 173)
+        item['file_path'] = spec_path
+        item['label'] = self.video_idx2label[video_idx]
+        item['target'] = self.label2target[item['label']]
+        if self.spec_transform is not None:
+            item = self.spec_transform(item)
+        return item
+class AMT_test(torch.utils.data.Dataset):
+    def __init__(self, spec_dir_path, spec_transform=None, action_only=False, material_only=False):
+        super().__init__()
+        self.specs_dir = spec_dir_path
+        self.spec_transform = spec_transform
+        self.spec_take_first = 173
+        self.dataset = sorted([os.path.join(self.specs_dir, f) for f in os.listdir(self.specs_dir)])
+        if action_only:
+            self.label2target = {'hit': 0, 'scratch': 1}
+        elif material_only:
+            self.label2target = {'carpet': 0, 'ceramic': 1, 'cloth': 2, 'dirt': 3, 'drywall': 4, 'glass': 5, 'grass': 6, 'gravel': 7, 'leaf': 8, 'metal': 9, 'paper': 10, 'plastic': 11, 'plastic-bag': 12, 'rock': 13, 'tile': 14, 'water': 15, 'wood': 16}
+        else:
+            self.label2target = {'carpet hit': 0, 'carpet scratch': 1, 'ceramic hit': 2, 'ceramic scratch': 3, 'cloth hit': 4, 'cloth scratch': 5, 'dirt hit': 6, 'dirt scratch': 7, 'drywall hit': 8, 'drywall scratch': 9, 'glass hit': 10, 'glass scratch': 11, 'grass hit': 12, 'grass scratch': 13, 'gravel hit': 14, 'gravel scratch': 15, 'leaf hit': 16, 'leaf scratch': 17, 'metal hit': 18, 'metal scratch': 19, 'paper hit': 20, 'paper scratch': 21, 'plastic hit': 22, 'plastic scratch': 23, 'plastic-bag hit': 24, 'plastic-bag scratch': 25, 'rock hit': 26, 'rock scratch': 27, 'tile hit': 28, 'tile scratch': 29, 'water hit': 30, 'water scratch': 31, 'wood hit': 32, 'wood scratch': 33}
+        self.target2label = {v: k for k, v in self.label2target.items()}
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        item = {}
+        spec_path = self.dataset[idx]
+        spec = np.load(spec_path) # (80, 860)
+        # concat spec outside dataload
+        item['input'] = 2 * spec - 1 # (80, 860)
+        item['input'] = item['input'][:, :self.spec_take_first] # (80, 173) (since 2sec audio can only generate 173)
+        item['file_path'] = spec_path
+        if self.spec_transform is not None:
+            item = self.spec_transform(item)
+        return item
+if __name__ == '__main__':
+    from transforms import Crop, StandardNormalizeAudio, ToTensor
+    specs_path = '/home/nvme/data/vggsound/features/melspec_10s_22050hz/'
+    transforms = torchvision.transforms.transforms.Compose([
+        StandardNormalizeAudio(specs_path),
+        ToTensor(),
+        Crop([80, 848]),
+    ])
+    datasets = {
+        'train': VGGSound('train', specs_path, transforms),
+        'valid': VGGSound('valid', specs_path, transforms),
+        'test': VGGSound('test', specs_path, transforms),
+    }
+    print(datasets['train'][0])
+    print(datasets['valid'][0])
+    print(datasets['test'][0])
+    print(datasets['train'].class_counts)
+    print(datasets['valid'].class_counts)
+    print(datasets['test'].class_counts)

foleycrafter/models/specvqgan/modules/losses/vggishish/logger.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import logging
+import os
+import time
+from shutil import copytree, ignore_patterns
+import torch
+from omegaconf import OmegaConf
+from torch.utils.tensorboard import SummaryWriter, summary
+class LoggerWithTBoard(SummaryWriter):
+    def __init__(self, cfg):
+        # current time stamp and experiment log directory
+        self.start_time = time.strftime('%y-%m-%dT%H-%M-%S', time.localtime())
+        if cfg.exp_name is not None:
+            self.logdir = os.path.join(cfg.logdir, self.start_time + f'_{cfg.exp_name}')
+        else:
+            self.logdir = os.path.join(cfg.logdir, self.start_time)
+        # init tboard
+        super().__init__(self.logdir)
+        # backup the cfg
+        OmegaConf.save(cfg, os.path.join(self.log_dir, 'cfg.yaml'))
+        # backup the code state
+        if cfg.log_code_state:
+            dest_dir = os.path.join(self.logdir, 'code')
+            copytree(os.getcwd(), dest_dir, ignore=ignore_patterns(*cfg.patterns_to_ignore))
+        # init logger which handles printing and logging mostly same things to the log file
+        self.print_logger = logging.getLogger('main')
+        self.print_logger.setLevel(logging.INFO)
+        msgfmt = '[%(levelname)s] %(asctime)s - %(name)s \n    %(message)s'
+        datefmt = '%d %b %Y %H:%M:%S'
+        formatter = logging.Formatter(msgfmt, datefmt)
+        # stdout
+        sh = logging.StreamHandler()
+        sh.setLevel(logging.DEBUG)
+        sh.setFormatter(formatter)
+        self.print_logger.addHandler(sh)
+        # log file
+        fh = logging.FileHandler(os.path.join(self.log_dir, 'log.txt'))
+        fh.setLevel(logging.INFO)
+        fh.setFormatter(formatter)
+        self.print_logger.addHandler(fh)
+        self.print_logger.info(f'Saving logs and checkpoints @ {self.logdir}')
+    def log_param_num(self, model):
+        param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        self.print_logger.info(f'The number of parameters: {param_num/1e+6:.3f} mil')
+        self.add_scalar('num_params', param_num, 0)
+        return param_num
+    def log_iter_loss(self, loss, iter, phase):
+        self.add_scalar(f'{phase}/loss_iter', loss, iter)
+    def log_epoch_loss(self, loss, epoch, phase):
+        self.add_scalar(f'{phase}/loss', loss, epoch)
+        self.print_logger.info(f'{phase} ({epoch}): loss {loss:.3f};')
+    def log_epoch_metrics(self, metrics_dict, epoch, phase):
+        for metric, val in metrics_dict.items():
+            self.add_scalar(f'{phase}/{metric}', val, epoch)
+        metrics_dict = {k: round(v, 4) for k, v in metrics_dict.items()}
+        self.print_logger.info(f'{phase} ({epoch}) metrics: {metrics_dict};')
+    def log_test_metrics(self, metrics_dict, hparams_dict, best_epoch):
+        allowed_types = (int, float, str, bool, torch.Tensor)
+        hparams_dict = {k: v for k, v in hparams_dict.items() if isinstance(v, allowed_types)}
+        metrics_dict = {f'test/{k}': round(v, 4) for k, v in metrics_dict.items()}
+        exp, ssi, sei = summary.hparams(hparams_dict, metrics_dict)
+        self.file_writer.add_summary(exp)
+        self.file_writer.add_summary(ssi)
+        self.file_writer.add_summary(sei)
+        for k, v in metrics_dict.items():
+            self.add_scalar(k, v, best_epoch)
+        self.print_logger.info(f'test ({best_epoch}) metrics: {metrics_dict};')
+    def log_best_model(self, model, loss, epoch, optimizer, metrics_dict):
+        model_name = model.__class__.__name__
+        self.best_model_path = os.path.join(self.logdir, f'{model_name}-{self.start_time}.pt')
+        checkpoint = {
+            'loss': loss,
+            'metrics': metrics_dict,
+            'epoch': epoch,
+            'optimizer': optimizer.state_dict(),
+            'model': model.state_dict(),
+        }
+        torch.save(checkpoint, self.best_model_path)
+        self.print_logger.info(f'Saved model in {self.best_model_path}')

foleycrafter/models/specvqgan/modules/losses/vggishish/loss.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+class WeightedCrossEntropy(nn.CrossEntropyLoss):
+    def __init__(self, weights, **pytorch_ce_loss_args) -> None:
+        super().__init__(reduction='none', **pytorch_ce_loss_args)
+        self.weights = weights
+    def __call__(self, outputs, targets, to_weight=True):
+        loss = super().__call__(outputs, targets)
+        if to_weight:
+            return (loss * self.weights[targets]).sum() / self.weights[targets].sum()
+        else:
+            return loss.mean()
+if __name__ == '__main__':
+    x = torch.randn(10, 5)
+    target = torch.randint(0, 5, (10,))
+    weights = torch.tensor([1., 2., 3., 4., 5.])
+    # criterion_weighted = nn.CrossEntropyLoss(weight=weights)
+    # loss_weighted = criterion_weighted(x, target)
+    # criterion_weighted_manual = nn.CrossEntropyLoss(reduction='none')
+    # loss_weighted_manual = criterion_weighted_manual(x, target)
+    # print(loss_weighted, loss_weighted_manual.mean())
+    # loss_weighted_manual = (loss_weighted_manual * weights[target]).sum() / weights[target].sum()
+    # print(loss_weighted, loss_weighted_manual)
+    # print(torch.allclose(loss_weighted, loss_weighted_manual))
+    pytorch_weighted = nn.CrossEntropyLoss(weight=weights)
+    pytorch_unweighted = nn.CrossEntropyLoss()
+    custom = WeightedCrossEntropy(weights)
+    assert torch.allclose(pytorch_weighted(x, target), custom(x, target, to_weight=True))
+    assert torch.allclose(pytorch_unweighted(x, target), custom(x, target, to_weight=False))
+    print(custom(x, target, to_weight=True), custom(x, target, to_weight=False))

foleycrafter/models/specvqgan/modules/losses/vggishish/metrics.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import logging
+import numpy as np
+import scipy
+import torch
+from sklearn.metrics import average_precision_score, roc_auc_score
+logger = logging.getLogger(f'main.{__name__}')
+def metrics(targets, outputs, topk=(1, 5)):
+    """
+    Adapted from https://github.com/hche11/VGGSound/blob/master/utils.py
+    Calculate statistics including mAP, AUC, and d-prime.
+        Args:
+            output: 2d tensors, (dataset_size, classes_num) - before softmax
+            target: 1d tensors, (dataset_size, )
+            topk: tuple
+        Returns:
+            metric_dict: a dict of metrics
+    """
+    metrics_dict = dict()
+    num_cls = outputs.shape[-1]
+    # accuracy@k
+    _, preds = torch.topk(outputs, k=max(topk), dim=1)
+    correct_for_maxtopk = preds == targets.view(-1, 1).expand_as(preds)
+    for k in topk:
+        metrics_dict[f'accuracy_{k}'] = float(correct_for_maxtopk[:, :k].sum() / correct_for_maxtopk.shape[0])
+    # avg precision, average roc_auc, and dprime
+    targets = torch.nn.functional.one_hot(targets, num_classes=num_cls)
+    # ids of the predicted classes (same as softmax)
+    targets_pred = torch.softmax(outputs, dim=1)
+    targets = targets.numpy()
+    targets_pred = targets_pred.numpy()
+    # one-vs-rest
+    avg_p = [average_precision_score(targets[:, c], targets_pred[:, c], average=None) for c in range(num_cls)]
+    try:
+        roc_aucs = [roc_auc_score(targets[:, c], targets_pred[:, c], average=None) for c in range(num_cls)]
+    except ValueError:
+        logger.warning('Weird... Some classes never occured in targets. Do not trust the metrics.')
+        roc_aucs = np.array([0.5])
+        avg_p = np.array([0])
+    metrics_dict['mAP'] = np.mean(avg_p)
+    metrics_dict['mROCAUC'] = np.mean(roc_aucs)
+    # Percent point function (ppf) (inverse of cdf — percentiles).
+    metrics_dict['dprime'] = scipy.stats.norm().ppf(metrics_dict['mROCAUC']) * np.sqrt(2)
+    return metrics_dict
+if __name__ == '__main__':
+    targets = torch.tensor([3, 3, 1, 2, 1, 0])
+    outputs = torch.tensor([
+        [1.2, 1.3, 1.1, 1.5],
+        [1.3, 1.4, 1.0, 1.1],
+        [1.5, 1.1, 1.4, 1.3],
+        [1.0, 1.2, 1.4, 1.5],
+        [1.2, 1.3, 1.1, 1.1],
+        [1.2, 1.1, 1.1, 1.1],
+    ]).float()
+    metrics_dict = metrics(targets, outputs, topk=(1, 3))
+    print(metrics_dict)

foleycrafter/models/specvqgan/modules/losses/vggishish/model.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import torch.nn as nn
+class VGGishish(nn.Module):
+    def __init__(self, conv_layers, use_bn, num_classes):
+        '''
+        Mostly from
+            https://pytorch.org/vision/0.8/_modules/torchvision/models/vgg.html
+        '''
+        super().__init__()
+        layers = []
+        in_channels = 1
+        # a list of channels with 'MP' (maxpool) from config
+        for v in conv_layers:
+            if v == 'MP':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1, stride=1)
+                if use_bn:
+                    layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+                else:
+                    layers += [conv2d, nn.ReLU(inplace=True)]
+                in_channels = v
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d((5, 10))
+        self.flatten = nn.Flatten()
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 5 * 10, 4096),
+            nn.ReLU(True),
+            nn.Linear(4096, 4096),
+            nn.ReLU(True),
+            nn.Linear(4096, num_classes)
+        )
+        # weight init
+        self.reset_parameters()
+    def forward(self, x):
+        # adding channel dim for conv2d (B, 1, F, T) <-
+        x = x.unsqueeze(1)
+        # backbone (B, 1, 5, 53) <- (B, 1, 80, 860)
+        x = self.features(x)
+        # adaptive avg pooling (B, 1, 5, 10) <- (B, 1, 5, 53) – if no MP is used as the end of VGG
+        x = self.avgpool(x)
+        # flatten
+        x = self.flatten(x)
+        # classify
+        x = self.classifier(x)
+        return x
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+if __name__ == '__main__':
+    num_classes = 309
+    inputs = torch.rand(3, 80, 848)
+    conv_layers = [64, 64, 'MP', 128, 128, 'MP', 256, 256, 256, 'MP', 512, 512, 512, 'MP', 512, 512, 512]
+    # conv_layers = [64, 'MP', 128, 'MP', 256, 256, 'MP', 512, 512, 'MP']
+    model = VGGishish(conv_layers, use_bn=False, num_classes=num_classes)
+    outputs = model(inputs)
+    print(outputs.shape)

foleycrafter/models/specvqgan/modules/losses/vggishish/predict.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+from torch.utils.data import DataLoader
+import torchvision
+from tqdm import tqdm
+from dataset import VGGSound
+import torch
+import torch.nn as nn
+from metrics import metrics
+from omegaconf import OmegaConf
+from model import VGGishish
+from transforms import Crop, StandardNormalizeAudio, ToTensor
+if __name__ == '__main__':
+    cfg_cli = OmegaConf.from_cli()
+    print(cfg_cli.config)
+    cfg_yml = OmegaConf.load(cfg_cli.config)
+    # the latter arguments are prioritized
+    cfg = OmegaConf.merge(cfg_yml, cfg_cli)
+    OmegaConf.set_readonly(cfg, True)
+    print(OmegaConf.to_yaml(cfg))
+    # logger = LoggerWithTBoard(cfg)
+    transforms = [
+        StandardNormalizeAudio(cfg.mels_path),
+        ToTensor(),
+    ]
+    if cfg.cropped_size not in [None, 'None', 'none']:
+        transforms.append(Crop(cfg.cropped_size))
+    transforms = torchvision.transforms.transforms.Compose(transforms)
+    datasets = {
+        'test': VGGSound('test', cfg.mels_path, transforms),
+    }
+    loaders = {
+        'test': DataLoader(datasets['test'], batch_size=cfg.batch_size,
+                           num_workers=cfg.num_workers, pin_memory=True)
+    }
+    device = torch.device(cfg.device if torch.cuda.is_available() else 'cpu')
+    model = VGGishish(cfg.conv_layers, cfg.use_bn, num_classes=len(datasets['test'].target2label))
+    model = model.to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)
+    criterion = nn.CrossEntropyLoss()
+    # loading the best model
+    folder_name = os.path.split(cfg.config)[0].split('/')[-1]
+    print(folder_name)
+    ckpt = torch.load(f'./logs/{folder_name}/vggishish-{folder_name}.pt', map_location='cpu')
+    model.load_state_dict(ckpt['model'])
+    print((f'The model was trained for {ckpt["epoch"]} epochs. Loss: {ckpt["loss"]:.4f}'))
+    # Testing the model
+    model.eval()
+    running_loss = 0
+    preds_from_each_batch = []
+    targets_from_each_batch = []
+    for i, batch in enumerate(tqdm(loaders['test'])):
+        inputs = batch['input'].to(device)
+        targets = batch['target'].to(device)
+        # zero the parameter gradients
+        optimizer.zero_grad()
+        # forward + backward + optimize
+        with torch.set_grad_enabled(False):
+            outputs = model(inputs)
+            loss = criterion(outputs, targets)
+        # loss
+        running_loss += loss.item()
+        # for metrics calculation later on
+        preds_from_each_batch += [outputs.detach().cpu()]
+        targets_from_each_batch += [targets.cpu()]
+    # logging metrics
+    preds_from_each_batch = torch.cat(preds_from_each_batch)
+    targets_from_each_batch = torch.cat(targets_from_each_batch)
+    test_metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch)
+    test_metrics_dict['avg_loss'] = running_loss / len(loaders['test'])
+    test_metrics_dict['param_num'] = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    # TODO: I have no idea why tboard doesn't keep metrics (hparams) in a tensorboard when
+    # I run this experiment from cli: `python main.py config=./configs/vggish.yaml`
+    # while when I run it in vscode debugger the metrics are present in the tboard (weird)
+    print(test_metrics_dict)

foleycrafter/models/specvqgan/modules/losses/vggishish/predict_gh.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import sys
+import json
+from torch.utils.data import DataLoader
+import torchvision
+from tqdm import tqdm
+from dataset import GreatestHit, AMT_test
+import torch
+import torch.nn as nn
+from metrics import metrics
+from omegaconf import OmegaConf
+from model import VGGishish
+from transforms import Crop, StandardNormalizeAudio, ToTensor
+if __name__ == '__main__':
+    cfg_cli = sys.argv[1]
+    target_path = sys.argv[2]
+    model_path = sys.argv[3]
+    cfg_yml = OmegaConf.load(cfg_cli)
+    # the latter arguments are prioritized
+    cfg = cfg_yml
+    OmegaConf.set_readonly(cfg, True)
+    # print(OmegaConf.to_yaml(cfg))
+    device = torch.device(cfg.device if torch.cuda.is_available() else 'cpu')
+    transforms = [
+        StandardNormalizeAudio(cfg.mels_path),
+    ]
+    if cfg.cropped_size not in [None, 'None', 'none']:
+        transforms.append(Crop(cfg.cropped_size))
+    transforms.append(ToTensor())
+    transforms = torchvision.transforms.transforms.Compose(transforms)
+    testset = AMT_test(target_path, transforms, action_only=cfg.action_only, material_only=cfg.material_only)
+    loader = DataLoader(testset, batch_size=cfg.batch_size,
+                        num_workers=cfg.num_workers, pin_memory=True)
+    model = VGGishish(cfg.conv_layers, cfg.use_bn, num_classes=len(testset.label2target))
+    ckpt = torch.load(model_path)['model']
+    model.load_state_dict(ckpt, strict=True)
+    model = model.to(device)
+    model.eval()
+    if cfg.cls_weights_in_loss:
+        weights = 1 / testset.class_counts
+    else:
+        weights = torch.ones(len(testset.label2target))
+    preds_from_each_batch = []
+    file_path_from_each_batch = []
+    for batch in tqdm(loader):
+        inputs = batch['input'].to(device)
+        file_path = batch['file_path']
+        with torch.set_grad_enabled(False):
+            outputs = model(inputs)
+        # for metrics calculation later on
+        preds_from_each_batch += [outputs.detach().cpu()]
+        file_path_from_each_batch += file_path
+    preds_from_each_batch = torch.cat(preds_from_each_batch)
+    _, preds = torch.topk(preds_from_each_batch, k=1)
+    pred_dict = {fp: int(p.item()) for fp, p in zip(file_path_from_each_batch, preds)}
+    mel_parent_dir = os.path.dirname(list(pred_dict.keys())[0])
+    pred_list = [pred_dict[os.path.join(mel_parent_dir, f'{i}.npy')] for i in range(len(pred_dict))]
+    json.dump(pred_list, open(target_path + f'_{cfg.exp_name}_preds.json', 'w'))

foleycrafter/models/specvqgan/modules/losses/vggishish/train_melception.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import random
+import numpy as np
+import torch
+import torchvision
+from omegaconf import OmegaConf
+from torch.utils.data.dataloader import DataLoader
+from torchvision.models.inception import BasicConv2d, Inception3
+from tqdm import tqdm
+from dataset import VGGSound
+from logger import LoggerWithTBoard
+from loss import WeightedCrossEntropy
+from metrics import metrics
+from transforms import Crop, StandardNormalizeAudio, ToTensor
+# TODO: refactor  ./evaluation/feature_extractors/melception.py to handle this class as well.
+# So far couldn't do it because of the difference in outputs
+class Melception(Inception3):
+    def __init__(self, num_classes, **kwargs):
+        # inception = Melception(num_classes=309)
+        super().__init__(num_classes=num_classes, **kwargs)
+        # the same as https://github.com/pytorch/vision/blob/5339e63148/torchvision/models/inception.py#L95
+        # but for 1-channel input instead of RGB.
+        self.Conv2d_1a_3x3 = BasicConv2d(1, 32, kernel_size=3, stride=2)
+        # also the 'hight' of the mel spec is 80 (vs 299 in RGB) we remove all max pool from Inception
+        self.maxpool1 = torch.nn.Identity()
+        self.maxpool2 = torch.nn.Identity()
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        return super().forward(x)
+def train_inception_scorer(cfg):
+    logger = LoggerWithTBoard(cfg)
+    random.seed(cfg.seed)
+    np.random.seed(cfg.seed)
+    torch.manual_seed(cfg.seed)
+    torch.cuda.manual_seed_all(cfg.seed)
+    # makes iterations faster (in this case 30%) if your inputs are of a fixed size
+    # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3
+    torch.backends.cudnn.benchmark = True
+    meta_path = './data/vggsound.csv'
+    train_ids_path = './data/vggsound_train.txt'
+    cache_path = './data/'
+    splits_path = cache_path
+    transforms = [
+        StandardNormalizeAudio(cfg.mels_path, train_ids_path, cache_path),
+    ]
+    if cfg.cropped_size not in [None, 'None', 'none']:
+        logger.print_logger.info(f'Using cropping {cfg.cropped_size}')
+        transforms.append(Crop(cfg.cropped_size))
+    transforms.append(ToTensor())
+    transforms = torchvision.transforms.transforms.Compose(transforms)
+    datasets = {
+        'train': VGGSound('train', cfg.mels_path, transforms, splits_path, meta_path),
+        'valid': VGGSound('valid', cfg.mels_path, transforms, splits_path, meta_path),
+        'test': VGGSound('test', cfg.mels_path, transforms, splits_path, meta_path),
+    }
+    loaders = {
+        'train': DataLoader(datasets['train'], batch_size=cfg.batch_size, shuffle=True, drop_last=True,
+                            num_workers=cfg.num_workers, pin_memory=True),
+        'valid': DataLoader(datasets['valid'], batch_size=cfg.batch_size,
+                            num_workers=cfg.num_workers, pin_memory=True),
+        'test': DataLoader(datasets['test'], batch_size=cfg.batch_size,
+                           num_workers=cfg.num_workers, pin_memory=True),
+    }
+    device = torch.device(cfg.device if torch.cuda.is_available() else 'cpu')
+    model = Melception(num_classes=len(datasets['train'].target2label))
+    model = model.to(device)
+    param_num = logger.log_param_num(model)
+    if cfg.optimizer == 'adam':
+        optimizer = torch.optim.Adam(
+            model.parameters(), lr=cfg.learning_rate, betas=cfg.betas, weight_decay=cfg.weight_decay)
+    elif cfg.optimizer == 'sgd':
+        optimizer = torch.optim.SGD(
+            model.parameters(), lr=cfg.learning_rate, momentum=cfg.momentum, weight_decay=cfg.weight_decay)
+    else:
+        raise NotImplementedError
+    if cfg.cls_weights_in_loss:
+        weights = 1 / datasets['train'].class_counts
+    else:
+        weights = torch.ones(len(datasets['train'].target2label))
+    criterion = WeightedCrossEntropy(weights.to(device))
+    # loop over the train and validation multiple times (typical PT boilerplate)
+    no_change_epochs = 0
+    best_valid_loss = float('inf')
+    early_stop_triggered = False
+    for epoch in range(cfg.num_epochs):
+        for phase in ['train', 'valid']:
+            if phase == 'train':
+                model.train()
+            else:
+                model.eval()
+            running_loss = 0
+            preds_from_each_batch = []
+            targets_from_each_batch = []
+            prog_bar = tqdm(loaders[phase], f'{phase} ({epoch})', ncols=0)
+            for i, batch in enumerate(prog_bar):
+                inputs = batch['input'].to(device)
+                targets = batch['target'].to(device)
+                # zero the parameter gradients
+                optimizer.zero_grad()
+                # forward + backward + optimize
+                with torch.set_grad_enabled(phase == 'train'):
+                    # inception v3
+                    if phase == 'train':
+                        outputs, aux_outputs = model(inputs)
+                        loss1 = criterion(outputs, targets)
+                        loss2 = criterion(aux_outputs, targets)
+                        loss = loss1 + 0.4*loss2
+                        loss = criterion(outputs, targets, to_weight=True)
+                    else:
+                        outputs = model(inputs)
+                        loss = criterion(outputs, targets, to_weight=False)
+                if phase == 'train':
+                    loss.backward()
+                    optimizer.step()
+                # loss
+                running_loss += loss.item()
+                # for metrics calculation later on
+                preds_from_each_batch += [outputs.detach().cpu()]
+                targets_from_each_batch += [targets.cpu()]
+                # iter logging
+                if i % 50 == 0:
+                    logger.log_iter_loss(loss.item(), epoch*len(loaders[phase])+i, phase)
+                    # tracks loss in the tqdm progress bar
+                    prog_bar.set_postfix(loss=loss.item())
+            # logging loss
+            epoch_loss = running_loss / len(loaders[phase])
+            logger.log_epoch_loss(epoch_loss, epoch, phase)
+            # logging metrics
+            preds_from_each_batch = torch.cat(preds_from_each_batch)
+            targets_from_each_batch = torch.cat(targets_from_each_batch)
+            metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch)
+            logger.log_epoch_metrics(metrics_dict, epoch, phase)
+            # Early stopping
+            if phase == 'valid':
+                if epoch_loss < best_valid_loss:
+                    no_change_epochs = 0
+                    best_valid_loss = epoch_loss
+                    logger.log_best_model(model, epoch_loss, epoch, optimizer, metrics_dict)
+                else:
+                    no_change_epochs += 1
+                    logger.print_logger.info(
+                        f'Valid loss hasnt changed for {no_change_epochs} patience: {cfg.patience}'
+                    )
+                    if no_change_epochs >= cfg.patience:
+                        early_stop_triggered = True
+        if early_stop_triggered:
+            logger.print_logger.info(f'Training is early stopped @ {epoch}')
+            break
+    logger.print_logger.info('Finished Training')
+    # loading the best model
+    ckpt = torch.load(logger.best_model_path)
+    model.load_state_dict(ckpt['model'])
+    logger.print_logger.info(f'Loading the best model from {logger.best_model_path}')
+    logger.print_logger.info((f'The model was trained for {ckpt["epoch"]} epochs. Loss: {ckpt["loss"]:.4f}'))
+    # Testing the model
+    model.eval()
+    running_loss = 0
+    preds_from_each_batch = []
+    targets_from_each_batch = []
+    for i, batch in enumerate(loaders['test']):
+        inputs = batch['input'].to(device)
+        targets = batch['target'].to(device)
+        # zero the parameter gradients
+        optimizer.zero_grad()
+        # forward + backward + optimize
+        with torch.set_grad_enabled(False):
+            outputs = model(inputs)
+            loss = criterion(outputs, targets, to_weight=False)
+        # loss
+        running_loss += loss.item()
+        # for metrics calculation later on
+        preds_from_each_batch += [outputs.detach().cpu()]
+        targets_from_each_batch += [targets.cpu()]
+    # logging metrics
+    preds_from_each_batch = torch.cat(preds_from_each_batch)
+    targets_from_each_batch = torch.cat(targets_from_each_batch)
+    test_metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch)
+    test_metrics_dict['avg_loss'] = running_loss / len(loaders['test'])
+    test_metrics_dict['param_num'] = param_num
+    # TODO: I have no idea why tboard doesn't keep metrics (hparams) when
+    # I run this experiment from cli: `python train_melception.py config=./configs/vggish.yaml`
+    # while when I run it in vscode debugger the metrics are logger (wtf)
+    logger.log_test_metrics(test_metrics_dict, dict(cfg), ckpt['epoch'])
+    logger.print_logger.info('Finished the experiment')
+if __name__ == '__main__':
+    # input = torch.rand(16, 1, 80, 848)
+    # output, aux = inception(input)
+    # print(output.shape, aux.shape)
+    # Expected input size: (3, 299, 299) in RGB -> (1, 80, 848) in Mel Spec
+    # train_inception_scorer()
+    cfg_cli = OmegaConf.from_cli()
+    cfg_yml = OmegaConf.load(cfg_cli.config)
+    # the latter arguments are prioritized
+    cfg = OmegaConf.merge(cfg_yml, cfg_cli)
+    OmegaConf.set_readonly(cfg, True)
+    print(OmegaConf.to_yaml(cfg))
+    train_inception_scorer(cfg)

foleycrafter/models/specvqgan/modules/losses/vggishish/train_vggishish.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from loss import WeightedCrossEntropy
+import random
+import numpy as np
+import torch
+import torchvision
+from omegaconf import OmegaConf
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from dataset import VGGSound
+from transforms import Crop, StandardNormalizeAudio, ToTensor
+from logger import LoggerWithTBoard
+from metrics import metrics
+from model import VGGishish
+if __name__ == "__main__":
+    cfg_cli = OmegaConf.from_cli()
+    cfg_yml = OmegaConf.load(cfg_cli.config)
+    # the latter arguments are prioritized
+    cfg = OmegaConf.merge(cfg_yml, cfg_cli)
+    OmegaConf.set_readonly(cfg, True)
+    print(OmegaConf.to_yaml(cfg))
+    logger = LoggerWithTBoard(cfg)
+    random.seed(cfg.seed)
+    np.random.seed(cfg.seed)
+    torch.manual_seed(cfg.seed)
+    torch.cuda.manual_seed_all(cfg.seed)
+    # makes iterations faster (in this case 30%) if your inputs are of a fixed size
+    # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3
+    torch.backends.cudnn.benchmark = True
+    transforms = [
+        StandardNormalizeAudio(cfg.mels_path),
+    ]
+    if cfg.cropped_size not in [None, 'None', 'none']:
+        logger.print_logger.info(f'Using cropping {cfg.cropped_size}')
+        transforms.append(Crop(cfg.cropped_size))
+    transforms.append(ToTensor())
+    transforms = torchvision.transforms.transforms.Compose(transforms)
+    datasets = {
+        'train': VGGSound('train', cfg.mels_path, transforms),
+        'valid': VGGSound('valid', cfg.mels_path, transforms),
+        'test': VGGSound('test', cfg.mels_path, transforms),
+    }
+    loaders = {
+        'train': DataLoader(datasets['train'], batch_size=cfg.batch_size, shuffle=True, drop_last=True,
+                            num_workers=cfg.num_workers, pin_memory=True),
+        'valid': DataLoader(datasets['valid'], batch_size=cfg.batch_size,
+                            num_workers=cfg.num_workers, pin_memory=True),
+        'test': DataLoader(datasets['test'], batch_size=cfg.batch_size,
+                           num_workers=cfg.num_workers, pin_memory=True),
+    }
+    device = torch.device(cfg.device if torch.cuda.is_available() else 'cpu')
+    model = VGGishish(cfg.conv_layers, cfg.use_bn, num_classes=len(datasets['train'].target2label))
+    model = model.to(device)
+    param_num = logger.log_param_num(model)
+    if cfg.optimizer == 'adam':
+        optimizer = torch.optim.Adam(
+            model.parameters(), lr=cfg.learning_rate, betas=cfg.betas, weight_decay=cfg.weight_decay)
+    elif cfg.optimizer == 'sgd':
+        optimizer = torch.optim.SGD(
+            model.parameters(), lr=cfg.learning_rate, momentum=cfg.momentum, weight_decay=cfg.weight_decay)
+    else:
+        raise NotImplementedError
+    if cfg.cls_weights_in_loss:
+        weights = 1 / datasets['train'].class_counts
+    else:
+        weights = torch.ones(len(datasets['train'].target2label))
+    criterion = WeightedCrossEntropy(weights.to(device))
+    # loop over the train and validation multiple times (typical PT boilerplate)
+    no_change_epochs = 0
+    best_valid_loss = float('inf')
+    early_stop_triggered = False
+    for epoch in range(cfg.num_epochs):
+        for phase in ['train', 'valid']:
+            if phase == 'train':
+                model.train()
+            else:
+                model.eval()
+            running_loss = 0
+            preds_from_each_batch = []
+            targets_from_each_batch = []
+            prog_bar = tqdm(loaders[phase], f'{phase} ({epoch})', ncols=0)
+            for i, batch in enumerate(prog_bar):
+                inputs = batch['input'].to(device)
+                targets = batch['target'].to(device)
+                # zero the parameter gradients
+                optimizer.zero_grad()
+                # forward + backward + optimize
+                with torch.set_grad_enabled(phase == 'train'):
+                    outputs = model(inputs)
+                    loss = criterion(outputs, targets, to_weight=phase == 'train')
+                if phase == 'train':
+                    loss.backward()
+                    optimizer.step()
+                # loss
+                running_loss += loss.item()
+                # for metrics calculation later on
+                preds_from_each_batch += [outputs.detach().cpu()]
+                targets_from_each_batch += [targets.cpu()]
+                # iter logging
+                if i % 50 == 0:
+                    logger.log_iter_loss(loss.item(), epoch*len(loaders[phase])+i, phase)
+                    # tracks loss in the tqdm progress bar
+                    prog_bar.set_postfix(loss=loss.item())
+            # logging loss
+            epoch_loss = running_loss / len(loaders[phase])
+            logger.log_epoch_loss(epoch_loss, epoch, phase)
+            # logging metrics
+            preds_from_each_batch = torch.cat(preds_from_each_batch)
+            targets_from_each_batch = torch.cat(targets_from_each_batch)
+            metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch)
+            logger.log_epoch_metrics(metrics_dict, epoch, phase)
+            # Early stopping
+            if phase == 'valid':
+                if epoch_loss < best_valid_loss:
+                    no_change_epochs = 0
+                    best_valid_loss = epoch_loss
+                    logger.log_best_model(model, epoch_loss, epoch, optimizer, metrics_dict)
+                else:
+                    no_change_epochs += 1
+                    logger.print_logger.info(
+                        f'Valid loss hasnt changed for {no_change_epochs} patience: {cfg.patience}'
+                    )
+                    if no_change_epochs >= cfg.patience:
+                        early_stop_triggered = True
+        if early_stop_triggered:
+            logger.print_logger.info(f'Training is early stopped @ {epoch}')
+            break
+    logger.print_logger.info('Finished Training')
+    # loading the best model
+    ckpt = torch.load(logger.best_model_path)
+    model.load_state_dict(ckpt['model'])
+    logger.print_logger.info(f'Loading the best model from {logger.best_model_path}')
+    logger.print_logger.info((f'The model was trained for {ckpt["epoch"]} epochs. Loss: {ckpt["loss"]:.4f}'))
+    # Testing the model
+    model.eval()
+    running_loss = 0
+    preds_from_each_batch = []
+    targets_from_each_batch = []
+    for i, batch in enumerate(loaders['test']):
+        inputs = batch['input'].to(device)
+        targets = batch['target'].to(device)
+        # zero the parameter gradients
+        optimizer.zero_grad()
+        # forward + backward + optimize
+        with torch.set_grad_enabled(False):
+            outputs = model(inputs)
+            loss = criterion(outputs, targets, to_weight=False)
+        # loss
+        running_loss += loss.item()
+        # for metrics calculation later on
+        preds_from_each_batch += [outputs.detach().cpu()]
+        targets_from_each_batch += [targets.cpu()]
+    # logging metrics
+    preds_from_each_batch = torch.cat(preds_from_each_batch)
+    targets_from_each_batch = torch.cat(targets_from_each_batch)
+    test_metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch)
+    test_metrics_dict['avg_loss'] = running_loss / len(loaders['test'])
+    test_metrics_dict['param_num'] = param_num
+    # TODO: I have no idea why tboard doesn't keep metrics (hparams) when
+    # I run this experiment from cli: `python train_vggishish.py config=./configs/vggish.yaml`
+    # while when I run it in vscode debugger the metrics are logger (wtf)
+    logger.log_test_metrics(test_metrics_dict, dict(cfg), ckpt['epoch'])
+    logger.print_logger.info('Finished the experiment')

foleycrafter/models/specvqgan/modules/losses/vggishish/train_vggishish_gh.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from loss import WeightedCrossEntropy
+import random
+import os
+import sys
+import json
+import numpy as np
+import torch
+import torchvision
+from omegaconf import OmegaConf
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from dataset import GreatestHit, AMT_test
+from transforms import Crop, StandardNormalizeAudio, ToTensor
+from logger import LoggerWithTBoard
+from metrics import metrics
+from model import VGGishish
+if __name__ == "__main__":
+    cfg_cli = sys.argv[1]
+    cfg_yml = OmegaConf.load(cfg_cli)
+    # the latter arguments are prioritized
+    cfg = cfg_yml
+    OmegaConf.set_readonly(cfg, True)
+    print(OmegaConf.to_yaml(cfg))
+    logger = LoggerWithTBoard(cfg)
+    random.seed(cfg.seed)
+    np.random.seed(cfg.seed)
+    torch.manual_seed(cfg.seed)
+    torch.cuda.manual_seed_all(cfg.seed)
+    # makes iterations faster (in this case 30%) if your inputs are of a fixed size
+    # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3
+    torch.backends.cudnn.benchmark = True
+    transforms = [
+        StandardNormalizeAudio(cfg.mels_path),
+    ]
+    if cfg.cropped_size not in [None, 'None', 'none']:
+        logger.print_logger.info(f'Using cropping {cfg.cropped_size}')
+        transforms.append(Crop(cfg.cropped_size))
+    transforms.append(ToTensor())
+    transforms = torchvision.transforms.transforms.Compose(transforms)
+    datasets = {
+        'train': GreatestHit('train', cfg.mels_path, transforms, action_only=cfg.action_only, material_only=cfg.material_only),
+        'valid': GreatestHit('valid', cfg.mels_path, transforms, action_only=cfg.action_only, material_only=cfg.material_only),
+        'test': GreatestHit('test', cfg.mels_path, transforms, action_only=cfg.action_only, material_only=cfg.material_only),
+    }
+    loaders = {
+        'train': DataLoader(datasets['train'], batch_size=cfg.batch_size, shuffle=True, drop_last=True,
+                            num_workers=cfg.num_workers, pin_memory=True),
+        'valid': DataLoader(datasets['valid'], batch_size=cfg.batch_size,
+                            num_workers=cfg.num_workers, pin_memory=True),
+        'test': DataLoader(datasets['test'], batch_size=cfg.batch_size,
+                           num_workers=cfg.num_workers, pin_memory=True),
+    }
+    device = torch.device(cfg.device if torch.cuda.is_available() else 'cpu')
+    model = VGGishish(cfg.conv_layers, cfg.use_bn, num_classes=len(datasets['train'].label2target))
+    model = model.to(device)
+    if cfg.load_model is not None:
+        state_dict = torch.load(cfg.load_model, map_location=device)['model']
+        target_dict = {}
+        # ignore the last layer
+        for key, v in state_dict.items():
+            # ignore classifier
+            if 'classifier' not in key:
+                target_dict[key] = v
+        model.load_state_dict(target_dict, strict=False)
+    param_num = logger.log_param_num(model)
+    if cfg.optimizer == 'adam':
+        optimizer = torch.optim.Adam(
+            model.parameters(), lr=cfg.learning_rate, betas=cfg.betas, weight_decay=cfg.weight_decay)
+    elif cfg.optimizer == 'sgd':
+        optimizer = torch.optim.SGD(
+            model.parameters(), lr=cfg.learning_rate, momentum=cfg.momentum, weight_decay=cfg.weight_decay)
+    else:
+        raise NotImplementedError
+    if cfg.cls_weights_in_loss:
+        weights = 1 / datasets['train'].class_counts
+    else:
+        weights = torch.ones(len(datasets['train'].label2target))
+    criterion = WeightedCrossEntropy(weights.to(device))
+    # loop over the train and validation multiple times (typical PT boilerplate)
+    no_change_epochs = 0
+    best_valid_loss = float('inf')
+    early_stop_triggered = False
+    for epoch in range(cfg.num_epochs):
+        for phase in ['train', 'valid']:
+            if phase == 'train':
+                model.train()
+            else:
+                model.eval()
+            running_loss = 0
+            preds_from_each_batch = []
+            targets_from_each_batch = []
+            prog_bar = tqdm(loaders[phase], f'{phase} ({epoch})', ncols=0)
+            for i, batch in enumerate(prog_bar):
+                inputs = batch['input'].to(device)
+                targets = batch['target'].to(device)
+                # zero the parameter gradients
+                optimizer.zero_grad()
+                # forward + backward + optimize
+                with torch.set_grad_enabled(phase == 'train'):
+                    outputs = model(inputs)
+                    loss = criterion(outputs, targets, to_weight=phase == 'train')
+                if phase == 'train':
+                    loss.backward()
+                    optimizer.step()
+                # loss
+                running_loss += loss.item()
+                # for metrics calculation later on
+                preds_from_each_batch += [outputs.detach().cpu()]
+                targets_from_each_batch += [targets.cpu()]
+                # iter logging
+                if i % 50 == 0:
+                    logger.log_iter_loss(loss.item(), epoch*len(loaders[phase])+i, phase)
+                    # tracks loss in the tqdm progress bar
+                    prog_bar.set_postfix(loss=loss.item())
+            # logging loss
+            epoch_loss = running_loss / len(loaders[phase])
+            logger.log_epoch_loss(epoch_loss, epoch, phase)
+            # logging metrics
+            preds_from_each_batch = torch.cat(preds_from_each_batch)
+            targets_from_each_batch = torch.cat(targets_from_each_batch)
+            if cfg.action_only:
+                metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch, topk=(1,))
+            else:
+                metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch, topk=(1, 5))
+            logger.log_epoch_metrics(metrics_dict, epoch, phase)
+            # Early stopping
+            if phase == 'valid':
+                if epoch_loss < best_valid_loss:
+                    no_change_epochs = 0
+                    best_valid_loss = epoch_loss
+                    logger.log_best_model(model, epoch_loss, epoch, optimizer, metrics_dict)
+                else:
+                    no_change_epochs += 1
+                    logger.print_logger.info(
+                        f'Valid loss hasnt changed for {no_change_epochs} patience: {cfg.patience}'
+                    )
+                    if no_change_epochs >= cfg.patience:
+                        early_stop_triggered = True
+        if early_stop_triggered:
+            logger.print_logger.info(f'Training is early stopped @ {epoch}')
+            break
+    logger.print_logger.info('Finished Training')
+    # loading the best model
+    ckpt = torch.load(logger.best_model_path)
+    model.load_state_dict(ckpt['model'])
+    logger.print_logger.info(f'Loading the best model from {logger.best_model_path}')
+    logger.print_logger.info((f'The model was trained for {ckpt["epoch"]} epochs. Loss: {ckpt["loss"]:.4f}'))
+    # Testing the model
+    model.eval()
+    running_loss = 0
+    preds_from_each_batch = []
+    targets_from_each_batch = []
+    for i, batch in enumerate(loaders['test']):
+        inputs = batch['input'].to(device)
+        targets = batch['target'].to(device)
+        # zero the parameter gradients
+        optimizer.zero_grad()
+        # forward + backward + optimize
+        with torch.set_grad_enabled(False):
+            outputs = model(inputs)
+            loss = criterion(outputs, targets, to_weight=False)
+        # loss
+        running_loss += loss.item()
+        # for metrics calculation later on
+        preds_from_each_batch += [outputs.detach().cpu()]
+        targets_from_each_batch += [targets.cpu()]
+    # logging metrics
+    preds_from_each_batch = torch.cat(preds_from_each_batch)
+    targets_from_each_batch = torch.cat(targets_from_each_batch)
+    if cfg.action_only:
+        test_metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch, topk=(1,))
+    else:
+        test_metrics_dict = metrics(targets_from_each_batch, preds_from_each_batch, topk=(1, 5))
+    test_metrics_dict['avg_loss'] = running_loss / len(loaders['test'])
+    test_metrics_dict['param_num'] = param_num
+    # TODO: I have no idea why tboard doesn't keep metrics (hparams) when
+    # I run this experiment from cli: `python train_vggishish.py config=./configs/vggish.yaml`
+    # while when I run it in vscode debugger the metrics are logger (wtf)
+    logger.log_test_metrics(test_metrics_dict, dict(cfg), ckpt['epoch'])
+    logger.print_logger.info('Finished the experiment')

foleycrafter/models/specvqgan/modules/losses/vggishish/transforms.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import logging
+import os
+from pathlib import Path
+import albumentations
+import numpy as np
+import torch
+from tqdm import tqdm
+logger = logging.getLogger(f'main.{__name__}')
+class StandardNormalizeAudio(object):
+    '''
+        Frequency-wise normalization
+    '''
+    def __init__(self, specs_dir, train_ids_path='./data/vggsound_train.txt', cache_path='./data/'):
+        self.specs_dir = specs_dir
+        self.train_ids_path = train_ids_path
+        # making the stats filename to match the specs dir name
+        self.cache_path = os.path.join(cache_path, f'train_means_stds_{Path(specs_dir).stem}.txt')
+        logger.info('Assuming that the input stats are calculated using preprocessed spectrograms (log)')
+        self.train_stats = self.calculate_or_load_stats()
+    def __call__(self, item):
+        # just to generalizat the input handling. Useful for FID, IS eval and training other staff
+        if isinstance(item, dict):
+            if 'input' in item:
+                input_key = 'input'
+            elif 'image' in item:
+                input_key = 'image'
+            else:
+                raise NotImplementedError
+            item[input_key] = (item[input_key] - self.train_stats['means']) / self.train_stats['stds']
+        elif isinstance(item, torch.Tensor):
+            # broadcasts np.ndarray (80, 1) to (1, 80, 1) because item is torch.Tensor (B, 80, T)
+            item = (item - self.train_stats['means']) / self.train_stats['stds']
+        else:
+            raise NotImplementedError
+        return item
+    def calculate_or_load_stats(self):
+        try:
+            # (F, 2)
+            train_stats = np.loadtxt(self.cache_path)
+            means, stds = train_stats.T
+            logger.info('Trying to load train stats for Standard Normalization of inputs')
+        except OSError:
+            logger.info('Could not find the precalculated stats for Standard Normalization. Calculating...')
+            train_vid_ids = open(self.train_ids_path)
+            specs_paths = [os.path.join(self.specs_dir, f'{i.rstrip()}_mel.npy') for i in train_vid_ids]
+            means = [None] * len(specs_paths)
+            stds = [None] * len(specs_paths)
+            for i, path in enumerate(tqdm(specs_paths)):
+                spec = np.load(path)
+                means[i] = spec.mean(axis=1)
+                stds[i] = spec.std(axis=1)
+            # (F) <- (num_files, F)
+            means = np.array(means).mean(axis=0)
+            stds = np.array(stds).mean(axis=0)
+            # saving in two columns
+            np.savetxt(self.cache_path, np.vstack([means, stds]).T, fmt='%0.8f')
+        means = means.reshape(-1, 1)
+        stds = stds.reshape(-1, 1)
+        return {'means': means, 'stds': stds}
+class ToTensor(object):
+    def __call__(self, item):
+        item['input'] = torch.from_numpy(item['input']).float()
+        if 'target' in item:
+            item['target'] = torch.tensor(item['target'])
+        return item
+class Crop(object):
+    def __init__(self, cropped_shape=None, random_crop=False):
+        self.cropped_shape = cropped_shape
+        if cropped_shape is not None:
+            mel_num, spec_len = cropped_shape
+            if random_crop:
+                self.cropper = albumentations.RandomCrop
+            else:
+                self.cropper = albumentations.CenterCrop
+            self.preprocessor = albumentations.Compose([self.cropper(mel_num, spec_len)])
+        else:
+            self.preprocessor = lambda **kwargs: kwargs
+    def __call__(self, item):
+        item['input'] = self.preprocessor(image=item['input'])['image']
+        return item
+if __name__ == '__main__':
+    cropper = Crop([80, 848])
+    item = {'input': torch.rand([80, 860])}
+    outputs = cropper(item)
+    print(outputs['input'].shape)

foleycrafter/models/specvqgan/modules/losses/vqperceptual.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys
+sys.path.insert(0, '.')  # nopep8
+from foleycrafter.models.specvqgan.modules.discriminator.model import (NLayerDiscriminator, NLayerDiscriminator1dFeats,
+                                                   NLayerDiscriminator1dSpecs,
+                                                   weights_init)
+from foleycrafter.models.specvqgan.modules.losses.lpaps import LPAPS
+class DummyLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+def adopt_weight(weight, global_step, threshold=0, value=0.):
+    if global_step < threshold:
+        weight = value
+    return weight
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1. - logits_real))
+    loss_fake = torch.mean(F.relu(1. + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+        torch.mean(torch.nn.functional.softplus(-logits_real)) +
+        torch.mean(torch.nn.functional.softplus(logits_fake)))
+    return d_loss
+class VQLPAPSWithDiscriminator(nn.Module):
+    def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
+                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
+                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
+                 disc_ndf=64, disc_loss="hinge", min_adapt_weight=0.0, max_adapt_weight=1e4):
+        super().__init__()
+        assert disc_loss in ["hinge", "vanilla"]
+        self.codebook_weight = codebook_weight
+        self.pixel_weight = pixelloss_weight
+        self.perceptual_loss = LPAPS().eval()
+        self.perceptual_weight = perceptual_weight
+        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
+                                                 n_layers=disc_num_layers,
+                                                 use_actnorm=use_actnorm,
+                                                 ndf=disc_ndf
+                                                 ).apply(weights_init)
+        self.discriminator_iter_start = disc_start
+        if disc_loss == "hinge":
+            self.disc_loss = hinge_d_loss
+        elif disc_loss == "vanilla":
+            self.disc_loss = vanilla_d_loss
+        else:
+            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
+        print(f"VQLPAPSWithDiscriminator running with {disc_loss} loss.")
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.disc_conditional = disc_conditional
+        self.min_adapt_weight = min_adapt_weight
+        self.max_adapt_weight = max_adapt_weight
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, self.min_adapt_weight, self.max_adapt_weight).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+    def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
+                global_step, last_layer=None, cond=None, split="train"):
+        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+        else:
+            p_loss = torch.tensor([0.0])
+        nll_loss = rec_loss
+        # nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        nll_loss = torch.mean(nll_loss)
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            if cond is None:
+                assert not self.disc_conditional
+                logits_fake = self.discriminator(reconstructions.contiguous())
+            else:
+                assert self.disc_conditional
+                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
+            g_loss = -torch.mean(logits_fake)
+            try:
+                d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+            except RuntimeError:
+                assert not self.training
+                d_weight = torch.tensor(0.0)
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
+            log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
+                   "{}/quant_loss".format(split): codebook_loss.detach().mean(),
+                   "{}/nll_loss".format(split): nll_loss.detach().mean(),
+                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
+                   "{}/p_loss".format(split): p_loss.detach().mean(),
+                   "{}/d_weight".format(split): d_weight.detach(),
+                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
+                   "{}/g_loss".format(split): g_loss.detach().mean(),
+                   }
+            return loss, log
+        if optimizer_idx == 1:
+            # second pass for discriminator update
+            if cond is None:
+                logits_real = self.discriminator(inputs.contiguous().detach())
+                logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            else:
+                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
+                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+                   "{}/logits_real".format(split): logits_real.detach().mean(),
+                   "{}/logits_fake".format(split): logits_fake.detach().mean()
+                   }
+            return d_loss, log
+class VQLPAPSWithDiscriminator1dFeats(VQLPAPSWithDiscriminator):
+    def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
+                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
+                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
+                 disc_ndf=64, disc_loss="hinge", min_adapt_weight=0.0, max_adapt_weight=1e4):
+        super().__init__(disc_start=disc_start, codebook_weight=codebook_weight,
+                         pixelloss_weight=pixelloss_weight, disc_num_layers=disc_num_layers,
+                         disc_in_channels=disc_in_channels, disc_factor=disc_factor, disc_weight=disc_weight,
+                         perceptual_weight=perceptual_weight, use_actnorm=use_actnorm,
+                         disc_conditional=disc_conditional, disc_ndf=disc_ndf, disc_loss=disc_loss,
+                         min_adapt_weight=min_adapt_weight, max_adapt_weight=max_adapt_weight)
+        self.discriminator = NLayerDiscriminator1dFeats(input_nc=disc_in_channels, n_layers=disc_num_layers,
+                                                   use_actnorm=use_actnorm, ndf=disc_ndf).apply(weights_init)
+class VQLPAPSWithDiscriminator1dSpecs(VQLPAPSWithDiscriminator):
+    def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
+                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
+                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
+                 disc_ndf=64, disc_loss="hinge", min_adapt_weight=0.0, max_adapt_weight=1e4):
+        super().__init__(disc_start=disc_start, codebook_weight=codebook_weight,
+                         pixelloss_weight=pixelloss_weight, disc_num_layers=disc_num_layers,
+                         disc_in_channels=disc_in_channels, disc_factor=disc_factor, disc_weight=disc_weight,
+                         perceptual_weight=perceptual_weight, use_actnorm=use_actnorm,
+                         disc_conditional=disc_conditional, disc_ndf=disc_ndf, disc_loss=disc_loss,
+                         min_adapt_weight=min_adapt_weight, max_adapt_weight=max_adapt_weight)
+        self.discriminator = NLayerDiscriminator1dSpecs(input_nc=disc_in_channels, n_layers=disc_num_layers,
+                                                   use_actnorm=use_actnorm, ndf=disc_ndf).apply(weights_init)
+if __name__ == '__main__':
+    from foleycrafter.models.specvqgan.modules.diffusionmodules.model import Decoder, Decoder1d
+    optimizer_idx = 0
+    loss_config = {
+        'disc_conditional': False,
+        'disc_start': 30001,
+        'disc_weight': 0.8,
+        'codebook_weight': 1.0,
+    }
+    ddconfig = {
+        'ch': 128,
+        'num_res_blocks': 2,
+        'dropout': 0.0,
+        'z_channels': 256,
+        'double_z': False,
+    }
+    qloss = torch.rand(1, requires_grad=True)
+    ## AUDIO
+    loss_config['disc_in_channels'] = 1
+    ddconfig['in_channels'] = 1
+    ddconfig['resolution'] = 848
+    ddconfig['attn_resolutions'] = [53]
+    ddconfig['out_ch'] = 1
+    ddconfig['ch_mult'] = [1, 1, 2, 2, 4]
+    decoder = Decoder(**ddconfig)
+    loss = VQLPAPSWithDiscriminator(**loss_config)
+    x = torch.rand(16, 1, 80, 848)
+    # subtracting something which uses dec_conv_out so that it will be in a graph
+    xrec = torch.rand(16, 1, 80, 848) - decoder.conv_out(torch.rand(16, 128, 80, 848)).mean()
+    aeloss, log_dict_ae = loss(qloss, x, xrec, optimizer_idx, global_step=0,last_layer=decoder.conv_out.weight)
+    print(aeloss)
+    print(log_dict_ae)

foleycrafter/models/specvqgan/modules/misc/class_cond.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+class ClassOnlyStage(object):
+    def __init__(self):
+        pass
+    def eval(self):
+        return self
+    def encode(self, c):
+        """fake vqmodel interface because self.cond_stage_model should have something
+        similar to coord.py but even more `dummy`"""
+        # assert 0.0 <= c.min() and c.max() <= 1.0
+        info = None, None, c
+        return c, None, info
+    def decode(self, c):
+        return c
+    def get_input(self, batch, k):
+        return batch[k].unsqueeze(1).to(memory_format=torch.contiguous_format)