diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..32e481d7c7cabef1a9f06f94782c01764f5408fb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,25 @@
+__pycache__/
+.cache/
+datasets/
+outputs/
+out/
+out2/
+debug/
+checkpoint/
+*.zip
+*.npy
+core
+history/
+tools/*
+tools
+eval_outputs/
+pretrained/
+.nfs00d20000091cb3390001ead3
+scripts/research/
+
+.idea
+.vscode
+.github
+.ipynb_checkpoints/
+_screenshots/
+flagged
diff --git a/README.md b/README.md
index 025204c959a676579de8666c6b0525b34cfe3c62..86d0401f8fd04102f8bfa9ae041033831be50e17 100644
--- a/README.md
+++ b/README.md
@@ -1,37 +1,75 @@
----
-title: StyleNeRF
-emoji: 🏃
-colorFrom: blue
-colorTo: indigo
-sdk: gradio
-app_file: app.py
-pinned: false
----
+# StyleNeRF: A Style-based 3D-Aware Generator for High-resolution Image Synthesis</sub>
 
-# Configuration
+![Random Sample](./docs/random_sample.jpg)
 
-`title`: _string_  
-Display title for the Space
+**StyleNeRF: A Style-based 3D-Aware Generator for High-resolution Image Synthesis**<br>
+Jiatao Gu, Lingjie Liu, Peng Wang, Christian Theobalt<br>
+### [Project Page](http://jiataogu.me/style_nerf) | [Video](http://jiataogu.me/style_nerf) | [Paper](https://arxiv.org/abs/2110.08985) | [Data](#dataset)<br>
 
-`emoji`: _string_  
-Space emoji (emoji-only character allowed)
+Abstract: *We propose StyleNeRF, a 3D-aware generative model for photo-realistic high-resolution image synthesis with high multi-view consistency, which can be trained on unstructured 2D images. Existing approaches either cannot synthesize high-resolution images with fine details or yield noticeable 3D-inconsistent artifacts. In addition, many of them lack control over style attributes and explicit 3D camera poses. StyleNeRF integrates the neural radiance field (NeRF) into a style-based generator to tackle the aforementioned challenges, i.e., improving rendering efficiency and 3D consistency for high-resolution image generation. We perform volume rendering only to produce a low-resolution feature map and progressively apply upsampling in 2D to address the first issue. To mitigate the inconsistencies caused by 2D upsampling, we propose multiple designs, including a better upsampler and a new regularization loss. With these designs, StyleNeRF can synthesize high-resolution images at interactive rates while preserving 3D consistency at high quality. StyleNeRF also enables control of camera poses and different levels of styles, which can generalize to unseen views. It also supports challenging tasks, including zoom-in and-out, style mixing, inversion, and semantic editing.*
 
-`colorFrom`: _string_  
-Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+## Requirements
+The codebase is tested on 
+* Python 3.7
+* PyTorch 1.7.1
+* 8 Nvidia GPU (Tesla V100 32GB) with CUDA version 11.0
 
-`colorTo`: _string_  
-Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+For additional python libraries, please install by:
 
-`sdk`: _string_  
-Can be either `gradio` or `streamlit`
+```
+pip install -r requirements.txt
+```
 
-`sdk_version` : _string_  
-Only applicable for `streamlit` SDK.  
-See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+Please refer to https://github.com/NVlabs/stylegan2-ada-pytorch for additional software/hardware requirements.
+
+## Dataset
+We follow the same dataset format as StyleGAN2-ADA supported, which can be either an image folder, or a zipped file.
+
+
+## Train a new StyleNeRF model
+```bash
+python run_train.py outdir=${OUTDIR} data=${DATASET} spec=paper512 model=stylenerf_ffhq
+```
+It will automatically detect all usable GPUs.
+
+Please check configuration files at ```conf/model``` and ```conf/spec```. You can always add your own model config. More details on how to use hydra configuration please follow https://hydra.cc/docs/intro/.
+
+## Render the pretrained model
+```bash
+python generate.py --outdir=${OUTDIR} --trunc=0.7 --seeds=${SEEDS} --network=${CHECKPOINT_PATH} --render-program="rotation_camera"
+```
+It supports different rotation trajectories for rendering new videos.
+
+## Run a demo page
+```bash
+python web_demo.py 21111
+```
+It will in default run a Gradio-powered demo on https://localhost:21111
+![Web demo](./docs/web_demo.gif)
+## Run a GUI visualizer
+```bash
+python visualizer.py
+```
+An interative application will show up for users to play with.
+![GUI demo](./docs/gui_demo.gif)
+## Citation
+
+```
+@inproceedings{
+    gu2022stylenerf,
+    title={StyleNeRF: A Style-based 3D Aware Generator for High-resolution Image Synthesis},
+    author={Jiatao Gu and Lingjie Liu and Peng Wang and Christian Theobalt},
+    booktitle={International Conference on Learning Representations},
+    year={2022},
+    url={https://openreview.net/forum?id=iUuzzTMUw9K}
+}
+```
+
+
+## License
+
+Copyright &copy; Facebook, Inc. All Rights Reserved.
+
+The majority of StyleNeRF is licensed under [CC-BY-NC](https://creativecommons.org/licenses/by-nc/4.0/), however, portions of this project are available under a separate license terms: all codes used or modified from [stylegan2-ada-pytorch](https://github.com/NVlabs/stylegan2-ada-pytorch) is under the [Nvidia Source Code License](https://nvlabs.github.io/stylegan2-ada-pytorch/license.html).
 
-`app_file`: _string_  
-Path to your main application file (which contains either `gradio` or `streamlit` Python code).  
-Path is relative to the root of the repository.
 
-`pinned`: _boolean_  
-Whether the Space stays on top of your list.
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..10e6df314bf2bf6af124c74775b0cebc06503fb7
--- /dev/null
+++ b/app.py
@@ -0,0 +1,213 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os, sys
+os.system('pip install -r requirements.txt')
+
+import gradio as gr
+import numpy as np
+import dnnlib
+import time
+import legacy
+import torch
+import glob
+
+import cv2
+import signal
+from torch_utils import misc
+from renderer import Renderer
+from training.networks import Generator
+from huggingface_hub import hf_hub_download
+
+
+device = torch.device('cuda')
+port   = int(sys.argv[1]) if len(sys.argv) > 1 else 21111
+
+
+
+def handler(signum, frame):
+    res = input("Ctrl-c was pressed. Do you really want to exit? y/n ")
+    if res == 'y':
+        gr.close_all()
+        exit(1)
+ 
+signal.signal(signal.SIGINT, handler)
+
+
+def set_random_seed(seed):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+
+def get_camera_traj(model, pitch, yaw, fov=12, batch_size=1, model_name='FFHQ512'):
+    gen = model.synthesis
+    range_u, range_v = gen.C.range_u, gen.C.range_v
+    if not (('car' in model_name) or ('Car' in model_name)):  # TODO: hack, better option?
+        yaw, pitch = 0.5 * yaw, 0.3  * pitch
+        pitch = pitch + np.pi/2
+        u = (yaw - range_u[0]) / (range_u[1] - range_u[0])
+        v = (pitch - range_v[0]) / (range_v[1] - range_v[0])
+    else:
+        u = (yaw + 1) / 2
+        v = (pitch + 1) / 2
+    cam = gen.get_camera(batch_size=batch_size, mode=[u, v, 0.5], device=device, fov=fov)
+    return cam
+
+
+def check_name(model_name='FFHQ512'):
+    """Gets model by name."""
+    if model_name == 'FFHQ512':
+        network_pkl = hf_hub_download(repo_id='thomagram/stylenerf-ffhq-config-basic', filename='ffhq_512.pkl')
+    
+    # TODO: checkpoint to be updated!
+    # elif model_name == 'FFHQ512v2':
+    #     network_pkl = "./pretrained/ffhq_512_eg3d.pkl"
+    # elif model_name == 'AFHQ512':
+    #     network_pkl = "./pretrained/afhq_512.pkl"
+    # elif model_name == 'MetFaces512':
+    #     network_pkl = "./pretrained/metfaces_512.pkl"
+    # elif model_name == 'CompCars256':
+    #     network_pkl = "./pretrained/cars_256.pkl"
+    # elif model_name == 'FFHQ1024':
+    #     network_pkl = "./pretrained/ffhq_1024.pkl"
+    else:
+        if os.path.isdir(model_name):
+            network_pkl = sorted(glob.glob(model_name + '/*.pkl'))[-1]
+        else:
+            network_pkl = model_name
+    return network_pkl
+
+
+def get_model(network_pkl, render_option=None):
+    print('Loading networks from "%s"...' % network_pkl)
+    with dnnlib.util.open_url(network_pkl) as f:
+        network = legacy.load_network_pkl(f)
+        G = network['G_ema'].to(device)  # type: ignore
+
+    with torch.no_grad():
+        G2 = Generator(*G.init_args, **G.init_kwargs).to(device)
+        misc.copy_params_and_buffers(G, G2, require_all=False)
+
+    print('compile and go through the initial image')
+    G2 = G2.eval()
+    init_z = torch.from_numpy(np.random.RandomState(0).rand(1, G2.z_dim)).to(device)
+    init_cam = get_camera_traj(G2, 0, 0, model_name=network_pkl)
+    dummy = G2(z=init_z, c=None, camera_matrices=init_cam, render_option=render_option, theta=0)
+    res = dummy['img'].shape[-1]
+    imgs = np.zeros((res, res//2, 3))
+    return G2, res, imgs
+
+
+global_states = list(get_model(check_name()))
+wss  = [None, None]
+
+def proc_seed(history, seed):
+    if isinstance(seed, str):
+        seed = 0
+    else:
+        seed = int(seed)
+
+
+def f_synthesis(model_name, model_find, render_option, early, trunc, seed1, seed2, mix1, mix2, yaw, pitch, roll, fov, history):
+    history = history or {}
+    seeds = []
+    
+    if model_find != "":
+        model_name = model_find
+        
+    model_name = check_name(model_name)
+    if model_name != history.get("model_name", None):
+        model, res, imgs = get_model(model_name, render_option)
+        global_states[0] = model
+        global_states[1] = res
+        global_states[2] = imgs
+
+    model, res, imgs = global_states
+    for idx, seed in enumerate([seed1, seed2]):
+        if isinstance(seed, str):
+            seed = 0
+        else:
+            seed = int(seed)
+
+        if (seed != history.get(f'seed{idx}', -1)) or \
+            (model_name != history.get("model_name", None)) or \
+            (trunc != history.get("trunc", 0.7)) or \
+            (wss[idx] is None):
+            print(f'use seed {seed}')
+            set_random_seed(seed)
+            z   = torch.from_numpy(np.random.RandomState(int(seed)).randn(1, model.z_dim).astype('float32')).to(device)
+            ws  = model.mapping(z=z, c=None, truncation_psi=trunc)
+            img = model.get_final_output(styles=ws, camera_matrices=get_camera_traj(model, 0, 0), render_option=render_option)
+            ws  = ws.detach().cpu().numpy()
+            img = img[0].permute(1,2,0).detach().cpu().numpy()
+
+            
+            imgs[idx * res // 2: (1 + idx) * res // 2] = cv2.resize(
+                np.asarray(img).clip(-1, 1) * 0.5 + 0.5,
+                (res//2, res//2), cv2.INTER_AREA)
+            wss[idx] = ws
+        else:
+            seed = history[f'seed{idx}']
+        seeds += [seed]
+
+        history[f'seed{idx}'] = seed
+    history['trunc'] = trunc
+    history['model_name'] = model_name
+    
+    set_random_seed(sum(seeds))
+
+    # style mixing (?)
+    ws1, ws2 = [torch.from_numpy(ws).to(device) for ws in wss]
+    ws = ws1.clone()
+    ws[:, :8] = ws1[:, :8] * mix1 + ws2[:, :8] * (1 - mix1)
+    ws[:, 8:] = ws1[:, 8:] * mix2 + ws2[:, 8:] * (1 - mix2)
+    
+    # set visualization for other types of inputs.
+    if early == 'Normal Map':
+        render_option += ',normal,early'
+    elif early == 'Gradient Map':
+        render_option += ',gradient,early'
+    
+    start_t = time.time()
+    with torch.no_grad():
+        cam = get_camera_traj(model, pitch, yaw, fov, model_name=model_name)
+        image = model.get_final_output(
+            styles=ws, camera_matrices=cam, 
+            theta=roll * np.pi,
+            render_option=render_option)
+    end_t = time.time()
+
+    image = image[0].permute(1,2,0).detach().cpu().numpy().clip(-1, 1) * 0.5 + 0.5
+
+    if imgs.shape[0] == image.shape[0]:
+        image = np.concatenate([imgs, image], 1)
+    else:
+        a = image.shape[0]
+        b = int(imgs.shape[1] / imgs.shape[0] * a)
+        print(f'resize {a} {b} {image.shape} {imgs.shape}')
+        image = np.concatenate([cv2.resize(imgs, (b, a), cv2.INTER_AREA), image], 1)
+  
+    print(f'rendering time = {end_t-start_t:.4f}s')
+    image = (image * 255).astype('uint8')
+    return image, history
+
+model_name = gr.inputs.Dropdown(['FFHQ512'])  #  'FFHQ512v2', 'AFHQ512', 'MetFaces512', 'CompCars256', 'FFHQ1024'
+model_find = gr.inputs.Textbox(label="checkpoint path", default="")
+render_option = gr.inputs.Textbox(label="rendering options", default='steps:40')
+trunc  = gr.inputs.Slider(default=0.7, maximum=1.0, minimum=0.0, label='truncation trick')
+seed1  = gr.inputs.Number(default=1, label="seed1")
+seed2  = gr.inputs.Number(default=9, label="seed2")
+mix1   = gr.inputs.Slider(minimum=0, maximum=1, default=0, label="linear mixing ratio (geometry)")
+mix2   = gr.inputs.Slider(minimum=0, maximum=1, default=0, label="linear mixing ratio (apparence)")
+early  = gr.inputs.Radio(['None', 'Normal Map', 'Gradient Map'], default='None', label='intermedia output')
+yaw    = gr.inputs.Slider(minimum=-1, maximum=1, default=0, label="yaw")
+pitch  = gr.inputs.Slider(minimum=-1, maximum=1, default=0, label="pitch")
+roll   = gr.inputs.Slider(minimum=-1, maximum=1, default=0, label="roll (optional, not suggested)")
+fov    = gr.inputs.Slider(minimum=9, maximum=15, default=12, label="fov")
+css = ".output_image {height: 40rem !important; width: 100% !important;}"
+
+gr.Interface(fn=f_synthesis,
+             inputs=[model_name, model_find, render_option, early, trunc, seed1, seed2, mix1, mix2, yaw, pitch, roll, fov, "state"],
+             title="Interctive Web Demo for StyleNeRF (ICLR 2022)",
+             outputs=["image", "state"],
+             layout='unaligned',
+             css=css, theme='dark-huggingface',
+             live=True).launch(server_port=port)
diff --git a/conf/config.yaml b/conf/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..695ff97d89a051761eaa6ee90e28828b787cf985
--- /dev/null
+++ b/conf/config.yaml
@@ -0,0 +1,49 @@
+defaults:
+  - _self_
+  - model: default
+  - spec: paper512
+
+# general options
+outdir: ~
+dry_run: False
+debug: False
+resume_run: ~
+
+snap: 50    # Snapshot interval [default: 50 ticks]
+imgsnap: 10
+metrics: [ "fid50k_full" ]
+seed: 2
+num_fp16_res: 4
+auto: False
+
+# dataset
+data: ~
+resolution: ~
+cond: False
+subset: ~   # Train with only N images: <int>, default = all
+mirror: False
+
+# discriminator augmentation
+aug: noaug
+p: ~
+target: ~
+augpipe: ~
+
+# transfer learning
+resume: ~
+freezed: ~
+
+# performance options
+fp32: False
+nhwc: False
+allow_tf32: False
+nobench: False
+workers: 3
+
+launcher: "spawn"
+partition: ~
+comment: ~
+gpus: ~     # Number of GPUs to use [default: 1]
+port: ~
+nodes: ~
+timeout: ~
\ No newline at end of file
diff --git a/conf/hydra/local.yaml b/conf/hydra/local.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b06588d70d5b05be55f2b1fe328dee4e54654436
--- /dev/null
+++ b/conf/hydra/local.yaml
@@ -0,0 +1,16 @@
+sweep:
+  dir: /checkpoint/${env:USER}/space/gan/${env:PREFIX}/${hydra.job.name}
+  subdir: ${hydra.job.num}
+launcher:
+  submitit_folder: ${hydra.sweep.dir}
+  timeout_min: 4320
+  cpus_per_task: 64
+  gpus_per_node: 8
+  tasks_per_node: 1
+  mem_gb: 400
+  nodes: 1
+  name: ${env:PREFIX}_${hydra.job.config_name}
+  # partition: devlab,learnlab,learnfair,scavenge
+  # constraint: volta32gb
+  # max_num_timeout: 30
+  # exclude: learnfair1381,learnfair5192,learnfair2304
\ No newline at end of file
diff --git a/conf/model/default.yaml b/conf/model/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cfe8e91130eeecbd1779e1be03f0ebabf8d8ed38
--- /dev/null
+++ b/conf/model/default.yaml
@@ -0,0 +1,35 @@
+# @package _group_
+name: default
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        num_fp16_res: ${num_fp16_res}
+        channel_base: ${spec.fmaps}
+        channel_max: 512
+        conv_clamp: 256
+        architecture: skip
+
+D_kwargs:
+    class_name: "training.networks.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: resnet
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
\ No newline at end of file
diff --git a/conf/model/stylenerf_afhq.yaml b/conf/model/stylenerf_afhq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ef687e9f67b9e0acdf606bd6958a717691136a4
--- /dev/null
+++ b/conf/model/stylenerf_afhq.yaml
@@ -0,0 +1,108 @@
+# @package _group_
+name: stylenerf_afhq
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 1024
+        conv_clamp: 256
+        kernel_size: 1
+        architecture: skip
+        upsample_mode: "nn_cat"
+
+        z_dim_bg: 32
+        z_dim: 0
+        resolution_vol: 32
+        resolution_start: 32
+        rgb_out_dim: 256
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-0.3, 0.3]
+            range_radius: [1.0, 1.0]
+            depth_range: [0.88, 1.12]
+            fov: 12
+            gaussian_camera: True
+            angular_camera: True
+            depth_transform:  ~
+            dists_normalized: False
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_bg_samples: 4
+            n_ray_samples: 14
+            abs_sigma: False
+            hierarchical: True
+            no_background: False
+            
+        foreground_kwargs:
+            positional_encoding: "normal"
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: True
+            use_viewdirs: False
+
+        background_kwargs:
+            positional_encoding: "normal"
+            hidden_size: 64
+            n_blocks: 4
+            downscale_p_by: 1
+            skips: []
+            inverse_sphere: True
+            use_style: "StyleGAN2"
+            predict_rgb: True
+            use_viewdirs: False
+
+        upsampler_kwargs:
+            channel_base: ${model.G_kwargs.synthesis_kwargs.channel_base}
+            channel_max:  ${model.G_kwargs.synthesis_kwargs.channel_max}
+            no_2d_renderer: False
+            no_residual_img: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: True
+
+        # reuglarization
+        n_reg_samples: 16
+        reg_full: True
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    progressive: ${model.G_kwargs.synthesis_kwargs.progressive}
+    lowres_head: ${model.G_kwargs.synthesis_kwargs.resolution_start}
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [500,5000]
\ No newline at end of file
diff --git a/conf/model/stylenerf_cars.yaml b/conf/model/stylenerf_cars.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e58681935afd227835771be5b5cf54b70f255190
--- /dev/null
+++ b/conf/model/stylenerf_cars.yaml
@@ -0,0 +1,108 @@
+# @package _group_
+name: stylenerf_ffhq
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 1024
+        conv_clamp: 256
+        kernel_size: 1
+        architecture: skip
+        upsample_mode: "pixelshuffle"
+
+        z_dim_bg: 32
+        z_dim: 0
+        resolution_vol: 32
+        resolution_start: 32
+        rgb_out_dim: 256
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-3.141592653589793, 3.141592653589793]
+            range_radius: [1.0, 1.0]
+            depth_range: [0.8, 1.2]
+            fov: 16
+            gaussian_camera: False
+            angular_camera: True
+            depth_transform:  ~
+            dists_normalized: False
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_bg_samples: 4
+            n_ray_samples: 16
+            abs_sigma: False
+            hierarchical: True
+            no_background: False
+            
+        foreground_kwargs:
+            positional_encoding: "normal"
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: True
+            use_viewdirs: False
+
+        background_kwargs:
+            positional_encoding: "normal"
+            hidden_size: 64
+            n_blocks: 4
+            downscale_p_by: 1
+            skips: []
+            inverse_sphere: True
+            use_style: "StyleGAN2"
+            predict_rgb: True
+            use_viewdirs: False
+
+        upsampler_kwargs:
+            channel_base: ${model.G_kwargs.synthesis_kwargs.channel_base}
+            channel_max:  ${model.G_kwargs.synthesis_kwargs.channel_max}
+            no_2d_renderer: False
+            no_residual_img: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: True
+
+        # reuglarization
+        n_reg_samples: 0
+        reg_full: False
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    progressive: ${model.G_kwargs.synthesis_kwargs.progressive}
+    lowres_head: ${model.G_kwargs.synthesis_kwargs.resolution_start}
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [500,5000]
\ No newline at end of file
diff --git a/conf/model/stylenerf_cars_debug.yaml b/conf/model/stylenerf_cars_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3cd38ba1e42ab7b8b5e1e26216e17d856ee2518f
--- /dev/null
+++ b/conf/model/stylenerf_cars_debug.yaml
@@ -0,0 +1,105 @@
+# @package _group_
+name: stylenerf_ffhq
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 1024
+        conv_clamp: 256
+        kernel_size: 1
+        architecture: skip
+        upsample_mode: "pixelshuffle"
+
+        z_dim_bg: 0
+        z_dim: 0
+        resolution_vol: 128
+        resolution_start: 32
+        rgb_out_dim: 256
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-3.141592653589793, 3.141592653589793]
+            range_radius: [1.0, 1.0]
+            depth_range: [0.8, 1.2]
+            fov: 16
+            gaussian_camera: False
+            angular_camera: True
+            depth_transform:  ~
+            dists_normalized: False
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_bg_samples: 0
+            n_ray_samples: 32
+            abs_sigma: False
+            hierarchical: True
+            no_background: True
+            
+        foreground_kwargs:
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: False
+            add_rgb: True
+            use_viewdirs: False
+            n_blocks: 0
+
+        input_kwargs:
+            output_mode: 'tri_plane_reshape'
+            input_mode: 'random'
+            in_res:  4
+            out_res: 256
+            out_dim: 32
+
+        upsampler_kwargs:
+            channel_base: ${model.G_kwargs.synthesis_kwargs.channel_base}
+            channel_max:  ${model.G_kwargs.synthesis_kwargs.channel_max}
+            no_2d_renderer: False
+            no_residual_img: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: True
+
+        # reuglarization
+        n_reg_samples: 0
+        reg_full: False
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    progressive: ${model.G_kwargs.synthesis_kwargs.progressive}
+    lowres_head: ${model.G_kwargs.synthesis_kwargs.resolution_start}
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [500,5000]
\ No newline at end of file
diff --git a/conf/model/stylenerf_ffhq.yaml b/conf/model/stylenerf_ffhq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d026ddb5be557adbe691d36b25218808ab5dc840
--- /dev/null
+++ b/conf/model/stylenerf_ffhq.yaml
@@ -0,0 +1,108 @@
+# @package _group_
+name: stylenerf_ffhq
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 1024
+        conv_clamp: 256
+        kernel_size: 1
+        architecture: skip
+        upsample_mode: "nn_cat"
+
+        z_dim_bg: 32
+        z_dim: 0
+        resolution_vol: 32
+        resolution_start: 32
+        rgb_out_dim: 256
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-0.3, 0.3]
+            range_radius: [1.0, 1.0]
+            depth_range: [0.88, 1.12]
+            fov: 12
+            gaussian_camera: True
+            angular_camera: True
+            depth_transform:  ~
+            dists_normalized: False
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_bg_samples: 4
+            n_ray_samples: 14
+            abs_sigma: False
+            hierarchical: True
+            no_background: False
+            
+        foreground_kwargs:
+            positional_encoding: "normal"
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: True
+            use_viewdirs: False
+
+        background_kwargs:
+            positional_encoding: "normal"
+            hidden_size: 64
+            n_blocks: 4
+            downscale_p_by: 1
+            skips: []
+            inverse_sphere: True
+            use_style: "StyleGAN2"
+            predict_rgb: True
+            use_viewdirs: False
+
+        upsampler_kwargs:
+            channel_base: ${model.G_kwargs.synthesis_kwargs.channel_base}
+            channel_max:  ${model.G_kwargs.synthesis_kwargs.channel_max}
+            no_2d_renderer: False
+            no_residual_img: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: True
+
+        # reuglarization
+        n_reg_samples: 16
+        reg_full: True
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    progressive: ${model.G_kwargs.synthesis_kwargs.progressive}
+    lowres_head: ${model.G_kwargs.synthesis_kwargs.resolution_start}
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [500,5000]
\ No newline at end of file
diff --git a/conf/model/stylenerf_ffhq_ae.yaml b/conf/model/stylenerf_ffhq_ae.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b96dc23d80060893492832defb41c0c1a23341f
--- /dev/null
+++ b/conf/model/stylenerf_ffhq_ae.yaml
@@ -0,0 +1,118 @@
+# @package _group_
+name: stylenerf_ffhq
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 1024
+        conv_clamp: 256
+        kernel_size: 1
+        architecture: skip
+        upsample_mode: "nn_cat"
+
+        z_dim: 0
+        resolution_vol: 128
+        resolution_start: 128
+        rgb_out_dim: 32
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-0.3, 0.3]
+            range_radius:  [1.0, 1.0]
+            depth_range: [0.88, 1.12]
+            fov: 12
+            gaussian_camera: True
+            angular_camera: True
+            depth_transform: ~
+            dists_normalized: True
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_ray_samples: 32
+            abs_sigma: False
+            hierarchical: True
+            no_background: True
+            
+        foreground_kwargs:
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: False
+            use_viewdirs: False
+            add_rgb: True
+            n_blocks: 0
+
+        input_kwargs:
+            output_mode: 'tri_plane_reshape'
+            input_mode: 'random'
+            in_res:  4
+            out_res: 256
+            out_dim: 32
+
+        upsampler_kwargs:
+            no_2d_renderer: False
+            no_residual_img: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: True
+
+        # reuglarization
+        n_reg_samples: 0
+        reg_full: False
+
+    encoder_kwargs:
+        class_name: "training.stylenerf.Encoder"
+        num_fp16_res: ${num_fp16_res}
+        channel_base: ${spec.fmaps}
+        channel_max: 512
+        conv_clamp: 256
+        architecture: skip
+        progressive: ${..synthesis_kwargs.progressive}
+        lowres_head: ${..synthesis_kwargs.resolution_start}
+        upsample_type: "bilinear"
+        model_kwargs:
+            output_mode: "W+"
+            predict_camera: False
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    
+    predict_camera: True
+
+    progressive: ${model.G_kwargs.synthesis_kwargs.progressive}
+    lowres_head: ${model.G_kwargs.synthesis_kwargs.resolution_start}
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [500,5000]
\ No newline at end of file
diff --git a/conf/model/stylenerf_ffhq_ae_basic.yaml b/conf/model/stylenerf_ffhq_ae_basic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93251bd5ec01f5005f9e30aa6c384f43d271ee6b
--- /dev/null
+++ b/conf/model/stylenerf_ffhq_ae_basic.yaml
@@ -0,0 +1,110 @@
+# @package _group_
+name: stylenerf_ffhq
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 1024
+        conv_clamp: 256
+        kernel_size: 1
+        architecture: skip
+        upsample_mode: "nn_cat"
+
+        z_dim: 0
+        resolution_vol: 32
+        resolution_start: 32
+        rgb_out_dim: 32
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-0.3, 0.3]
+            range_radius:  [1.0, 1.0]
+            depth_range: [0.88, 1.12]
+            fov: 12
+            gaussian_camera: True
+            angular_camera: True
+            depth_transform: ~
+            dists_normalized: True
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_ray_samples: 32
+            abs_sigma: False
+            hierarchical: True
+            no_background: True
+            
+        foreground_kwargs:
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: False
+            use_viewdirs: False
+            add_rgb: True
+
+        upsampler_kwargs:
+            no_2d_renderer: False
+            no_residual_img: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: True
+
+        # reuglarization
+        n_reg_samples: 0
+        reg_full: False
+
+    encoder_kwargs:
+        class_name: "training.stylenerf.Encoder"
+        num_fp16_res: ${num_fp16_res}
+        channel_base: ${spec.fmaps}
+        channel_max: 512
+        conv_clamp: 256
+        architecture: skip
+        progressive: ${..synthesis_kwargs.progressive}
+        lowres_head: ${..synthesis_kwargs.resolution_start}
+        upsample_type: "bilinear"
+        model_kwargs:
+            output_mode: "W+"
+            predict_camera: False
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    
+    predict_camera: True
+
+    progressive: ${model.G_kwargs.synthesis_kwargs.progressive}
+    lowres_head: ${model.G_kwargs.synthesis_kwargs.resolution_start}
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [500,5000]
\ No newline at end of file
diff --git a/conf/model/stylenerf_ffhq_debug.yaml b/conf/model/stylenerf_ffhq_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d05beb9c12513ef61fdd6531679b7590942fdf0e
--- /dev/null
+++ b/conf/model/stylenerf_ffhq_debug.yaml
@@ -0,0 +1,103 @@
+# @package _group_
+name: stylenerf_ffhq
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 1024
+        conv_clamp: 256
+        kernel_size: 1
+        architecture: skip
+        upsample_mode: "nn_cat"
+
+        z_dim: 0
+        resolution_vol: 128
+        resolution_start: 128
+        rgb_out_dim: 32
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-0.3, 0.3]
+            range_radius:  [1.0, 1.0]
+            depth_range: [0.88, 1.12]
+            fov: 12
+            gaussian_camera: True
+            angular_camera: True
+            depth_transform: ~
+            dists_normalized: True
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_ray_samples: 32
+            abs_sigma: False
+            hierarchical: True
+            no_background: True
+            
+        foreground_kwargs:
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: False
+            use_viewdirs: False
+            add_rgb: True
+            n_blocks: 0
+
+        input_kwargs:
+            output_mode: 'tri_plane_reshape'
+            input_mode: 'random'
+            in_res:  4
+            out_res: 256
+            out_dim: 32
+            keep_posenc: -1
+            keep_nerf_latents: False
+
+        upsampler_kwargs:
+            no_2d_renderer: False
+            no_residual_img: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: True
+
+        # reuglarization
+        n_reg_samples: 0
+        reg_full: False
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    progressive: ${model.G_kwargs.synthesis_kwargs.progressive}
+    lowres_head: ${model.G_kwargs.synthesis_kwargs.resolution_start}
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [500,5000]
\ No newline at end of file
diff --git a/conf/model/stylenerf_ffhq_eg3d.yaml b/conf/model/stylenerf_ffhq_eg3d.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8ed080ae13494cf987bb2d4fbf5d7e5066000ff
--- /dev/null
+++ b/conf/model/stylenerf_ffhq_eg3d.yaml
@@ -0,0 +1,100 @@
+# @package _group_
+name: stylenerf_ffhq
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 512
+        conv_clamp: 256
+        kernel_size: 3
+        architecture: skip
+
+        z_dim: 0
+        resolution_vol: 128
+        resolution_start: 32
+        rgb_out_dim: 32
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-0.3, 0.3]
+            range_radius:  [1.0, 1.0]
+            depth_range: [0.88, 1.12]
+            fov: 12
+            gaussian_camera: True
+            angular_camera: True
+            depth_transform: ~
+            dists_normalized: True
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_ray_samples: 32
+            abs_sigma: False
+            hierarchical: True
+            no_background: True
+            
+        foreground_kwargs:
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: False
+            use_viewdirs: False
+            add_rgb: True
+            n_blocks: 0
+
+        input_kwargs:
+            output_mode: 'tri_plane_reshape'
+            input_mode: 'random'
+            in_res:  4
+            out_res: 256
+            out_dim: 32
+
+        upsampler_kwargs:
+            no_2d_renderer: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: False
+        prog_nerf_only: True
+
+        # reuglarization
+        n_reg_samples: 0
+        reg_full: False
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    progressive: False
+    dual_input_res: 128
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [0,5000]
\ No newline at end of file
diff --git a/conf/model/stylenerf_ffhq_warped_depth.yaml b/conf/model/stylenerf_ffhq_warped_depth.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95bb7ecf64c2d174df94af750c3db9ced5a3397e
--- /dev/null
+++ b/conf/model/stylenerf_ffhq_warped_depth.yaml
@@ -0,0 +1,97 @@
+# @package _group_
+name: stylenerf_ffhq_warped_depth
+
+G_kwargs:
+    class_name: "training.networks.Generator"
+    z_dim: 512
+    w_dim: 512
+
+    mapping_kwargs:
+        num_layers: ${spec.map}
+
+    synthesis_kwargs:
+        # global settings
+        num_fp16_res: ${num_fp16_res}
+        channel_base: 1
+        channel_max: 1024
+        conv_clamp: 256
+        kernel_size: 1
+        architecture: skip
+        upsample_mode: "nn_cat"
+
+        z_dim_bg: 0
+        z_dim: 0
+        resolution_vol: 32
+        resolution_start: 32
+        rgb_out_dim: 256
+
+        use_noise: False
+        module_name: "training.stylenerf.NeRFSynthesisNetwork"
+        no_bbox: True
+        margin: 0
+        magnitude_ema_beta: 0.999
+
+        camera_kwargs:
+            range_v: [1.4157963267948965, 1.7257963267948966]
+            range_u: [-0.3, 0.3]
+            range_radius: [1.0, 1.0]
+            depth_range: [0.88, 3.2]
+            fov: 12
+            gaussian_camera: True
+            angular_camera: True
+            depth_transform:  InverseWarp
+            dists_normalized: True
+            ray_align_corner: False
+            bg_start: 0.5
+        
+        renderer_kwargs:
+            n_bg_samples: 0
+            n_ray_samples: 48
+            abs_sigma: False
+            hierarchical: True
+            no_background: True
+            
+        foreground_kwargs:
+            positional_encoding: "normal"
+            downscale_p_by: 1
+            use_style: "StyleGAN2"
+            predict_rgb: True
+            use_viewdirs: False
+
+        upsampler_kwargs:
+            channel_base: ${model.G_kwargs.synthesis_kwargs.channel_base}
+            channel_max:  ${model.G_kwargs.synthesis_kwargs.channel_max}
+            no_2d_renderer: False
+            no_residual_img: False
+            block_reses: ~
+            shared_rgb_style: False
+            upsample_type: "bilinear"
+        
+        progressive: True
+
+        # reuglarization
+        n_reg_samples: 16
+        reg_full: True
+
+D_kwargs:
+    class_name: "training.stylenerf.Discriminator"
+    epilogue_kwargs:
+        mbstd_group_size: ${spec.mbstd}
+
+    num_fp16_res: ${num_fp16_res}
+    channel_base: ${spec.fmaps}
+    channel_max: 512
+    conv_clamp: 256
+    architecture: skip
+    progressive: ${model.G_kwargs.synthesis_kwargs.progressive}
+    lowres_head: ${model.G_kwargs.synthesis_kwargs.resolution_start}
+    upsample_type: "bilinear"
+    resize_real_early: True
+
+# loss kwargs
+loss_kwargs:
+    pl_batch_shrink: 2
+    pl_decay: 0.01
+    pl_weight: 2
+    style_mixing_prob: 0.9
+    curriculum: [500,5000]
\ No newline at end of file
diff --git a/conf/spec/cifar.yaml b/conf/spec/cifar.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..019e5fb4be842212575f59575e7175d0e6d05638
--- /dev/null
+++ b/conf/spec/cifar.yaml
@@ -0,0 +1,13 @@
+# @package _group_
+
+name: cifar
+ref_gpus: 2
+kimg: 100000
+mb: 64
+mbstd: 32
+fmaps: 1
+lrate: 0.0025
+gamma: 0.01
+ema: 500
+ramp: 0.05
+map: 2
diff --git a/conf/spec/nerf32.yaml b/conf/spec/nerf32.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67cf83bc71cb61b7c33483bf63f583425dd3cecd
--- /dev/null
+++ b/conf/spec/nerf32.yaml
@@ -0,0 +1,14 @@
+# @package _group_
+
+name: nerf32
+ref_gpus: 8
+kimg: 25000
+mb: 64
+mbstd: 8
+fmaps: 0.5
+lrate: 0.0025
+lrate_disc: 0.0025
+gamma: 0.003
+ema: 20
+ramp: ~
+map: 8
diff --git a/conf/spec/paper1024.yaml b/conf/spec/paper1024.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e2fdca5e8693b9b954aea46a799a9672a0eae54
--- /dev/null
+++ b/conf/spec/paper1024.yaml
@@ -0,0 +1,14 @@
+# @package _group_
+
+name: paper1024
+ref_gpus: 8
+kimg: 25000
+mb: 32
+mbstd: 4
+fmaps: 1
+lrate: 0.002
+lrate_disc: 0.002
+gamma: 2
+ema: 10
+ramp: ~
+map: 8
diff --git a/conf/spec/paper256.yaml b/conf/spec/paper256.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8168fd72cb9fa2212cefc85d820dfa1a96fa4a32
--- /dev/null
+++ b/conf/spec/paper256.yaml
@@ -0,0 +1,14 @@
+# @package _group_
+
+name: paper256
+ref_gpus: 8
+kimg: 25000
+mb: 64
+mbstd: 8
+fmaps: 0.5
+lrate: 0.0025
+lrate_disc: 0.0025
+gamma: 0.5
+ema: 20
+ramp: ~
+map: 8
diff --git a/conf/spec/paper512.yaml b/conf/spec/paper512.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dfdf69e8fb0dfa2d9066293d2fad3a57d02fae03
--- /dev/null
+++ b/conf/spec/paper512.yaml
@@ -0,0 +1,12 @@
+name: paper512
+ref_gpus: 8
+kimg: 25000
+mb: 64
+mbstd: 8
+fmaps: 1
+lrate: 0.0025
+lrate_disc: 0.0025
+gamma: 0.5
+ema: 20
+ramp: ~
+map: 8
diff --git a/conf/spec/stylegan2.yaml b/conf/spec/stylegan2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23b1d38d48c7716d2cb262a863a4efe38dd8bf8b
--- /dev/null
+++ b/conf/spec/stylegan2.yaml
@@ -0,0 +1,14 @@
+# @package _group_
+
+name: stylegan2
+ref_gpus: 8
+kimg: 25000
+mb: 32
+mbstd: 4
+fmaps: 1
+lrate: 0.002
+lrate_disc: 0.0025
+gamma: 10
+ema: 10
+ramp: ~
+map: 8
diff --git a/dnnlib/__init__.py b/dnnlib/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..0a2cd19cb09ef6cd5a9f74d3b97a91c6aa080558
--- /dev/null
+++ b/dnnlib/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .util import EasyDict, make_cache_dir_path
diff --git a/dnnlib/camera.py b/dnnlib/camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..8846713c04afa650de92261ea87bdd4be00dac9a
--- /dev/null
+++ b/dnnlib/camera.py
@@ -0,0 +1,687 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import numpy as np
+from numpy.lib.function_base import angle
+import torch
+import torch.nn.functional as F
+import math
+
+from scipy.spatial.transform import Rotation as Rot
+HUGE_NUMBER = 1e10
+TINY_NUMBER = 1e-6      # float32 only has 7 decimal digits precision
+
+
+def get_camera_mat(fov=49.13, invert=True):
+    # fov = 2 * arctan(sensor / (2 * focal))
+    # focal = (sensor / 2)  * 1 / (tan(0.5 * fov))
+    # in our case, sensor = 2 as pixels are in [-1, 1]
+    focal = 1. / np.tan(0.5 * fov * np.pi/180.)
+    focal = focal.astype(np.float32)
+    mat = torch.tensor([
+        [focal, 0., 0., 0.],
+        [0., focal, 0., 0.],
+        [0., 0., 1, 0.],
+        [0., 0., 0., 1.]
+    ]).reshape(1, 4, 4)
+    if invert:
+        mat = torch.inverse(mat)
+    return mat
+
+
+def get_random_pose(range_u, range_v, range_radius, batch_size=32,
+                    invert=False, gaussian=False, angular=False):
+    loc, (u, v) = sample_on_sphere(range_u, range_v, size=(batch_size), gaussian=gaussian, angular=angular)
+    radius = range_radius[0] + torch.rand(batch_size) * (range_radius[1] - range_radius[0])
+    loc = loc * radius.unsqueeze(-1)
+    R = look_at(loc)
+    RT = torch.eye(4).reshape(1, 4, 4).repeat(batch_size, 1, 1)
+    RT[:, :3, :3] = R
+    RT[:, :3, -1] = loc
+
+    if invert:
+        RT = torch.inverse(RT)
+    
+    def N(a, range_a):
+        if range_a[0] == range_a[1]:
+            return a * 0
+        return (a - range_a[0]) / (range_a[1] - range_a[0])
+    
+    val_u, val_v, val_r = N(u, range_u), N(v, range_v), N(radius, range_radius)
+    return RT, (val_u, val_v, val_r)
+
+
+def get_camera_pose(range_u, range_v, range_r, val_u=0.5, val_v=0.5, val_r=0.5,
+                    batch_size=32, invert=False,  gaussian=False, angular=False):
+    r0, rr = range_r[0], range_r[1] - range_r[0]
+    r = r0 + val_r * rr
+    if not gaussian:
+        u0, ur = range_u[0], range_u[1] - range_u[0]
+        v0, vr = range_v[0], range_v[1] - range_v[0]   
+        u = u0 + val_u * ur
+        v = v0 + val_v * vr
+    else:
+        mean_u, mean_v = sum(range_u) / 2, sum(range_v) / 2
+        vu, vv = mean_u - range_u[0], mean_v - range_v[0]
+        u = mean_u + vu * val_u
+        v = mean_v + vv * val_v
+        
+    loc, _ = sample_on_sphere((u, u), (v, v), size=(batch_size), angular=angular)
+    radius = torch.ones(batch_size) * r
+    loc = loc * radius.unsqueeze(-1)
+    R = look_at(loc)
+    RT = torch.eye(4).reshape(1, 4, 4).repeat(batch_size, 1, 1)
+    RT[:, :3, :3] = R
+    RT[:, :3, -1] = loc
+
+    if invert:
+        RT = torch.inverse(RT)
+    return RT
+
+
+def get_camera_pose_v2(range_u, range_v, range_r, mode, invert=False, gaussian=False, angular=False):
+    r0, rr = range_r[0], range_r[1] - range_r[0]
+    val_u, val_v = mode[:,0], mode[:,1]
+    val_r = torch.ones_like(val_u) * 0.5
+    if not gaussian:
+        u0, ur = range_u[0], range_u[1] - range_u[0]
+        v0, vr = range_v[0], range_v[1] - range_v[0]
+        u = u0 + val_u * ur
+        v = v0 + val_v * vr
+    else:
+        mean_u, mean_v = sum(range_u) / 2, sum(range_v) / 2
+        vu, vv = mean_u - range_u[0], mean_v - range_v[0]
+        u = mean_u + vu * val_u
+        v = mean_v + vv * val_v
+    
+    loc = to_sphere(u, v, angular)
+    radius = r0 + val_r * rr
+    loc = loc * radius.unsqueeze(-1)
+    R = look_at(loc)
+    RT = torch.eye(4).to(R.device).reshape(1, 4, 4).repeat(R.size(0), 1, 1)
+    RT[:, :3, :3] = R
+    RT[:, :3, -1] = loc
+
+    if invert:
+        RT = torch.inverse(RT)
+    return RT, (val_u, val_v, val_r)
+
+
+def to_sphere(u, v, angular=False):
+    T = torch if isinstance(u, torch.Tensor) else np
+    if not angular:
+        theta = 2 * math.pi * u
+        phi = T.arccos(1 - 2 * v)
+    else:
+        theta, phi = u, v
+    
+    cx = T.sin(phi) * T.cos(theta)
+    cy = T.sin(phi) * T.sin(theta)
+    cz = T.cos(phi)
+    return T.stack([cx, cy, cz], -1)
+
+
+def sample_on_sphere(range_u=(0, 1), range_v=(0, 1), size=(1,),
+                     to_pytorch=True, gaussian=False, angular=False):
+    if not gaussian:
+        u = np.random.uniform(*range_u, size=size)
+        v = np.random.uniform(*range_v, size=size)
+    else:
+        mean_u, mean_v = sum(range_u) / 2, sum(range_v) / 2
+        var_u, var_v = mean_u - range_u[0], mean_v - range_v[0]
+        u = np.random.normal(size=size) * var_u + mean_u
+        v = np.random.normal(size=size) * var_v + mean_v
+
+    sample = to_sphere(u, v, angular)
+    if to_pytorch:
+        sample = torch.tensor(sample).float()
+        u, v = torch.tensor(u).float(), torch.tensor(v).float()
+
+    return sample, (u, v)
+
+
+def look_at(eye, at=np.array([0, 0, 0]), up=np.array([0, 0, 1]), eps=1e-5,
+            to_pytorch=True):
+    if not isinstance(eye, torch.Tensor):
+        # this is the original code from GRAF
+        at = at.astype(float).reshape(1, 3)
+        up = up.astype(float).reshape(1, 3)
+        eye = eye.reshape(-1, 3)
+        up = up.repeat(eye.shape[0] // up.shape[0], axis=0)
+        eps = np.array([eps]).reshape(1, 1).repeat(up.shape[0], axis=0)
+        z_axis = eye - at
+        z_axis /= np.max(np.stack([np.linalg.norm(z_axis,
+                                                axis=1, keepdims=True), eps]))
+        x_axis = np.cross(up, z_axis)
+        x_axis /= np.max(np.stack([np.linalg.norm(x_axis,
+                                                axis=1, keepdims=True), eps]))
+        y_axis = np.cross(z_axis, x_axis)
+        y_axis /= np.max(np.stack([np.linalg.norm(y_axis,
+                                                axis=1, keepdims=True), eps]))
+        r_mat = np.concatenate(
+            (x_axis.reshape(-1, 3, 1), y_axis.reshape(-1, 3, 1), z_axis.reshape(
+                -1, 3, 1)), axis=2)
+        if to_pytorch:
+            r_mat = torch.tensor(r_mat).float()
+    else:
+        
+        def normalize(x, axis=-1, order=2):
+            l2 = x.norm(p=order, dim=axis, keepdim=True).clamp(min=1e-8)
+            return x / l2
+        
+        at, up = torch.from_numpy(at).float().to(eye.device), torch.from_numpy(up).float().to(eye.device)
+        z_axis = normalize(eye - at[None, :])
+        x_axis = normalize(torch.cross(up[None,:].expand_as(z_axis), z_axis, dim=-1))
+        y_axis = normalize(torch.cross(z_axis, x_axis, dim=-1))
+        r_mat = torch.stack([x_axis, y_axis, z_axis], dim=-1)
+
+    return r_mat
+
+
+def get_rotation_matrix(axis='z', value=0., batch_size=32):
+    r = Rot.from_euler(axis, value * 2 * np.pi).as_dcm()
+    r = torch.from_numpy(r).reshape(1, 3, 3).repeat(batch_size, 1, 1)
+    return r
+
+
+def get_corner_rays(corner_pixels, camera_matrices, res):
+    assert (res + 1) * (res + 1) == corner_pixels.size(1)
+    batch_size = camera_matrices[0].size(0)
+    rays, origins, _ = get_camera_rays(camera_matrices, corner_pixels)
+    corner_rays = torch.cat([rays, torch.cross(origins, rays, dim=-1)], -1)
+    corner_rays = corner_rays.reshape(batch_size, res+1, res+1, 6).permute(0,3,1,2)
+    corner_rays = torch.cat([corner_rays[..., :-1, :-1], corner_rays[..., 1:, :-1], corner_rays[..., 1:, 1:], corner_rays[..., :-1, 1:]], 1)
+    return corner_rays
+    
+
+def arange_pixels(
+        resolution=(128, 128), 
+        batch_size=1, 
+        subsample_to=None, 
+        invert_y_axis=False, 
+        margin=0,
+        corner_aligned=True,
+        jitter=None
+    ):
+    ''' Arranges pixels for given resolution in range image_range.
+
+    The function returns the unscaled pixel locations as integers and the
+    scaled float values.
+
+    Args:
+        resolution (tuple): image resolution
+        batch_size (int): batch size
+        subsample_to (int): if integer and > 0, the points are randomly
+            subsampled to this value
+    '''
+    h, w = resolution
+    n_points = resolution[0] * resolution[1]
+    uh = 1 if corner_aligned else 1 - (1 / h)
+    uw = 1 if corner_aligned else 1 - (1 / w)
+    if margin > 0:
+        uh = uh + (2 / h) * margin
+        uw = uw + (2 / w) * margin 
+        w, h = w + margin * 2, h + margin * 2
+
+    x, y = torch.linspace(-uw, uw, w), torch.linspace(-uh, uh, h)
+    if jitter is not None:
+        dx = (torch.ones_like(x).uniform_() - 0.5) * 2 / w * jitter
+        dy = (torch.ones_like(y).uniform_() - 0.5) * 2 / h * jitter
+        x, y = x + dx, y + dy
+    x, y = torch.meshgrid(x, y)
+    pixel_scaled = torch.stack([x, y], -1).permute(1,0,2).reshape(1, -1, 2).repeat(batch_size, 1, 1)
+    
+    # Subsample points if subsample_to is not None and > 0
+    if (subsample_to is not None and subsample_to > 0 and subsample_to < n_points):
+        idx = np.random.choice(pixel_scaled.shape[1], size=(subsample_to,),
+                               replace=False)
+        pixel_scaled = pixel_scaled[:, idx]
+
+    if invert_y_axis:
+        pixel_scaled[..., -1] *= -1.
+
+    return pixel_scaled
+
+
+def to_pytorch(tensor, return_type=False):
+    ''' Converts input tensor to pytorch.
+
+    Args:
+        tensor (tensor): Numpy or Pytorch tensor
+        return_type (bool): whether to return input type
+    '''
+    is_numpy = False
+    if type(tensor) == np.ndarray:
+        tensor = torch.from_numpy(tensor)
+        is_numpy = True
+    tensor = tensor.clone()
+    if return_type:
+        return tensor, is_numpy
+    return tensor
+
+
+def transform_to_world(pixels, depth, camera_mat, world_mat, scale_mat=None,
+                       invert=True, use_absolute_depth=True):
+    ''' Transforms pixel positions p with given depth value d to world coordinates.
+
+    Args:
+        pixels (tensor): pixel tensor of size B x N x 2
+        depth (tensor): depth tensor of size B x N x 1
+        camera_mat (tensor): camera matrix
+        world_mat (tensor): world matrix
+        scale_mat (tensor): scale matrix
+        invert (bool): whether to invert matrices (default: true)
+    '''
+    assert(pixels.shape[-1] == 2)
+    if scale_mat is None:
+        scale_mat = torch.eye(4).unsqueeze(0).repeat(
+            camera_mat.shape[0], 1, 1).to(camera_mat.device)
+
+    # Convert to pytorch
+    pixels, is_numpy = to_pytorch(pixels, True)
+    depth = to_pytorch(depth)
+    camera_mat = to_pytorch(camera_mat)
+    world_mat = to_pytorch(world_mat)
+    scale_mat = to_pytorch(scale_mat)
+
+    # Invert camera matrices
+    if invert:
+        camera_mat = torch.inverse(camera_mat)
+        world_mat = torch.inverse(world_mat)
+        scale_mat = torch.inverse(scale_mat)
+
+    # Transform pixels to homogen coordinates
+    pixels = pixels.permute(0, 2, 1)
+    pixels = torch.cat([pixels, torch.ones_like(pixels)], dim=1)
+
+    # Project pixels into camera space
+    if use_absolute_depth:
+        pixels[:, :2] = pixels[:, :2] * depth.permute(0, 2, 1).abs()
+        pixels[:, 2:3] = pixels[:, 2:3] * depth.permute(0, 2, 1)
+    else:
+        pixels[:, :3] = pixels[:, :3] * depth.permute(0, 2, 1)
+    
+    # Transform pixels to world space
+    p_world = scale_mat @ world_mat @ camera_mat @ pixels
+
+    # Transform p_world back to 3D coordinates
+    p_world = p_world[:, :3].permute(0, 2, 1)
+
+    if is_numpy:
+        p_world = p_world.numpy()
+    return p_world
+
+
+def transform_to_camera_space(p_world, world_mat, camera_mat=None, scale_mat=None):
+    ''' Transforms world points to camera space.
+        Args:
+        p_world (tensor): world points tensor of size B x N x 3
+        camera_mat (tensor): camera matrix
+        world_mat (tensor): world matrix
+        scale_mat (tensor): scale matrix
+    '''
+    batch_size, n_p, _ = p_world.shape
+    device = p_world.device
+
+    # Transform world points to homogen coordinates
+    p_world = torch.cat([p_world, torch.ones(
+        batch_size, n_p, 1).to(device)], dim=-1).permute(0, 2, 1)
+
+    # Apply matrices to transform p_world to camera space
+    if scale_mat is None:
+        if camera_mat is None:
+            p_cam = world_mat @ p_world
+        else:
+            p_cam = camera_mat @ world_mat @ p_world
+    else:
+        p_cam = camera_mat @ world_mat @ scale_mat @ p_world
+
+    # Transform points back to 3D coordinates
+    p_cam = p_cam[:, :3].permute(0, 2, 1)
+    return p_cam
+
+
+def origin_to_world(n_points, camera_mat, world_mat, scale_mat=None,
+                    invert=False):
+    ''' Transforms origin (camera location) to world coordinates.
+
+    Args:
+        n_points (int): how often the transformed origin is repeated in the
+            form (batch_size, n_points, 3)
+        camera_mat (tensor): camera matrix
+        world_mat (tensor): world matrix
+        scale_mat (tensor): scale matrix
+        invert (bool): whether to invert the matrices (default: true)
+    '''
+    batch_size = camera_mat.shape[0]
+    device = camera_mat.device
+    # Create origin in homogen coordinates
+    p = torch.zeros(batch_size, 4, n_points).to(device)
+    p[:, -1] = 1.
+
+    if scale_mat is None:
+        scale_mat = torch.eye(4).unsqueeze(
+            0).repeat(batch_size, 1, 1).to(device)
+
+    # Invert matrices
+    if invert:
+        camera_mat = torch.inverse(camera_mat)
+        world_mat = torch.inverse(world_mat)
+        scale_mat = torch.inverse(scale_mat)
+
+    # Apply transformation
+    p_world = scale_mat @ world_mat @ camera_mat @ p
+
+    # Transform points back to 3D coordinates
+    p_world = p_world[:, :3].permute(0, 2, 1)
+    return p_world
+
+
+def image_points_to_world(image_points, camera_mat, world_mat, scale_mat=None,
+                          invert=False, negative_depth=True):
+    ''' Transforms points on image plane to world coordinates.
+
+    In contrast to transform_to_world, no depth value is needed as points on
+    the image plane have a fixed depth of 1.
+
+    Args:
+        image_points (tensor): image points tensor of size B x N x 2
+        camera_mat (tensor): camera matrix
+        world_mat (tensor): world matrix
+        scale_mat (tensor): scale matrix
+        invert (bool): whether to invert matrices
+    '''
+    batch_size, n_pts, dim = image_points.shape
+    assert(dim == 2)
+    device = image_points.device
+    d_image = torch.ones(batch_size, n_pts, 1).to(device)
+    if negative_depth:
+        d_image *= -1.
+    return transform_to_world(image_points, d_image, camera_mat, world_mat,
+                              scale_mat, invert=invert)
+
+
+def image_points_to_camera(image_points, camera_mat, 
+                           invert=False, negative_depth=True, use_absolute_depth=True):
+    batch_size, n_pts, dim = image_points.shape
+    assert(dim == 2)
+    device = image_points.device
+    d_image = torch.ones(batch_size, n_pts, 1).to(device)
+    if negative_depth:
+        d_image *= -1.
+
+    # Convert to pytorch
+    pixels, is_numpy = to_pytorch(image_points, True)
+    depth = to_pytorch(d_image)
+    camera_mat = to_pytorch(camera_mat)
+
+    # Invert camera matrices
+    if invert:
+        camera_mat = torch.inverse(camera_mat)
+    
+    # Transform pixels to homogen coordinates
+    pixels = pixels.permute(0, 2, 1)
+    pixels = torch.cat([pixels, torch.ones_like(pixels)], dim=1)
+
+    # Project pixels into camera space
+    if use_absolute_depth:
+        pixels[:, :2] = pixels[:, :2] * depth.permute(0, 2, 1).abs()
+        pixels[:, 2:3] = pixels[:, 2:3] * depth.permute(0, 2, 1)
+    else:
+        pixels[:, :3] = pixels[:, :3] * depth.permute(0, 2, 1)
+
+    # Transform pixels to world space
+    p_camera = camera_mat @ pixels
+
+    # Transform p_world back to 3D coordinates
+    p_camera = p_camera[:, :3].permute(0, 2, 1)
+
+    if is_numpy:
+        p_camera = p_camera.numpy()
+    return p_camera
+
+
+def camera_points_to_image(camera_points, camera_mat, 
+                           invert=False, negative_depth=True, use_absolute_depth=True):
+    batch_size, n_pts, dim = camera_points.shape
+    assert(dim == 3)
+    device = camera_points.device
+
+    # Convert to pytorch
+    p_camera, is_numpy = to_pytorch(camera_points, True)
+    camera_mat = to_pytorch(camera_mat)
+
+    # Invert camera matrices
+    if invert:
+        camera_mat = torch.inverse(camera_mat)
+
+    # Transform world camera space to pixels
+    p_camera = p_camera.permute(0, 2, 1)  # B x 3 x N
+    pixels = camera_mat[:, :3, :3] @ p_camera
+
+    assert use_absolute_depth and negative_depth
+    pixels, p_depths = pixels[:, :2], pixels[:, 2:3]
+    p_depths = -p_depths  # negative depth
+    pixels = pixels / p_depths
+
+    pixels = pixels.permute(0, 2, 1)
+    if is_numpy:
+        pixels = pixels.numpy()
+    return pixels
+
+
+def angular_interpolation(res, camera_mat):
+    batch_size = camera_mat.shape[0]
+    device = camera_mat.device
+    input_rays  = image_points_to_camera(arange_pixels((res, res), batch_size, 
+        invert_y_axis=True).to(device), camera_mat)
+    output_rays = image_points_to_camera(arange_pixels((res * 2, res * 2), batch_size,
+        invert_y_axis=True).to(device), camera_mat)
+    input_rays  = input_rays / input_rays.norm(dim=-1, keepdim=True)
+    output_rays = output_rays / output_rays.norm(dim=-1, keepdim=True)
+
+    def dir2sph(v):
+        u = (v[..., :2] ** 2).sum(-1).sqrt()
+        theta = torch.atan2(u, v[..., 2]) / math.pi
+        phi = torch.atan2(v[..., 1], v[..., 0]) / math.pi
+        return torch.stack([theta, phi], 1)
+
+    input_rays  = dir2sph(input_rays).reshape(batch_size, 2, res, res)
+    output_rays = dir2sph(output_rays).reshape(batch_size, 2, res * 2, res * 2)
+    return input_rays
+
+
+def interpolate_sphere(z1, z2, t):
+    p = (z1 * z2).sum(dim=-1, keepdim=True)
+    p = p / z1.pow(2).sum(dim=-1, keepdim=True).sqrt()
+    p = p / z2.pow(2).sum(dim=-1, keepdim=True).sqrt()
+    omega = torch.acos(p)
+    s1 = torch.sin((1-t)*omega)/torch.sin(omega)
+    s2 = torch.sin(t*omega)/torch.sin(omega)
+    z = s1 * z1 + s2 * z2
+    return z
+
+
+def get_camera_rays(camera_matrices, pixels=None, res=None, margin=0):
+    device     = camera_matrices[0].device
+    batch_size = camera_matrices[0].shape[0]
+    if pixels is None:
+        assert res is not None
+        pixels = arange_pixels((res, res), batch_size, invert_y_axis=True, margin=margin).to(device)
+    n_points = pixels.size(1)
+    pixels_world = image_points_to_world(
+            pixels, camera_mat=camera_matrices[0],
+            world_mat=camera_matrices[1])
+    camera_world = origin_to_world(
+            n_points, camera_mat=camera_matrices[0],
+            world_mat=camera_matrices[1])
+    ray_vector = pixels_world - camera_world
+    ray_vector = ray_vector / ray_vector.norm(dim=-1, keepdim=True)
+    return ray_vector, camera_world, pixels_world
+
+
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    """
+    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
+    using Gram--Schmidt orthogonalization per Section B of [1].
+    Args:
+        d6: 6D rotation representation, of size (*, 6)
+
+    Returns:
+        batch of rotation matrices of size (*, 3, 3)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+
+
+def camera_9d_to_16d(d9):
+    d6, translation = d9[..., :6], d9[..., 6:]
+    rotation = rotation_6d_to_matrix(d6)
+    RT = torch.eye(4).to(device=d9.device, dtype=d9.dtype).reshape(
+        1, 4, 4).repeat(d6.size(0), 1, 1)
+    RT[:, :3, :3] = rotation
+    RT[:, :3, -1] = translation
+    return RT.reshape(-1, 16)
+
+def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
+    by dropping the last row. Note that 6D representation is not unique.
+    Args:
+        matrix: batch of rotation matrices of size (*, 3, 3)
+
+    Returns:
+        6D rotation representation, of size (*, 6)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    return matrix[..., :2, :].clone().reshape(*matrix.size()[:-2], 6)
+
+
+def depth2pts_outside(ray_o, ray_d, depth):
+    '''
+    ray_o, ray_d: [..., 3]
+    depth: [...]; inverse of distance to sphere origin
+    '''
+    # note: d1 becomes negative if this mid point is behind camera
+    d1 = -torch.sum(ray_d * ray_o, dim=-1) / torch.sum(ray_d * ray_d, dim=-1)
+    p_mid = ray_o + d1.unsqueeze(-1) * ray_d
+    p_mid_norm = torch.norm(p_mid, dim=-1)
+    ray_d_cos = 1. / torch.norm(ray_d, dim=-1)
+    d2 = torch.sqrt(1. - p_mid_norm * p_mid_norm) * ray_d_cos
+    p_sphere = ray_o + (d1 + d2).unsqueeze(-1) * ray_d
+
+    rot_axis = torch.cross(ray_o, p_sphere, dim=-1)
+    rot_axis = rot_axis / torch.norm(rot_axis, dim=-1, keepdim=True)
+    phi = torch.asin(p_mid_norm)
+    theta = torch.asin(p_mid_norm * depth)  # depth is inside [0, 1]
+    rot_angle = (phi - theta).unsqueeze(-1)     # [..., 1]
+
+    # now rotate p_sphere
+    # Rodrigues formula: https://en.wikipedia.org/wiki/Rodrigues%27_rotation_formula
+    p_sphere_new = p_sphere * torch.cos(rot_angle) + \
+                torch.cross(rot_axis, p_sphere, dim=-1) * torch.sin(rot_angle) + \
+                rot_axis * torch.sum(rot_axis*p_sphere, dim=-1, keepdim=True) * (1.-torch.cos(rot_angle))
+    p_sphere_new = p_sphere_new / torch.norm(p_sphere_new, dim=-1, keepdim=True)
+    pts = torch.cat((p_sphere_new, depth.unsqueeze(-1)), dim=-1)
+
+    # now calculate conventional depth
+    depth_real = 1. / (depth + TINY_NUMBER) * torch.cos(theta) * ray_d_cos + d1
+    return pts, depth_real
+
+
+def intersect_sphere(ray_o, ray_d, radius=1):
+    '''
+    ray_o, ray_d: [..., 3]
+    compute the depth of the intersection point between this ray and unit sphere
+    '''
+    # note: d1 becomes negative if this mid point is behind camera
+    d1 = -torch.sum(ray_d * ray_o, dim=-1) / torch.sum(ray_d * ray_d, dim=-1)
+    p = ray_o + d1.unsqueeze(-1) * ray_d
+    # consider the case where the ray does not intersect the sphere
+    ray_d_cos = 1. / torch.norm(ray_d, dim=-1)
+    d2 = radius ** 2 - torch.sum(p * p, dim=-1)
+    mask = (d2 > 0)
+    d2 = torch.sqrt(d2.clamp(min=1e-6)) * ray_d_cos
+    d1, d2 = d1.unsqueeze(-1), d2.unsqueeze(-1)
+    depth_range = [d1 - d2, d1 + d2]
+    return depth_range, mask
+
+
+def normalize(x, axis=-1, order=2):
+    if isinstance(x, torch.Tensor):
+        l2 = x.norm(p=order, dim=axis, keepdim=True)
+        return x / (l2 + 1e-8), l2
+
+    else:
+        l2 = np.linalg.norm(x, order, axis)
+        l2 = np.expand_dims(l2, axis)
+        l2[l2==0] = 1
+        return x / l2, l2
+
+
+def sample_pdf(bins, weights, N_importance, det=False, eps=1e-5):
+    """
+    Sample @N_importance samples from @bins with distribution defined by @weights.
+    Inputs:
+        bins: (N_rays, N_samples_+1) where N_samples_ is "the number of coarse samples per ray - 2"
+        weights: (N_rays, N_samples_)
+        N_importance: the number of samples to draw from the distribution
+        det: deterministic or not
+        eps: a small number to prevent division by zero
+    Outputs:
+        samples: the sampled samples
+    Source: https://github.com/kwea123/nerf_pl/blob/master/models/rendering.py
+    """
+    N_rays, N_samples_ = weights.shape
+    weights = weights + eps # prevent division by zero (don't do inplace op!)
+    pdf = weights / torch.sum(weights, -1, keepdim=True) # (N_rays, N_samples_)
+    cdf = torch.cumsum(pdf, -1) # (N_rays, N_samples), cumulative distribution function
+    cdf = torch.cat([torch.zeros_like(cdf[: ,:1]), cdf], -1)  # (N_rays, N_samples_+1)
+                                                               # padded to 0~1 inclusive
+
+    if det:
+        u = torch.linspace(0, 1, N_importance, device=bins.device)
+        u = u.expand(N_rays, N_importance)
+    else:
+        u = torch.rand(N_rays, N_importance, device=bins.device)
+    u = u.contiguous()
+
+    inds = torch.searchsorted(cdf, u)
+    below = torch.clamp_min(inds-1, 0)
+    above = torch.clamp_max(inds, N_samples_)
+
+    inds_sampled = torch.stack([below, above], -1).view(N_rays, 2*N_importance)
+    cdf_g = torch.gather(cdf, 1, inds_sampled)
+    cdf_g = cdf_g.view(N_rays, N_importance, 2)
+    bins_g = torch.gather(bins, 1, inds_sampled).view(N_rays, N_importance, 2)
+
+    denom = cdf_g[...,1]-cdf_g[...,0]
+    denom[denom<eps] = 1 # denom equals 0 means a bin has weight 0, in which case it will not be sampled
+                         # anyway, therefore any value for it is fine (set to 1 here)
+
+    samples = bins_g[...,0] + (u-cdf_g[...,0])/denom * (bins_g[...,1]-bins_g[...,0])
+    return samples
+
+
+def normalization_inverse_sqrt_dist_centered(x_in_world, view_cell_center, max_depth):
+    localized = x_in_world - view_cell_center
+    local = torch.sqrt(torch.linalg.norm(localized, dim=-1))
+    res = localized / (math.sqrt(max_depth) * local[..., None])
+    return res
+
+
+######################################################################################
diff --git a/dnnlib/filters.py b/dnnlib/filters.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eaff58938b300eb2c2152cca3f8117d61f2d340
--- /dev/null
+++ b/dnnlib/filters.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def kaiser_attenuation(n_taps, f_h, sr):
+    df = (2 * f_h) / (sr / 2)
+    return 2.285 * (n_taps - 1) * math.pi * df + 7.95
+
+
+def kaiser_beta(n_taps, f_h, sr):
+    atten = kaiser_attenuation(n_taps, f_h, sr)
+
+    if atten > 50:
+        return 0.1102 * (atten - 8.7)
+
+    elif 50 >= atten >= 21:
+        return 0.5842 * (atten - 21) ** 0.4 + 0.07886 * (atten - 21)
+
+    else:
+        return 0.0
+
+def sinc(x, eps=1e-10):
+    y = torch.sin(math.pi * x) / (math.pi * x + eps)
+    y = y.masked_fill(x.eq(0), 1.0)
+    return y
+
+
+def kaiser_window(n_taps, f_h, sr):
+    beta = kaiser_beta(n_taps, f_h, sr)
+    ind = torch.arange(n_taps) - (n_taps - 1) / 2
+    return torch.i0(beta * torch.sqrt(1 - ((2 * ind) / (n_taps - 1)) ** 2)) / torch.i0(
+        torch.tensor(beta)
+    )
+
+
+def lowpass_filter(n_taps, cutoff, band_half, sr):
+    window = kaiser_window(n_taps, band_half, sr)
+    ind = torch.arange(n_taps) - (n_taps - 1) / 2
+    lowpass = 2 * cutoff / sr * sinc(2 * cutoff / sr * ind) * window
+    return lowpass
+
+
+def filter_parameters(
+    n_layer,
+    n_critical,
+    sr_max,
+    cutoff_0,
+    cutoff_n,
+    stopband_0,
+    stopband_n
+):
+    cutoffs = []
+    stopbands = []
+    srs = []
+    band_halfs = []
+
+    for i in range(n_layer):
+        f_c = cutoff_0 * (cutoff_n / cutoff_0) ** min(i / (n_layer - n_critical), 1)
+        f_t = stopband_0 * (stopband_n / stopband_0) ** min(
+            i / (n_layer - n_critical), 1
+        )
+        s_i = 2 ** math.ceil(math.log(min(2 * f_t, sr_max), 2))
+        f_h = max(f_t, s_i / 2) - f_c
+
+        cutoffs.append(f_c)
+        stopbands.append(f_t)
+        srs.append(s_i)
+        band_halfs.append(f_h)
+
+    return {
+        "cutoffs": cutoffs,
+        "stopbands": stopbands,
+        "srs": srs,
+        "band_halfs": band_halfs,
+    }
diff --git a/dnnlib/geometry.py b/dnnlib/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c2f4857be0947846328d6c43ec3d188fd632696
--- /dev/null
+++ b/dnnlib/geometry.py
@@ -0,0 +1,406 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import torch
+import torch.nn.functional as F
+import math
+import random
+import numpy as np
+
+
+def positional_encoding(p, size, pe='normal', use_pos=False):
+    if pe == 'gauss':
+        p_transformed = np.pi * p @ size
+        p_transformed = torch.cat(
+            [torch.sin(p_transformed), torch.cos(p_transformed)], dim=-1)
+    else:
+        p_transformed = torch.cat([torch.cat(
+            [torch.sin((2 ** i) * np.pi * p),
+            torch.cos((2 ** i) * np.pi * p)],
+            dim=-1) for i in range(size)], dim=-1)
+    if use_pos:
+        p_transformed = torch.cat([p_transformed, p], -1)
+    return p_transformed
+
+
+def upsample(img_nerf, size, filter=None):
+    up = size // img_nerf.size(-1)
+    if up <= 1:
+        return img_nerf
+
+    if filter is not None:
+        from torch_utils.ops import upfirdn2d
+        for _ in range(int(math.log2(up))):
+            img_nerf = upfirdn2d.downsample2d(img_nerf, filter, up=2)
+    else:
+        img_nerf = F.interpolate(img_nerf, (size, size), mode='bilinear', align_corners=False)
+    return img_nerf
+
+
+def downsample(img0, size, filter=None):
+    down = img0.size(-1) // size
+    if down <= 1:
+        return img0
+
+    if filter is not None:    
+        from torch_utils.ops import upfirdn2d
+        for _ in range(int(math.log2(down))):
+            img0 = upfirdn2d.downsample2d(img0, filter, down=2)
+    else:
+        img0 = F.interpolate(img0, (size, size), mode='bilinear', align_corners=False)
+    return img0
+        
+
+def normalize_vecs(vectors: torch.Tensor) -> torch.Tensor:
+    """
+    Normalize vector lengths.
+    """
+    return vectors / (torch.norm(vectors, dim=-1, keepdim=True))
+
+
+def repeat_vecs(vecs, n, dim=0):
+    return torch.stack(n*[vecs], dim=dim)
+
+
+def get_grids(H, W, device, align=True):
+    ch = 1 if align else 1 - (1 / H)
+    cw = 1 if align else 1 - (1 / W)
+    x, y = torch.meshgrid(torch.linspace(-cw, cw, W, device=device),
+                          torch.linspace(ch, -ch, H, device=device))
+    return torch.stack([x, y], -1)
+
+
+def local_ensemble(pi, po, resolution):
+    ii = range(resolution)
+    ia = torch.tensor([max((i - 1)//2, 0) for i in ii]).long()
+    ib = torch.tensor([min((i + 1)//2, resolution//2-1) for i in ii]).long()
+    
+    ul = torch.meshgrid(ia, ia)
+    ur = torch.meshgrid(ia, ib)
+    ll = torch.meshgrid(ib, ia)
+    lr = torch.meshgrid(ib, ib)
+    
+    d_ul, p_ul = po - pi[ul], torch.stack(ul, -1)
+    d_ur, p_ur = po - pi[ur], torch.stack(ur, -1)
+    d_ll, p_ll = po - pi[ll], torch.stack(ll, -1)
+    d_lr, p_lr = po - pi[lr], torch.stack(lr, -1)
+    
+    c_ul = d_ul.prod(dim=-1).abs()
+    c_ur = d_ur.prod(dim=-1).abs()
+    c_ll = d_ll.prod(dim=-1).abs()
+    c_lr = d_lr.prod(dim=-1).abs()
+
+    D = torch.stack([d_ul, d_ur, d_ll, d_lr], 0)
+    P = torch.stack([p_ul, p_ur, p_ll, p_lr], 0)
+    C = torch.stack([c_ul, c_ur, c_ll, c_lr], 0)
+    C = C / C.sum(dim=0, keepdim=True)
+    return D, P, C
+
+
+def get_initial_rays_trig(num_steps, fov, resolution, ray_start, ray_end, device='cpu'):
+    """Returns sample points, z_vals, ray directions in camera space."""
+
+    W, H = resolution
+    # Create full screen NDC (-1 to +1) coords [x, y, 0, 1].
+    # Y is flipped to follow image memory layouts.
+    x, y = torch.meshgrid(torch.linspace(-1, 1, W, device=device),
+                          torch.linspace(1, -1, H, device=device))
+    x = x.T.flatten()
+    y = y.T.flatten()
+    z = -torch.ones_like(x, device=device) / math.tan((2 * math.pi * fov / 360)/2)
+
+    rays_d_cam = normalize_vecs(torch.stack([x, y, z], -1))
+
+    z_vals = torch.linspace(ray_start, ray_end, num_steps, device=device).reshape(1, num_steps, 1).repeat(W*H, 1, 1)
+    points = rays_d_cam.unsqueeze(1).repeat(1, num_steps, 1) * z_vals
+    return points, z_vals, rays_d_cam
+
+
+def sample_camera_positions(
+    device, n=1, r=1, horizontal_stddev=1, vertical_stddev=1, 
+    horizontal_mean=math.pi*0.5, vertical_mean=math.pi*0.5, mode='normal'):
+    """
+    Samples n random locations along a sphere of radius r. 
+    Uses a gaussian distribution for pitch and yaw
+    """
+    if mode == 'uniform':
+        theta = (torch.rand((n, 1),device=device) - 0.5) * 2 * horizontal_stddev + horizontal_mean
+        phi = (torch.rand((n, 1),device=device) - 0.5) * 2 * vertical_stddev + vertical_mean
+
+    elif mode == 'normal' or mode == 'gaussian':
+        theta = torch.randn((n, 1), device=device) * horizontal_stddev + horizontal_mean
+        phi = torch.randn((n, 1), device=device) * vertical_stddev + vertical_mean
+
+    elif mode == 'hybrid':
+        if random.random() < 0.5:
+            theta = (torch.rand((n, 1),device=device) - 0.5) * 2 * horizontal_stddev * 2 + horizontal_mean
+            phi = (torch.rand((n, 1),device=device) - 0.5) * 2 * vertical_stddev * 2 + vertical_mean
+        else:
+            theta = torch.randn((n, 1), device=device) * horizontal_stddev + horizontal_mean
+            phi = torch.randn((n, 1), device=device) * vertical_stddev + vertical_mean
+    else:
+        phi = torch.ones((n, 1), device=device, dtype=torch.float) * vertical_mean
+        theta = torch.ones((n, 1), device=device, dtype=torch.float) * horizontal_mean
+
+    phi = torch.clamp(phi, 1e-5, math.pi - 1e-5)
+
+    output_points = torch.zeros((n, 3), device=device)# torch.cuda.FloatTensor(n, 3).fill_(0)#torch.zeros((n, 3))
+
+    output_points[:, 0:1] = r*torch.sin(phi) * torch.cos(theta)
+    output_points[:, 2:3] = r*torch.sin(phi) * torch.sin(theta)
+    output_points[:, 1:2] = r*torch.cos(phi)
+
+    return output_points, phi, theta
+
+
+def perturb_points(points, z_vals, ray_directions, device):
+    distance_between_points = z_vals[:,:,1:2,:] - z_vals[:,:,0:1,:]
+    offset = (torch.rand(z_vals.shape, device=device)-0.5) * distance_between_points
+    z_vals = z_vals + offset
+    points = points + offset * ray_directions.unsqueeze(2)
+    return points, z_vals
+
+
+def create_cam2world_matrix(forward_vector, origin, device=None):
+    """Takes in the direction the camera is pointing and the camera origin and returns a world2cam matrix."""
+
+    forward_vector = normalize_vecs(forward_vector)
+    up_vector = torch.tensor([0, 1, 0], dtype=torch.float, device=device).expand_as(forward_vector)
+    left_vector = normalize_vecs(torch.cross(up_vector, forward_vector, dim=-1))
+    up_vector = normalize_vecs(torch.cross(forward_vector, left_vector, dim=-1))
+
+    rotation_matrix = torch.eye(4, device=device).unsqueeze(0).repeat(forward_vector.shape[0], 1, 1)
+    rotation_matrix[:, :3, :3] = torch.stack((-left_vector, up_vector, -forward_vector), axis=-1)
+
+    translation_matrix = torch.eye(4, device=device).unsqueeze(0).repeat(forward_vector.shape[0], 1, 1)
+    translation_matrix[:, :3, 3] = origin
+
+    cam2world = translation_matrix @ rotation_matrix
+
+    return cam2world
+
+
+def transform_sampled_points(
+    points, z_vals, ray_directions, device, 
+    h_stddev=1, v_stddev=1, h_mean=math.pi * 0.5, 
+    v_mean=math.pi * 0.5, mode='normal'):
+    """
+    points: batch_size x total_pixels x num_steps x 3
+    z_vals: batch_size x total_pixels x num_steps
+    """
+    n, num_rays, num_steps, channels = points.shape
+    points, z_vals = perturb_points(points, z_vals, ray_directions, device)
+    camera_origin, pitch, yaw = sample_camera_positions(
+        n=points.shape[0], r=1, 
+        horizontal_stddev=h_stddev, vertical_stddev=v_stddev, 
+        horizontal_mean=h_mean, vertical_mean=v_mean, 
+        device=device, mode=mode)
+    forward_vector = normalize_vecs(-camera_origin)
+    cam2world_matrix = create_cam2world_matrix(forward_vector, camera_origin, device=device)
+
+    points_homogeneous = torch.ones((points.shape[0], points.shape[1], points.shape[2], points.shape[3] + 1), device=device)
+    points_homogeneous[:, :, :, :3] = points
+
+    # should be n x 4 x 4 , n x r^2 x num_steps x 4
+    transformed_points = torch.bmm(cam2world_matrix, points_homogeneous.reshape(n, -1, 4).permute(0,2,1)).permute(0, 2, 1).reshape(n, num_rays, num_steps, 4)
+    transformed_ray_directions = torch.bmm(cam2world_matrix[..., :3, :3], ray_directions.reshape(n, -1, 3).permute(0,2,1)).permute(0, 2, 1).reshape(n, num_rays, 3)
+
+    homogeneous_origins = torch.zeros((n, 4, num_rays), device=device)
+    homogeneous_origins[:, 3, :] = 1
+
+    transformed_ray_origins = torch.bmm(cam2world_matrix, homogeneous_origins).permute(0, 2, 1).reshape(n, num_rays, 4)[..., :3]
+    return transformed_points[..., :3], z_vals, transformed_ray_directions, transformed_ray_origins, pitch, yaw
+
+
+def integration(
+    rgb_sigma, z_vals, device, noise_std=0.5,
+    last_back=False, white_back=False, clamp_mode=None, fill_mode=None):
+
+    rgbs = rgb_sigma[..., :3]
+    sigmas = rgb_sigma[..., 3:]
+
+    deltas = z_vals[..., 1:, :] - z_vals[..., :-1, :]
+    delta_inf = 1e10 * torch.ones_like(deltas[..., :1, :])
+    deltas = torch.cat([deltas, delta_inf], -2)
+
+    if noise_std > 0:
+        noise = torch.randn(sigmas.shape, device=device) * noise_std
+    else:
+        noise = 0
+
+    if clamp_mode == 'softplus':
+        alphas = 1 - torch.exp(-deltas * (F.softplus(sigmas + noise)))
+    elif clamp_mode == 'relu':
+        alphas = 1 - torch.exp(-deltas * (F.relu(sigmas + noise)))
+    else:
+        raise "Need to choose clamp mode"
+
+    alphas_shifted = torch.cat([torch.ones_like(alphas[..., :1, :]), 1-alphas + 1e-10], -2)
+    weights = alphas * torch.cumprod(alphas_shifted, -2)[..., :-1, :]
+    weights_sum = weights.sum(-2)
+
+    if last_back:
+        weights[..., -1, :] += (1 - weights_sum)
+
+    rgb_final = torch.sum(weights * rgbs, -2)
+    depth_final = torch.sum(weights * z_vals, -2)
+
+    if white_back:
+        rgb_final = rgb_final + 1-weights_sum
+
+    if fill_mode == 'debug':
+        rgb_final[weights_sum.squeeze(-1) < 0.9] = torch.tensor([1., 0, 0], device=rgb_final.device)
+    elif fill_mode == 'weight':
+        rgb_final = weights_sum.expand_as(rgb_final)
+
+    return rgb_final, depth_final, weights
+
+
+def get_sigma_field_np(nerf, styles, resolution=512, block_resolution=64):
+    # return numpy array of forwarded sigma value
+    bound = (nerf.depth_range[1] - nerf.depth_range[0]) * 0.5
+    X = torch.linspace(-bound, bound, resolution).split(block_resolution)
+
+    sigma_np = np.zeros([resolution, resolution, resolution], dtype=np.float32)
+
+    for xi, xs in enumerate(X):
+        for yi, ys in enumerate(X):
+            for zi, zs in enumerate(X):
+                xx, yy, zz = torch.meshgrid(xs, ys, zs)
+                pts = torch.stack([xx, yy, zz], dim=-1).unsqueeze(0).to(styles.device)  # B, H, H, H, C
+                block_shape = [1, len(xs), len(ys), len(zs)]
+                feat_out, sigma_out = nerf.fg_nerf.forward_style2(pts, None, block_shape, ws=styles)
+                sigma_np[xi * block_resolution: xi * block_resolution + len(xs), \
+                yi * block_resolution: yi * block_resolution + len(ys), \
+                zi * block_resolution: zi * block_resolution + len(zs)] = sigma_out.reshape(block_shape[1:]).detach().cpu().numpy()
+
+    return sigma_np, bound
+
+
+def extract_geometry(nerf, styles, resolution, threshold):
+    import mcubes
+
+    print('threshold: {}'.format(threshold))
+    u, bound = get_sigma_field_np(nerf, styles, resolution)
+    vertices, triangles = mcubes.marching_cubes(u, threshold)
+    b_min_np = np.array([-bound, -bound, -bound])
+    b_max_np = np.array([ bound,  bound,  bound])
+
+    vertices = vertices / (resolution - 1.0) * (b_max_np - b_min_np)[None, :] + b_min_np[None, :]
+    return vertices.astype('float32'), triangles
+
+
+def render_mesh(meshes, camera_matrices, render_noise=True):
+    from pytorch3d.renderer import (
+        FoVPerspectiveCameras, look_at_view_transform,
+        RasterizationSettings, BlendParams,
+        MeshRenderer, MeshRasterizer, HardPhongShader, TexturesVertex
+    )
+    from pytorch3d.ops import interpolate_face_attributes
+    from pytorch3d.structures.meshes import Meshes
+
+    intrinsics, poses, _, _ = camera_matrices
+    device = poses.device
+    c2w = torch.matmul(poses, torch.diag(torch.tensor([-1.0, 1.0, -1.0, 1.0], device=device))[None, :, :])  # Different camera model...
+    w2c = torch.inverse(c2w)
+    R = c2w[:, :3, :3]
+    T = w2c[:, :3, 3]   # So weird..... Why one is c2w and another is w2c?
+    focal = intrinsics[0, 0, 0]
+    fov = torch.arctan(focal) * 2.0 / np.pi * 180
+    
+
+    colors = []
+    offset = 1
+    for res, (mesh, face_vert_noise) in meshes.items():
+        raster_settings = RasterizationSettings(
+                image_size=res,
+                blur_radius=0.0,
+                faces_per_pixel=1,
+        )
+        mesh = Meshes(
+            verts=[torch.from_numpy(mesh.vertices).float().to(device)],
+            faces=[torch.from_numpy(mesh.faces).long().to(device)])
+
+        _colors = []
+        for i in range(len(poses)):
+            cameras = FoVPerspectiveCameras(device=device, R=R[i: i+1], T=T[i: i+1], fov=fov)
+            rasterizer = MeshRasterizer(cameras=cameras, raster_settings=raster_settings)
+            pix_to_face, zbuf, bary_coord, dists = rasterizer(mesh)
+            color = interpolate_face_attributes(pix_to_face, bary_coord, face_vert_noise).squeeze()
+
+            # hack
+            color[offset:, offset:] = color[:-offset, :-offset]
+            _colors += [color]
+        color = torch.stack(_colors, 0).permute(0,3,1,2)
+        colors += [color]
+        offset *= 2
+    return colors
+
+
+def rotate_vects(v, theta):
+    theta = theta / math.pi * 2
+    theta = theta + (theta < 0).type_as(theta) * 4
+    v  = v.reshape(v.size(0), v.size(1) // 4, 4, v.size(2), v.size(3))
+    vs = []
+    order  = [0,2,3,1]  # Not working
+    iorder = [0,3,1,2]  # Not working
+    for b in range(len(v)):
+        if (theta[b] - 0) < 1e-6:
+            u, l = 0, 0
+        elif (theta[b] - 1) < 1e-6:
+            u, l = 0, 1
+        elif (theta[b] - 2) < 1e-6:
+            u, l = 0, 2
+        elif (theta[b] - 3) < 1e-6:
+            u, l = 0, 3
+        else:
+            u, l = math.modf(theta[b])
+        l, r = int(l), int(l + 1) % 4
+        vv = v[b, :, order]  # 0 -> 1 -> 3 -> 2
+        vl   = torch.cat([vv[:, l:], vv[:, :l]], 1)
+        if u > 0:
+            vr = torch.cat([vv[:, r:], vv[:, :r]], 1)
+            vv = vl * (1-u) + vr * u
+        else:
+            vv = vl
+        vs.append(vv[:, iorder])
+    v = torch.stack(vs, 0)
+    v = v.reshape(v.size(0), -1, v.size(-2), v.size(-1))
+    return v
+
+
+def generate_option_outputs(render_option):
+    # output debugging outputs (not used in normal rendering process)
+    if ('depth' in render_option.split(',')):    
+        img = camera_world[:, :1] + fg_depth_map * ray_vector
+        img = reformat(img, tgt_res)
+
+        if 'gradient' in render_option.split(','):
+            points = (camera_world[:,:,None]+di[:,:,:,None]*ray_vector[:,:,None]).reshape(
+                batch_size, tgt_res, tgt_res, di.size(-1), 3)
+            with torch.enable_grad():
+                gradients = self.fg_nerf.forward_style2(
+                    points, None, [batch_size, tgt_res, di.size(-1), tgt_res], get_normal=True,
+                    ws=styles, z_shape=z_shape_obj, z_app=z_app_obj).reshape(
+                        batch_size, di.size(-1), 3, tgt_res * tgt_res).permute(0,3,1,2)
+                avg_grads = (gradients * fg_weights.unsqueeze(-1)).sum(-2)
+            normal = reformat(normalize(avg_grads, axis=2)[0], tgt_res)
+            img = normal
+
+        if 'value' in render_option.split(','):
+            fg_feat = fg_feat[:,:,3:].norm(dim=-1,keepdim=True)
+            img = reformat(fg_feat.repeat(1,1,3), tgt_res) / fg_feat.max() * 2 - 1
+            
+        if 'opacity' in render_option.split(','):
+            opacity = bg_lambda.unsqueeze(-1).repeat(1,1,3) * 2 - 1
+            img = reformat(opacity, tgt_res)
+
+        if 'normal' in render_option.split(','):
+            shift_l, shift_r = img[:,:,2:,:], img[:,:,:-2,:]
+            shift_u, shift_d = img[:,:,:,2:], img[:,:,:,:-2]
+            diff_hor = normalize(shift_r - shift_l, axis=1)[0][:, :, :, 1:-1]
+            diff_ver = normalize(shift_u - shift_d, axis=1)[0][:, :, 1:-1, :]
+            normal = torch.cross(diff_hor, diff_ver, dim=1)
+            img = normalize(normal, axis=1)[0]
+        
+        return {'full_out': (None, img), 'reg_loss': {}}
diff --git a/dnnlib/util.py b/dnnlib/util.py
new file mode 100755
index 0000000000000000000000000000000000000000..1646ede6427095f7a88d2f52d378c7e46a379346
--- /dev/null
+++ b/dnnlib/util.py
@@ -0,0 +1,531 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Miscellaneous utility classes and functions."""
+
+import ctypes
+import fnmatch
+import importlib
+import inspect
+import numpy as np
+import os
+import shutil
+import sys
+import types
+import io
+import pickle
+import re
+import requests
+import html
+import hashlib
+import glob
+import tempfile
+import urllib
+import urllib.request
+import uuid
+import torch
+
+from distutils.util import strtobool
+from typing import Any, List, Tuple, Union
+
+
+# Util classes
+# ------------------------------------------------------------------------------------------
+
+
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+
+
+class Logger(object):
+    """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file."""
+
+    def __init__(self, file_name: str = None, file_mode: str = "w", should_flush: bool = True):
+        self.file = None
+
+        if file_name is not None:
+            self.file = open(file_name, file_mode)
+
+        self.should_flush = should_flush
+        self.stdout = sys.stdout
+        self.stderr = sys.stderr
+
+        sys.stdout = self
+        sys.stderr = self
+
+    def __enter__(self) -> "Logger":
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.close()
+
+    def write(self, text: Union[str, bytes]) -> None:
+        """Write text to stdout (and a file) and optionally flush."""
+        if isinstance(text, bytes):
+            text = text.decode()
+        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
+            return
+
+        if self.file is not None:
+            self.file.write(text)
+
+        self.stdout.write(text)
+
+        if self.should_flush:
+            self.flush()
+
+    def flush(self) -> None:
+        """Flush written text to both stdout and a file, if open."""
+        if self.file is not None:
+            self.file.flush()
+
+        self.stdout.flush()
+
+    def close(self) -> None:
+        """Flush, close possible files, and remove stdout/stderr mirroring."""
+        self.flush()
+
+        # if using multiple loggers, prevent closing in wrong order
+        if sys.stdout is self:
+            sys.stdout = self.stdout
+        if sys.stderr is self:
+            sys.stderr = self.stderr
+
+        if self.file is not None:
+            self.file.close()
+            self.file = None
+
+
+# Cache directories
+# ------------------------------------------------------------------------------------------
+
+_dnnlib_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+    global _dnnlib_cache_dir
+    _dnnlib_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+    if _dnnlib_cache_dir is not None:
+        return os.path.join(_dnnlib_cache_dir, *paths)
+    if 'DNNLIB_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['DNNLIB_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'dnnlib', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'dnnlib', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'dnnlib', *paths)
+
+# Small util functions
+# ------------------------------------------------------------------------------------------
+
+
+def format_time(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60)
+    else:
+        return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60)
+
+
+def ask_yes_no(question: str) -> bool:
+    """Ask the user the question until the user inputs a valid answer."""
+    while True:
+        try:
+            print("{0} [y/n]".format(question))
+            return strtobool(input().lower())
+        except ValueError:
+            pass
+
+
+def tuple_product(t: Tuple) -> Any:
+    """Calculate the product of the tuple elements."""
+    result = 1
+
+    for v in t:
+        result *= v
+
+    return result
+
+
+_str_to_ctype = {
+    "uint8": ctypes.c_ubyte,
+    "uint16": ctypes.c_uint16,
+    "uint32": ctypes.c_uint32,
+    "uint64": ctypes.c_uint64,
+    "int8": ctypes.c_byte,
+    "int16": ctypes.c_int16,
+    "int32": ctypes.c_int32,
+    "int64": ctypes.c_int64,
+    "float32": ctypes.c_float,
+    "float64": ctypes.c_double
+}
+
+
+def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]:
+    """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes."""
+    type_str = None
+
+    if isinstance(type_obj, str):
+        type_str = type_obj
+    elif hasattr(type_obj, "__name__"):
+        type_str = type_obj.__name__
+    elif hasattr(type_obj, "name"):
+        type_str = type_obj.name
+    else:
+        raise RuntimeError("Cannot infer type name from input")
+
+    assert type_str in _str_to_ctype.keys()
+
+    my_dtype = np.dtype(type_str)
+    my_ctype = _str_to_ctype[type_str]
+
+    assert my_dtype.itemsize == ctypes.sizeof(my_ctype)
+
+    return my_dtype, my_ctype
+
+
+def is_pickleable(obj: Any) -> bool:
+    try:
+        with io.BytesIO() as stream:
+            pickle.dump(obj, stream)
+        return True
+    except:
+        return False
+
+
+# Functionality to import modules/objects by name, and call functions by name
+# ------------------------------------------------------------------------------------------
+
+def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]:
+    """Searches for the underlying module behind the name to some python object.
+    Returns the module and the object name (original name with module part removed)."""
+
+    # allow convenience shorthands, substitute them by full names
+    obj_name = re.sub("^np.", "numpy.", obj_name)
+    obj_name = re.sub("^tf.", "tensorflow.", obj_name)
+
+    # list alternatives for (module_name, local_obj_name)
+    parts = obj_name.split(".")
+    name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)]
+
+    # try each alternative in turn
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+            return module, local_obj_name
+        except:
+            pass
+
+    # maybe some of the modules themselves contain errors?
+    for module_name, _local_obj_name in name_pairs:
+        try:
+            importlib.import_module(module_name) # may raise ImportError
+        except ImportError:
+            if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"):
+                raise
+
+    # maybe the requested attribute is missing?
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+        except ImportError:
+            pass
+
+    # we are out of luck, but we have no idea why
+    raise ImportError(obj_name)
+
+
+def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any:
+    """Traverses the object name and returns the last (rightmost) python object."""
+    if obj_name == '':
+        return module
+    obj = module
+    for part in obj_name.split("."):
+        obj = getattr(obj, part)
+    return obj
+
+
+def get_obj_by_name(name: str) -> Any:
+    """Finds the python object with the given name."""
+    module, obj_name = get_module_from_obj_name(name)
+    return get_obj_from_module(module, obj_name)
+
+
+def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any:
+    """Finds the python object with the given name and calls it as a function."""
+    assert func_name is not None
+    func_obj = get_obj_by_name(func_name)
+    assert callable(func_obj)
+    return func_obj(*args, **kwargs)
+
+
+def construct_class_by_name(*args, class_name: str = None, **kwargs) -> Any:
+    """Finds the python class with the given name and constructs it with the given arguments."""
+    return call_func_by_name(*args, func_name=class_name, **kwargs)
+
+
+def get_module_dir_by_obj_name(obj_name: str) -> str:
+    """Get the directory path of the module containing the given object name."""
+    module, _ = get_module_from_obj_name(obj_name)
+    return os.path.dirname(inspect.getfile(module))
+
+
+def is_top_level_function(obj: Any) -> bool:
+    """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'."""
+    return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__
+
+
+def get_top_level_function_name(obj: Any) -> str:
+    """Return the fully-qualified name of a top-level function."""
+    assert is_top_level_function(obj)
+    module = obj.__module__
+    if module == '__main__':
+        module = os.path.splitext(os.path.basename(sys.modules[module].__file__))[0]
+    return module + "." + obj.__name__
+
+
+# File system helpers
+# ------------------------------------------------------------------------------------------
+
+def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]:
+    """List all files recursively in a given directory while ignoring given file and directory names.
+    Returns list of tuples containing both absolute and relative paths."""
+    assert os.path.isdir(dir_path)
+    base_name = os.path.basename(os.path.normpath(dir_path))
+
+    if ignores is None:
+        ignores = []
+
+    result = []
+
+    for root, dirs, files in os.walk(dir_path, topdown=True):
+        for ignore_ in ignores:
+            dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)]
+
+            # dirs need to be edited in-place
+            for d in dirs_to_remove:
+                dirs.remove(d)
+
+            files = [f for f in files if not fnmatch.fnmatch(f, ignore_)]
+
+        absolute_paths = [os.path.join(root, f) for f in files]
+        relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths]
+
+        if add_base_to_relative:
+            relative_paths = [os.path.join(base_name, p) for p in relative_paths]
+
+        assert len(absolute_paths) == len(relative_paths)
+        result += zip(absolute_paths, relative_paths)
+
+    return result
+
+
+def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None:
+    """Takes in a list of tuples of (src, dst) paths and copies files.
+    Will create all necessary directories."""
+    for file in files:
+        target_dir_name = os.path.dirname(file[1])
+
+        # will create all intermediate-level directories
+        if not os.path.exists(target_dir_name):
+            os.makedirs(target_dir_name)
+
+        shutil.copyfile(file[0], file[1])
+
+
+# URL helpers
+# ------------------------------------------------------------------------------------------
+
+def is_url(obj: Any, allow_file_urls: bool = False) -> bool:
+    """Determine whether the given object is a valid URL string."""
+    if not isinstance(obj, str) or not "://" in obj:
+        return False
+    if allow_file_urls and obj.startswith('file://'):
+        return True
+    try:
+        res = requests.compat.urlparse(obj)
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+        res = requests.compat.urlparse(requests.compat.urljoin(obj, "/"))
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+    except:
+        return False
+    return True
+
+
+def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+    assert not (return_filename and (not cache))
+
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match('^[a-z]+://', url):
+        return url if return_filename else open(url, "rb")
+
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith('file://'):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r'^/[a-zA-Z]:', filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+
+    assert is_url(url)
+
+    # Lookup from cache.
+    if cache_dir is None:
+        cache_dir = make_cache_dir_path('downloads')
+
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    if cache:
+        cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*"))
+        if len(cache_files) == 1:
+            filename = cache_files[0]
+            return filename if return_filename else open(filename, "rb")
+
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+
+    # Save to cache.
+    if cache:
+        safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name)
+        cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name)
+        temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name)
+        os.makedirs(cache_dir, exist_ok=True)
+        with open(temp_file, "wb") as f:
+            f.write(url_data)
+        os.replace(temp_file, cache_file) # atomic
+        if return_filename:
+            return cache_file
+
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)
+
+
+def dividable(n, k=2):
+    if k == 2:
+        for i in range(int(np.sqrt(n)), 0, -1):
+            if n % i == 0:
+                break
+        return i, n // i
+    elif k == 3:
+        for i in range(int(float(n) ** (1/3)), 0, -1):
+            if n % i == 0:
+                b, c = dividable(n // i, 2)
+                return i, b, c
+    else:
+        raise NotImplementedError
+
+
+def visualize_feature_map(x, scale=1.0, mask=None, loc=None):
+    B, C, H, W = x.size()
+    lh, lw = dividable(C)
+    x = x.reshape(B, lh, lw, H, W).permute(0,1,3,2,4)
+    # loc = [(3,1), (6,3), (4,0)]
+    # loc = [(4,0), (0,7), (6,2)]
+    loc = [(3, 11), (5,3), (3,9)]
+    # loc = [(1,3), (5,3), (7,4)]
+    # loc = [(0,5), (10,0), (3,14)]
+    if loc is None:
+        x = x.reshape(B, 1, lh*H, lw*W).repeat(1,3,1,1)
+    else:
+        x = [x[:, l[0], :, l[1]] for l in loc]
+        x = torch.stack(x, 1)
+        x = x / x.norm(dim=1, keepdim=True)
+    x = x / scale
+    return x
+
+
+def hash_func(x, res, T):
+    d = x.size(-1)
+    assert d <= 3
+
+    h = x[..., 0]
+    if res ** d < T:
+        f = [1, res, res * res]
+        for i in range(1, d):
+            h += x[..., i] * f[i]
+    else:
+        f = [1, 19349663, 83492791]
+        for i in range(1, d):
+            h = torch.bitwise_xor(h, x[..., i] * f[i])
+        h = h % T
+    return h
\ No newline at end of file
diff --git a/generate.py b/generate.py
new file mode 100755
index 0000000000000000000000000000000000000000..275ad074a6eeb0fa36fa3faa0b9d6cf336adf3c8
--- /dev/null
+++ b/generate.py
@@ -0,0 +1,202 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Generate images using pretrained network pickle."""
+
+import os
+import re
+import time
+import glob
+from typing import List, Optional
+
+import click
+import dnnlib
+import numpy as np
+import PIL.Image
+import torch
+import imageio
+import legacy
+from renderer import Renderer
+
+#----------------------------------------------------------------------------
+
+def num_range(s: str) -> List[int]:
+    '''Accept either a comma separated list of numbers 'a,b,c' or a range 'a-c' and return as a list of ints.'''
+
+    range_re = re.compile(r'^(\d+)-(\d+)$')
+    m = range_re.match(s)
+    if m:
+        return list(range(int(m.group(1)), int(m.group(2))+1))
+    vals = s.split(',')
+    return [int(x) for x in vals]
+
+#----------------------------------------------------------------------------
+os.environ['PYOPENGL_PLATFORM'] = 'egl'
+
+@click.command()
+@click.pass_context
+@click.option('--network', 'network_pkl', help='Network pickle filename', required=True)
+@click.option('--seeds', type=num_range, help='List of random seeds')
+@click.option('--trunc', 'truncation_psi', type=float, help='Truncation psi', default=1, show_default=True)
+@click.option('--class', 'class_idx', type=int, help='Class label (unconditional if not specified)')
+@click.option('--noise-mode', help='Noise mode', type=click.Choice(['const', 'random', 'none']), default='const', show_default=True)
+@click.option('--projected-w', help='Projection result file', type=str, metavar='FILE')
+@click.option('--outdir', help='Where to save the output images', type=str, required=True, metavar='DIR')
+@click.option('--render-program', default=None, show_default=True)
+@click.option('--render-option', default=None, type=str, help="e.g. up_256, camera, depth")
+@click.option('--n_steps', default=8, type=int, help="number of steps for each seed")
+@click.option('--no-video', default=False)
+@click.option('--relative_range_u_scale', default=1.0, type=float, help="relative scale on top of the original range u")
+def generate_images(
+    ctx: click.Context,
+    network_pkl: str,
+    seeds: Optional[List[int]],
+    truncation_psi: float,
+    noise_mode: str,
+    outdir: str,
+    class_idx: Optional[int],
+    projected_w: Optional[str],
+    render_program=None,
+    render_option=None,
+    n_steps=8,
+    no_video=False,
+    relative_range_u_scale=1.0
+):
+
+    
+    device = torch.device('cuda')
+    if os.path.isdir(network_pkl):
+        network_pkl = sorted(glob.glob(network_pkl + '/*.pkl'))[-1]
+    print('Loading networks from "%s"...' % network_pkl)
+    
+    with dnnlib.util.open_url(network_pkl) as f:
+        network = legacy.load_network_pkl(f)
+        G = network['G_ema'].to(device) # type: ignore
+        D = network['D'].to(device)
+    # from fairseq import pdb;pdb.set_trace()
+    os.makedirs(outdir, exist_ok=True)
+
+    # Labels.
+    label = torch.zeros([1, G.c_dim], device=device)
+    if G.c_dim != 0:
+        if class_idx is None:
+            ctx.fail('Must specify class label with --class when using a conditional network')
+        label[:, class_idx] = 1
+    else:
+        if class_idx is not None:
+            print ('warn: --class=lbl ignored when running on an unconditional network')
+
+    # avoid persistent classes... 
+    from training.networks import Generator
+    # from training.stylenerf import Discriminator
+    from torch_utils import misc
+    with torch.no_grad():
+        G2 = Generator(*G.init_args, **G.init_kwargs).to(device)
+        misc.copy_params_and_buffers(G, G2, require_all=False)
+        # D2 = Discriminator(*D.init_args, **D.init_kwargs).to(device)
+        # misc.copy_params_and_buffers(D, D2, require_all=False)
+    G2 = Renderer(G2, D, program=render_program)
+    
+    # Generate images.
+    all_imgs = []
+
+    def stack_imgs(imgs):
+        img = torch.stack(imgs, dim=2)
+        return img.reshape(img.size(0) * img.size(1), img.size(2) * img.size(3), 3)
+
+    def proc_img(img): 
+        return (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8).cpu()
+
+    if projected_w is not None:
+        ws = np.load(projected_w)
+        ws = torch.tensor(ws, device=device) # pylint: disable=not-callable
+        img = G2(styles=ws, truncation_psi=truncation_psi, noise_mode=noise_mode, render_option=render_option)
+        assert isinstance(img, List)
+        imgs = [proc_img(i) for i in img]
+        all_imgs += [imgs]
+    
+    else:
+        for seed_idx, seed in enumerate(seeds):
+            print('Generating image for seed %d (%d/%d) ...' % (seed, seed_idx, len(seeds)))
+            G2.set_random_seed(seed)
+            z = torch.from_numpy(np.random.RandomState(seed).randn(2, G.z_dim)).to(device)
+            relative_range_u = [0.5 - 0.5 * relative_range_u_scale, 0.5 + 0.5 * relative_range_u_scale]
+            outputs = G2(
+                z=z,
+                c=label,
+                truncation_psi=truncation_psi,
+                noise_mode=noise_mode,
+                render_option=render_option,
+                n_steps=n_steps,
+                relative_range_u=relative_range_u,
+                return_cameras=True)
+            if isinstance(outputs, tuple):
+                img, cameras = outputs
+            else:
+                img = outputs
+
+            if isinstance(img, List):
+                imgs = [proc_img(i) for i in img]
+                if not no_video:
+                    all_imgs += [imgs]
+           
+                curr_out_dir = os.path.join(outdir, 'seed_{:0>6d}'.format(seed))
+                os.makedirs(curr_out_dir, exist_ok=True)
+
+                if (render_option is not None) and ("gen_ibrnet_metadata" in render_option):
+                    intrinsics = []
+                    poses = []
+                    _, H, W, _ = imgs[0].shape
+                    for i, camera in enumerate(cameras):
+                        intri, pose, _, _ = camera
+                        focal = (H - 1) * 0.5 / intri[0, 0, 0].item()
+                        intri = np.diag([focal, focal, 1.0, 1.0]).astype(np.float32)
+                        intri[0, 2], intri[1, 2] = (W - 1) * 0.5, (H - 1) * 0.5
+
+                        pose = pose.squeeze().detach().cpu().numpy() @ np.diag([1, -1, -1, 1]).astype(np.float32)
+                        intrinsics.append(intri)
+                        poses.append(pose)
+
+                    intrinsics = np.stack(intrinsics, axis=0)
+                    poses = np.stack(poses, axis=0)
+
+                    np.savez(os.path.join(curr_out_dir, 'cameras.npz'), intrinsics=intrinsics, poses=poses)
+                    with open(os.path.join(curr_out_dir, 'meta.conf'), 'w') as f:
+                        f.write('depth_range = {}\ntest_hold_out = {}\nheight = {}\nwidth = {}'.
+                                format(G2.generator.synthesis.depth_range, 2, H, W))
+
+                img_dir = os.path.join(curr_out_dir, 'images_raw')
+                os.makedirs(img_dir, exist_ok=True)
+                for step, img in enumerate(imgs):
+                    PIL.Image.fromarray(img[0].detach().cpu().numpy(), 'RGB').save(f'{img_dir}/{step:03d}.png')
+
+            else:
+                img = proc_img(img)[0]
+                PIL.Image.fromarray(img.numpy(), 'RGB').save(f'{outdir}/seed_{seed:0>6d}.png')
+
+    if len(all_imgs) > 0 and (not no_video):
+         # write to video
+        timestamp = time.strftime('%Y%m%d.%H%M%S',time.localtime(time.time()))
+        seeds = ','.join([str(s) for s in seeds]) if seeds is not None else 'projected'
+        network_pkl = network_pkl.split('/')[-1].split('.')[0]
+        all_imgs = [stack_imgs([a[k] for a in all_imgs]).numpy() for k in range(len(all_imgs[0]))]
+        imageio.mimwrite(f'{outdir}/{network_pkl}_{timestamp}_{seeds}.mp4', all_imgs, fps=30, quality=8)
+        outdir = f'{outdir}/{network_pkl}_{timestamp}_{seeds}'
+        os.makedirs(outdir, exist_ok=True)
+        for step, img in enumerate(all_imgs):
+            PIL.Image.fromarray(img, 'RGB').save(f'{outdir}/{step:04d}.png')
+
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    generate_images() # pylint: disable=no-value-for-parameter
+
+#----------------------------------------------------------------------------
diff --git a/gui_utils/__init__.py b/gui_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..939e7c6c8f94c4ea1141885c3c3295fe083b06aa
--- /dev/null
+++ b/gui_utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# empty
diff --git a/gui_utils/gl_utils.py b/gui_utils/gl_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..64b6cb6510ab2f5075effe8684d824f50bd38272
--- /dev/null
+++ b/gui_utils/gl_utils.py
@@ -0,0 +1,374 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import functools
+import contextlib
+import numpy as np
+import OpenGL.GL as gl
+import OpenGL.GL.ARB.texture_float
+import dnnlib
+
+#----------------------------------------------------------------------------
+
+def init_egl():
+    assert os.environ['PYOPENGL_PLATFORM'] == 'egl' # Must be set before importing OpenGL.
+    import OpenGL.EGL as egl
+    import ctypes
+
+    # Initialize EGL.
+    display = egl.eglGetDisplay(egl.EGL_DEFAULT_DISPLAY)
+    assert display != egl.EGL_NO_DISPLAY
+    major = ctypes.c_int32()
+    minor = ctypes.c_int32()
+    ok = egl.eglInitialize(display, major, minor)
+    assert ok
+    assert major.value * 10 + minor.value >= 14
+
+    # Choose config.
+    config_attribs = [
+        egl.EGL_RENDERABLE_TYPE,    egl.EGL_OPENGL_BIT,
+        egl.EGL_SURFACE_TYPE,       egl.EGL_PBUFFER_BIT,
+        egl.EGL_NONE
+    ]
+    configs = (ctypes.c_int32 * 1)()
+    num_configs = ctypes.c_int32()
+    ok = egl.eglChooseConfig(display, config_attribs, configs, 1, num_configs)
+    assert ok
+    assert num_configs.value == 1
+    config = configs[0]
+
+    # Create dummy pbuffer surface.
+    surface_attribs = [
+        egl.EGL_WIDTH,  1,
+        egl.EGL_HEIGHT, 1,
+        egl.EGL_NONE
+    ]
+    surface = egl.eglCreatePbufferSurface(display, config, surface_attribs)
+    assert surface != egl.EGL_NO_SURFACE
+
+    # Setup GL context.
+    ok = egl.eglBindAPI(egl.EGL_OPENGL_API)
+    assert ok
+    context = egl.eglCreateContext(display, config, egl.EGL_NO_CONTEXT, None)
+    assert context != egl.EGL_NO_CONTEXT
+    ok = egl.eglMakeCurrent(display, surface, surface, context)
+    assert ok
+
+#----------------------------------------------------------------------------
+
+_texture_formats = {
+    ('uint8',   1): dnnlib.EasyDict(type=gl.GL_UNSIGNED_BYTE, format=gl.GL_LUMINANCE,       internalformat=gl.GL_LUMINANCE8),
+    ('uint8',   2): dnnlib.EasyDict(type=gl.GL_UNSIGNED_BYTE, format=gl.GL_LUMINANCE_ALPHA, internalformat=gl.GL_LUMINANCE8_ALPHA8),
+    ('uint8',   3): dnnlib.EasyDict(type=gl.GL_UNSIGNED_BYTE, format=gl.GL_RGB,             internalformat=gl.GL_RGB8),
+    ('uint8',   4): dnnlib.EasyDict(type=gl.GL_UNSIGNED_BYTE, format=gl.GL_RGBA,            internalformat=gl.GL_RGBA8),
+    ('float32', 1): dnnlib.EasyDict(type=gl.GL_FLOAT,         format=gl.GL_LUMINANCE,       internalformat=OpenGL.GL.ARB.texture_float.GL_LUMINANCE32F_ARB),
+    ('float32', 2): dnnlib.EasyDict(type=gl.GL_FLOAT,         format=gl.GL_LUMINANCE_ALPHA, internalformat=OpenGL.GL.ARB.texture_float.GL_LUMINANCE_ALPHA32F_ARB),
+    ('float32', 3): dnnlib.EasyDict(type=gl.GL_FLOAT,         format=gl.GL_RGB,             internalformat=gl.GL_RGB32F),
+    ('float32', 4): dnnlib.EasyDict(type=gl.GL_FLOAT,         format=gl.GL_RGBA,            internalformat=gl.GL_RGBA32F),
+}
+
+def get_texture_format(dtype, channels):
+    return _texture_formats[(np.dtype(dtype).name, int(channels))]
+
+#----------------------------------------------------------------------------
+
+def prepare_texture_data(image):
+    image = np.asarray(image)
+    if image.ndim == 2:
+        image = image[:, :, np.newaxis]
+    if image.dtype.name == 'float64':
+        image = image.astype('float32')
+    return image
+
+#----------------------------------------------------------------------------
+
+def draw_pixels(image, *, pos=0, zoom=1, align=0, rint=True):
+    pos = np.broadcast_to(np.asarray(pos, dtype='float32'), [2])
+    zoom = np.broadcast_to(np.asarray(zoom, dtype='float32'), [2])
+    align = np.broadcast_to(np.asarray(align, dtype='float32'), [2])
+    image = prepare_texture_data(image)
+    height, width, channels = image.shape
+    size = zoom * [width, height]
+    pos = pos - size * align
+    if rint:
+        pos = np.rint(pos)
+    fmt = get_texture_format(image.dtype, channels)
+
+    gl.glPushAttrib(gl.GL_CURRENT_BIT | gl.GL_PIXEL_MODE_BIT)
+    gl.glPushClientAttrib(gl.GL_CLIENT_PIXEL_STORE_BIT)
+    gl.glRasterPos2f(pos[0], pos[1])
+    gl.glPixelZoom(zoom[0], -zoom[1])
+    gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+    gl.glDrawPixels(width, height, fmt.format, fmt.type, image)
+    gl.glPopClientAttrib()
+    gl.glPopAttrib()
+
+#----------------------------------------------------------------------------
+
+def read_pixels(width, height, *, pos=0, dtype='uint8', channels=3):
+    pos = np.broadcast_to(np.asarray(pos, dtype='float32'), [2])
+    dtype = np.dtype(dtype)
+    fmt = get_texture_format(dtype, channels)
+    image = np.empty([height, width, channels], dtype=dtype)
+
+    gl.glPushClientAttrib(gl.GL_CLIENT_PIXEL_STORE_BIT)
+    gl.glPixelStorei(gl.GL_PACK_ALIGNMENT, 1)
+    gl.glReadPixels(int(np.round(pos[0])), int(np.round(pos[1])), width, height, fmt.format, fmt.type, image)
+    gl.glPopClientAttrib()
+    return np.flipud(image)
+
+#----------------------------------------------------------------------------
+
+class Texture:
+    def __init__(self, *, image=None, width=None, height=None, channels=None, dtype=None, bilinear=True, mipmap=True):
+        self.gl_id = None
+        self.bilinear = bilinear
+        self.mipmap = mipmap
+
+        # Determine size and dtype.
+        if image is not None:
+            image = prepare_texture_data(image)
+            self.height, self.width, self.channels = image.shape
+            self.dtype = image.dtype
+        else:
+            assert width is not None and height is not None
+            self.width = width
+            self.height = height
+            self.channels = channels if channels is not None else 3
+            self.dtype = np.dtype(dtype) if dtype is not None else np.uint8
+
+        # Validate size and dtype.
+        assert isinstance(self.width, int) and self.width >= 0
+        assert isinstance(self.height, int) and self.height >= 0
+        assert isinstance(self.channels, int) and self.channels >= 1
+        assert self.is_compatible(width=width, height=height, channels=channels, dtype=dtype)
+
+        # Create texture object.
+        self.gl_id = gl.glGenTextures(1)
+        with self.bind():
+            gl.glTexParameterf(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_WRAP_S, gl.GL_CLAMP_TO_EDGE)
+            gl.glTexParameterf(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_WRAP_T, gl.GL_CLAMP_TO_EDGE)
+            gl.glTexParameterf(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR if self.bilinear else gl.GL_NEAREST)
+            gl.glTexParameterf(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR_MIPMAP_LINEAR if self.mipmap else gl.GL_NEAREST)
+        self.update(image)
+
+    def delete(self):
+        if self.gl_id is not None:
+            gl.glDeleteTextures([self.gl_id])
+            self.gl_id = None
+
+    def __del__(self):
+        try:
+            self.delete()
+        except:
+            pass
+
+    @contextlib.contextmanager
+    def bind(self):
+        prev_id = gl.glGetInteger(gl.GL_TEXTURE_BINDING_2D)
+        gl.glBindTexture(gl.GL_TEXTURE_2D, self.gl_id)
+        yield
+        gl.glBindTexture(gl.GL_TEXTURE_2D, prev_id)
+
+    def update(self, image):
+        if image is not None:
+            image = prepare_texture_data(image)
+            assert self.is_compatible(image=image)
+        with self.bind():
+            fmt = get_texture_format(self.dtype, self.channels)
+            gl.glPushClientAttrib(gl.GL_CLIENT_PIXEL_STORE_BIT)
+            gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+            gl.glTexImage2D(gl.GL_TEXTURE_2D, 0, fmt.internalformat, self.width, self.height, 0, fmt.format, fmt.type, image)
+            if self.mipmap:
+                gl.glGenerateMipmap(gl.GL_TEXTURE_2D)
+            gl.glPopClientAttrib()
+
+    def draw(self, *, pos=0, zoom=1, align=0, rint=False, color=1, alpha=1, rounding=0):
+        zoom = np.broadcast_to(np.asarray(zoom, dtype='float32'), [2])
+        size = zoom * [self.width, self.height]
+        with self.bind():
+            gl.glPushAttrib(gl.GL_ENABLE_BIT)
+            gl.glEnable(gl.GL_TEXTURE_2D)
+            draw_rect(pos=pos, size=size, align=align, rint=rint, color=color, alpha=alpha, rounding=rounding)
+            gl.glPopAttrib()
+
+    def is_compatible(self, *, image=None, width=None, height=None, channels=None, dtype=None): # pylint: disable=too-many-return-statements
+        if image is not None:
+            if image.ndim != 3:
+                return False
+            ih, iw, ic = image.shape
+            if not self.is_compatible(width=iw, height=ih, channels=ic, dtype=image.dtype):
+                return False
+        if width is not None and self.width != width:
+            return False
+        if height is not None and self.height != height:
+            return False
+        if channels is not None and self.channels != channels:
+            return False
+        if dtype is not None and self.dtype != dtype:
+            return False
+        return True
+
+#----------------------------------------------------------------------------
+
+class Framebuffer:
+    def __init__(self, *, texture=None, width=None, height=None, channels=None, dtype=None, msaa=0):
+        self.texture = texture
+        self.gl_id = None
+        self.gl_color = None
+        self.gl_depth_stencil = None
+        self.msaa = msaa
+
+        # Determine size and dtype.
+        if texture is not None:
+            assert isinstance(self.texture, Texture)
+            self.width = texture.width
+            self.height = texture.height
+            self.channels = texture.channels
+            self.dtype = texture.dtype
+        else:
+            assert width is not None and height is not None
+            self.width = width
+            self.height = height
+            self.channels = channels if channels is not None else 4
+            self.dtype = np.dtype(dtype) if dtype is not None else np.float32
+
+        # Validate size and dtype.
+        assert isinstance(self.width, int) and self.width >= 0
+        assert isinstance(self.height, int) and self.height >= 0
+        assert isinstance(self.channels, int) and self.channels >= 1
+        assert width is None or width == self.width
+        assert height is None or height == self.height
+        assert channels is None or channels == self.channels
+        assert dtype is None or dtype == self.dtype
+
+        # Create framebuffer object.
+        self.gl_id = gl.glGenFramebuffers(1)
+        with self.bind():
+
+            # Setup color buffer.
+            if self.texture is not None:
+                assert self.msaa == 0
+                gl.glFramebufferTexture2D(gl.GL_FRAMEBUFFER, gl.GL_COLOR_ATTACHMENT0, gl.GL_TEXTURE_2D, self.texture.gl_id, 0)
+            else:
+                fmt = get_texture_format(self.dtype, self.channels)
+                self.gl_color = gl.glGenRenderbuffers(1)
+                gl.glBindRenderbuffer(gl.GL_RENDERBUFFER, self.gl_color)
+                gl.glRenderbufferStorageMultisample(gl.GL_RENDERBUFFER, self.msaa, fmt.internalformat, self.width, self.height)
+                gl.glFramebufferRenderbuffer(gl.GL_FRAMEBUFFER, gl.GL_COLOR_ATTACHMENT0, gl.GL_RENDERBUFFER, self.gl_color)
+
+            # Setup depth/stencil buffer.
+            self.gl_depth_stencil = gl.glGenRenderbuffers(1)
+            gl.glBindRenderbuffer(gl.GL_RENDERBUFFER, self.gl_depth_stencil)
+            gl.glRenderbufferStorageMultisample(gl.GL_RENDERBUFFER, self.msaa, gl.GL_DEPTH24_STENCIL8, self.width, self.height)
+            gl.glFramebufferRenderbuffer(gl.GL_FRAMEBUFFER, gl.GL_DEPTH_STENCIL_ATTACHMENT, gl.GL_RENDERBUFFER, self.gl_depth_stencil)
+
+    def delete(self):
+        if self.gl_id is not None:
+            gl.glDeleteFramebuffers([self.gl_id])
+            self.gl_id = None
+        if self.gl_color is not None:
+            gl.glDeleteRenderbuffers(1, [self.gl_color])
+            self.gl_color = None
+        if self.gl_depth_stencil is not None:
+            gl.glDeleteRenderbuffers(1, [self.gl_depth_stencil])
+            self.gl_depth_stencil = None
+
+    def __del__(self):
+        try:
+            self.delete()
+        except:
+            pass
+
+    @contextlib.contextmanager
+    def bind(self):
+        prev_fbo = gl.glGetInteger(gl.GL_FRAMEBUFFER_BINDING)
+        prev_rbo = gl.glGetInteger(gl.GL_RENDERBUFFER_BINDING)
+        gl.glBindFramebuffer(gl.GL_FRAMEBUFFER, self.gl_id)
+        if self.width is not None and self.height is not None:
+            gl.glViewport(0, 0, self.width, self.height)
+        yield
+        gl.glBindFramebuffer(gl.GL_FRAMEBUFFER, prev_fbo)
+        gl.glBindRenderbuffer(gl.GL_RENDERBUFFER, prev_rbo)
+
+    def blit(self, dst=None):
+        assert dst is None or isinstance(dst, Framebuffer)
+        with self.bind():
+            gl.glBindFramebuffer(gl.GL_DRAW_FRAMEBUFFER, 0 if dst is None else dst.fbo)
+            gl.glBlitFramebuffer(0, 0, self.width, self.height, 0, 0, self.width, self.height, gl.GL_COLOR_BUFFER_BIT, gl.GL_NEAREST)
+
+#----------------------------------------------------------------------------
+
+def draw_shape(vertices, *, mode=gl.GL_TRIANGLE_FAN, pos=0, size=1, color=1, alpha=1):
+    assert vertices.ndim == 2 and vertices.shape[1] == 2
+    pos = np.broadcast_to(np.asarray(pos, dtype='float32'), [2])
+    size = np.broadcast_to(np.asarray(size, dtype='float32'), [2])
+    color = np.broadcast_to(np.asarray(color, dtype='float32'), [3])
+    alpha = np.clip(np.broadcast_to(np.asarray(alpha, dtype='float32'), []), 0, 1)
+
+    gl.glPushClientAttrib(gl.GL_CLIENT_VERTEX_ARRAY_BIT)
+    gl.glPushAttrib(gl.GL_CURRENT_BIT | gl.GL_TRANSFORM_BIT)
+    gl.glMatrixMode(gl.GL_MODELVIEW)
+    gl.glPushMatrix()
+
+    gl.glEnableClientState(gl.GL_VERTEX_ARRAY)
+    gl.glEnableClientState(gl.GL_TEXTURE_COORD_ARRAY)
+    gl.glVertexPointer(2, gl.GL_FLOAT, 0, vertices)
+    gl.glTexCoordPointer(2, gl.GL_FLOAT, 0, vertices)
+    gl.glTranslate(pos[0], pos[1], 0)
+    gl.glScale(size[0], size[1], 1)
+    gl.glColor4f(color[0] * alpha, color[1] * alpha, color[2] * alpha, alpha)
+    gl.glDrawArrays(mode, 0, vertices.shape[0])
+
+    gl.glPopMatrix()
+    gl.glPopAttrib()
+    gl.glPopClientAttrib()
+
+#----------------------------------------------------------------------------
+
+def draw_rect(*, pos=0, pos2=None, size=None, align=0, rint=False, color=1, alpha=1, rounding=0):
+    assert pos2 is None or size is None
+    pos = np.broadcast_to(np.asarray(pos, dtype='float32'), [2])
+    pos2 = np.broadcast_to(np.asarray(pos2, dtype='float32'), [2]) if pos2 is not None else None
+    size = np.broadcast_to(np.asarray(size, dtype='float32'), [2]) if size is not None else None
+    size = size if size is not None else pos2 - pos if pos2 is not None else np.array([1, 1], dtype='float32')
+    pos = pos - size * align
+    if rint:
+        pos = np.rint(pos)
+    rounding = np.broadcast_to(np.asarray(rounding, dtype='float32'), [2])
+    rounding = np.minimum(np.abs(rounding) / np.maximum(np.abs(size), 1e-8), 0.5)
+    if np.min(rounding) == 0:
+        rounding *= 0
+    vertices = _setup_rect(float(rounding[0]), float(rounding[1]))
+    draw_shape(vertices, mode=gl.GL_TRIANGLE_FAN, pos=pos, size=size, color=color, alpha=alpha)
+
+@functools.lru_cache(maxsize=10000)
+def _setup_rect(rx, ry):
+    t = np.linspace(0, np.pi / 2, 1 if max(rx, ry) == 0 else 64)
+    s = 1 - np.sin(t); c = 1 - np.cos(t)
+    x = [c * rx, 1 - s * rx, 1 - c * rx, s * rx]
+    y = [s * ry, c * ry, 1 - s * ry, 1 - c * ry]
+    v = np.stack([x, y], axis=-1).reshape(-1, 2)
+    return v.astype('float32')
+
+#----------------------------------------------------------------------------
+
+def draw_circle(*, center=0, radius=100, hole=0, color=1, alpha=1):
+    hole = np.broadcast_to(np.asarray(hole, dtype='float32'), [])
+    vertices = _setup_circle(float(hole))
+    draw_shape(vertices, mode=gl.GL_TRIANGLE_STRIP, pos=center, size=radius, color=color, alpha=alpha)
+
+@functools.lru_cache(maxsize=10000)
+def _setup_circle(hole):
+    t = np.linspace(0, np.pi * 2, 128)
+    s = np.sin(t); c = np.cos(t)
+    v = np.stack([c, s, c * hole, s * hole], axis=-1).reshape(-1, 2)
+    return v.astype('float32')
+
+#----------------------------------------------------------------------------
diff --git a/gui_utils/glfw_window.py b/gui_utils/glfw_window.py
new file mode 100644
index 0000000000000000000000000000000000000000..83264eb89a855ec5038cf255994ee2b4b3ddb5ee
--- /dev/null
+++ b/gui_utils/glfw_window.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import time
+import glfw
+import OpenGL.GL as gl
+from . import gl_utils
+
+#----------------------------------------------------------------------------
+
+class GlfwWindow: # pylint: disable=too-many-public-methods
+    def __init__(self, *, title='GlfwWindow', window_width=1920, window_height=1080, deferred_show=True, close_on_esc=True):
+        self._glfw_window           = None
+        self._drawing_frame         = False
+        self._frame_start_time      = None
+        self._frame_delta           = 0
+        self._fps_limit             = None
+        self._vsync                 = None
+        self._skip_frames           = 0
+        self._deferred_show         = deferred_show
+        self._close_on_esc          = close_on_esc
+        self._esc_pressed           = False
+        self._drag_and_drop_paths   = None
+        self._capture_next_frame    = False
+        self._captured_frame        = None
+
+        # Create window.
+        glfw.init()
+        glfw.window_hint(glfw.VISIBLE, False)
+        self._glfw_window = glfw.create_window(width=window_width, height=window_height, title=title, monitor=None, share=None)
+        self._attach_glfw_callbacks()
+        self.make_context_current()
+
+        # Adjust window.
+        self.set_vsync(False)
+        self.set_window_size(window_width, window_height)
+        if not self._deferred_show:
+            glfw.show_window(self._glfw_window)
+
+    def close(self):
+        if self._drawing_frame:
+            self.end_frame()
+        if self._glfw_window is not None:
+            glfw.destroy_window(self._glfw_window)
+            self._glfw_window = None
+        #glfw.terminate() # Commented out to play it nice with other glfw clients.
+
+    def __del__(self):
+        try:
+            self.close()
+        except:
+            pass
+
+    @property
+    def window_width(self):
+        return self.content_width
+
+    @property
+    def window_height(self):
+        return self.content_height + self.title_bar_height
+
+    @property
+    def content_width(self):
+        width, _height = glfw.get_window_size(self._glfw_window)
+        return width
+
+    @property
+    def content_height(self):
+        _width, height = glfw.get_window_size(self._glfw_window)
+        return height
+
+    @property
+    def title_bar_height(self):
+        _left, top, _right, _bottom = glfw.get_window_frame_size(self._glfw_window)
+        return top
+
+    @property
+    def monitor_width(self):
+        _, _, width, _height = glfw.get_monitor_workarea(glfw.get_primary_monitor())
+        return width
+
+    @property
+    def monitor_height(self):
+        _, _, _width, height = glfw.get_monitor_workarea(glfw.get_primary_monitor())
+        return height
+
+    @property
+    def frame_delta(self):
+        return self._frame_delta
+
+    def set_title(self, title):
+        glfw.set_window_title(self._glfw_window, title)
+
+    def set_window_size(self, width, height):
+        width = min(width, self.monitor_width)
+        height = min(height, self.monitor_height)
+        glfw.set_window_size(self._glfw_window, width, max(height - self.title_bar_height, 0))
+        if width == self.monitor_width and height == self.monitor_height:
+            self.maximize()
+
+    def set_content_size(self, width, height):
+        self.set_window_size(width, height + self.title_bar_height)
+
+    def maximize(self):
+        glfw.maximize_window(self._glfw_window)
+
+    def set_position(self, x, y):
+        glfw.set_window_pos(self._glfw_window, x, y + self.title_bar_height)
+
+    def center(self):
+        self.set_position((self.monitor_width - self.window_width) // 2, (self.monitor_height - self.window_height) // 2)
+
+    def set_vsync(self, vsync):
+        vsync = bool(vsync)
+        if vsync != self._vsync:
+            glfw.swap_interval(1 if vsync else 0)
+            self._vsync = vsync
+
+    def set_fps_limit(self, fps_limit):
+        self._fps_limit = int(fps_limit)
+
+    def should_close(self):
+        return glfw.window_should_close(self._glfw_window) or (self._close_on_esc and self._esc_pressed)
+
+    def skip_frame(self):
+        self.skip_frames(1)
+
+    def skip_frames(self, num): # Do not update window for the next N frames.
+        self._skip_frames = max(self._skip_frames, int(num))
+
+    def is_skipping_frames(self):
+        return self._skip_frames > 0
+
+    def capture_next_frame(self):
+        self._capture_next_frame = True
+
+    def pop_captured_frame(self):
+        frame = self._captured_frame
+        self._captured_frame = None
+        return frame
+
+    def pop_drag_and_drop_paths(self):
+        paths = self._drag_and_drop_paths
+        self._drag_and_drop_paths = None
+        return paths
+
+    def draw_frame(self): # To be overridden by subclass.
+        self.begin_frame()
+        # Rendering code goes here.
+        self.end_frame()
+
+    def make_context_current(self):
+        if self._glfw_window is not None:
+            glfw.make_context_current(self._glfw_window)
+
+    def begin_frame(self):
+        # End previous frame.
+        if self._drawing_frame:
+            self.end_frame()
+
+        # Apply FPS limit.
+        if self._frame_start_time is not None and self._fps_limit is not None:
+            delay = self._frame_start_time - time.perf_counter() + 1 / self._fps_limit
+            if delay > 0:
+                time.sleep(delay)
+        cur_time = time.perf_counter()
+        if self._frame_start_time is not None:
+            self._frame_delta = cur_time - self._frame_start_time
+        self._frame_start_time = cur_time
+
+        # Process events.
+        glfw.poll_events()
+
+        # Begin frame.
+        self._drawing_frame = True
+        self.make_context_current()
+
+        # Initialize GL state.
+        gl.glViewport(0, 0, self.content_width, self.content_height)
+        gl.glMatrixMode(gl.GL_PROJECTION)
+        gl.glLoadIdentity()
+        gl.glTranslate(-1, 1, 0)
+        gl.glScale(2 / max(self.content_width, 1), -2 / max(self.content_height, 1), 1)
+        gl.glMatrixMode(gl.GL_MODELVIEW)
+        gl.glLoadIdentity()
+        gl.glEnable(gl.GL_BLEND)
+        gl.glBlendFunc(gl.GL_ONE, gl.GL_ONE_MINUS_SRC_ALPHA) # Pre-multiplied alpha.
+
+        # Clear.
+        gl.glClearColor(0, 0, 0, 1)
+        gl.glClear(gl.GL_COLOR_BUFFER_BIT | gl.GL_DEPTH_BUFFER_BIT)
+
+    def end_frame(self):
+        assert self._drawing_frame
+        self._drawing_frame = False
+
+        # Skip frames if requested.
+        if self._skip_frames > 0:
+            self._skip_frames -= 1
+            return
+
+        # Capture frame if requested.
+        if self._capture_next_frame:
+            self._captured_frame = gl_utils.read_pixels(self.content_width, self.content_height)
+            self._capture_next_frame = False
+
+        # Update window.
+        if self._deferred_show:
+            glfw.show_window(self._glfw_window)
+            self._deferred_show = False
+        glfw.swap_buffers(self._glfw_window)
+
+    def _attach_glfw_callbacks(self):
+        glfw.set_key_callback(self._glfw_window, self._glfw_key_callback)
+        glfw.set_drop_callback(self._glfw_window, self._glfw_drop_callback)
+
+    def _glfw_key_callback(self, _window, key, _scancode, action, _mods):
+        if action == glfw.PRESS and key == glfw.KEY_ESCAPE:
+            self._esc_pressed = True
+
+    def _glfw_drop_callback(self, _window, paths):
+        self._drag_and_drop_paths = paths
+
+#----------------------------------------------------------------------------
diff --git a/gui_utils/imgui_utils.py b/gui_utils/imgui_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..333024bd6999bf2b18a5cb96766c4da3798666a2
--- /dev/null
+++ b/gui_utils/imgui_utils.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import contextlib
+import imgui
+
+#----------------------------------------------------------------------------
+
+def set_default_style(color_scheme='dark', spacing=9, indent=23, scrollbar=27):
+    s = imgui.get_style()
+    s.window_padding        = [spacing, spacing]
+    s.item_spacing          = [spacing, spacing]
+    s.item_inner_spacing    = [spacing, spacing]
+    s.columns_min_spacing   = spacing
+    s.indent_spacing        = indent
+    s.scrollbar_size        = scrollbar
+    s.frame_padding         = [4, 3]
+    s.window_border_size    = 1
+    s.child_border_size     = 1
+    s.popup_border_size     = 1
+    s.frame_border_size     = 1
+    s.window_rounding       = 0
+    s.child_rounding        = 0
+    s.popup_rounding        = 3
+    s.frame_rounding        = 3
+    s.scrollbar_rounding    = 3
+    s.grab_rounding         = 3
+
+    getattr(imgui, f'style_colors_{color_scheme}')(s)
+    c0 = s.colors[imgui.COLOR_MENUBAR_BACKGROUND]
+    c1 = s.colors[imgui.COLOR_FRAME_BACKGROUND]
+    s.colors[imgui.COLOR_POPUP_BACKGROUND] = [x * 0.7 + y * 0.3 for x, y in zip(c0, c1)][:3] + [1]
+
+#----------------------------------------------------------------------------
+
+@contextlib.contextmanager
+def grayed_out(cond=True):
+    if cond:
+        s = imgui.get_style()
+        text = s.colors[imgui.COLOR_TEXT_DISABLED]
+        grab = s.colors[imgui.COLOR_SCROLLBAR_GRAB]
+        back = s.colors[imgui.COLOR_MENUBAR_BACKGROUND]
+        imgui.push_style_color(imgui.COLOR_TEXT, *text)
+        imgui.push_style_color(imgui.COLOR_CHECK_MARK, *grab)
+        imgui.push_style_color(imgui.COLOR_SLIDER_GRAB, *grab)
+        imgui.push_style_color(imgui.COLOR_SLIDER_GRAB_ACTIVE, *grab)
+        imgui.push_style_color(imgui.COLOR_FRAME_BACKGROUND, *back)
+        imgui.push_style_color(imgui.COLOR_FRAME_BACKGROUND_HOVERED, *back)
+        imgui.push_style_color(imgui.COLOR_FRAME_BACKGROUND_ACTIVE, *back)
+        imgui.push_style_color(imgui.COLOR_BUTTON, *back)
+        imgui.push_style_color(imgui.COLOR_BUTTON_HOVERED, *back)
+        imgui.push_style_color(imgui.COLOR_BUTTON_ACTIVE, *back)
+        imgui.push_style_color(imgui.COLOR_HEADER, *back)
+        imgui.push_style_color(imgui.COLOR_HEADER_HOVERED, *back)
+        imgui.push_style_color(imgui.COLOR_HEADER_ACTIVE, *back)
+        imgui.push_style_color(imgui.COLOR_POPUP_BACKGROUND, *back)
+        yield
+        imgui.pop_style_color(14)
+    else:
+        yield
+
+#----------------------------------------------------------------------------
+
+@contextlib.contextmanager
+def item_width(width=None):
+    if width is not None:
+        imgui.push_item_width(width)
+        yield
+        imgui.pop_item_width()
+    else:
+        yield
+
+#----------------------------------------------------------------------------
+
+def scoped_by_object_id(method):
+    def decorator(self, *args, **kwargs):
+        imgui.push_id(str(id(self)))
+        res = method(self, *args, **kwargs)
+        imgui.pop_id()
+        return res
+    return decorator
+
+#----------------------------------------------------------------------------
+
+def button(label, width=0, enabled=True):
+    with grayed_out(not enabled):
+        clicked = imgui.button(label, width=width)
+    clicked = clicked and enabled
+    return clicked
+
+#----------------------------------------------------------------------------
+
+def collapsing_header(text, visible=None, flags=0, default=False, enabled=True, show=True):
+    expanded = False
+    if show:
+        if default:
+            flags |= imgui.TREE_NODE_DEFAULT_OPEN
+        if not enabled:
+            flags |= imgui.TREE_NODE_LEAF
+        with grayed_out(not enabled):
+            expanded, visible = imgui.collapsing_header(text, visible=visible, flags=flags)
+        expanded = expanded and enabled
+    return expanded, visible
+
+#----------------------------------------------------------------------------
+
+def popup_button(label, width=0, enabled=True):
+    if button(label, width, enabled):
+        imgui.open_popup(label)
+    opened = imgui.begin_popup(label)
+    return opened
+
+#----------------------------------------------------------------------------
+
+def input_text(label, value, buffer_length, flags, width=None, help_text=''):
+    old_value = value
+    color = list(imgui.get_style().colors[imgui.COLOR_TEXT])
+    if value == '':
+        color[-1] *= 0.5
+    with item_width(width):
+        imgui.push_style_color(imgui.COLOR_TEXT, *color)
+        value = value if value != '' else help_text
+        changed, value = imgui.input_text(label, value, buffer_length, flags)
+        value = value if value != help_text else ''
+        imgui.pop_style_color(1)
+    if not flags & imgui.INPUT_TEXT_ENTER_RETURNS_TRUE:
+        changed = (value != old_value)
+    return changed, value
+
+#----------------------------------------------------------------------------
+
+def drag_previous_control(enabled=True):
+    dragging = False
+    dx = 0
+    dy = 0
+    if imgui.begin_drag_drop_source(imgui.DRAG_DROP_SOURCE_NO_PREVIEW_TOOLTIP):
+        if enabled:
+            dragging = True
+            dx, dy = imgui.get_mouse_drag_delta()
+            imgui.reset_mouse_drag_delta()
+        imgui.end_drag_drop_source()
+    return dragging, dx, dy
+
+#----------------------------------------------------------------------------
+
+def drag_button(label, width=0, enabled=True):
+    clicked = button(label, width=width, enabled=enabled)
+    dragging, dx, dy = drag_previous_control(enabled=enabled)
+    return clicked, dragging, dx, dy
+
+#----------------------------------------------------------------------------
+
+def drag_hidden_window(label, x, y, width, height, enabled=True):
+    imgui.push_style_color(imgui.COLOR_WINDOW_BACKGROUND, 0, 0, 0, 0)
+    imgui.push_style_color(imgui.COLOR_BORDER, 0, 0, 0, 0)
+    imgui.set_next_window_position(x, y)
+    imgui.set_next_window_size(width, height)
+    imgui.begin(label, closable=False, flags=(imgui.WINDOW_NO_TITLE_BAR | imgui.WINDOW_NO_RESIZE | imgui.WINDOW_NO_MOVE))
+    dragging, dx, dy = drag_previous_control(enabled=enabled)
+    imgui.end()
+    imgui.pop_style_color(2)
+    return dragging, dx, dy
+
+#----------------------------------------------------------------------------
diff --git a/gui_utils/imgui_window.py b/gui_utils/imgui_window.py
new file mode 100644
index 0000000000000000000000000000000000000000..30d539a1382def526050c83978d1118348ac77ad
--- /dev/null
+++ b/gui_utils/imgui_window.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import imgui
+import imgui.integrations.glfw
+
+from . import glfw_window
+from . import imgui_utils
+from . import text_utils
+
+#----------------------------------------------------------------------------
+
+class ImguiWindow(glfw_window.GlfwWindow):
+    def __init__(self, *, title='ImguiWindow', font=None, font_sizes=range(14,24), **glfw_kwargs):
+        if font is None:
+            font = text_utils.get_default_font()
+        font_sizes = {int(size) for size in font_sizes}
+        super().__init__(title=title, **glfw_kwargs)
+
+        # Init fields.
+        self._imgui_context  = None
+        self._imgui_renderer = None
+        self._imgui_fonts    = None
+        self._cur_font_size  = max(font_sizes)
+
+        # Delete leftover imgui.ini to avoid unexpected behavior.
+        if os.path.isfile('imgui.ini'):
+            os.remove('imgui.ini')
+
+        # Init ImGui.
+        self._imgui_context = imgui.create_context()
+        self._imgui_renderer = _GlfwRenderer(self._glfw_window)
+        self._attach_glfw_callbacks()
+        imgui.get_io().ini_saving_rate = 0 # Disable creating imgui.ini at runtime.
+        imgui.get_io().mouse_drag_threshold = 0 # Improve behavior with imgui_utils.drag_custom().
+        self._imgui_fonts = {size: imgui.get_io().fonts.add_font_from_file_ttf(font, size) for size in font_sizes}
+        self._imgui_renderer.refresh_font_texture()
+
+    def close(self):
+        self.make_context_current()
+        self._imgui_fonts = None
+        if self._imgui_renderer is not None:
+            self._imgui_renderer.shutdown()
+            self._imgui_renderer = None
+        if self._imgui_context is not None:
+            #imgui.destroy_context(self._imgui_context) # Commented out to avoid creating imgui.ini at the end.
+            self._imgui_context = None
+        super().close()
+
+    def _glfw_key_callback(self, *args):
+        super()._glfw_key_callback(*args)
+        self._imgui_renderer.keyboard_callback(*args)
+
+    @property
+    def font_size(self):
+        return self._cur_font_size
+
+    @property
+    def spacing(self):
+        return round(self._cur_font_size * 0.4)
+
+    def set_font_size(self, target): # Applied on next frame.
+        self._cur_font_size = min((abs(key - target), key) for key in self._imgui_fonts.keys())[1]
+
+    def begin_frame(self):
+        # Begin glfw frame.
+        super().begin_frame()
+
+        # Process imgui events.
+        self._imgui_renderer.mouse_wheel_multiplier = self._cur_font_size / 10
+        if self.content_width > 0 and self.content_height > 0:
+            self._imgui_renderer.process_inputs()
+
+        # Begin imgui frame.
+        imgui.new_frame()
+        imgui.push_font(self._imgui_fonts[self._cur_font_size])
+        imgui_utils.set_default_style(spacing=self.spacing, indent=self.font_size, scrollbar=self.font_size+4)
+
+    def end_frame(self):
+        imgui.pop_font()
+        imgui.render()
+        imgui.end_frame()
+        self._imgui_renderer.render(imgui.get_draw_data())
+        super().end_frame()
+
+#----------------------------------------------------------------------------
+# Wrapper class for GlfwRenderer to fix a mouse wheel bug on Linux.
+
+class _GlfwRenderer(imgui.integrations.glfw.GlfwRenderer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mouse_wheel_multiplier = 1
+
+    def scroll_callback(self, window, x_offset, y_offset):
+        self.io.mouse_wheel += y_offset * self.mouse_wheel_multiplier
+
+#----------------------------------------------------------------------------
diff --git a/gui_utils/text_utils.py b/gui_utils/text_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e5e4a16dc62c4be80df5432208bce5d386bf16
--- /dev/null
+++ b/gui_utils/text_utils.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import functools
+from typing import Optional
+
+import dnnlib
+import numpy as np
+import PIL.Image
+import PIL.ImageFont
+import scipy.ndimage
+
+from . import gl_utils
+
+#----------------------------------------------------------------------------
+
+def get_default_font():
+    url = 'http://fonts.gstatic.com/s/opensans/v17/mem8YaGs126MiZpBA-U1UpcaXcl0Aw.ttf' # Open Sans regular
+    return dnnlib.util.open_url(url, return_filename=True)
+
+#----------------------------------------------------------------------------
+
+@functools.lru_cache(maxsize=None)
+def get_pil_font(font=None, size=32):
+    if font is None:
+        font = get_default_font()
+    return PIL.ImageFont.truetype(font=font, size=size)
+
+#----------------------------------------------------------------------------
+
+def get_array(string, *, dropshadow_radius: int=None, **kwargs):
+    if dropshadow_radius is not None:
+        offset_x = int(np.ceil(dropshadow_radius*2/3))
+        offset_y = int(np.ceil(dropshadow_radius*2/3))
+        return _get_array_priv(string, dropshadow_radius=dropshadow_radius, offset_x=offset_x, offset_y=offset_y, **kwargs)
+    else:
+        return _get_array_priv(string, **kwargs)
+
+@functools.lru_cache(maxsize=10000)
+def _get_array_priv(
+    string: str, *,
+    size: int = 32,
+    max_width: Optional[int]=None,
+    max_height: Optional[int]=None,
+    min_size=10,
+    shrink_coef=0.8,
+    dropshadow_radius: int=None,
+    offset_x: int=None,
+    offset_y: int=None,
+    **kwargs
+):
+    cur_size = size
+    array = None
+    while True:
+        if dropshadow_radius is not None:
+            # separate implementation for dropshadow text rendering
+            array = _get_array_impl_dropshadow(string, size=cur_size, radius=dropshadow_radius, offset_x=offset_x, offset_y=offset_y, **kwargs)
+        else:
+            array = _get_array_impl(string, size=cur_size, **kwargs)
+        height, width, _ = array.shape
+        if (max_width is None or width <= max_width) and (max_height is None or height <= max_height) or (cur_size <= min_size):
+            break
+        cur_size = max(int(cur_size * shrink_coef), min_size)
+    return array
+
+#----------------------------------------------------------------------------
+
+@functools.lru_cache(maxsize=10000)
+def _get_array_impl(string, *, font=None, size=32, outline=0, outline_pad=3, outline_coef=3, outline_exp=2, line_pad: int=None):
+    pil_font = get_pil_font(font=font, size=size)
+    lines = [pil_font.getmask(line, 'L') for line in string.split('\n')]
+    lines = [np.array(line, dtype=np.uint8).reshape([line.size[1], line.size[0]]) for line in lines]
+    width = max(line.shape[1] for line in lines)
+    lines = [np.pad(line, ((0, 0), (0, width - line.shape[1])), mode='constant') for line in lines]
+    line_spacing = line_pad if line_pad is not None else size // 2
+    lines = [np.pad(line, ((0, line_spacing), (0, 0)), mode='constant') for line in lines[:-1]] + lines[-1:]
+    mask = np.concatenate(lines, axis=0)
+    alpha = mask
+    if outline > 0:
+        mask = np.pad(mask, int(np.ceil(outline * outline_pad)), mode='constant', constant_values=0)
+        alpha = mask.astype(np.float32) / 255
+        alpha = scipy.ndimage.gaussian_filter(alpha, outline)
+        alpha = 1 - np.maximum(1 - alpha * outline_coef, 0) ** outline_exp
+        alpha = (alpha * 255 + 0.5).clip(0, 255).astype(np.uint8)
+        alpha = np.maximum(alpha, mask)
+    return np.stack([mask, alpha], axis=-1)
+
+#----------------------------------------------------------------------------
+
+@functools.lru_cache(maxsize=10000)
+def _get_array_impl_dropshadow(string, *, font=None, size=32, radius: int, offset_x: int, offset_y: int, line_pad: int=None, **kwargs):
+    assert (offset_x > 0) and (offset_y > 0)
+    pil_font = get_pil_font(font=font, size=size)
+    lines = [pil_font.getmask(line, 'L') for line in string.split('\n')]
+    lines = [np.array(line, dtype=np.uint8).reshape([line.size[1], line.size[0]]) for line in lines]
+    width = max(line.shape[1] for line in lines)
+    lines = [np.pad(line, ((0, 0), (0, width - line.shape[1])), mode='constant') for line in lines]
+    line_spacing = line_pad if line_pad is not None else size // 2
+    lines = [np.pad(line, ((0, line_spacing), (0, 0)), mode='constant') for line in lines[:-1]] + lines[-1:]
+    mask = np.concatenate(lines, axis=0)
+    alpha = mask
+
+    mask = np.pad(mask, 2*radius + max(abs(offset_x), abs(offset_y)), mode='constant', constant_values=0)
+    alpha = mask.astype(np.float32) / 255
+    alpha = scipy.ndimage.gaussian_filter(alpha, radius)
+    alpha = 1 - np.maximum(1 - alpha * 1.5, 0) ** 1.4
+    alpha = (alpha * 255 + 0.5).clip(0, 255).astype(np.uint8)
+    alpha = np.pad(alpha, [(offset_y, 0), (offset_x, 0)], mode='constant')[:-offset_y, :-offset_x]
+    alpha = np.maximum(alpha, mask)
+    return np.stack([mask, alpha], axis=-1)
+
+#----------------------------------------------------------------------------
+
+@functools.lru_cache(maxsize=10000)
+def get_texture(string, bilinear=True, mipmap=True, **kwargs):
+    return gl_utils.Texture(image=get_array(string, **kwargs), bilinear=bilinear, mipmap=mipmap)
+
+#----------------------------------------------------------------------------
diff --git a/launcher.py b/launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..5296a93c256b48ae43cf57eb7876e3fb3b3dfe68
--- /dev/null
+++ b/launcher.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import random, shlex, datetime
+import os, sys, subprocess, shutil
+from glob import iglob
+
+
+def copy_all_python_files(
+    source, snapshot_main_dir, code_snapshot_hash, recurse_dirs="fairseq"
+):
+    """
+    Copies following files from source to destination:
+        a) all *.py files at direct source location.
+        b) all fairseq/*.py recursively (default); recurse through comma-separated recurse_dirs
+    """
+    os.makedirs(snapshot_main_dir, exist_ok=True)
+    destination = os.path.join(snapshot_main_dir, code_snapshot_hash)
+    assert not os.path.exists(destination), "Code snapshot: {0} alredy exists".format(
+        code_snapshot_hash
+    )
+    os.makedirs(destination)
+
+    def all_pys(recurse_dirs):
+        yield from iglob(os.path.join(source, "*.py"))
+        for d in recurse_dirs.split(","):
+            yield from iglob(os.path.join(source, d, "**/*.py"), recursive=True)
+            yield from iglob(os.path.join(source, d, "**/*.so"), recursive=True)
+            yield from iglob(os.path.join(source, d, "**/*.yaml"), recursive=True)
+
+    for filepath in all_pys(recurse_dirs):
+        directory, filename = os.path.split(filepath)
+        if directory:
+            os.makedirs(os.path.join(destination, directory), exist_ok=True)
+        shutil.copy2(
+            os.path.join(source, filepath), os.path.join(destination, filepath)
+        )
+    return destination
+
+def launch_cluster(slurm_args, model_args):
+    # prepare
+    jobname = slurm_args.get('job-name', 'test')
+    if slurm_args.get('workplace') is not None:
+        os.makedirs(slurm_args.get('workplace'), exist_ok=True)
+    if slurm_args.get('workplace') is not None:
+        train_log = os.path.join(slurm_args['workplace'], 'train.%A.out')
+        train_stderr = os.path.join(slurm_args['workplace'], 'train.%A.stderr.%j')
+    else:
+        train_log = train_stderr = None
+    nodes, gpus = slurm_args.get('nodes', 1), slurm_args.get('gpus', 8)
+    if not slurm_args.get('local', False):
+        assert (train_log is not None) and (train_stderr is not None)
+    # parse slurm
+
+    destination = ""
+    # if slurm_args.get('workplace', None) is not None:
+    #     # Currently hash is just the current time in ISO format.
+    #     # Remove colons since they cannot be escaped in POSIX PATH env vars.
+    #     code_snapshot_hash = datetime.datetime.now().isoformat().replace(":", "_")
+    #     destination = copy_all_python_files(
+    #         ".",
+    #         os.path.join(slurm_args['workplace'], "slurm_snapshot_code"),
+    #         code_snapshot_hash,
+    #         'fairseq',
+    #     )
+    #     os.environ["PYTHONPATH"] = destination + ":" + os.environ.get("PYTHONPATH", "")
+    #     print('creat snapshot at {}'.format(destination))
+
+    train_cmd = ['python', os.path.join(destination, 'run_train.py'), ]
+    train_cmd.extend([f'gpus={nodes * gpus}'])
+    train_cmd.extend([f'port={get_random_port()}'])
+    train_cmd += model_args
+
+    base_srun_cmd = [
+            'srun',
+            '--job-name', jobname,
+            '--output', train_log,
+            '--error', train_stderr,
+            '--open-mode', 'append',
+            '--unbuffered',
+        ]
+    srun_cmd = base_srun_cmd + train_cmd
+    srun_cmd_str = ' '.join(map(shlex.quote, srun_cmd)) 
+    srun_cmd_str = srun_cmd_str + ' &'
+
+    sbatch_cmd = [
+                'sbatch',
+                '--job-name', jobname,
+                '--partition', slurm_args.get('partition', 'learnfair'),
+                '--gres', 'gpu:volta:{}'.format(gpus),
+                '--nodes', str(nodes),
+                '--ntasks-per-node', '1',
+                '--cpus-per-task', '20',
+                '--output', train_log,
+                '--error', train_stderr,
+                '--open-mode', 'append',
+                '--signal', 'B:USR1@180',
+                '--time', slurm_args.get('time', '4320'),
+                '--mem', slurm_args.get('mem', '500gb'),
+                '--exclusive',
+                '--exclude', 'learnfair5035,learnfair5289,learnfair5088,learnfair5028,learnfair5032,learnfair5033,learnfair5056,learnfair5098,learnfair5122,learnfair5124,learnfair5156,learnfair5036,learnfair5258,learnfair5205,learnfair5201,learnfair5240,learnfair5087,learnfair5119,learnfair5246,learnfair7474,learnfair7585,learnfair5150,learnfair5166,learnfair5215,learnfair5142,learnfair5070,learnfair5236,learnfair7523'
+            ]
+    if 'constraint' in slurm_args:
+        sbatch_cmd += ['-C', slurm_args.get('constraint')]
+    if 'comment' in slurm_args:
+        sbatch_cmd += ['--comment', slurm_args.get('comment')]
+    
+    wrapped_cmd = requeue_support() + '\n' + srun_cmd_str + ' \n wait $! \n sleep 610 & \n wait $!'
+    sbatch_cmd += ['--wrap', wrapped_cmd]
+    sbatch_cmd_str = ' '.join(map(shlex.quote, sbatch_cmd))
+    
+    # start training
+    env = os.environ.copy()
+    env['OMP_NUM_THREADS'] = '2'
+    env['NCCL_SOCKET_IFNAME'] = ''
+    
+    if env.get('SLURM_ARGS', None) is not None:
+        del env['SLURM_ARGS']
+
+    if nodes > 1:
+        env['NCCL_SOCKET_IFNAME'] = '^docker0,lo'
+        env['NCCL_DEBUG'] = 'INFO'
+
+    if slurm_args.get('dry-run', False):
+        print(sbatch_cmd_str)
+    
+    elif slurm_args.get('local', False):
+        assert nodes == 1, 'distributed training cannot be combined with local' 
+        if 'CUDA_VISIBLE_DEVICES' not in env:
+            env['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, range(gpus)))
+        env['NCCL_DEBUG'] = 'INFO'
+        
+        if train_log is not None:
+            train_proc = subprocess.Popen(train_cmd, env=env, stdout=subprocess.PIPE)
+            tee_proc = subprocess.Popen(['tee', '-a', train_log], stdin=train_proc.stdout)
+            train_proc.stdout.close()
+            train_proc.wait()
+            tee_proc.wait()
+        else:
+            train_proc = subprocess.Popen(train_cmd, env=env)
+            train_proc.wait()
+    else:
+        with open(train_log, 'a') as train_log_h:
+            print(f'running command: {sbatch_cmd_str}\n')
+            with subprocess.Popen(sbatch_cmd, stdout=subprocess.PIPE, env=env) as train_proc:
+                stdout = train_proc.stdout.read().decode('utf-8')
+                print(stdout, file=train_log_h)
+                try:
+                    job_id = int(stdout.rstrip().split()[-1])
+                    return job_id
+                except IndexError:
+                    return None
+
+
+def launch(slurm_args, model_args):
+    job_id = launch_cluster(slurm_args, model_args)
+    if job_id is not None:
+        print('Launched {}'.format(job_id))
+    else:
+        print('Failed.')
+
+
+def requeue_support():
+    return """
+        trap_handler () {
+           echo "Caught signal: " $1
+           # SIGTERM must be bypassed
+           if [ "$1" = "TERM" ]; then
+               echo "bypass sigterm"
+           else
+             # Submit a new job to the queue
+             echo "Requeuing " $SLURM_JOB_ID
+             scontrol requeue $SLURM_JOB_ID
+           fi
+        }
+
+
+        # Install signal handler
+        trap 'trap_handler USR1' USR1
+        trap 'trap_handler TERM' TERM
+    """
+
+
+def get_random_port():
+    old_state = random.getstate()
+    random.seed()
+    port = random.randint(10000, 20000)
+    random.setstate(old_state)
+    return port
diff --git a/legacy.py b/legacy.py
new file mode 100755
index 0000000000000000000000000000000000000000..4c25534aa034ba3eb221efd51d6498c2c35adf10
--- /dev/null
+++ b/legacy.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import click
+import pickle
+import re
+import copy
+import numpy as np
+import torch
+import dnnlib
+from torch_utils import misc
+
+#----------------------------------------------------------------------------
+
+def load_network_pkl(f, force_fp16=False):
+    data = _LegacyUnpickler(f).load()
+
+    # Legacy TensorFlow pickle => convert.
+    if isinstance(data, tuple) and len(data) == 3 and all(isinstance(net, _TFNetworkStub) for net in data):
+        tf_G, tf_D, tf_Gs = data
+        G = convert_tf_generator(tf_G)
+        D = convert_tf_discriminator(tf_D)
+        G_ema = convert_tf_generator(tf_Gs)
+        data = dict(G=G, D=D, G_ema=G_ema)
+
+    # Add missing fields.
+    if 'training_set_kwargs' not in data:
+        data['training_set_kwargs'] = None
+    if 'augment_pipe' not in data:
+        data['augment_pipe'] = None
+
+    # Validate contents.
+    # assert isinstance(data['G'], torch.nn.Module)
+    # assert isinstance(data['D'], torch.nn.Module)
+    # assert isinstance(data['G_ema'], torch.nn.Module)
+    # assert isinstance(data['training_set_kwargs'], (dict, type(None)))
+    # assert isinstance(data['augment_pipe'], (torch.nn.Module, type(None)))
+
+    # Force FP16.
+    if force_fp16:
+        for key in ['G', 'D', 'G_ema']:
+            old = data[key]
+            kwargs = copy.deepcopy(old.init_kwargs)
+            if key.startswith('G'):
+                kwargs.synthesis_kwargs = dnnlib.EasyDict(kwargs.get('synthesis_kwargs', {}))
+                kwargs.synthesis_kwargs.num_fp16_res = 4
+                kwargs.synthesis_kwargs.conv_clamp = 256
+            if key.startswith('D'):
+                kwargs.num_fp16_res = 4
+                kwargs.conv_clamp = 256
+            if kwargs != old.init_kwargs:
+                new = type(old)(**kwargs).eval().requires_grad_(False)
+                misc.copy_params_and_buffers(old, new, require_all=True)
+                data[key] = new
+    return data
+
+#----------------------------------------------------------------------------
+
+class _TFNetworkStub(dnnlib.EasyDict):
+    pass
+
+class _LegacyUnpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        if module == 'dnnlib.tflib.network' and name == 'Network':
+            return _TFNetworkStub
+        return super().find_class(module, name)
+
+#----------------------------------------------------------------------------
+
+def _collect_tf_params(tf_net):
+    # pylint: disable=protected-access
+    tf_params = dict()
+    def recurse(prefix, tf_net):
+        for name, value in tf_net.variables:
+            tf_params[prefix + name] = value
+        for name, comp in tf_net.components.items():
+            recurse(prefix + name + '/', comp)
+    recurse('', tf_net)
+    return tf_params
+
+#----------------------------------------------------------------------------
+
+def _populate_module_params(module, *patterns):
+    for name, tensor in misc.named_params_and_buffers(module):
+        found = False
+        value = None
+        for pattern, value_fn in zip(patterns[0::2], patterns[1::2]):
+            match = re.fullmatch(pattern, name)
+            if match:
+                found = True
+                if value_fn is not None:
+                    value = value_fn(*match.groups())
+                break
+        try:
+            assert found
+            if value is not None:
+                tensor.copy_(torch.from_numpy(np.array(value)))
+        except:
+            print(name, list(tensor.shape))
+            raise
+
+#----------------------------------------------------------------------------
+
+def convert_tf_generator(tf_G):
+    if tf_G.version < 4:
+        raise ValueError('TensorFlow pickle version too low')
+
+    # Collect kwargs.
+    tf_kwargs = tf_G.static_kwargs
+    known_kwargs = set()
+    def kwarg(tf_name, default=None, none=None):
+        known_kwargs.add(tf_name)
+        val = tf_kwargs.get(tf_name, default)
+        return val if val is not None else none
+
+    # Convert kwargs.
+    kwargs = dnnlib.EasyDict(
+        z_dim                   = kwarg('latent_size',          512),
+        c_dim                   = kwarg('label_size',           0),
+        w_dim                   = kwarg('dlatent_size',         512),
+        img_resolution          = kwarg('resolution',           1024),
+        img_channels            = kwarg('num_channels',         3),
+        mapping_kwargs = dnnlib.EasyDict(
+            num_layers          = kwarg('mapping_layers',       8),
+            embed_features      = kwarg('label_fmaps',          None),
+            layer_features      = kwarg('mapping_fmaps',        None),
+            activation          = kwarg('mapping_nonlinearity', 'lrelu'),
+            lr_multiplier       = kwarg('mapping_lrmul',        0.01),
+            w_avg_beta          = kwarg('w_avg_beta',           0.995,  none=1),
+        ),
+        synthesis_kwargs = dnnlib.EasyDict(
+            channel_base        = kwarg('fmap_base',            16384) * 2,
+            channel_max         = kwarg('fmap_max',             512),
+            num_fp16_res        = kwarg('num_fp16_res',         0),
+            conv_clamp          = kwarg('conv_clamp',           None),
+            architecture        = kwarg('architecture',         'skip'),
+            resample_filter     = kwarg('resample_kernel',      [1,3,3,1]),
+            use_noise           = kwarg('use_noise',            True),
+            activation          = kwarg('nonlinearity',         'lrelu'),
+        ),
+    )
+
+    # Check for unknown kwargs.
+    kwarg('truncation_psi')
+    kwarg('truncation_cutoff')
+    kwarg('style_mixing_prob')
+    kwarg('structure')
+    unknown_kwargs = list(set(tf_kwargs.keys()) - known_kwargs)
+    if len(unknown_kwargs) > 0:
+        raise ValueError('Unknown TensorFlow kwarg', unknown_kwargs[0])
+
+    # Collect params.
+    tf_params = _collect_tf_params(tf_G)
+    for name, value in list(tf_params.items()):
+        match = re.fullmatch(r'ToRGB_lod(\d+)/(.*)', name)
+        if match:
+            r = kwargs.img_resolution // (2 ** int(match.group(1)))
+            tf_params[f'{r}x{r}/ToRGB/{match.group(2)}'] = value
+            kwargs.synthesis.kwargs.architecture = 'orig'
+    #for name, value in tf_params.items(): print(f'{name:<50s}{list(value.shape)}')
+
+    # Convert params.
+    from training import networks
+    G = networks.Generator(**kwargs).eval().requires_grad_(False)
+    # pylint: disable=unnecessary-lambda
+    _populate_module_params(G,
+        r'mapping\.w_avg',                                  lambda:     tf_params[f'dlatent_avg'],
+        r'mapping\.embed\.weight',                          lambda:     tf_params[f'mapping/LabelEmbed/weight'].transpose(),
+        r'mapping\.embed\.bias',                            lambda:     tf_params[f'mapping/LabelEmbed/bias'],
+        r'mapping\.fc(\d+)\.weight',                        lambda i:   tf_params[f'mapping/Dense{i}/weight'].transpose(),
+        r'mapping\.fc(\d+)\.bias',                          lambda i:   tf_params[f'mapping/Dense{i}/bias'],
+        r'synthesis\.b4\.const',                            lambda:     tf_params[f'synthesis/4x4/Const/const'][0],
+        r'synthesis\.b4\.conv1\.weight',                    lambda:     tf_params[f'synthesis/4x4/Conv/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b4\.conv1\.bias',                      lambda:     tf_params[f'synthesis/4x4/Conv/bias'],
+        r'synthesis\.b4\.conv1\.noise_const',               lambda:     tf_params[f'synthesis/noise0'][0, 0],
+        r'synthesis\.b4\.conv1\.noise_strength',            lambda:     tf_params[f'synthesis/4x4/Conv/noise_strength'],
+        r'synthesis\.b4\.conv1\.affine\.weight',            lambda:     tf_params[f'synthesis/4x4/Conv/mod_weight'].transpose(),
+        r'synthesis\.b4\.conv1\.affine\.bias',              lambda:     tf_params[f'synthesis/4x4/Conv/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.conv0\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/weight'][::-1, ::-1].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.conv0\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/bias'],
+        r'synthesis\.b(\d+)\.conv0\.noise_const',           lambda r:   tf_params[f'synthesis/noise{int(np.log2(int(r)))*2-5}'][0, 0],
+        r'synthesis\.b(\d+)\.conv0\.noise_strength',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/noise_strength'],
+        r'synthesis\.b(\d+)\.conv0\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.conv0\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.conv1\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.conv1\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/bias'],
+        r'synthesis\.b(\d+)\.conv1\.noise_const',           lambda r:   tf_params[f'synthesis/noise{int(np.log2(int(r)))*2-4}'][0, 0],
+        r'synthesis\.b(\d+)\.conv1\.noise_strength',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/noise_strength'],
+        r'synthesis\.b(\d+)\.conv1\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.conv1\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.torgb\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.torgb\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/bias'],
+        r'synthesis\.b(\d+)\.torgb\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.torgb\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.skip\.weight',                 lambda r:   tf_params[f'synthesis/{r}x{r}/Skip/weight'][::-1, ::-1].transpose(3, 2, 0, 1),
+        r'.*\.resample_filter',                             None,
+    )
+    return G
+
+#----------------------------------------------------------------------------
+
+def convert_tf_discriminator(tf_D):
+    if tf_D.version < 4:
+        raise ValueError('TensorFlow pickle version too low')
+
+    # Collect kwargs.
+    tf_kwargs = tf_D.static_kwargs
+    known_kwargs = set()
+    def kwarg(tf_name, default=None):
+        known_kwargs.add(tf_name)
+        return tf_kwargs.get(tf_name, default)
+
+    # Convert kwargs.
+    kwargs = dnnlib.EasyDict(
+        c_dim                   = kwarg('label_size',           0),
+        img_resolution          = kwarg('resolution',           1024),
+        img_channels            = kwarg('num_channels',         3),
+        architecture            = kwarg('architecture',         'resnet'),
+        channel_base            = kwarg('fmap_base',            16384) * 2,
+        channel_max             = kwarg('fmap_max',             512),
+        num_fp16_res            = kwarg('num_fp16_res',         0),
+        conv_clamp              = kwarg('conv_clamp',           None),
+        cmap_dim                = kwarg('mapping_fmaps',        None),
+        block_kwargs = dnnlib.EasyDict(
+            activation          = kwarg('nonlinearity',         'lrelu'),
+            resample_filter     = kwarg('resample_kernel',      [1,3,3,1]),
+            freeze_layers       = kwarg('freeze_layers',        0),
+        ),
+        mapping_kwargs = dnnlib.EasyDict(
+            num_layers          = kwarg('mapping_layers',       0),
+            embed_features      = kwarg('mapping_fmaps',        None),
+            layer_features      = kwarg('mapping_fmaps',        None),
+            activation          = kwarg('nonlinearity',         'lrelu'),
+            lr_multiplier       = kwarg('mapping_lrmul',        0.1),
+        ),
+        epilogue_kwargs = dnnlib.EasyDict(
+            mbstd_group_size    = kwarg('mbstd_group_size',     None),
+            mbstd_num_channels  = kwarg('mbstd_num_features',   1),
+            activation          = kwarg('nonlinearity',         'lrelu'),
+        ),
+    )
+
+    # Check for unknown kwargs.
+    kwarg('structure')
+    unknown_kwargs = list(set(tf_kwargs.keys()) - known_kwargs)
+    if len(unknown_kwargs) > 0:
+        raise ValueError('Unknown TensorFlow kwarg', unknown_kwargs[0])
+
+    # Collect params.
+    tf_params = _collect_tf_params(tf_D)
+    for name, value in list(tf_params.items()):
+        match = re.fullmatch(r'FromRGB_lod(\d+)/(.*)', name)
+        if match:
+            r = kwargs.img_resolution // (2 ** int(match.group(1)))
+            tf_params[f'{r}x{r}/FromRGB/{match.group(2)}'] = value
+            kwargs.architecture = 'orig'
+    #for name, value in tf_params.items(): print(f'{name:<50s}{list(value.shape)}')
+
+    # Convert params.
+    from training import networks
+    D = networks.Discriminator(**kwargs).eval().requires_grad_(False)
+    # pylint: disable=unnecessary-lambda
+    _populate_module_params(D,
+        r'b(\d+)\.fromrgb\.weight',     lambda r:       tf_params[f'{r}x{r}/FromRGB/weight'].transpose(3, 2, 0, 1),
+        r'b(\d+)\.fromrgb\.bias',       lambda r:       tf_params[f'{r}x{r}/FromRGB/bias'],
+        r'b(\d+)\.conv(\d+)\.weight',   lambda r, i:    tf_params[f'{r}x{r}/Conv{i}{["","_down"][int(i)]}/weight'].transpose(3, 2, 0, 1),
+        r'b(\d+)\.conv(\d+)\.bias',     lambda r, i:    tf_params[f'{r}x{r}/Conv{i}{["","_down"][int(i)]}/bias'],
+        r'b(\d+)\.skip\.weight',        lambda r:       tf_params[f'{r}x{r}/Skip/weight'].transpose(3, 2, 0, 1),
+        r'mapping\.embed\.weight',      lambda:         tf_params[f'LabelEmbed/weight'].transpose(),
+        r'mapping\.embed\.bias',        lambda:         tf_params[f'LabelEmbed/bias'],
+        r'mapping\.fc(\d+)\.weight',    lambda i:       tf_params[f'Mapping{i}/weight'].transpose(),
+        r'mapping\.fc(\d+)\.bias',      lambda i:       tf_params[f'Mapping{i}/bias'],
+        r'b4\.conv\.weight',            lambda:         tf_params[f'4x4/Conv/weight'].transpose(3, 2, 0, 1),
+        r'b4\.conv\.bias',              lambda:         tf_params[f'4x4/Conv/bias'],
+        r'b4\.fc\.weight',              lambda:         tf_params[f'4x4/Dense0/weight'].transpose(),
+        r'b4\.fc\.bias',                lambda:         tf_params[f'4x4/Dense0/bias'],
+        r'b4\.out\.weight',             lambda:         tf_params[f'Output/weight'].transpose(),
+        r'b4\.out\.bias',               lambda:         tf_params[f'Output/bias'],
+        r'.*\.resample_filter',         None,
+    )
+    return D
+
+#----------------------------------------------------------------------------
+
+@click.command()
+@click.option('--source', help='Input pickle', required=True, metavar='PATH')
+@click.option('--dest', help='Output pickle', required=True, metavar='PATH')
+@click.option('--force-fp16', help='Force the networks to use FP16', type=bool, default=False, metavar='BOOL', show_default=True)
+def convert_network_pickle(source, dest, force_fp16):
+    """Convert legacy network pickle into the native PyTorch format.
+
+    The tool is able to load the main network configurations exported using the TensorFlow version of StyleGAN2 or StyleGAN2-ADA.
+    It does not support e.g. StyleGAN2-ADA comparison methods, StyleGAN2 configs A-D, or StyleGAN1 networks.
+
+    Example:
+
+    \b
+    python legacy.py \\
+        --source=https://nvlabs-fi-cdn.nvidia.com/stylegan2/networks/stylegan2-cat-config-f.pkl \\
+        --dest=stylegan2-cat-config-f.pkl
+    """
+    print(f'Loading "{source}"...')
+    with dnnlib.util.open_url(source) as f:
+        data = load_network_pkl(f, force_fp16=force_fp16)
+    print(f'Saving "{dest}"...')
+    with open(dest, 'wb') as f:
+        pickle.dump(data, f)
+    print('Done.')
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    convert_network_pickle() # pylint: disable=no-value-for-parameter
+
+#----------------------------------------------------------------------------
diff --git a/metrics/__init__.py b/metrics/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e1e1a5ba99e56a56ecaa14f7d4fa41777789c0cf
--- /dev/null
+++ b/metrics/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# empty
diff --git a/metrics/frechet_inception_distance.py b/metrics/frechet_inception_distance.py
new file mode 100755
index 0000000000000000000000000000000000000000..7ba3a6fe166fe798e0870f5caa5d34010c640b6e
--- /dev/null
+++ b/metrics/frechet_inception_distance.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Frechet Inception Distance (FID) from the paper
+"GANs trained by a two time-scale update rule converge to a local Nash
+equilibrium". Matches the original implementation by Heusel et al. at
+https://github.com/bioinf-jku/TTUR/blob/master/fid.py"""
+
+import numpy as np
+import scipy.linalg
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+def compute_fid(opts, max_real, num_gen):
+    # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+    detector_url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/inception-2015-12-05.pt'
+    detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer.
+    mu_real, sigma_real = metric_utils.compute_feature_stats_for_dataset(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=0, capture_mean_cov=True, max_items=max_real).get_mean_cov()
+
+    mu_gen, sigma_gen = metric_utils.compute_feature_stats_for_generator(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=1, capture_mean_cov=True, max_items=num_gen).get_mean_cov()
+
+    if opts.rank != 0:
+        return float('nan')
+
+    m = np.square(mu_gen - mu_real).sum()
+    s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member
+    fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
+    return float(fid)
+
+#----------------------------------------------------------------------------
diff --git a/metrics/inception_score.py b/metrics/inception_score.py
new file mode 100755
index 0000000000000000000000000000000000000000..4158667c73a4b84b9e3fa749af959ebdc8688411
--- /dev/null
+++ b/metrics/inception_score.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Inception Score (IS) from the paper "Improved techniques for training
+GANs". Matches the original implementation by Salimans et al. at
+https://github.com/openai/improved-gan/blob/master/inception_score/model.py"""
+
+import numpy as np
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+def compute_is(opts, num_gen, num_splits):
+    # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+    detector_url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/inception-2015-12-05.pt'
+    detector_kwargs = dict(no_output_bias=True) # Match the original implementation by not applying bias in the softmax layer.
+
+    gen_probs = metric_utils.compute_feature_stats_for_generator(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        capture_all=True, max_items=num_gen).get_all()
+
+    if opts.rank != 0:
+        return float('nan'), float('nan')
+
+    scores = []
+    for i in range(num_splits):
+        part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits]
+        kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True)))
+        kl = np.mean(np.sum(kl, axis=1))
+        scores.append(np.exp(kl))
+    return float(np.mean(scores)), float(np.std(scores))
+
+#----------------------------------------------------------------------------
diff --git a/metrics/kernel_inception_distance.py b/metrics/kernel_inception_distance.py
new file mode 100755
index 0000000000000000000000000000000000000000..12b4a95e6c628a45f0e8a618c7e943e04fa62d69
--- /dev/null
+++ b/metrics/kernel_inception_distance.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Kernel Inception Distance (KID) from the paper "Demystifying MMD
+GANs". Matches the original implementation by Binkowski et al. at
+https://github.com/mbinkowski/MMD-GAN/blob/master/gan/compute_scores.py"""
+
+import numpy as np
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+def compute_kid(opts, max_real, num_gen, num_subsets, max_subset_size):
+    # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+    detector_url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/inception-2015-12-05.pt'
+    detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer.
+
+    real_features = metric_utils.compute_feature_stats_for_dataset(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=0, capture_all=True, max_items=max_real).get_all()
+
+    gen_features = metric_utils.compute_feature_stats_for_generator(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=1, capture_all=True, max_items=num_gen).get_all()
+
+    if opts.rank != 0:
+        return float('nan')
+
+    n = real_features.shape[1]
+    m = min(min(real_features.shape[0], gen_features.shape[0]), max_subset_size)
+    t = 0
+    for _subset_idx in range(num_subsets):
+        x = gen_features[np.random.choice(gen_features.shape[0], m, replace=False)]
+        y = real_features[np.random.choice(real_features.shape[0], m, replace=False)]
+        a = (x @ x.T / n + 1) ** 3 + (y @ y.T / n + 1) ** 3
+        b = (x @ y.T / n + 1) ** 3
+        t += (a.sum() - np.diag(a).sum()) / (m - 1) - b.sum() * 2 / m
+    kid = t / num_subsets / m
+    return float(kid)
+
+#----------------------------------------------------------------------------
diff --git a/metrics/metric_main.py b/metrics/metric_main.py
new file mode 100755
index 0000000000000000000000000000000000000000..7f658ae9e5bde8a5176e842ca5a9a618b00e0faa
--- /dev/null
+++ b/metrics/metric_main.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import time
+import json
+import torch
+import dnnlib
+
+from . import metric_utils
+from . import frechet_inception_distance
+from . import kernel_inception_distance
+from . import precision_recall
+from . import perceptual_path_length
+from . import inception_score
+
+#----------------------------------------------------------------------------
+
+_metric_dict = dict() # name => fn
+
+def register_metric(fn):
+    assert callable(fn)
+    _metric_dict[fn.__name__] = fn
+    return fn
+
+def is_valid_metric(metric):
+    return metric in _metric_dict
+
+def list_valid_metrics():
+    return list(_metric_dict.keys())
+
+#----------------------------------------------------------------------------
+
+def calc_metric(metric, **kwargs): # See metric_utils.MetricOptions for the full list of arguments.
+    assert is_valid_metric(metric)
+    opts = metric_utils.MetricOptions(**kwargs)
+
+    # Calculate.
+    start_time = time.time()
+    results = _metric_dict[metric](opts)
+    total_time = time.time() - start_time
+
+    # Broadcast results.
+    for key, value in list(results.items()):
+        if opts.num_gpus > 1:
+            value = torch.as_tensor(value, dtype=torch.float64, device=opts.device)
+            torch.distributed.broadcast(tensor=value, src=0)
+            value = float(value.cpu())
+        results[key] = value
+
+    # Decorate with metadata.
+    return dnnlib.EasyDict(
+        results         = dnnlib.EasyDict(results),
+        metric          = metric,
+        total_time      = total_time,
+        total_time_str  = dnnlib.util.format_time(total_time),
+        num_gpus        = opts.num_gpus,
+    )
+
+#----------------------------------------------------------------------------
+
+def report_metric(result_dict, run_dir=None, snapshot_pkl=None):
+    metric = result_dict['metric']
+    assert is_valid_metric(metric)
+    if run_dir is not None and snapshot_pkl is not None:
+        snapshot_pkl = os.path.relpath(snapshot_pkl, run_dir)
+
+    jsonl_line = json.dumps(dict(result_dict, snapshot_pkl=snapshot_pkl, timestamp=time.time()))
+    print(jsonl_line)
+    if run_dir is not None and os.path.isdir(run_dir):
+        with open(os.path.join(run_dir, f'metric-{metric}.jsonl'), 'at') as f:
+            f.write(jsonl_line + '\n')
+
+#----------------------------------------------------------------------------
+# Primary metrics.
+
+@register_metric
+def fid50k_full(opts):
+    opts.dataset_kwargs.update(max_size=None, xflip=False)
+    fid = frechet_inception_distance.compute_fid(opts, max_real=None, num_gen=50000)
+    return dict(fid50k_full=fid)
+
+@register_metric
+def kid50k_full(opts):
+    opts.dataset_kwargs.update(max_size=None, xflip=False)
+    kid = kernel_inception_distance.compute_kid(opts, max_real=1000000, num_gen=50000, num_subsets=100, max_subset_size=1000)
+    return dict(kid50k_full=kid)
+
+@register_metric
+def pr50k3_full(opts):
+    opts.dataset_kwargs.update(max_size=None, xflip=False)
+    precision, recall = precision_recall.compute_pr(opts, max_real=200000, num_gen=50000, nhood_size=3, row_batch_size=10000, col_batch_size=10000)
+    return dict(pr50k3_full_precision=precision, pr50k3_full_recall=recall)
+
+@register_metric
+def ppl2_wend(opts):
+    ppl = perceptual_path_length.compute_ppl(opts, num_samples=50000, epsilon=1e-4, space='w', sampling='end', crop=False, batch_size=2)
+    return dict(ppl2_wend=ppl)
+
+@register_metric
+def is50k(opts):
+    opts.dataset_kwargs.update(max_size=None, xflip=False)
+    mean, std = inception_score.compute_is(opts, num_gen=50000, num_splits=10)
+    return dict(is50k_mean=mean, is50k_std=std)
+
+#----------------------------------------------------------------------------
+# Legacy metrics.
+
+@register_metric
+def fid50k(opts):
+    opts.dataset_kwargs.update(max_size=None)
+    fid = frechet_inception_distance.compute_fid(opts, max_real=50000, num_gen=50000)
+    return dict(fid50k=fid)
+
+@register_metric
+def kid50k(opts):
+    opts.dataset_kwargs.update(max_size=None)
+    kid = kernel_inception_distance.compute_kid(opts, max_real=50000, num_gen=50000, num_subsets=100, max_subset_size=1000)
+    return dict(kid50k=kid)
+
+@register_metric
+def pr50k3(opts):
+    opts.dataset_kwargs.update(max_size=None)
+    precision, recall = precision_recall.compute_pr(opts, max_real=50000, num_gen=50000, nhood_size=3, row_batch_size=10000, col_batch_size=10000)
+    return dict(pr50k3_precision=precision, pr50k3_recall=recall)
+
+@register_metric
+def ppl_zfull(opts):
+    ppl = perceptual_path_length.compute_ppl(opts, num_samples=50000, epsilon=1e-4, space='z', sampling='full', crop=True, batch_size=2)
+    return dict(ppl_zfull=ppl)
+
+@register_metric
+def ppl_wfull(opts):
+    ppl = perceptual_path_length.compute_ppl(opts, num_samples=50000, epsilon=1e-4, space='w', sampling='full', crop=True, batch_size=2)
+    return dict(ppl_wfull=ppl)
+
+@register_metric
+def ppl_zend(opts):
+    ppl = perceptual_path_length.compute_ppl(opts, num_samples=50000, epsilon=1e-4, space='z', sampling='end', crop=True, batch_size=2)
+    return dict(ppl_zend=ppl)
+
+@register_metric
+def ppl_wend(opts):
+    ppl = perceptual_path_length.compute_ppl(opts, num_samples=50000, epsilon=1e-4, space='w', sampling='end', crop=True, batch_size=2)
+    return dict(ppl_wend=ppl)
+
+#----------------------------------------------------------------------------
diff --git a/metrics/metric_utils.py b/metrics/metric_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..e48785f9ebeb784a695529515872ee30e9a4512e
--- /dev/null
+++ b/metrics/metric_utils.py
@@ -0,0 +1,305 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import time
+import hashlib
+import pickle
+import copy
+import uuid
+import numpy as np
+import torch
+import dnnlib
+import glob
+#----------------------------------------------------------------------------
+
+class MetricOptions:
+    def __init__(self, G=None, G_kwargs={}, dataset_kwargs={}, num_gpus=1, rank=0, device=None, progress=None, cache=True):
+        assert 0 <= rank < num_gpus
+        self.G              = G
+        self.G_kwargs       = dnnlib.EasyDict(G_kwargs)
+        self.dataset_kwargs = dnnlib.EasyDict(dataset_kwargs)
+        self.num_gpus       = num_gpus
+        self.rank           = rank
+        self.device         = device if device is not None else torch.device('cuda', rank)
+        self.progress       = progress.sub() if progress is not None and rank == 0 else ProgressMonitor()
+        self.cache          = cache
+
+#----------------------------------------------------------------------------
+
+_feature_detector_cache = dict()
+
+def get_feature_detector_name(url):
+    return os.path.splitext(url.split('/')[-1])[0]
+
+def get_feature_detector(url, device=torch.device('cpu'), num_gpus=1, rank=0, verbose=False):
+    assert 0 <= rank < num_gpus
+    key = (url, device)
+    if key not in _feature_detector_cache:
+        is_leader = (rank == 0)
+        if not is_leader and num_gpus > 1:
+            torch.distributed.barrier() # leader goes first
+        with dnnlib.util.open_url(url, verbose=(verbose and is_leader)) as f:
+            _feature_detector_cache[key] = torch.jit.load(f).eval().to(device)
+        if is_leader and num_gpus > 1:
+            torch.distributed.barrier() # others follow
+    return _feature_detector_cache[key]
+
+#----------------------------------------------------------------------------
+
+class FeatureStats:
+    def __init__(self, capture_all=False, capture_mean_cov=False, max_items=None):
+        self.capture_all = capture_all
+        self.capture_mean_cov = capture_mean_cov
+        self.max_items = max_items
+        self.num_items = 0
+        self.num_features = None
+        self.all_features = None
+        self.raw_mean = None
+        self.raw_cov = None
+
+    def set_num_features(self, num_features):
+        if self.num_features is not None:
+            assert num_features == self.num_features
+        else:
+            self.num_features = num_features
+            self.all_features = []
+            self.raw_mean = np.zeros([num_features], dtype=np.float64)
+            self.raw_cov = np.zeros([num_features, num_features], dtype=np.float64)
+
+    def is_full(self):
+        return (self.max_items is not None) and (self.num_items >= self.max_items)
+
+    def append(self, x):
+        x = np.asarray(x, dtype=np.float32)
+        assert x.ndim == 2
+        if (self.max_items is not None) and (self.num_items + x.shape[0] > self.max_items):
+            if self.num_items >= self.max_items:
+                return
+            x = x[:self.max_items - self.num_items]
+
+        self.set_num_features(x.shape[1])
+        self.num_items += x.shape[0]
+        if self.capture_all:
+            self.all_features.append(x)
+        if self.capture_mean_cov:
+            x64 = x.astype(np.float64)
+            self.raw_mean += x64.sum(axis=0)
+            self.raw_cov += x64.T @ x64
+
+    def append_torch(self, x, num_gpus=1, rank=0):
+        assert isinstance(x, torch.Tensor) and x.ndim == 2
+        assert 0 <= rank < num_gpus
+        if num_gpus > 1:
+            ys = []
+            for src in range(num_gpus):
+                y = x.clone()
+                torch.distributed.broadcast(y, src=src)
+                ys.append(y)
+            x = torch.stack(ys, dim=1).flatten(0, 1) # interleave samples
+        self.append(x.cpu().numpy())
+
+    def get_all(self):
+        assert self.capture_all
+        return np.concatenate(self.all_features, axis=0)
+
+    def get_all_torch(self):
+        return torch.from_numpy(self.get_all())
+
+    def get_mean_cov(self):
+        assert self.capture_mean_cov
+        mean = self.raw_mean / self.num_items
+        cov = self.raw_cov / self.num_items
+        cov = cov - np.outer(mean, mean)
+        return mean, cov
+
+    def save(self, pkl_file):
+        with open(pkl_file, 'wb') as f:
+            pickle.dump(self.__dict__, f)
+
+    @staticmethod
+    def load(pkl_file):
+        with open(pkl_file, 'rb') as f:
+            s = dnnlib.EasyDict(pickle.load(f))
+        obj = FeatureStats(capture_all=s.capture_all, max_items=s.max_items)
+        obj.__dict__.update(s)
+        return obj
+
+#----------------------------------------------------------------------------
+
+class ProgressMonitor:
+    def __init__(self, tag=None, num_items=None, flush_interval=1000, verbose=False, progress_fn=None, pfn_lo=0, pfn_hi=1000, pfn_total=1000):
+        self.tag = tag
+        self.num_items = num_items
+        self.verbose = verbose
+        self.flush_interval = flush_interval
+        self.progress_fn = progress_fn
+        self.pfn_lo = pfn_lo
+        self.pfn_hi = pfn_hi
+        self.pfn_total = pfn_total
+        self.start_time = time.time()
+        self.batch_time = self.start_time
+        self.batch_items = 0
+        if self.progress_fn is not None:
+            self.progress_fn(self.pfn_lo, self.pfn_total)
+
+    def update(self, cur_items):
+        assert (self.num_items is None) or (cur_items <= self.num_items)
+        if (cur_items < self.batch_items + self.flush_interval) and (self.num_items is None or cur_items < self.num_items):
+            return
+        cur_time = time.time()
+        total_time = cur_time - self.start_time
+        time_per_item = (cur_time - self.batch_time) / max(cur_items - self.batch_items, 1)
+        if (self.verbose) and (self.tag is not None):
+            print(f'{self.tag:<19s} items {cur_items:<7d} time {dnnlib.util.format_time(total_time):<12s} ms/item {time_per_item*1e3:.2f}')
+        self.batch_time = cur_time
+        self.batch_items = cur_items
+
+        if (self.progress_fn is not None) and (self.num_items is not None):
+            self.progress_fn(self.pfn_lo + (self.pfn_hi - self.pfn_lo) * (cur_items / self.num_items), self.pfn_total)
+
+    def sub(self, tag=None, num_items=None, flush_interval=1000, rel_lo=0, rel_hi=1):
+        return ProgressMonitor(
+            tag             = tag,
+            num_items       = num_items,
+            flush_interval  = flush_interval,
+            verbose         = self.verbose,
+            progress_fn     = self.progress_fn,
+            pfn_lo          = self.pfn_lo + (self.pfn_hi - self.pfn_lo) * rel_lo,
+            pfn_hi          = self.pfn_lo + (self.pfn_hi - self.pfn_lo) * rel_hi,
+            pfn_total       = self.pfn_total,
+        )
+
+#----------------------------------------------------------------------------
+
+def compute_feature_stats_for_dataset(opts, detector_url, detector_kwargs, rel_lo=0, rel_hi=1, batch_size=64, data_loader_kwargs=None, max_items=None, **stats_kwargs):
+    dataset = dnnlib.util.construct_class_by_name(**opts.dataset_kwargs)
+    if data_loader_kwargs is None:
+        data_loader_kwargs = dict(pin_memory=True, num_workers=3, prefetch_factor=2)
+
+    # Try to lookup from cache.
+    cache_file = None
+    if opts.cache:
+        # Choose cache file name.
+        args = dict(dataset_kwargs=opts.dataset_kwargs, detector_url=detector_url, detector_kwargs=detector_kwargs, stats_kwargs=stats_kwargs)
+        md5 = hashlib.md5(repr(sorted(args.items())).encode('utf-8'))
+        cache_tag = f'{dataset.name}-{get_feature_detector_name(detector_url)}-{md5.hexdigest()}'
+        cache_file = dnnlib.make_cache_dir_path('gan-metrics', cache_tag + '.pkl')
+
+        # Check if the file exists (all processes must agree).
+        flag = os.path.isfile(cache_file) if opts.rank == 0 else False
+        if opts.num_gpus > 1:
+            flag = torch.as_tensor(flag, dtype=torch.float32, device=opts.device)
+            torch.distributed.broadcast(tensor=flag, src=0)
+            flag = (float(flag.cpu()) != 0)
+
+        # Load.
+        if flag:
+            return FeatureStats.load(cache_file)
+
+    # Initialize.
+    num_items = len(dataset)
+    if max_items is not None:
+        num_items = min(num_items, max_items)
+    stats = FeatureStats(max_items=num_items, **stats_kwargs)
+    progress = opts.progress.sub(tag='dataset features', num_items=num_items, rel_lo=rel_lo, rel_hi=rel_hi)
+    detector = get_feature_detector(url=detector_url, device=opts.device, num_gpus=opts.num_gpus, rank=opts.rank, verbose=progress.verbose)
+
+    # Main loop.
+    item_subset = [(i * opts.num_gpus + opts.rank) % num_items for i in range((num_items - 1) // opts.num_gpus + 1)]
+    for images, _labels, _indices in torch.utils.data.DataLoader(dataset=dataset, sampler=item_subset, batch_size=batch_size, **data_loader_kwargs):
+        if images.shape[1] == 1:
+            images = images.repeat([1, 3, 1, 1])
+        features = detector(images.to(opts.device), **detector_kwargs)
+        stats.append_torch(features, num_gpus=opts.num_gpus, rank=opts.rank)
+        progress.update(stats.num_items)
+
+    # Save to cache.
+    if cache_file is not None and opts.rank == 0:
+        os.makedirs(os.path.dirname(cache_file), exist_ok=True)
+        temp_file = cache_file + '.' + uuid.uuid4().hex
+        stats.save(temp_file)
+        os.replace(temp_file, cache_file) # atomic
+    return stats
+
+#----------------------------------------------------------------------------
+
+def compute_feature_stats_for_generator(opts, detector_url, detector_kwargs, rel_lo=0, rel_hi=1, batch_size=64, batch_gen=None, jit=False, **stats_kwargs):
+    if batch_gen is None:
+        batch_gen = min(batch_size, 4)
+    assert batch_size % batch_gen == 0
+
+    # Setup generator and load labels.
+    G = copy.deepcopy(opts.G).eval().requires_grad_(False).to(opts.device)
+    dataset = dnnlib.util.construct_class_by_name(**opts.dataset_kwargs)
+
+    # HACK:
+    # other_data = "/checkpoint/jgu/space/gan/ffhq/giraffe_results/gen_images"
+    # other_data = "/checkpoint/jgu/space/gan/cars/gen_images_380000"
+    # other_data = "/private/home/jgu/work/pi-GAN/Baselines/FFHQEvalOutput2"
+    # other_data = "/private/home/jgu/work/pi-GAN/Baselines/AFHQEvalOutput"
+    # other_data = sorted(glob.glob(f'{other_data}/*.jpg'))
+    # other_data = '/private/home/jgu/work/giraffe/out/afhq256/fid_images.npy'
+    # other_images = np.load(other_data)
+    #　from fairseq import pdb;pdb.set_trace()
+    # print(f'other data size = {len(other_data)}')
+    other_data = None
+
+    # Image generation func.
+    def run_generator(z, c):
+        # from fairseq import pdb;pdb.set_trace()
+        if hasattr(G, 'get_final_output'):
+            img = G.get_final_output(z=z, c=c, **opts.G_kwargs)
+        else:
+            img = G(z=z, c=c, **opts.G_kwargs)
+        img = (img * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+        return img
+
+    # JIT.
+    if jit:
+        z = torch.zeros([batch_gen, G.z_dim], device=opts.device)
+        c = torch.zeros([batch_gen, G.c_dim], device=opts.device)
+        run_generator = torch.jit.trace(run_generator, [z, c], check_trace=False)
+
+    # Initialize.
+    stats = FeatureStats(**stats_kwargs)
+    assert stats.max_items is not None
+    progress = opts.progress.sub(tag='generator features', num_items=stats.max_items, rel_lo=rel_lo, rel_hi=rel_hi)
+    detector = get_feature_detector(url=detector_url, device=opts.device, num_gpus=opts.num_gpus, rank=opts.rank, verbose=progress.verbose)
+
+    # Main loop.
+    till_now = 0
+    while not stats.is_full():
+        images = []
+        if other_data is None:
+            for _i in range(batch_size // batch_gen):
+                z = torch.randn([batch_gen, G.z_dim], device=opts.device)
+                c = [dataset.get_label(np.random.randint(len(dataset))) for _i in range(batch_gen)]
+                c = torch.from_numpy(np.stack(c)).pin_memory().to(opts.device)
+                img = run_generator(z, c)
+                images.append(img)
+            images = torch.cat(images)
+        else:
+            batch_idxs = [((till_now + i) * opts.num_gpus + opts.rank) % len(other_images) for i in range(batch_size)]
+            import imageio
+            till_now += batch_size
+            images = other_images[batch_idxs]
+            images = torch.from_numpy(images).to(opts.device)
+            # images = np.stack([imageio.imread(other_data[i % len(other_data)]) for i in batch_idxs], axis=0)
+            # images = torch.from_numpy(images).to(opts.device).permute(0,3,1,2)
+            
+        if images.shape[1] == 1:
+            images = images.repeat([1, 3, 1, 1])
+        features = detector(images, **detector_kwargs)
+        stats.append_torch(features, num_gpus=opts.num_gpus, rank=opts.rank)
+        progress.update(stats.num_items)
+    return stats
+
+#----------------------------------------------------------------------------
diff --git a/metrics/perceptual_path_length.py b/metrics/perceptual_path_length.py
new file mode 100755
index 0000000000000000000000000000000000000000..8d2c3a44aececa58a7c5602e14a24d424e51bf14
--- /dev/null
+++ b/metrics/perceptual_path_length.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Perceptual Path Length (PPL) from the paper "A Style-Based Generator
+Architecture for Generative Adversarial Networks". Matches the original
+implementation by Karras et al. at
+https://github.com/NVlabs/stylegan/blob/master/metrics/perceptual_path_length.py"""
+
+import copy
+import numpy as np
+import torch
+import dnnlib
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+# Spherical interpolation of a batch of vectors.
+def slerp(a, b, t):
+    a = a / a.norm(dim=-1, keepdim=True)
+    b = b / b.norm(dim=-1, keepdim=True)
+    d = (a * b).sum(dim=-1, keepdim=True)
+    p = t * torch.acos(d)
+    c = b - d * a
+    c = c / c.norm(dim=-1, keepdim=True)
+    d = a * torch.cos(p) + c * torch.sin(p)
+    d = d / d.norm(dim=-1, keepdim=True)
+    return d
+
+#----------------------------------------------------------------------------
+
+class PPLSampler(torch.nn.Module):
+    def __init__(self, G, G_kwargs, epsilon, space, sampling, crop, vgg16):
+        assert space in ['z', 'w']
+        assert sampling in ['full', 'end']
+        super().__init__()
+        self.G = copy.deepcopy(G)
+        self.G_kwargs = G_kwargs
+        self.epsilon = epsilon
+        self.space = space
+        self.sampling = sampling
+        self.crop = crop
+        self.vgg16 = copy.deepcopy(vgg16)
+
+    def forward(self, c):
+        # Generate random latents and interpolation t-values.
+        t = torch.rand([c.shape[0]], device=c.device) * (1 if self.sampling == 'full' else 0)
+        z0, z1 = torch.randn([c.shape[0] * 2, self.G.z_dim], device=c.device).chunk(2)
+
+        # Interpolate in W or Z.
+        if self.space == 'w':
+            w0, w1 = self.G.mapping(z=torch.cat([z0,z1]), c=torch.cat([c,c])).chunk(2)
+            wt0 = w0.lerp(w1, t.unsqueeze(1).unsqueeze(2))
+            wt1 = w0.lerp(w1, t.unsqueeze(1).unsqueeze(2) + self.epsilon)
+        else: # space == 'z'
+            zt0 = slerp(z0, z1, t.unsqueeze(1))
+            zt1 = slerp(z0, z1, t.unsqueeze(1) + self.epsilon)
+            wt0, wt1 = self.G.mapping(z=torch.cat([zt0,zt1]), c=torch.cat([c,c])).chunk(2)
+
+        # Randomize noise buffers.
+        for name, buf in self.G.named_buffers():
+            if name.endswith('.noise_const'):
+                buf.copy_(torch.randn_like(buf))
+
+        # Generate images.
+        img = self.G.synthesis(ws=torch.cat([wt0,wt1]), noise_mode='const', force_fp32=True, **self.G_kwargs)
+
+        # Center crop.
+        if self.crop:
+            assert img.shape[2] == img.shape[3]
+            c = img.shape[2] // 8
+            img = img[:, :, c*3 : c*7, c*2 : c*6]
+
+        # Downsample to 256x256.
+        factor = self.G.img_resolution // 256
+        if factor > 1:
+            img = img.reshape([-1, img.shape[1], img.shape[2] // factor, factor, img.shape[3] // factor, factor]).mean([3, 5])
+
+        # Scale dynamic range from [-1,1] to [0,255].
+        img = (img + 1) * (255 / 2)
+        if self.G.img_channels == 1:
+            img = img.repeat([1, 3, 1, 1])
+
+        # Evaluate differential LPIPS.
+        lpips_t0, lpips_t1 = self.vgg16(img, resize_images=False, return_lpips=True).chunk(2)
+        dist = (lpips_t0 - lpips_t1).square().sum(1) / self.epsilon ** 2
+        return dist
+
+#----------------------------------------------------------------------------
+
+def compute_ppl(opts, num_samples, epsilon, space, sampling, crop, batch_size, jit=False):
+    dataset = dnnlib.util.construct_class_by_name(**opts.dataset_kwargs)
+    vgg16_url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt'
+    vgg16 = metric_utils.get_feature_detector(vgg16_url, num_gpus=opts.num_gpus, rank=opts.rank, verbose=opts.progress.verbose)
+
+    # Setup sampler.
+    sampler = PPLSampler(G=opts.G, G_kwargs=opts.G_kwargs, epsilon=epsilon, space=space, sampling=sampling, crop=crop, vgg16=vgg16)
+    sampler.eval().requires_grad_(False).to(opts.device)
+    if jit:
+        c = torch.zeros([batch_size, opts.G.c_dim], device=opts.device)
+        sampler = torch.jit.trace(sampler, [c], check_trace=False)
+
+    # Sampling loop.
+    dist = []
+    progress = opts.progress.sub(tag='ppl sampling', num_items=num_samples)
+    for batch_start in range(0, num_samples, batch_size * opts.num_gpus):
+        progress.update(batch_start)
+        c = [dataset.get_label(np.random.randint(len(dataset))) for _i in range(batch_size)]
+        c = torch.from_numpy(np.stack(c)).pin_memory().to(opts.device)
+        x = sampler(c)
+        for src in range(opts.num_gpus):
+            y = x.clone()
+            if opts.num_gpus > 1:
+                torch.distributed.broadcast(y, src=src)
+            dist.append(y)
+    progress.update(num_samples)
+
+    # Compute PPL.
+    if opts.rank != 0:
+        return float('nan')
+    dist = torch.cat(dist)[:num_samples].cpu().numpy()
+    lo = np.percentile(dist, 1, interpolation='lower')
+    hi = np.percentile(dist, 99, interpolation='higher')
+    ppl = np.extract(np.logical_and(dist >= lo, dist <= hi), dist).mean()
+    return float(ppl)
+
+#----------------------------------------------------------------------------
diff --git a/metrics/precision_recall.py b/metrics/precision_recall.py
new file mode 100755
index 0000000000000000000000000000000000000000..9b4b98574f9cf8d23ac14831471db2e1021ba501
--- /dev/null
+++ b/metrics/precision_recall.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Precision/Recall (PR) from the paper "Improved Precision and Recall
+Metric for Assessing Generative Models". Matches the original implementation
+by Kynkaanniemi et al. at
+https://github.com/kynkaat/improved-precision-and-recall-metric/blob/master/precision_recall.py"""
+
+import torch
+from . import metric_utils
+
+#----------------------------------------------------------------------------
+
+def compute_distances(row_features, col_features, num_gpus, rank, col_batch_size):
+    assert 0 <= rank < num_gpus
+    num_cols = col_features.shape[0]
+    num_batches = ((num_cols - 1) // col_batch_size // num_gpus + 1) * num_gpus
+    col_batches = torch.nn.functional.pad(col_features, [0, 0, 0, -num_cols % num_batches]).chunk(num_batches)
+    dist_batches = []
+    for col_batch in col_batches[rank :: num_gpus]:
+        dist_batch = torch.cdist(row_features.unsqueeze(0), col_batch.unsqueeze(0))[0]
+        for src in range(num_gpus):
+            dist_broadcast = dist_batch.clone()
+            if num_gpus > 1:
+                torch.distributed.broadcast(dist_broadcast, src=src)
+            dist_batches.append(dist_broadcast.cpu() if rank == 0 else None)
+    return torch.cat(dist_batches, dim=1)[:, :num_cols] if rank == 0 else None
+
+#----------------------------------------------------------------------------
+
+def compute_pr(opts, max_real, num_gen, nhood_size, row_batch_size, col_batch_size):
+    detector_url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt'
+    detector_kwargs = dict(return_features=True)
+
+    real_features = metric_utils.compute_feature_stats_for_dataset(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=0, capture_all=True, max_items=max_real).get_all_torch().to(torch.float16).to(opts.device)
+
+    gen_features = metric_utils.compute_feature_stats_for_generator(
+        opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs,
+        rel_lo=0, rel_hi=1, capture_all=True, max_items=num_gen).get_all_torch().to(torch.float16).to(opts.device)
+
+    results = dict()
+    for name, manifold, probes in [('precision', real_features, gen_features), ('recall', gen_features, real_features)]:
+        kth = []
+        for manifold_batch in manifold.split(row_batch_size):
+            dist = compute_distances(row_features=manifold_batch, col_features=manifold, num_gpus=opts.num_gpus, rank=opts.rank, col_batch_size=col_batch_size)
+            kth.append(dist.to(torch.float32).kthvalue(nhood_size + 1).values.to(torch.float16) if opts.rank == 0 else None)
+        kth = torch.cat(kth) if opts.rank == 0 else None
+        pred = []
+        for probes_batch in probes.split(row_batch_size):
+            dist = compute_distances(row_features=probes_batch, col_features=manifold, num_gpus=opts.num_gpus, rank=opts.rank, col_batch_size=col_batch_size)
+            pred.append((dist <= kth).any(dim=1) if opts.rank == 0 else None)
+        results[name] = float(torch.cat(pred).to(torch.float32).mean() if opts.rank == 0 else 'nan')
+    return results['precision'], results['recall']
+
+#----------------------------------------------------------------------------
diff --git a/renderer.py b/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d497b6c49492b1c62c83c81852af6505e1871e
--- /dev/null
+++ b/renderer.py
@@ -0,0 +1,322 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+"""Wrap the generator to render a sequence of images"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch import random
+import tqdm
+import copy
+import trimesh
+
+
+class Renderer(object):
+
+    def __init__(self, generator, discriminator=None, program=None):
+        self.generator = generator
+        self.discriminator = discriminator
+        self.sample_tmp = 0.65
+        self.program = program
+        self.seed = 0
+
+        if (program is not None) and (len(program.split(':')) == 2):
+            from training.dataset import ImageFolderDataset
+            self.image_data = ImageFolderDataset(program.split(':')[1])
+            self.program = program.split(':')[0]
+        else:
+            self.image_data = None
+
+    def set_random_seed(self, seed):
+        self.seed = seed
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+
+    def __call__(self, *args, **kwargs):
+        self.generator.eval()  # eval mode...
+
+        if self.program is None:
+            if hasattr(self.generator, 'get_final_output'):
+                return self.generator.get_final_output(*args, **kwargs)
+            return self.generator(*args, **kwargs)
+        
+        if self.image_data is not None:
+            batch_size = 1
+            indices = (np.random.rand(batch_size) * len(self.image_data)).tolist()
+            rimages = np.stack([self.image_data._load_raw_image(int(i)) for i in indices], 0)
+            rimages = torch.from_numpy(rimages).float().to(kwargs['z'].device) / 127.5 - 1
+            kwargs['img'] = rimages
+        
+        outputs = getattr(self, f"render_{self.program}")(*args, **kwargs)
+        
+        if self.image_data is not None:
+            imgs = outputs if not isinstance(outputs, tuple) else outputs[0]
+            size = imgs[0].size(-1)
+            rimg = F.interpolate(rimages, (size, size), mode='bicubic', align_corners=False)
+            imgs = [torch.cat([img, rimg], 0) for img in imgs]
+            outputs = imgs if not isinstance(outputs, tuple) else (imgs, outputs[1])
+        return outputs
+
+    def get_additional_params(self, ws, t=0):
+        gen = self.generator.synthesis
+        batch_size = ws.size(0)
+
+        kwargs = {}
+        if not hasattr(gen, 'get_latent_codes'):
+            return kwargs
+
+        s_val, t_val, r_val = [[0, 0, 0]], [[0.5, 0.5, 0.5]], [0.]
+        # kwargs["transformations"] = gen.get_transformations(batch_size=batch_size, mode=[s_val, t_val, r_val], device=ws.device)
+        # kwargs["bg_rotation"] = gen.get_bg_rotation(batch_size, device=ws.device)
+        # kwargs["light_dir"] = gen.get_light_dir(batch_size, device=ws.device)
+        kwargs["latent_codes"] = gen.get_latent_codes(batch_size, tmp=self.sample_tmp, device=ws.device)
+        kwargs["camera_matrices"] = self.get_camera_traj(t, ws.size(0), device=ws.device)
+        return kwargs
+
+    def get_camera_traj(self, t, batch_size=1, traj_type='pigan', device='cpu'):
+        gen = self.generator.synthesis
+        if traj_type == 'pigan':
+            range_u, range_v = gen.C.range_u, gen.C.range_v
+            pitch = 0.2 * np.cos(t * 2 * np.pi) + np.pi/2
+            yaw = 0.4 * np.sin(t * 2 * np.pi)
+            u = (yaw - range_u[0]) / (range_u[1] - range_u[0])
+            v = (pitch - range_v[0]) / (range_v[1] - range_v[0])
+            cam = gen.get_camera(batch_size=batch_size, mode=[u, v, 0.5], device=device)
+        else:
+            raise NotImplementedError
+        return cam
+   
+    def render_rotation_camera(self, *args, **kwargs):
+        batch_size, n_steps = 2, kwargs["n_steps"]
+        gen = self.generator.synthesis
+
+        if 'img' not in kwargs:
+            ws = self.generator.mapping(*args, **kwargs)
+        else:
+            ws, _ = self.generator.encoder(kwargs['img'])
+        # ws = ws.repeat(batch_size, 1, 1)
+
+        # kwargs["not_render_background"] = True
+        if hasattr(gen, 'get_latent_codes'):
+            kwargs["latent_codes"] = gen.get_latent_codes(batch_size, tmp=self.sample_tmp, device=ws.device)
+            kwargs.pop('img', None) 
+
+        out = []
+        cameras = []
+        relatve_range_u = kwargs['relative_range_u']
+        u_samples = np.linspace(relatve_range_u[0], relatve_range_u[1], n_steps)
+        for step in tqdm.tqdm(range(n_steps)):
+            # Set Camera
+            u = u_samples[step]
+            kwargs["camera_matrices"] = gen.get_camera(batch_size=batch_size, mode=[u, 0.5, 0.5], device=ws.device)
+            cameras.append(gen.get_camera(batch_size=batch_size, mode=[u, 0.5, 0.5], device=ws.device))
+            with torch.no_grad():
+                out_i = gen(ws, **kwargs)
+                if isinstance(out_i, dict):
+                    out_i = out_i['img']
+            out.append(out_i)
+
+        if 'return_cameras' in kwargs and kwargs["return_cameras"]:
+            return out, cameras
+        else:
+            return out
+
+    def render_rotation_camera3(self, styles=None, *args, **kwargs): 
+        gen = self.generator.synthesis
+        n_steps = 36  # 120
+
+        if styles is None:
+            batch_size = 2
+            if 'img' not in kwargs:
+                ws = self.generator.mapping(*args, **kwargs)
+            else:
+                ws = self.generator.encoder(kwargs['img'])['ws']
+            # ws = ws.repeat(batch_size, 1, 1)
+        else:
+            ws = styles
+            batch_size = ws.size(0)
+
+        # kwargs["not_render_background"] = True
+        # Get Random codes and bg rotation
+        self.sample_tmp = 0.72
+        if hasattr(gen, 'get_latent_codes'):
+            kwargs["latent_codes"] = gen.get_latent_codes(batch_size, tmp=self.sample_tmp, device=ws.device)
+            kwargs.pop('img', None) 
+
+        # if getattr(gen, "use_noise", False):
+        #     from dnnlib.geometry import extract_geometry
+        #     kwargs['meshes'] = {}
+        #     low_res, high_res = gen.resolution_vol, gen.img_resolution
+        #     res = low_res * 2
+        #     while res <= high_res:
+        #         kwargs['meshes'][res] = [trimesh.Trimesh(*extract_geometry(gen, ws, resolution=res, threshold=30.))]
+        #         kwargs['meshes'][res] += [
+        #             torch.randn(len(kwargs['meshes'][res][0].vertices), 
+        #                 2, device=ws.device)[kwargs['meshes'][res][0].faces]]
+        #         res = res * 2
+        # if getattr(gen, "use_noise", False):
+        #     kwargs['voxel_noise'] = gen.get_voxel_field(styles=ws, n_vols=2048, return_noise=True, sphere_noise=True)
+        # if getattr(gen, "use_voxel_noise", False):
+        #     kwargs['voxel_noise'] = gen.get_voxel_field(styles=ws, n_vols=128, return_noise=True)
+        kwargs['noise_mode'] = 'const'
+        
+        out = []
+        tspace = np.linspace(0, 1, n_steps)
+        range_u, range_v = gen.C.range_u, gen.C.range_v
+        
+        for step in tqdm.tqdm(range(n_steps)):
+            t = tspace[step]
+            pitch = 0.2 * np.cos(t * 2 * np.pi) + np.pi/2
+            yaw = 0.4 * np.sin(t * 2 * np.pi)
+            u = (yaw - range_u[0]) / (range_u[1] - range_u[0])
+            v = (pitch - range_v[0]) / (range_v[1] - range_v[0])
+            
+            kwargs["camera_matrices"] = gen.get_camera(
+                batch_size=batch_size, mode=[u, v, t], device=ws.device)
+            
+            with torch.no_grad():
+                out_i = gen(ws, **kwargs)
+                if isinstance(out_i, dict):
+                    out_i = out_i['img'] 
+            out.append(out_i)
+        return out
+
+    def render_rotation_both(self, *args, **kwargs): 
+        gen = self.generator.synthesis
+        batch_size, n_steps = 1, 36 
+        if 'img' not in kwargs:
+            ws = self.generator.mapping(*args, **kwargs)
+        else:
+            ws, _ = self.generator.encoder(kwargs['img'])
+        ws = ws.repeat(batch_size, 1, 1)
+
+        # kwargs["not_render_background"] = True
+        # Get Random codes and bg rotation
+        kwargs["latent_codes"] = gen.get_latent_codes(batch_size, tmp=self.sample_tmp, device=ws.device)
+        kwargs.pop('img', None) 
+
+        out = []
+        tspace = np.linspace(0, 1, n_steps)
+        range_u, range_v = gen.C.range_u, gen.C.range_v
+        
+        for step in tqdm.tqdm(range(n_steps)):
+            t = tspace[step]
+            pitch = 0.2 * np.cos(t * 2 * np.pi) + np.pi/2
+            yaw = 0.4 * np.sin(t * 2 * np.pi)
+            u = (yaw - range_u[0]) / (range_u[1] - range_u[0])
+            v = (pitch - range_v[0]) / (range_v[1] - range_v[0])
+
+            kwargs["camera_matrices"] = gen.get_camera(
+                batch_size=batch_size, mode=[u, v, 0.5], device=ws.device)
+            
+            with torch.no_grad():
+                out_i = gen(ws, **kwargs)
+                if isinstance(out_i, dict):
+                    out_i = out_i['img']  
+
+                kwargs_n = copy.deepcopy(kwargs)
+                kwargs_n.update({'render_option': 'early,no_background,up64,depth,normal'})               
+                out_n = gen(ws, **kwargs_n)
+                out_n = F.interpolate(out_n, 
+                    size=(out_i.size(-1), out_i.size(-1)), 
+                    mode='bicubic', align_corners=True)
+                out_i = torch.cat([out_i, out_n], 0)
+            out.append(out_i)
+        return out
+
+    def render_rotation_grid(self, styles=None, return_cameras=False, *args, **kwargs):
+        gen = self.generator.synthesis
+        if styles is None:
+            batch_size = 1
+            ws = self.generator.mapping(*args, **kwargs)
+            ws = ws.repeat(batch_size, 1, 1)
+        else:
+            ws = styles
+            batch_size = ws.size(0)
+
+        kwargs["latent_codes"] = gen.get_latent_codes(batch_size, tmp=self.sample_tmp, device=ws.device)
+        kwargs.pop('img', None) 
+
+        if getattr(gen, "use_voxel_noise", False):
+            kwargs['voxel_noise'] = gen.get_voxel_field(styles=ws, n_vols=128, return_noise=True)
+
+        out = []
+        cameras = []
+        range_u, range_v = gen.C.range_u, gen.C.range_v
+
+        a_steps, b_steps = 6, 3
+        aspace = np.linspace(-0.4, 0.4, a_steps)
+        bspace = np.linspace(-0.2, 0.2, b_steps) * -1
+        for b in tqdm.tqdm(range(b_steps)):
+            for a in range(a_steps):
+                t_a = aspace[a]
+                t_b = bspace[b]
+                camera_mat = gen.camera_matrix.repeat(batch_size, 1, 1).to(ws.device)
+                loc_x = np.cos(t_b) * np.cos(t_a)
+                loc_y = np.cos(t_b) * np.sin(t_a)
+                loc_z = np.sin(t_b)
+                loc = torch.tensor([[loc_x, loc_y, loc_z]], dtype=torch.float32).to(ws.device)
+                from dnnlib.camera import look_at
+                R = look_at(loc)
+                RT = torch.eye(4).reshape(1, 4, 4).repeat(batch_size, 1, 1)
+                RT[:, :3, :3] = R
+                RT[:, :3, -1] = loc
+
+                world_mat = RT.to(ws.device)
+                #kwargs["camera_matrices"] = gen.get_camera(
+                #     batch_size=batch_size, mode=[u, v, 0.5], device=ws.device)
+                kwargs["camera_matrices"] = (camera_mat, world_mat, "random", None)
+
+                with torch.no_grad():
+                    out_i = gen(ws, **kwargs)
+                    if isinstance(out_i, dict):
+                        out_i = out_i['img']
+
+                    # kwargs_n = copy.deepcopy(kwargs)
+                    # kwargs_n.update({'render_option': 'early,no_background,up64,depth,normal'})
+                    # out_n = gen(ws, **kwargs_n)
+                    # out_n = F.interpolate(out_n,
+                    #                       size=(out_i.size(-1), out_i.size(-1)),
+                    #                       mode='bicubic', align_corners=True)
+                    # out_i = torch.cat([out_i, out_n], 0)
+                out.append(out_i)
+
+        if return_cameras:
+            return out, cameras
+        else:
+            return out
+
+    def render_rotation_camera_grid(self, *args, **kwargs): 
+        batch_size, n_steps = 1, 60
+        gen = self.generator.synthesis
+        bbox_generator = self.generator.synthesis.boundingbox_generator
+        
+        ws = self.generator.mapping(*args, **kwargs)
+        ws = ws.repeat(batch_size, 1, 1)
+
+        # Get Random codes and bg rotation
+        kwargs["latent_codes"] = gen.get_latent_codes(batch_size, tmp=self.sample_tmp, device=ws.device)
+        del kwargs['render_option']
+
+        out = []
+        for v in [0.15, 0.5, 1.05]:
+            for step in tqdm.tqdm(range(n_steps)):
+                # Set Camera
+                u = step * 1.0 / (n_steps - 1) - 1.0 
+                kwargs["camera_matrices"] = gen.get_camera(batch_size=batch_size, mode=[u, v, 0.5], device=ws.device)
+                with torch.no_grad():
+                    out_i = gen(ws, render_option=None, **kwargs)
+                    if isinstance(out_i, dict):
+                        out_i = out_i['img']
+                    # option_n = 'early,no_background,up64,depth,direct_depth'
+                    # option_n = 'early,up128,no_background,depth,normal'                
+                    # out_n = gen(ws, render_option=option_n, **kwargs)
+                    # out_n = F.interpolate(out_n, 
+                    #     size=(out_i.size(-1), out_i.size(-1)), 
+                    #     mode='bicubic', align_corners=True)
+                    # out_i = torch.cat([out_i, out_n], 0)
+            
+                out.append(out_i)
+
+        # out += out[::-1]
+        return out
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6d4abb4eaff3b311ea9f18b26c7033ea92814c3f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,29 @@
+albumentations==1.0.3
+click==8.0.3
+clip-by-openai==1.1
+einops==0.3.0
+glfw==2.5.0
+gradio==2.8.13
+imageio==2.9.0
+imgui==1.4.1
+kornia==0.5.10
+lmdb==0.98
+lpips==0.1.4
+matplotlib==3.4.3
+numpy==1.21.2
+hydra-core==1.1
+opencv_python_headless==4.5.1.48
+Pillow==9.0.1
+psutil==5.8.0
+PyMCubes==0.1.2
+PyOpenGL==3.1.6
+pyspng==0.1.0
+requests==2.26.0
+scipy==1.7.1
+submitit==1.1.5
+tensorboardX==2.5
+torch==1.7.1
+torchvision==0.8.2
+tqdm==4.62.2
+trimesh==3.9.8
+imageio-ffmpeg==0.4.5
\ No newline at end of file
diff --git a/run_train.py b/run_train.py
new file mode 100755
index 0000000000000000000000000000000000000000..9d50f941854b2a54cdefdee9bf4504431da9faa6
--- /dev/null
+++ b/run_train.py
@@ -0,0 +1,398 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+from math import dist
+import sys
+import os
+import click
+import re
+import json
+import glob
+import tempfile
+import torch
+import dnnlib
+import hydra
+
+from datetime import date
+from training import training_loop
+from metrics import metric_main
+from torch_utils import training_stats, custom_ops, distributed_utils
+from torch_utils.distributed_utils import get_init_file, get_shared_folder
+from omegaconf import DictConfig, OmegaConf
+
+#----------------------------------------------------------------------------
+
+class UserError(Exception):
+    pass
+
+#----------------------------------------------------------------------------
+
+def setup_training_loop_kwargs(cfg):
+    args = OmegaConf.create({})
+
+    # ------------------------------------------
+    # General options: gpus, snap, metrics, seed
+    # ------------------------------------------
+    args.rank       = 0
+    args.gpu        = 0
+    args.num_gpus   = torch.cuda.device_count() if cfg.gpus is None else cfg.gpus
+    args.nodes      = cfg.nodes if cfg.nodes is not None else 1
+    args.world_size = 1
+    
+    args.dist_url   = 'env://'
+    args.launcher   = cfg.launcher
+    args.partition  = cfg.partition
+    args.comment    = cfg.comment
+    args.timeout    = 4320 if cfg.timeout is None else cfg.timeout
+    args.job_dir    = ''
+
+    if cfg.snap is None:
+        cfg.snap = 50
+    assert isinstance(cfg.snap, int)
+    if cfg.snap < 1:
+        raise UserError('snap must be at least 1')
+    args.image_snapshot_ticks = cfg.imgsnap
+    args.network_snapshot_ticks = cfg.snap
+    if hasattr(cfg, 'ucp'):
+        args.update_cam_prior_ticks = cfg.ucp
+
+    if cfg.metrics is None:
+        cfg.metrics = ['fid50k_full']
+    cfg.metrics = list(cfg.metrics)
+    if not all(metric_main.is_valid_metric(metric) for metric in cfg.metrics):
+        raise UserError('\n'.join(['metrics can only contain the following values:'] + metric_main.list_valid_metrics()))
+    args.metrics = cfg.metrics
+
+    if cfg.seed is None:
+        cfg.seed = 0
+    assert isinstance(cfg.seed, int)
+    args.random_seed = cfg.seed
+
+    # -----------------------------------
+    # Dataset: data, cond, subset, mirror
+    # ----------------------------------- 
+
+    assert cfg.data is not None
+    assert isinstance(cfg.data, str)
+    args.update({"training_set_kwargs": dict(class_name='training.dataset.ImageFolderDataset', path=cfg.data, resolution=cfg.resolution, use_labels=True, max_size=None, xflip=False)})
+    args.update({"data_loader_kwargs": dict(pin_memory=True, num_workers=3, prefetch_factor=2)})
+    args.generation_with_image = getattr(cfg, 'generate_with_image', False)
+    try:
+        training_set = dnnlib.util.construct_class_by_name(**args.training_set_kwargs) # subclass of training.dataset.Dataset
+        args.training_set_kwargs.resolution = training_set.resolution                  # be explicit about resolution
+        args.training_set_kwargs.use_labels = training_set.has_labels                  # be explicit about labels
+        args.training_set_kwargs.max_size = len(training_set)                          # be explicit about dataset size
+        desc = training_set.name
+        del training_set # conserve memory
+    except IOError as err:
+        raise UserError(f'data: {err}')
+
+    if cfg.cond is None:
+        cfg.cond = False
+    assert isinstance(cfg.cond, bool)
+    if cfg.cond:
+        if not args.training_set_kwargs.use_labels:
+            raise UserError('cond=True requires labels specified in dataset.json')
+        desc += '-cond'
+    else:
+        args.training_set_kwargs.use_labels = False
+
+    if cfg.subset is not None:
+        assert isinstance(cfg.subset, int)
+        if not 1 <= cfg.subset <= args.training_set_kwargs.max_size:
+            raise UserError(f'subset must be between 1 and {args.training_set_kwargs.max_size}')
+        desc += f'-subset{cfg.subset}'
+        if cfg.subset < args.training_set_kwargs.max_size:
+            args.training_set_kwargs.max_size = cfg.subset
+            args.training_set_kwargs.random_seed = args.random_seed
+
+    if cfg.mirror is None:
+        cfg.mirror = False
+    assert isinstance(cfg.mirror, bool)
+    if cfg.mirror:
+        desc += '-mirror'
+        args.training_set_kwargs.xflip = True
+
+    # ------------------------------------
+    # Base config: cfg, model, gamma, kimg, batch
+    # ------------------------------------
+    if cfg.auto:
+        cfg.spec.name = 'auto'
+    desc += f'-{cfg.spec.name}'
+    desc += f'-{cfg.model.name}'
+    if cfg.spec.name == 'auto':
+        res = args.training_set_kwargs.resolution
+        cfg.spec.fmaps = 1 if res >= 512 else 0.5
+        cfg.spec.lrate = 0.002 if res >= 1024 else 0.0025
+        cfg.spec.gamma = 0.0002 * (res ** 2) / cfg.spec.mb # heuristic formula
+        cfg.spec.ema = cfg.spec.mb * 10 / 32
+    
+    if getattr(cfg.spec, 'lrate_disc', None) is None:
+        cfg.spec.lrate_disc = cfg.spec.lrate   # use the same learning rate for discriminator
+
+    # model (generator, discriminator)
+    args.update({"G_kwargs": dict(**cfg.model.G_kwargs)})
+    args.update({"D_kwargs": dict(**cfg.model.D_kwargs)})
+    args.update({"G_opt_kwargs": dict(class_name='torch.optim.Adam', lr=cfg.spec.lrate, betas=[0,0.99], eps=1e-8)})
+    args.update({"D_opt_kwargs": dict(class_name='torch.optim.Adam', lr=cfg.spec.lrate_disc, betas=[0,0.99], eps=1e-8)})
+    args.update({"loss_kwargs": dict(class_name='training.loss.StyleGAN2Loss', r1_gamma=cfg.spec.gamma, **cfg.model.loss_kwargs)})
+    
+    if cfg.spec.name == 'cifar':
+        args.loss_kwargs.pl_weight = 0 # disable path length regularization
+        args.loss_kwargs.style_mixing_prob = 0 # disable style mixing
+        args.D_kwargs.architecture = 'orig' # disable residual skip connections
+
+    # kimg data config
+    args.spec = cfg.spec  # just keep the dict.
+    args.total_kimg = cfg.spec.kimg
+    args.batch_size = cfg.spec.mb
+    args.batch_gpu = cfg.spec.mbstd
+    args.ema_kimg = cfg.spec.ema
+    args.ema_rampup = cfg.spec.ramp
+    
+    # ---------------------------------------------------
+    # Discriminator augmentation: aug, p, target, augpipe
+    # ---------------------------------------------------
+    if cfg.aug is None:
+        cfg.aug = 'ada'
+    else:
+        assert isinstance(cfg.aug, str)
+        desc += f'-{cfg.aug}'
+
+    if cfg.aug == 'ada':
+        args.ada_target = 0.6
+    elif cfg.aug == 'noaug':
+        pass
+    elif cfg.aug == 'fixed':
+        if cfg.p is None:
+            raise UserError(f'--aug={cfg.aug} requires specifying --p')
+    else:
+        raise UserError(f'--aug={cfg.aug} not supported')
+
+    if cfg.p is not None:
+        assert isinstance(cfg.p, float)
+        if cfg.aug != 'fixed':
+            raise UserError('--p can only be specified with --aug=fixed')
+        if not 0 <= cfg.p <= 1:
+            raise UserError('--p must be between 0 and 1')
+        desc += f'-p{cfg.p:g}'
+        args.augment_p = cfg.p
+
+    if cfg.target is not None:
+        assert isinstance(cfg.target, float)
+        if cfg.aug != 'ada':
+            raise UserError('--target can only be specified with --aug=ada')
+        if not 0 <= cfg.target <= 1:
+            raise UserError('--target must be between 0 and 1')
+        desc += f'-target{cfg.target:g}'
+        args.ada_target = cfg.target
+
+    assert cfg.augpipe is None or isinstance(cfg.augpipe, str)
+    if cfg.augpipe is None:
+        cfg.augpipe = 'bgc'
+    else:
+        if cfg.aug == 'noaug':
+            raise UserError('--augpipe cannot be specified with --aug=noaug')
+        desc += f'-{cfg.augpipe}'
+
+    augpipe_specs = {
+        'blit':   dict(xflip=1, rotate90=1, xint=1),
+        'geom':   dict(scale=1, rotate=1, aniso=1, xfrac=1),
+        'color':  dict(brightness=1, contrast=1, lumaflip=1, hue=1, saturation=1),
+        'filter': dict(imgfilter=1),
+        'noise':  dict(noise=1),
+        'cutout': dict(cutout=1),
+        'bgc0':   dict(xint=1, scale=1, aniso=1, xfrac=1, brightness=1, contrast=1, lumaflip=1, hue=1, saturation=1),
+        'bg':     dict(xflip=1, rotate90=1, xint=1, scale=1, rotate=1, aniso=1, xfrac=1),
+        'bgc':    dict(xflip=1, rotate90=1, xint=1, scale=1, rotate=1, aniso=1, xfrac=1, brightness=1, contrast=1, lumaflip=1, hue=1, saturation=1),
+        'bgcf':   dict(xflip=1, rotate90=1, xint=1, scale=1, rotate=1, aniso=1, xfrac=1, brightness=1, contrast=1, lumaflip=1, hue=1, saturation=1, imgfilter=1),
+        'bgcfn':  dict(xflip=1, rotate90=1, xint=1, scale=1, rotate=1, aniso=1, xfrac=1, brightness=1, contrast=1, lumaflip=1, hue=1, saturation=1, imgfilter=1, noise=1),
+        'bgcfnc': dict(xflip=1, rotate90=1, xint=1, scale=1, rotate=1, aniso=1, xfrac=1, brightness=1, contrast=1, lumaflip=1, hue=1, saturation=1, imgfilter=1, noise=1, cutout=1),
+    }
+    assert cfg.augpipe in augpipe_specs
+    if cfg.aug != 'noaug':
+        args.update({"augment_kwargs": dict(class_name='training.augment.AugmentPipe', **augpipe_specs[cfg.augpipe])})
+
+    # ----------------------------------
+    # Transfer learning: resume, freezed
+    # ----------------------------------
+
+    resume_specs = {
+        'ffhq256':     'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/transfer-learning-source-nets/ffhq-res256-mirror-paper256-noaug.pkl',
+        'ffhq512':     'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/transfer-learning-source-nets/ffhq-res512-mirror-stylegan2-noaug.pkl',
+        'ffhq1024':    'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/transfer-learning-source-nets/ffhq-res1024-mirror-stylegan2-noaug.pkl',
+        'celebahq256': 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/transfer-learning-source-nets/celebahq-res256-mirror-paper256-kimg100000-ada-target0.5.pkl',
+        'lsundog256':  'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/transfer-learning-source-nets/lsundog-res256-paper256-kimg100000-noaug.pkl',
+    }
+
+    assert cfg.resume is None or isinstance(cfg.resume, str)
+    if cfg.resume is None:
+        cfg.resume = 'noresume'
+    elif cfg.resume == 'noresume':
+        desc += '-noresume'
+    elif cfg.resume in resume_specs:
+        desc += f'-resume{cfg.resume}'
+        args.resume_pkl = resume_specs[cfg.resume] # predefined url
+    else:
+        desc += '-resumecustom'
+        args.resume_pkl = cfg.resume # custom path or url
+
+    if cfg.resume != 'noresume':
+        args.ada_kimg = 100 # make ADA react faster at the beginning
+        args.ema_rampup = None # disable EMA rampup
+
+    if cfg.freezed is not None:
+        assert isinstance(cfg.freezed, int)
+        if not cfg.freezed >= 0:
+            raise UserError('--freezed must be non-negative')
+        desc += f'-freezed{cfg.freezed:d}'
+        args.D_kwargs.block_kwargs.freeze_layers = cfg.freezed
+
+    # -------------------------------------------------
+    # Performance options: fp32, nhwc, nobench, workers
+    # -------------------------------------------------
+    args.num_fp16_res = cfg.num_fp16_res
+    if cfg.fp32 is None:
+        cfg.fp32 = False
+    assert isinstance(cfg.fp32, bool)
+    if cfg.fp32:
+        args.G_kwargs.synthesis_kwargs.num_fp16_res = args.D_kwargs.num_fp16_res = 0
+        args.G_kwargs.synthesis_kwargs.conv_clamp = args.D_kwargs.conv_clamp = None
+
+    if cfg.nhwc is None:
+        cfg.nhwc = False
+    assert isinstance(cfg.nhwc, bool)
+    if cfg.nhwc:
+        args.G_kwargs.synthesis_kwargs.fp16_channels_last = args.D_kwargs.block_kwargs.fp16_channels_last = True
+
+    if cfg.nobench is None:
+        cfg.nobench = False
+    assert isinstance(cfg.nobench, bool)
+    if cfg.nobench:
+        args.cudnn_benchmark = False
+
+    if cfg.allow_tf32 is None:
+        cfg.allow_tf32 = False
+    assert isinstance(cfg.allow_tf32, bool)
+    args.allow_tf32 = cfg.allow_tf32
+
+    if cfg.workers is not None:
+        assert isinstance(cfg.workers, int)
+        if not cfg.workers >= 1:
+            raise UserError('--workers must be at least 1')
+        args.data_loader_kwargs.num_workers = cfg.workers
+
+    args.debug = cfg.debug
+    if getattr(cfg, "prefix", None) is not None:
+        desc = cfg.prefix + '-' + desc
+    return desc, args
+
+#----------------------------------------------------------------------------
+
+def subprocess_fn(rank, args):
+    if not args.debug:
+        dnnlib.util.Logger(file_name=os.path.join(args.run_dir, 'log.txt'), file_mode='a', should_flush=True)
+
+    # Init torch.distributed.
+    distributed_utils.init_distributed_mode(rank, args)
+    if args.rank != 0:
+        custom_ops.verbosity = 'none'
+    
+    # Execute training loop.
+    training_loop.training_loop(**args)
+    
+#----------------------------------------------------------------------------
+
+class CommaSeparatedList(click.ParamType):
+    name = 'list'
+
+    def convert(self, value, param, ctx):
+        _ = param, ctx
+        if value is None or value.lower() == 'none' or value == '':
+            return []
+        return value.split(',')
+
+
+@hydra.main(config_path="conf", config_name="config")
+def main(cfg: DictConfig):
+    
+    outdir = cfg.outdir
+
+    # Setup training options
+    run_desc, args = setup_training_loop_kwargs(cfg)
+    
+    # Pick output directory.
+    prev_run_dirs = []
+    if os.path.isdir(outdir):
+        prev_run_dirs = [x for x in os.listdir(outdir) if os.path.isdir(os.path.join(outdir, x))]
+    
+    if cfg.resume_run is None:
+        prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs]
+        prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None]
+        cur_run_id = max(prev_run_ids, default=-1) + 1
+    else:
+        cur_run_id = cfg.resume_run
+        
+    args.run_dir = os.path.join(outdir, f'{cur_run_id:05d}-{run_desc}')
+    print(outdir, args.run_dir)
+
+    if cfg.resume_run is not None:
+        pkls = sorted(glob.glob(args.run_dir + '/network*.pkl'))
+        if len(pkls) > 0:
+            args.resume_pkl = pkls[-1]
+            args.resume_start = int(args.resume_pkl.split('-')[-1][:-4]) * 1000
+        else:
+            args.resume_start = 0
+
+    # Print options.
+    print()
+    print('Training options:')
+    print(OmegaConf.to_yaml(args))
+    print()
+    print(f'Output directory:   {args.run_dir}')
+    print(f'Training data:      {args.training_set_kwargs.path}')
+    print(f'Training duration:  {args.total_kimg} kimg')
+    print(f'Number of images:   {args.training_set_kwargs.max_size}')
+    print(f'Image resolution:   {args.training_set_kwargs.resolution}')
+    print(f'Conditional model:  {args.training_set_kwargs.use_labels}')
+    print(f'Dataset x-flips:    {args.training_set_kwargs.xflip}')
+    print()
+
+    # Dry run?
+    if cfg.dry_run:
+        print('Dry run; exiting.')
+        return
+
+    # Create output directory.
+    print('Creating output directory...')
+    if not os.path.exists(args.run_dir):
+        os.makedirs(args.run_dir)
+        with open(os.path.join(args.run_dir, 'training_options.yaml'), 'wt') as fp:
+            OmegaConf.save(config=args, f=fp.name)
+
+    # Launch processes.    
+    print('Launching processes...')
+    if (args.launcher == 'spawn') and (args.num_gpus > 1):
+        args.dist_url = distributed_utils.get_init_file().as_uri()
+        torch.multiprocessing.set_start_method('spawn')
+        torch.multiprocessing.spawn(fn=subprocess_fn, args=(args,), nprocs=args.num_gpus)
+    else:
+        subprocess_fn(rank=0, args=args)
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    if os.getenv('SLURM_ARGS') is not None:
+        # deparcated launcher for slurm jobs.
+        slurm_arg = eval(os.getenv('SLURM_ARGS'))
+        all_args = sys.argv[1:]
+        print(slurm_arg)
+        print(all_args)
+
+        from launcher import launch
+        launch(slurm_arg, all_args)
+    
+    else:
+        main() # pylint: disable=no-value-for-parameter
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/__init__.py b/torch_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..939e7c6c8f94c4ea1141885c3c3295fe083b06aa
--- /dev/null
+++ b/torch_utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# empty
diff --git a/torch_utils/custom_ops.py b/torch_utils/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd7cc046e925f58602154be9bdf678ca9d76f59f
--- /dev/null
+++ b/torch_utils/custom_ops.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import glob
+import hashlib
+import importlib
+import os
+import re
+import shutil
+import uuid
+
+import torch
+import torch.utils.cpp_extension
+from torch.utils.file_baton import FileBaton
+
+#----------------------------------------------------------------------------
+# Global options.
+
+verbosity = 'brief' # Verbosity level: 'none', 'brief', 'full'
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    patterns = [
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64',
+        'C:/Program Files (x86)/Microsoft Visual Studio */vc/bin',
+    ]
+    for pattern in patterns:
+        matches = sorted(glob.glob(pattern))
+        if len(matches):
+            return matches[-1]
+    return None
+
+#----------------------------------------------------------------------------
+
+def _get_mangled_gpu_name():
+    name = torch.cuda.get_device_name().lower()
+    out = []
+    for c in name:
+        if re.match('[a-z0-9_-]+', c):
+            out.append(c)
+        else:
+            out.append('-')
+    return ''.join(out)
+
+#----------------------------------------------------------------------------
+# Main entry point for compiling and loading C++/CUDA plugins.
+
+_cached_plugins = dict()
+
+def get_plugin(module_name, sources, headers=None, source_dir=None, **build_kwargs):
+    assert verbosity in ['none', 'brief', 'full']
+    if headers is None:
+        headers = []
+    if source_dir is not None:
+        sources = [os.path.join(source_dir, fname) for fname in sources]
+        headers = [os.path.join(source_dir, fname) for fname in headers]
+
+    # Already cached?
+    if module_name in _cached_plugins:
+        return _cached_plugins[module_name]
+
+    # Print status.
+    if verbosity == 'full':
+        print(f'Setting up PyTorch plugin "{module_name}"...')
+    elif verbosity == 'brief':
+        print(f'Setting up PyTorch plugin "{module_name}"... ', end='', flush=True)
+    verbose_build = (verbosity == 'full')
+
+    # Compile and load.
+    try: # pylint: disable=too-many-nested-blocks
+        # Make sure we can find the necessary compiler binaries.
+        if os.name == 'nt' and os.system("where cl.exe >nul 2>nul") != 0:
+            compiler_bindir = _find_compiler_bindir()
+            if compiler_bindir is None:
+                raise RuntimeError(f'Could not find MSVC/GCC/CLANG installation on this computer. Check _find_compiler_bindir() in "{__file__}".')
+            os.environ['PATH'] += ';' + compiler_bindir
+
+        # Some containers set TORCH_CUDA_ARCH_LIST to a list that can either
+        # break the build or unnecessarily restrict what's available to nvcc.
+        # Unset it to let nvcc decide based on what's available on the
+        # machine.
+        os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+        # Incremental build md5sum trickery.  Copies all the input source files
+        # into a cached build directory under a combined md5 digest of the input
+        # source files.  Copying is done only if the combined digest has changed.
+        # This keeps input file timestamps and filenames the same as in previous
+        # extension builds, allowing for fast incremental rebuilds.
+        #
+        # This optimization is done only in case all the source files reside in
+        # a single directory (just for simplicity) and if the TORCH_EXTENSIONS_DIR
+        # environment variable is set (we take this as a signal that the user
+        # actually cares about this.)
+        #
+        # EDIT: We now do it regardless of TORCH_EXTENSIOS_DIR, in order to work
+        # around the *.cu dependency bug in ninja config.
+        #
+        all_source_files = sorted(sources + headers)
+        all_source_dirs = set(os.path.dirname(fname) for fname in all_source_files)
+        if len(all_source_dirs) == 1: # and ('TORCH_EXTENSIONS_DIR' in os.environ):
+
+            # Compute combined hash digest for all source files.
+            hash_md5 = hashlib.md5()
+            for src in all_source_files:
+                with open(src, 'rb') as f:
+                    hash_md5.update(f.read())
+
+            # Select cached build directory name.
+            source_digest = hash_md5.hexdigest()
+            build_top_dir = torch.utils.cpp_extension._get_build_directory(module_name, verbose=verbose_build) # pylint: disable=protected-access
+            cached_build_dir = os.path.join(build_top_dir, f'{source_digest}-{_get_mangled_gpu_name()}')
+
+            if not os.path.isdir(cached_build_dir):
+                tmpdir = f'{build_top_dir}/srctmp-{uuid.uuid4().hex}'
+                os.makedirs(tmpdir)
+                for src in all_source_files:
+                    shutil.copyfile(src, os.path.join(tmpdir, os.path.basename(src)))
+                try:
+                    os.replace(tmpdir, cached_build_dir) # atomic
+                except OSError:
+                    # source directory already exists, delete tmpdir and its contents.
+                    shutil.rmtree(tmpdir)
+                    if not os.path.isdir(cached_build_dir): raise
+
+            # Compile.
+            cached_sources = [os.path.join(cached_build_dir, os.path.basename(fname)) for fname in sources]
+            torch.utils.cpp_extension.load(name=module_name, build_directory=cached_build_dir,
+                verbose=verbose_build, sources=cached_sources, **build_kwargs)
+        else:
+            torch.utils.cpp_extension.load(name=module_name, verbose=verbose_build, sources=sources, **build_kwargs)
+
+        # Load.
+        module = importlib.import_module(module_name)
+
+    except:
+        if verbosity == 'brief':
+            print('Failed!')
+        raise
+
+    # Print status and add to cache dict.
+    if verbosity == 'full':
+        print(f'Done setting up PyTorch plugin "{module_name}".')
+    elif verbosity == 'brief':
+        print('Done.')
+    _cached_plugins[module_name] = module
+    return module
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/distributed_utils.py b/torch_utils/distributed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9983e4595618e080a15796670c98037cf691c3b
--- /dev/null
+++ b/torch_utils/distributed_utils.py
@@ -0,0 +1,213 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import os
+import pickle
+import random
+import socket
+import struct
+import subprocess
+import warnings
+import tempfile
+import uuid
+
+
+from datetime import date
+from pathlib import Path
+from collections import OrderedDict
+from typing import Any, Dict, Mapping
+
+import torch
+import torch.distributed as dist
+
+
+logger = logging.getLogger(__name__)
+
+
+def is_master(args):
+    return args.distributed_rank == 0
+
+
+def init_distributed_mode(rank, args):
+    if "WORLD_SIZE" in os.environ:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+    
+    if args.launcher == 'spawn':  # single node with multiprocessing.spawn
+        args.world_size = args.num_gpus
+        args.rank = rank
+        args.gpu = rank
+    
+    elif 'RANK' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    
+    if args.world_size == 1:
+        return
+
+    if 'MASTER_ADDR' in os.environ:
+        args.dist_url = 'tcp://{}:{}'.format(os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+
+    print(f'gpu={args.gpu}, rank={args.rank}, world_size={args.world_size}')
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
+    
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+
+
+def gather_list_and_concat(tensor):
+    gather_t = [torch.ones_like(tensor) for _ in range(dist.get_world_size())]
+    dist.all_gather(gather_t, tensor)
+    return torch.cat(gather_t)
+
+
+def get_rank():
+    return dist.get_rank()
+
+
+def get_world_size():
+    return dist.get_world_size()
+
+
+def get_default_group():
+    return dist.group.WORLD
+
+
+def all_gather_list(data, group=None, max_size=16384):
+    """Gathers arbitrary data from all nodes into a list.
+
+    Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python
+    data. Note that *data* must be picklable.
+
+    Args:
+        data (Any): data from the local worker to be gathered on other workers
+        group (optional): group of the collective
+        max_size (int, optional): maximum size of the data to be gathered
+            across workers
+    """
+    rank = get_rank()
+    world_size = get_world_size()
+
+    buffer_size = max_size * world_size
+    if not hasattr(all_gather_list, '_buffer') or \
+            all_gather_list._buffer.numel() < buffer_size:
+        all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size)
+        all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory()
+    buffer = all_gather_list._buffer
+    buffer.zero_()
+    cpu_buffer = all_gather_list._cpu_buffer
+
+    data = data.cpu()
+    enc = pickle.dumps(data)
+    enc_size = len(enc)
+    header_size = 4  # size of header that contains the length of the encoded data
+    size = header_size + enc_size
+    if size > max_size:
+        raise ValueError('encoded data size ({}) exceeds max_size ({})'.format(size, max_size))
+
+    header = struct.pack(">I", enc_size)
+    cpu_buffer[:size] = torch.ByteTensor(list(header + enc))
+    start = rank * max_size
+    buffer[start:start + size].copy_(cpu_buffer[:size])
+
+    all_reduce(buffer, group=group)
+
+    buffer = buffer.cpu()
+    try:
+        result = []
+        for i in range(world_size):
+            out_buffer = buffer[i * max_size:(i + 1) * max_size]
+            enc_size, = struct.unpack(">I", bytes(out_buffer[:header_size].tolist()))
+            if enc_size > 0:
+                result.append(pickle.loads(bytes(out_buffer[header_size:header_size + enc_size].tolist())))
+        return result
+    except pickle.UnpicklingError:
+        raise Exception(
+            'Unable to unpickle data from other workers. all_gather_list requires all '
+            'workers to enter the function together, so this error usually indicates '
+            'that the workers have fallen out of sync somehow. Workers can fall out of '
+            'sync if one of them runs out of memory, or if there are other conditions '
+            'in your training script that can cause one worker to finish an epoch '
+            'while other workers are still iterating over their portions of the data. '
+            'Try rerunning with --ddp-backend=no_c10d and see if that helps.'
+        )
+
+
+def all_reduce_dict(
+    data: Mapping[str, Any],
+    device,
+    group=None,
+) -> Dict[str, Any]:
+    """
+    AllReduce a dictionary of values across workers. We separately
+    reduce items that are already on the device and items on CPU for
+    better performance.
+
+    Args:
+        data (Mapping[str, Any]): dictionary of data to all-reduce, but
+            cannot be a nested dictionary
+        device (torch.device): device for the reduction
+        group (optional): group of the collective
+    """
+    data_keys = list(data.keys())
+
+    # We want to separately reduce items that are already on the
+    # device and items on CPU for performance reasons.
+    cpu_data = OrderedDict()
+    device_data = OrderedDict()
+    for k in data_keys:
+        t = data[k]
+        if not torch.is_tensor(t):
+            cpu_data[k] = torch.tensor(t, dtype=torch.double)
+        elif t.device.type != device.type:
+            cpu_data[k] = t.to(dtype=torch.double)
+        else:
+            device_data[k] = t.to(dtype=torch.double)
+
+    def _all_reduce_dict(data: OrderedDict):
+        if len(data) == 0:
+            return data
+        buf = torch.stack(list(data.values())).to(device=device)
+        all_reduce(buf, group=group)
+        return {k: buf[i] for i, k in enumerate(data)}
+
+    cpu_data = _all_reduce_dict(cpu_data)
+    device_data = _all_reduce_dict(device_data)
+
+    def get_from_stack(key):
+        if key in cpu_data:
+            return cpu_data[key]
+        elif key in device_data:
+            return device_data[key]
+        raise KeyError
+
+    return OrderedDict([(key, get_from_stack(key)) for key in data_keys])
+
+
+def get_shared_folder() -> Path:
+    user = os.getenv("USER")
+    if Path("/checkpoint/").is_dir():
+        p = Path(f"/checkpoint/{user}/experiments")
+        p.mkdir(exist_ok=True)
+        return p
+    else:
+        p = Path(f"/tmp/experiments")
+        p.mkdir(exist_ok=True)
+        return p
+
+
+def get_init_file():
+    # Init file must not exist, but it's parent dir must exist.
+    os.makedirs(str(get_shared_folder()), exist_ok=True)
+    init_file = Path(str(get_shared_folder()) + f"/{uuid.uuid4().hex}_init")
+    if init_file.exists():
+        os.remove(str(init_file))
+    return init_file
+
diff --git a/torch_utils/misc.py b/torch_utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..281c7eda1201042832efefd0abaaa9e444dfe0a0
--- /dev/null
+++ b/torch_utils/misc.py
@@ -0,0 +1,303 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import re
+import contextlib
+import numpy as np
+import torch
+import warnings
+import dnnlib
+
+#----------------------------------------------------------------------------
+# Cached construction of constant tensors. Avoids CPU=>GPU copy when the
+# same constant is used multiple times.
+
+_constant_cache = dict()
+
+def constant(value, shape=None, dtype=None, device=None, memory_format=None):
+    value = np.asarray(value)
+    if shape is not None:
+        shape = tuple(shape)
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.device('cpu')
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+
+    key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
+    tensor = _constant_cache.get(key, None)
+    if tensor is None:
+        tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
+        if shape is not None:
+            tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
+        tensor = tensor.contiguous(memory_format=memory_format)
+        _constant_cache[key] = tensor
+    return tensor
+
+#----------------------------------------------------------------------------
+# Replace NaN/Inf with specified numerical values.
+
+try:
+    nan_to_num = torch.nan_to_num # 1.8.0a0
+except AttributeError:
+    def nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None): # pylint: disable=redefined-builtin
+        assert isinstance(input, torch.Tensor)
+        if posinf is None:
+            posinf = torch.finfo(input.dtype).max
+        if neginf is None:
+            neginf = torch.finfo(input.dtype).min
+        assert nan == 0
+        return torch.clamp(input.unsqueeze(0).nansum(0), min=neginf, max=posinf, out=out)
+
+#----------------------------------------------------------------------------
+# Symbolic assert.
+
+try:
+    symbolic_assert = torch._assert # 1.8.0a0 # pylint: disable=protected-access
+except AttributeError:
+    symbolic_assert = torch.Assert # 1.7.0
+
+#----------------------------------------------------------------------------
+# Context manager to temporarily suppress known warnings in torch.jit.trace().
+# Note: Cannot use catch_warnings because of https://bugs.python.org/issue29672
+
+@contextlib.contextmanager
+def suppress_tracer_warnings():
+    flt = ('ignore', None, torch.jit.TracerWarning, None, 0)
+    warnings.filters.insert(0, flt)
+    yield
+    warnings.filters.remove(flt)
+
+#----------------------------------------------------------------------------
+# Assert that the shape of a tensor matches the given list of integers.
+# None indicates that the size of a dimension is allowed to vary.
+# Performs symbolic assertion when used in torch.jit.trace().
+
+def assert_shape(tensor, ref_shape):
+    if tensor.ndim != len(ref_shape):
+        raise AssertionError(f'Wrong number of dimensions: got {tensor.ndim}, expected {len(ref_shape)}')
+    for idx, (size, ref_size) in enumerate(zip(tensor.shape, ref_shape)):
+        if ref_size is None:
+            pass
+        elif isinstance(ref_size, torch.Tensor):
+            with suppress_tracer_warnings(): # as_tensor results are registered as constants
+                symbolic_assert(torch.equal(torch.as_tensor(size), ref_size), f'Wrong size for dimension {idx}')
+        elif isinstance(size, torch.Tensor):
+            with suppress_tracer_warnings(): # as_tensor results are registered as constants
+                symbolic_assert(torch.equal(size, torch.as_tensor(ref_size)), f'Wrong size for dimension {idx}: expected {ref_size}')
+        elif size != ref_size:
+            raise AssertionError(f'Wrong size for dimension {idx}: got {size}, expected {ref_size}')
+
+#----------------------------------------------------------------------------
+# Function decorator that calls torch.autograd.profiler.record_function().
+
+def profiled_function(fn):
+    def decorator(*args, **kwargs):
+        with torch.autograd.profiler.record_function(fn.__name__):
+            return fn(*args, **kwargs)
+    decorator.__name__ = fn.__name__
+    return decorator
+
+#----------------------------------------------------------------------------
+# Sampler for torch.utils.data.DataLoader that loops over the dataset
+# indefinitely, shuffling items as it goes.
+
+class InfiniteSampler(torch.utils.data.Sampler):
+    def __init__(self, dataset, rank=0, num_replicas=1, shuffle=True, seed=0, window_size=0.5):
+        assert len(dataset) > 0
+        assert num_replicas > 0
+        assert 0 <= rank < num_replicas
+        assert 0 <= window_size <= 1
+        super().__init__(dataset)
+        self.dataset = dataset
+        self.rank = rank
+        self.num_replicas = num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+        self.window_size = window_size
+
+    def __iter__(self):
+        order = np.arange(len(self.dataset))
+        rnd = None
+        window = 0
+        if self.shuffle:
+            rnd = np.random.RandomState(self.seed)
+            rnd.shuffle(order)
+            window = int(np.rint(order.size * self.window_size))
+
+        idx = 0
+        while True:
+            i = idx % order.size
+            if idx % self.num_replicas == self.rank:
+                yield order[i]
+            if window >= 2:
+                j = (i - rnd.randint(window)) % order.size
+                order[i], order[j] = order[j], order[i]
+            idx += 1
+
+#----------------------------------------------------------------------------
+# Utilities for operating with torch.nn.Module parameters and buffers.
+
+def params_and_buffers(module):
+    assert isinstance(module, torch.nn.Module)
+    return list(module.parameters()) + list(module.buffers())
+
+def named_params_and_buffers(module):
+    assert isinstance(module, torch.nn.Module)
+    return list(module.named_parameters()) + list(module.named_buffers())
+
+def copy_params_and_buffers(src_module, dst_module, require_all=False):
+    assert isinstance(src_module, torch.nn.Module)
+    assert isinstance(dst_module, torch.nn.Module)
+    src_tensors = dict(named_params_and_buffers(src_module))
+    for name, tensor in named_params_and_buffers(dst_module):
+        assert (name in src_tensors) or (not require_all)
+        if name in src_tensors:
+            try:
+                tensor.copy_(src_tensors[name].detach()).requires_grad_(tensor.requires_grad)
+            except Exception as e:
+                print(f'Error loading: {name} {src_tensors[name].shape} {tensor.shape}')
+                raise e
+#----------------------------------------------------------------------------
+# Context manager for easily enabling/disabling DistributedDataParallel
+# synchronization.
+
+@contextlib.contextmanager
+def ddp_sync(module, sync):
+    assert isinstance(module, torch.nn.Module)
+    if sync or not isinstance(module, torch.nn.parallel.DistributedDataParallel):
+        yield
+    else:
+        with module.no_sync():
+            yield
+
+#----------------------------------------------------------------------------
+# Check DistributedDataParallel consistency across processes.
+
+def check_ddp_consistency(module, ignore_regex=None):
+    assert isinstance(module, torch.nn.Module)
+    for name, tensor in named_params_and_buffers(module):
+        fullname = type(module).__name__ + '.' + name
+        if ignore_regex is not None and re.fullmatch(ignore_regex, fullname):
+            continue
+        tensor = tensor.detach()
+        if tensor.is_floating_point():
+            tensor = nan_to_num(tensor)
+        other = tensor.clone()
+        torch.distributed.broadcast(tensor=other, src=0)
+        assert (tensor == other).all(), fullname
+
+#----------------------------------------------------------------------------
+# Print summary table of module hierarchy.
+
+def print_module_summary(module, inputs, max_nesting=3, skip_redundant=True):
+    assert isinstance(module, torch.nn.Module)
+    assert not isinstance(module, torch.jit.ScriptModule)
+    assert isinstance(inputs, (tuple, list))
+
+    # Register hooks.
+    entries = []
+    nesting = [0]
+    def pre_hook(_mod, _inputs):
+        nesting[0] += 1
+    def post_hook(mod, _inputs, outputs):
+        nesting[0] -= 1
+        if nesting[0] <= max_nesting:
+            outputs = list(outputs) if isinstance(outputs, (tuple, list)) else [outputs]
+            outputs = [t for t in outputs if isinstance(t, torch.Tensor)]
+            entries.append(dnnlib.EasyDict(mod=mod, outputs=outputs))
+    hooks = [mod.register_forward_pre_hook(pre_hook) for mod in module.modules()]
+    hooks += [mod.register_forward_hook(post_hook) for mod in module.modules()]
+
+    # Run module.
+    outputs = module(*inputs)
+    for hook in hooks:
+        hook.remove()
+
+    # Identify unique outputs, parameters, and buffers.
+    tensors_seen = set()
+    for e in entries:
+        e.unique_params = [t for t in e.mod.parameters() if id(t) not in tensors_seen]
+        e.unique_buffers = [t for t in e.mod.buffers() if id(t) not in tensors_seen]
+        e.unique_outputs = [t for t in e.outputs if id(t) not in tensors_seen]
+        tensors_seen |= {id(t) for t in e.unique_params + e.unique_buffers + e.unique_outputs}
+
+    # Filter out redundant entries.
+    if skip_redundant:
+        entries = [e for e in entries if len(e.unique_params) or len(e.unique_buffers) or len(e.unique_outputs)]
+
+    # Construct table.
+    rows = [[type(module).__name__, 'Parameters', 'Buffers', 'Output shape', 'Datatype']]
+    rows += [['---'] * len(rows[0])]
+    param_total = 0
+    buffer_total = 0
+    submodule_names = {mod: name for name, mod in module.named_modules()}
+    for e in entries:
+        name = '<top-level>' if e.mod is module else submodule_names[e.mod]
+        param_size = sum(t.numel() for t in e.unique_params)
+        buffer_size = sum(t.numel() for t in e.unique_buffers)
+        output_shapes = [str(list(t.shape)) for t in e.outputs]
+        output_dtypes = [str(t.dtype).split('.')[-1] for t in e.outputs]
+        rows += [[
+            name + (':0' if len(e.outputs) >= 2 else ''),
+            str(param_size) if param_size else '-',
+            str(buffer_size) if buffer_size else '-',
+            (output_shapes + ['-'])[0],
+            (output_dtypes + ['-'])[0],
+        ]]
+        for idx in range(1, len(e.outputs)):
+            rows += [[name + f':{idx}', '-', '-', output_shapes[idx], output_dtypes[idx]]]
+        param_total += param_size
+        buffer_total += buffer_size
+    rows += [['---'] * len(rows[0])]
+    rows += [['Total', str(param_total), str(buffer_total), '-', '-']]
+
+    # Print table.
+    widths = [max(len(cell) for cell in column) for column in zip(*rows)]
+    print()
+    for row in rows:
+        print('  '.join(cell + ' ' * (width - len(cell)) for cell, width in zip(row, widths)))
+    print()
+    return outputs
+
+#----------------------------------------------------------------------------
+
+def get_ddp_func(m, func_name):
+    if hasattr(m, func_name):
+        return getattr(m, func_name)
+    if hasattr(m.module, func_name):
+        return getattr(m.module, func_name)
+    return None
+
+
+#----------------------------------------------------------------------------
+
+@contextlib.contextmanager
+def cuda_time(prefix=""):
+    start = torch.cuda.Event(enable_timing=True)
+    end   = torch.cuda.Event(enable_timing=True)
+    start.record()
+    try:
+        yield 
+    finally: 
+        end.record()
+        torch.cuda.synchronize()
+        print(f'{prefix}: {start.elapsed_time(end)} ms')
+        
+# ---------------------------------------------------------------------------
+
+def get_func(m, f):
+    if hasattr(m, f):
+        return getattr(m, f)
+    elif hasattr(m.module, f):
+        return getattr(m.module, f)
+    else:
+        raise NotImplementedError
diff --git a/torch_utils/ops/__init__.py b/torch_utils/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..939e7c6c8f94c4ea1141885c3c3295fe083b06aa
--- /dev/null
+++ b/torch_utils/ops/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# empty
diff --git a/torch_utils/ops/bias_act.cpp b/torch_utils/ops/bias_act.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3adaeee2ae44e96655d354c2bdfb81de8ebfe6c6
--- /dev/null
+++ b/torch_utils/ops/bias_act.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "bias_act.h"
+
+//------------------------------------------------------------------------
+
+static bool has_same_layout(torch::Tensor x, torch::Tensor y)
+{
+    if (x.dim() != y.dim())
+        return false;
+    for (int64_t i = 0; i < x.dim(); i++)
+    {
+        if (x.size(i) != y.size(i))
+            return false;
+        if (x.size(i) >= 2 && x.stride(i) != y.stride(i))
+            return false;
+    }
+    return true;
+}
+
+//------------------------------------------------------------------------
+
+static torch::Tensor bias_act(torch::Tensor x, torch::Tensor b, torch::Tensor xref, torch::Tensor yref, torch::Tensor dy, int grad, int dim, int act, float alpha, float gain, float clamp)
+{
+    // Validate arguments.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    TORCH_CHECK(b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()), "b must have the same dtype and device as x");
+    TORCH_CHECK(xref.numel() == 0 || (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() && xref.device() == x.device()), "xref must have the same shape, dtype, and device as x");
+    TORCH_CHECK(yref.numel() == 0 || (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() && yref.device() == x.device()), "yref must have the same shape, dtype, and device as x");
+    TORCH_CHECK(dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() && dy.device() == x.device()), "dy must have the same dtype and device as x");
+    TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+    TORCH_CHECK(b.dim() == 1, "b must have rank 1");
+    TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()), "dim is out of bounds");
+    TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim), "b has wrong number of elements");
+    TORCH_CHECK(grad >= 0, "grad must be non-negative");
+
+    // Validate layout.
+    TORCH_CHECK(x.is_non_overlapping_and_dense(), "x must be non-overlapping and dense");
+    TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
+    TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x), "xref must have the same layout as x");
+    TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x), "yref must have the same layout as x");
+    TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x), "dy must have the same layout as x");
+
+    // Create output tensor.
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    torch::Tensor y = torch::empty_like(x);
+    TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");
+
+    // Initialize CUDA kernel parameters.
+    bias_act_kernel_params p;
+    p.x     = x.data_ptr();
+    p.b     = (b.numel()) ? b.data_ptr() : NULL;
+    p.xref  = (xref.numel()) ? xref.data_ptr() : NULL;
+    p.yref  = (yref.numel()) ? yref.data_ptr() : NULL;
+    p.dy    = (dy.numel()) ? dy.data_ptr() : NULL;
+    p.y     = y.data_ptr();
+    p.grad  = grad;
+    p.act   = act;
+    p.alpha = alpha;
+    p.gain  = gain;
+    p.clamp = clamp;
+    p.sizeX = (int)x.numel();
+    p.sizeB = (int)b.numel();
+    p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;
+
+    // Choose CUDA kernel.
+    void* kernel;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&]
+    {
+        kernel = choose_bias_act_kernel<scalar_t>(p);
+    });
+    TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");
+
+    // Launch CUDA kernel.
+    p.loopX = 4;
+    int blockSize = 4 * 32;
+    int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
+    void* args[] = {&p};
+    AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
+    return y;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("bias_act", &bias_act);
+}
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/bias_act.cu b/torch_utils/ops/bias_act.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ed1d16f14eadd1344939e074ace1375cfd936cea
--- /dev/null
+++ b/torch_utils/ops/bias_act.cu
@@ -0,0 +1,173 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include "bias_act.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>     { typedef double scalar_t; };
+template <> struct InternalType<float>      { typedef float  scalar_t; };
+template <> struct InternalType<c10::Half>  { typedef float  scalar_t; };
+
+//------------------------------------------------------------------------
+// CUDA kernel.
+
+template <class T, int A>
+__global__ void bias_act_kernel(bias_act_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    int G                 = p.grad;
+    scalar_t alpha        = (scalar_t)p.alpha;
+    scalar_t gain         = (scalar_t)p.gain;
+    scalar_t clamp        = (scalar_t)p.clamp;
+    scalar_t one          = (scalar_t)1;
+    scalar_t two          = (scalar_t)2;
+    scalar_t expRange     = (scalar_t)80;
+    scalar_t halfExpRange = (scalar_t)40;
+    scalar_t seluScale    = (scalar_t)1.0507009873554804934193349852946;
+    scalar_t seluAlpha    = (scalar_t)1.6732632423543772848170429916717;
+
+    // Loop over elements.
+    int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
+    for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX; loopIdx++, xi += blockDim.x)
+    {
+        // Load.
+        scalar_t x = (scalar_t)((const T*)p.x)[xi];
+        scalar_t b = (p.b) ? (scalar_t)((const T*)p.b)[(xi / p.stepB) % p.sizeB] : 0;
+        scalar_t xref = (p.xref) ? (scalar_t)((const T*)p.xref)[xi] : 0;
+        scalar_t yref = (p.yref) ? (scalar_t)((const T*)p.yref)[xi] : 0;
+        scalar_t dy = (p.dy) ? (scalar_t)((const T*)p.dy)[xi] : one;
+        scalar_t yy = (gain != 0) ? yref / gain : 0;
+        scalar_t y = 0;
+
+        // Apply bias.
+        ((G == 0) ? x : xref) += b;
+
+        // linear
+        if (A == 1)
+        {
+            if (G == 0) y = x;
+            if (G == 1) y = x;
+        }
+
+        // relu
+        if (A == 2)
+        {
+            if (G == 0) y = (x > 0) ? x : 0;
+            if (G == 1) y = (yy > 0) ? x : 0;
+        }
+
+        // lrelu
+        if (A == 3)
+        {
+            if (G == 0) y = (x > 0) ? x : x * alpha;
+            if (G == 1) y = (yy > 0) ? x : x * alpha;
+        }
+
+        // tanh
+        if (A == 4)
+        {
+            if (G == 0) { scalar_t c = exp(x); scalar_t d = one / c; y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d); }
+            if (G == 1) y = x * (one - yy * yy);
+            if (G == 2) y = x * (one - yy * yy) * (-two * yy);
+        }
+
+        // sigmoid
+        if (A == 5)
+        {
+            if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
+            if (G == 1) y = x * yy * (one - yy);
+            if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
+        }
+
+        // elu
+        if (A == 6)
+        {
+            if (G == 0) y = (x >= 0) ? x : exp(x) - one;
+            if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
+            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
+        }
+
+        // selu
+        if (A == 7)
+        {
+            if (G == 0) y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
+            if (G == 1) y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
+            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
+        }
+
+        // softplus
+        if (A == 8)
+        {
+            if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
+            if (G == 1) y = x * (one - exp(-yy));
+            if (G == 2) { scalar_t c = exp(-yy); y = x * c * (one - c); }
+        }
+
+        // swish
+        if (A == 9)
+        {
+            if (G == 0)
+                y = (x < -expRange) ? 0 : x / (exp(-x) + one);
+            else
+            {
+                scalar_t c = exp(xref);
+                scalar_t d = c + one;
+                if (G == 1)
+                    y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
+                else
+                    y = (xref > halfExpRange) ? 0 : x * c * (xref * (two - d) + two * d) / (d * d * d);
+                yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
+            }
+        }
+
+        // Apply gain.
+        y *= gain * dy;
+
+        // Clamp.
+        if (clamp >= 0)
+        {
+            if (G == 0)
+                y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
+            else
+                y = (yref > -clamp & yref < clamp) ? y : 0;
+        }
+
+        // Store.
+        ((T*)p.y)[xi] = (T)y;
+    }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p)
+{
+    if (p.act == 1) return (void*)bias_act_kernel<T, 1>;
+    if (p.act == 2) return (void*)bias_act_kernel<T, 2>;
+    if (p.act == 3) return (void*)bias_act_kernel<T, 3>;
+    if (p.act == 4) return (void*)bias_act_kernel<T, 4>;
+    if (p.act == 5) return (void*)bias_act_kernel<T, 5>;
+    if (p.act == 6) return (void*)bias_act_kernel<T, 6>;
+    if (p.act == 7) return (void*)bias_act_kernel<T, 7>;
+    if (p.act == 8) return (void*)bias_act_kernel<T, 8>;
+    if (p.act == 9) return (void*)bias_act_kernel<T, 9>;
+    return NULL;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template void* choose_bias_act_kernel<double>       (const bias_act_kernel_params& p);
+template void* choose_bias_act_kernel<float>        (const bias_act_kernel_params& p);
+template void* choose_bias_act_kernel<c10::Half>    (const bias_act_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/bias_act.h b/torch_utils/ops/bias_act.h
new file mode 100644
index 0000000000000000000000000000000000000000..60b81c6058d54638a6d74a13046fa388442d767d
--- /dev/null
+++ b/torch_utils/ops/bias_act.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct bias_act_kernel_params
+{
+    const void* x;      // [sizeX]
+    const void* b;      // [sizeB] or NULL
+    const void* xref;   // [sizeX] or NULL
+    const void* yref;   // [sizeX] or NULL
+    const void* dy;     // [sizeX] or NULL
+    void*       y;      // [sizeX]
+
+    int         grad;
+    int         act;
+    float       alpha;
+    float       gain;
+    float       clamp;
+
+    int         sizeX;
+    int         sizeB;
+    int         stepB;
+    int         loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/bias_act.py b/torch_utils/ops/bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c485c0027570decab26f0b6602a363a432b851f
--- /dev/null
+++ b/torch_utils/ops/bias_act.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom PyTorch ops for efficient bias and activation."""
+
+import os
+import numpy as np
+import torch
+import dnnlib
+
+from .. import custom_ops
+from .. import misc
+
+#----------------------------------------------------------------------------
+
+activation_funcs = {
+    'linear':   dnnlib.EasyDict(func=lambda x, **_:         x,                                          def_alpha=0,    def_gain=1,             cuda_idx=1, ref='',  has_2nd_grad=False),
+    'relu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.relu(x),                def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=2, ref='y', has_2nd_grad=False),
+    'lrelu':    dnnlib.EasyDict(func=lambda x, alpha, **_:  torch.nn.functional.leaky_relu(x, alpha),   def_alpha=0.2,  def_gain=np.sqrt(2),    cuda_idx=3, ref='y', has_2nd_grad=False),
+    'tanh':     dnnlib.EasyDict(func=lambda x, **_:         torch.tanh(x),                              def_alpha=0,    def_gain=1,             cuda_idx=4, ref='y', has_2nd_grad=True),
+    'sigmoid':  dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x),                           def_alpha=0,    def_gain=1,             cuda_idx=5, ref='y', has_2nd_grad=True),
+    'elu':      dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.elu(x),                 def_alpha=0,    def_gain=1,             cuda_idx=6, ref='y', has_2nd_grad=True),
+    'selu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.selu(x),                def_alpha=0,    def_gain=1,             cuda_idx=7, ref='y', has_2nd_grad=True),
+    'softplus': dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.softplus(x),            def_alpha=0,    def_gain=1,             cuda_idx=8, ref='y', has_2nd_grad=True),
+    'swish':    dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x) * x,                       def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=9, ref='x', has_2nd_grad=True),
+}
+
+#----------------------------------------------------------------------------
+
+_plugin = None
+_null_tensor = torch.empty([0])
+
+def _init():
+    global _plugin
+    if _plugin is None:
+        _plugin = custom_ops.get_plugin(
+            module_name='bias_act_plugin',
+            sources=['bias_act.cpp', 'bias_act.cu'],
+            headers=['bias_act.h'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math'],
+        )
+    return True
+
+#----------------------------------------------------------------------------
+
+def bias_act(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None, impl='cuda'):
+    r"""Fused bias and activation function.
+
+    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
+    and scales the result by `gain`. Each of the steps is optional. In most cases,
+    the fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports first and second order gradients,
+    but not third order gradients.
+
+    Args:
+        x:      Input activation tensor. Can be of any shape.
+        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                as `x`. The shape must be known, and it must match the dimension of `x`
+                corresponding to `dim`.
+        dim:    The dimension in `x` corresponding to the elements of `b`.
+                The value of `dim` is ignored if `b` is not specified.
+        act:    Name of the activation function to evaluate, or `"linear"` to disable.
+                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
+                See `activation_funcs` for a full list. `None` is not allowed.
+        alpha:  Shape parameter for the activation function, or `None` to use the default.
+        gain:   Scaling factor for the output tensor, or `None` to use default.
+                See `activation_funcs` for the default scaling of each activation function.
+                If unsure, consider specifying 1.
+        clamp:  Clamp the output values to `[-clamp, +clamp]`, or `None` to disable
+                the clamping (default).
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _bias_act_cuda(dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp).apply(x, b)
+    return _bias_act_ref(x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _bias_act_ref(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None):
+    """Slow reference implementation of `bias_act()` using standard TensorFlow ops.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Add bias.
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.ndim == 1
+        assert 0 <= dim < x.ndim
+        assert b.shape[0] == x.shape[dim]
+        x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)])
+
+    # Evaluate activation function.
+    alpha = float(alpha)
+    x = spec.func(x, alpha=alpha)
+
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        x = x * gain
+
+    # Clamp.
+    if clamp >= 0:
+        x = x.clamp(-clamp, clamp) # pylint: disable=invalid-unary-operand-type
+    return x
+
+#----------------------------------------------------------------------------
+
+_bias_act_cuda_cache = dict()
+
+def _bias_act_cuda(dim=1, act='linear', alpha=None, gain=None, clamp=None):
+    """Fast CUDA implementation of `bias_act()` using custom ops.
+    """
+    # Parse arguments.
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Lookup from cache.
+    key = (dim, act, alpha, gain, clamp)
+    if key in _bias_act_cuda_cache:
+        return _bias_act_cuda_cache[key]
+
+    # Forward op.
+    class BiasActCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, b): # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(1) == 1 else torch.contiguous_format
+            x = x.contiguous(memory_format=ctx.memory_format)
+            b = b.contiguous() if b is not None else _null_tensor
+            y = x
+            if act != 'linear' or gain != 1 or clamp >= 0 or b is not _null_tensor:
+                y = _plugin.bias_act(x, b, _null_tensor, _null_tensor, _null_tensor, 0, dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
+                b if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
+                y if 'y' in spec.ref else _null_tensor)
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            dy = dy.contiguous(memory_format=ctx.memory_format)
+            x, b, y = ctx.saved_tensors
+            dx = None
+            db = None
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                dx = dy
+                if act != 'linear' or gain != 1 or clamp >= 0:
+                    dx = BiasActCudaGrad.apply(dy, x, b, y)
+
+            if ctx.needs_input_grad[1]:
+                db = dx.sum([i for i in range(dx.ndim) if i != dim])
+
+            return dx, db
+
+    # Backward op.
+    class BiasActCudaGrad(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if dy.ndim > 2 and dy.stride(1) == 1 else torch.contiguous_format
+            dx = _plugin.bias_act(dy, b, x, y, _null_tensor, 1, dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                dy if spec.has_2nd_grad else _null_tensor,
+                x, b, y)
+            return dx
+
+        @staticmethod
+        def backward(ctx, d_dx): # pylint: disable=arguments-differ
+            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
+            dy, x, b, y = ctx.saved_tensors
+            d_dy = None
+            d_x = None
+            d_b = None
+            d_y = None
+
+            if ctx.needs_input_grad[0]:
+                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)
+
+            if spec.has_2nd_grad and (ctx.needs_input_grad[1] or ctx.needs_input_grad[2]):
+                d_x = _plugin.bias_act(d_dx, b, x, y, dy, 2, dim, spec.cuda_idx, alpha, gain, clamp)
+
+            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
+                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])
+
+            return d_dy, d_x, d_b, d_y
+
+    # Add to cache.
+    _bias_act_cuda_cache[key] = BiasActCuda
+    return BiasActCuda
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/conv2d_gradfix.py b/torch_utils/ops/conv2d_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2cf8727edbb5106a88a139b34943229487c9988
--- /dev/null
+++ b/torch_utils/ops/conv2d_gradfix.py
@@ -0,0 +1,200 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom replacement for `torch.nn.functional.conv2d` that supports
+arbitrarily high order gradients with zero performance penalty."""
+
+import contextlib
+import torch
+
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-differ
+# pylint: disable=protected-access
+
+#----------------------------------------------------------------------------
+
+enabled = False                     # Enable the custom op by setting this to true.
+weight_gradients_disabled = False   # Forcefully disable computation of gradients with respect to the weights.
+
+@contextlib.contextmanager
+def no_weight_gradients(disable=True):
+    global weight_gradients_disabled
+    old = weight_gradients_disabled
+    if disable:
+        weight_gradients_disabled = True
+    yield
+    weight_gradients_disabled = old
+
+#----------------------------------------------------------------------------
+
+def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(transpose=False, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=0, dilation=dilation, groups=groups).apply(input, weight, bias)
+    return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(transpose=True, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation).apply(input, weight, bias)
+    return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation)
+
+#----------------------------------------------------------------------------
+
+def _should_use_custom_op(input):
+    assert isinstance(input, torch.Tensor)
+    if (not enabled) or (not torch.backends.cudnn.enabled):
+        return False
+    if input.device.type != 'cuda':
+        return False
+    return True
+
+def _tuple_of_ints(xs, ndim):
+    xs = tuple(xs) if isinstance(xs, (tuple, list)) else (xs,) * ndim
+    assert len(xs) == ndim
+    assert all(isinstance(x, int) for x in xs)
+    return xs
+
+#----------------------------------------------------------------------------
+
+_conv2d_gradfix_cache = dict()
+_null_tensor = torch.empty([0])
+
+def _conv2d_gradfix(transpose, weight_shape, stride, padding, output_padding, dilation, groups):
+    # Parse arguments.
+    ndim = 2
+    weight_shape = tuple(weight_shape)
+    stride = _tuple_of_ints(stride, ndim)
+    padding = _tuple_of_ints(padding, ndim)
+    output_padding = _tuple_of_ints(output_padding, ndim)
+    dilation = _tuple_of_ints(dilation, ndim)
+
+    # Lookup from cache.
+    key = (transpose, weight_shape, stride, padding, output_padding, dilation, groups)
+    if key in _conv2d_gradfix_cache:
+        return _conv2d_gradfix_cache[key]
+
+    # Validate arguments.
+    assert groups >= 1
+    assert len(weight_shape) == ndim + 2
+    assert all(stride[i] >= 1 for i in range(ndim))
+    assert all(padding[i] >= 0 for i in range(ndim))
+    assert all(dilation[i] >= 0 for i in range(ndim))
+    if not transpose:
+        assert all(output_padding[i] == 0 for i in range(ndim))
+    else: # transpose
+        assert all(0 <= output_padding[i] < max(stride[i], dilation[i]) for i in range(ndim))
+
+    # Helpers.
+    common_kwargs = dict(stride=stride, padding=padding, dilation=dilation, groups=groups)
+    def calc_output_padding(input_shape, output_shape):
+        if transpose:
+            return [0, 0]
+        return [
+            input_shape[i + 2]
+            - (output_shape[i + 2] - 1) * stride[i]
+            - (1 - 2 * padding[i])
+            - dilation[i] * (weight_shape[i + 2] - 1)
+            for i in range(ndim)
+        ]
+
+    # Forward & backward.
+    class Conv2d(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input, weight, bias):
+            assert weight.shape == weight_shape
+            ctx.save_for_backward(
+                input if weight.requires_grad else _null_tensor,
+                weight if input.requires_grad else _null_tensor,
+            )
+            ctx.input_shape = input.shape
+
+            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
+            if weight_shape[2:] == stride == dilation == (1, 1) and padding == (0, 0) and torch.cuda.get_device_capability(input.device) < (8, 0):
+                a = weight.reshape(groups, weight_shape[0] // groups, weight_shape[1])
+                b = input.reshape(input.shape[0], groups, input.shape[1] // groups, -1)
+                c = (a.transpose(1, 2) if transpose else a) @ b.permute(1, 2, 0, 3).flatten(2)
+                c = c.reshape(-1, input.shape[0], *input.shape[2:]).transpose(0, 1)
+                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(2).unsqueeze(3)
+                return c.contiguous(memory_format=(torch.channels_last if input.stride(1) == 1 else torch.contiguous_format))
+
+            # General case => cuDNN.
+            if transpose:
+                return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, output_padding=output_padding, **common_kwargs)
+            return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, **common_kwargs)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, weight = ctx.saved_tensors
+            input_shape = ctx.input_shape
+            grad_input = None
+            grad_weight = None
+            grad_bias = None
+
+            if ctx.needs_input_grad[0]:
+                p = calc_output_padding(input_shape=input_shape, output_shape=grad_output.shape)
+                op = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs)
+                grad_input = op.apply(grad_output, weight, None)
+                assert grad_input.shape == input_shape
+
+            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
+                grad_weight = Conv2dGradWeight.apply(grad_output, input)
+                assert grad_weight.shape == weight_shape
+
+            if ctx.needs_input_grad[2]:
+                grad_bias = grad_output.sum([0, 2, 3])
+
+            return grad_input, grad_weight, grad_bias
+
+    # Gradient with respect to the weights.
+    class Conv2dGradWeight(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, grad_output, input):
+            ctx.save_for_backward(
+                grad_output if input.requires_grad else _null_tensor,
+                input if grad_output.requires_grad else _null_tensor,
+            )
+            ctx.grad_output_shape = grad_output.shape
+            ctx.input_shape = input.shape
+
+            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
+            if weight_shape[2:] == stride == dilation == (1, 1) and padding == (0, 0):
+                a = grad_output.reshape(grad_output.shape[0], groups, grad_output.shape[1] // groups, -1).permute(1, 2, 0, 3).flatten(2)
+                b = input.reshape(input.shape[0], groups, input.shape[1] // groups, -1).permute(1, 2, 0, 3).flatten(2)
+                c = (b @ a.transpose(1, 2) if transpose else a @ b.transpose(1, 2)).reshape(weight_shape)
+                return c.contiguous(memory_format=(torch.channels_last if input.stride(1) == 1 else torch.contiguous_format))
+
+            # General case => cuDNN.
+            name = 'aten::cudnn_convolution_transpose_backward_weight' if transpose else 'aten::cudnn_convolution_backward_weight'
+            flags = [torch.backends.cudnn.benchmark, torch.backends.cudnn.deterministic, torch.backends.cudnn.allow_tf32]
+            return torch._C._jit_get_operation(name)(weight_shape, grad_output, input, padding, stride, dilation, groups, *flags)
+
+        @staticmethod
+        def backward(ctx, grad2_grad_weight):
+            grad_output, input = ctx.saved_tensors
+            grad_output_shape = ctx.grad_output_shape
+            input_shape = ctx.input_shape
+            grad2_grad_output = None
+            grad2_input = None
+
+            if ctx.needs_input_grad[0]:
+                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight, None)
+                assert grad2_grad_output.shape == grad_output_shape
+
+            if ctx.needs_input_grad[1]:
+                p = calc_output_padding(input_shape=input_shape, output_shape=grad_output_shape)
+                op = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs)
+                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
+                assert grad2_input.shape == input_shape
+
+            return grad2_grad_output, grad2_input
+
+    _conv2d_gradfix_cache[key] = Conv2d
+    return Conv2d
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/conv2d_resample.py b/torch_utils/ops/conv2d_resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..d646cb01ec45be01097e69fe56591e7c5a9a3e66
--- /dev/null
+++ b/torch_utils/ops/conv2d_resample.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""2D convolution with optional up/downsampling."""
+
+import torch
+
+from .. import misc
+from . import conv2d_gradfix
+from . import upfirdn2d
+from .upfirdn2d import _parse_padding
+from .upfirdn2d import _get_filter_size
+
+#----------------------------------------------------------------------------
+
+def _get_weight_shape(w):
+    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+        shape = [int(sz) for sz in w.shape]
+    misc.assert_shape(w, shape)
+    return shape
+
+#----------------------------------------------------------------------------
+
+def _conv2d_wrapper(x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True):
+    """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations.
+    """
+    _out_channels, _in_channels_per_group, kh, kw = _get_weight_shape(w)
+
+    # Flip weight if requested.
+    # Note: conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False).
+    if not flip_weight and (kw > 1 or kh > 1):
+        w = w.flip([2, 3])
+
+    # Execute using conv2d_gradfix.
+    op = conv2d_gradfix.conv_transpose2d if transpose else conv2d_gradfix.conv2d
+    return op(x, w, stride=stride, padding=padding, groups=groups)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def conv2d_resample(x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False):
+    r"""2D convolution with optional up/downsampling.
+
+    Padding is performed only once at the beginning, not between the operations.
+
+    Args:
+        x:              Input tensor of shape
+                        `[batch_size, in_channels, in_height, in_width]`.
+        w:              Weight tensor of shape
+                        `[out_channels, in_channels//groups, kernel_height, kernel_width]`.
+        f:              Low-pass filter for up/downsampling. Must be prepared beforehand by
+                        calling upfirdn2d.setup_filter(). None = identity (default).
+        up:             Integer upsampling factor (default: 1).
+        down:           Integer downsampling factor (default: 1).
+        padding:        Padding with respect to the upsampled image. Can be a single number
+                        or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                        (default: 0).
+        groups:         Split input channels into N groups (default: 1).
+        flip_weight:    False = convolution, True = correlation (default: True).
+        flip_filter:    False = convolution, True = correlation (default: False).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and (x.ndim == 4)
+    assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype)
+    assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2] and f.dtype == torch.float32)
+    assert isinstance(up, int) and (up >= 1)
+    assert isinstance(down, int) and (down >= 1)
+    assert isinstance(groups, int) and (groups >= 1)
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+    fw, fh = _get_filter_size(f)
+    px0, px1, py0, py1 = _parse_padding(padding)
+
+    # Adjust padding to account for up/downsampling.
+    if up > 1:
+        px0 += (fw + up - 1) // 2
+        px1 += (fw - up) // 2
+        py0 += (fh + up - 1) // 2
+        py1 += (fh - up) // 2
+    if down > 1:
+        px0 += (fw - down + 1) // 2
+        px1 += (fw - down) // 2
+        py0 += (fh - down + 1) // 2
+        py1 += (fh - down) // 2
+
+    # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve.
+    if kw == 1 and kh == 1 and (down > 1 and up == 1):
+        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample.
+    if kw == 1 and kh == 1 and (up > 1 and down == 1):
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        x = upfirdn2d.upfirdn2d(x=x, f=f, up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
+        return x
+
+    # Fast path: downsampling only => use strided convolution.
+    if down > 1 and up == 1:
+        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: upsampling with optional downsampling => use transpose strided convolution.
+    if up > 1:
+        if groups == 1:
+            w = w.transpose(0, 1)
+        else:
+            w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw)
+            w = w.transpose(1, 2)
+            w = w.reshape(groups * in_channels_per_group, out_channels // groups, kh, kw)
+        px0 -= kw - 1
+        px1 -= kw - up
+        py0 -= kh - 1
+        py1 -= kh - up
+        pxt = max(min(-px0, -px1), 0)
+        pyt = max(min(-py0, -py1), 0)
+        x = _conv2d_wrapper(x=x, w=w, stride=up, padding=[pyt,pxt], groups=groups, transpose=True, flip_weight=(not flip_weight))
+        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0+pxt,px1+pxt,py0+pyt,py1+pyt], gain=up**2, flip_filter=flip_filter)
+        if down > 1:
+            x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+        return x
+
+    # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d.
+    if up == 1 and down == 1:
+        if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0:
+            return _conv2d_wrapper(x=x, w=w, padding=[py0,px0], groups=groups, flip_weight=flip_weight)
+
+    # Fallback: Generic reference implementation.
+    x = upfirdn2d.upfirdn2d(x=x, f=(f if up > 1 else None), up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
+    x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+    if down > 1:
+        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+    return x
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/filtered_lrelu.cpp b/torch_utils/ops/filtered_lrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff4149b8b46b54d2f400ae10e44d19f20503ba1f
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu.cpp
@@ -0,0 +1,300 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "filtered_lrelu.h"
+
+//------------------------------------------------------------------------
+
+static std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b, torch::Tensor si,
+    int up, int down, int px0, int px1, int py0, int py1, int sx, int sy, float gain, float slope, float clamp, bool flip_filters, bool writeSigns)
+{
+    // Set CUDA device.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+    // Validate arguments.
+    TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() && b.device() == x.device(), "all input tensors must reside on the same device");
+    TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat, "fu and fd must be float32");
+    TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
+    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat, "x and b must be float16 or float32");
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
+    TORCH_CHECK(x.numel() > 0, "x is empty");
+    TORCH_CHECK((fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2), "fu and fd must be rank 1 or 2");
+    TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX, "fu is too large");
+    TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX, "fd is too large");
+    TORCH_CHECK(fu.numel() > 0, "fu is empty");
+    TORCH_CHECK(fd.numel() > 0, "fd is empty");
+    TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1), "b must be a vector with the same number of channels as x");
+    TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");
+
+    // Figure out how much shared memory is available on the device.
+    int maxSharedBytes = 0;
+    AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, x.device().index()));
+    int sharedKB = maxSharedBytes >> 10;
+
+    // Populate enough launch parameters to check if a CUDA kernel exists.
+    filtered_lrelu_kernel_params p;
+    p.up      = up;
+    p.down    = down;
+    p.fuShape = make_int2((int)fu.size(-1), fu.dim() == 2 ? (int)fu.size(0) : 0); // shape [n, 0] indicates separable filter.
+    p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
+    filtered_lrelu_kernel_spec test_spec = choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
+    if (!test_spec.exec)
+    {
+        // No kernel found - return empty tensors and indicate missing kernel with return code of -1.
+        return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
+    }
+
+    // Input/output element size.
+    int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;
+
+    // Input sizes.
+    int64_t xw = (int)x.size(3);
+    int64_t xh = (int)x.size(2);
+    int64_t fut_w = (int)fu.size(-1) - 1;
+    int64_t fut_h = (int)fu.size(0)  - 1;
+    int64_t fdt_w = (int)fd.size(-1) - 1;
+    int64_t fdt_h = (int)fd.size(0)  - 1;
+
+    // Logical size of upsampled buffer.
+    int64_t cw = xw * up + (px0 + px1) - fut_w;
+    int64_t ch = xh * up + (py0 + py1) - fut_h;
+    TORCH_CHECK(cw > fdt_w && ch > fdt_h, "upsampled buffer must be at least the size of downsampling filter");
+    TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");
+
+    // Compute output size and allocate.
+    int64_t yw = (cw - fdt_w + (down - 1)) / down;
+    int64_t yh = (ch - fdt_h + (down - 1)) / down;
+    TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
+    TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
+    torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(), x.suggest_memory_format());
+
+    // Allocate sign tensor.
+    torch::Tensor so;
+    torch::Tensor s = si;
+    bool readSigns = !!s.numel();
+    int64_t sw_active = 0; // Active width of sign tensor.
+    if (writeSigns)
+    {
+        sw_active = yw * down - (down - 1) + fdt_w;     // Active width in elements.
+        int64_t sh = yh * down - (down - 1) + fdt_h;    // Height = active height.
+        int64_t sw = (sw_active + 15) & ~15;            // Width  = active width in elements, rounded up to multiple of 16.
+        TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
+        s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
+    }
+    else if (readSigns)
+        sw_active = s.size(3) << 2;
+
+    // Validate sign tensor if in use.
+    if (readSigns || writeSigns)
+    {
+        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
+        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
+        TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX, "signs is too large");
+    }
+
+    // Populate rest of CUDA kernel parameters.
+    p.x         = x.data_ptr();
+    p.y         = y.data_ptr();
+    p.b         = b.data_ptr();
+    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+    p.fu        = fu.data_ptr<float>();
+    p.fd        = fd.data_ptr<float>();
+    p.pad0      = make_int2(px0, py0);
+    p.gain      = gain;
+    p.slope     = slope;
+    p.clamp     = clamp;
+    p.flip      = (flip_filters) ? 1 : 0;
+    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.yShape    = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3), (int)s.size(2)) : make_int2(0, 0); // Width is in bytes. Contiguous.
+    p.sOfs      = make_int2(sx, sy);
+    p.swLimit   = (sw_active + 3) >> 2; // Rounded up to bytes.
+
+    // x, y, b strides are in bytes.
+    p.xStride   = make_longlong4(sz * x.stride(3), sz * x.stride(2), sz * x.stride(1), sz * x.stride(0));
+    p.yStride   = make_longlong4(sz * y.stride(3), sz * y.stride(2), sz * y.stride(1), sz * y.stride(0));
+    p.bStride   = sz * b.stride(0);
+
+    // fu, fd strides are in elements.
+    p.fuStride  = make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
+    p.fdStride  = make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);
+
+    // Determine if indices don't fit in int32. Support negative strides although Torch currently never produces those.
+    bool index64b = false;
+    if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
+    if (std::min(x.size(0) * p.xStride.w, 0ll) + std::min(x.size(1) * p.xStride.z, 0ll) + std::min(x.size(2) * p.xStride.y, 0ll) + std::min(x.size(3) * p.xStride.x, 0ll) < -INT_MAX) index64b = true;
+    if (std::max(x.size(0) * p.xStride.w, 0ll) + std::max(x.size(1) * p.xStride.z, 0ll) + std::max(x.size(2) * p.xStride.y, 0ll) + std::max(x.size(3) * p.xStride.x, 0ll) >  INT_MAX) index64b = true;
+    if (std::min(y.size(0) * p.yStride.w, 0ll) + std::min(y.size(1) * p.yStride.z, 0ll) + std::min(y.size(2) * p.yStride.y, 0ll) + std::min(y.size(3) * p.yStride.x, 0ll) < -INT_MAX) index64b = true;
+    if (std::max(y.size(0) * p.yStride.w, 0ll) + std::max(y.size(1) * p.yStride.z, 0ll) + std::max(y.size(2) * p.yStride.y, 0ll) + std::max(y.size(3) * p.yStride.x, 0ll) >  INT_MAX) index64b = true;
+    if (s.numel() > INT_MAX) index64b = true;
+
+    // Choose CUDA kernel.
+    filtered_lrelu_kernel_spec spec = { 0 };
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_cuda", [&]
+    {
+        if constexpr (sizeof(scalar_t) <= 4) // Exclude doubles. constexpr prevents template instantiation.
+        {
+            // Choose kernel based on index type, datatype and sign read/write modes.
+            if      (!index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true,  false>(p, sharedKB);
+            else if (!index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true >(p, sharedKB);
+            else if (!index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(p, sharedKB);
+            else if ( index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true,  false>(p, sharedKB);
+            else if ( index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true >(p, sharedKB);
+            else if ( index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(p, sharedKB);
+        }
+    });
+    TORCH_CHECK(spec.exec, "internal error - CUDA kernel not found") // This should not happen because we tested earlier that kernel exists.
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    int bx = spec.numWarps * 32;
+    int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
+    int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
+    int gz = p.yShape.z * p.yShape.w;
+
+    // Repeat multiple horizontal tiles in a CTA?
+    if (spec.xrep)
+    {
+        p.tilesXrep = spec.xrep;
+        p.tilesXdim = gx;
+
+        gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
+        std::swap(gx, gy);
+    }
+    else
+    {
+        p.tilesXrep = 0;
+        p.tilesXdim = 0;
+    }
+
+    // Launch filter setup kernel.
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0, at::cuda::getCurrentCUDAStream()));
+
+    // Copy kernels to constant memory.
+    if      ( writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<true,  false>(at::cuda::getCurrentCUDAStream())));
+    else if (!writeSigns &&  readSigns) AT_CUDA_CHECK((copy_filters<false, true >(at::cuda::getCurrentCUDAStream())));
+    else if (!writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<false, false>(at::cuda::getCurrentCUDAStream())));
+
+    // Set cache and shared memory configurations for main kernel.
+    AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
+    if (spec.dynamicSharedKB) // Need dynamically allocated shared memory?
+        AT_CUDA_CHECK(cudaFuncSetAttribute(spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize, spec.dynamicSharedKB << 10));
+    AT_CUDA_CHECK(cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));
+
+    // Launch main kernel.
+    const int maxSubGz = 65535; // CUDA maximum for block z dimension.
+    for (int zofs=0; zofs < gz; zofs += maxSubGz) // Do multiple launches if gz is too big.
+    {
+        p.blockZofs = zofs;
+        int subGz = std::min(maxSubGz, gz - zofs);
+        AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args, spec.dynamicSharedKB << 10, at::cuda::getCurrentCUDAStream()));
+    }
+
+    // Done.
+    return std::make_tuple(y, so, 0);
+}
+
+//------------------------------------------------------------------------
+
+static torch::Tensor filtered_lrelu_act(torch::Tensor x, torch::Tensor si, int sx, int sy, float gain, float slope, float clamp, bool writeSigns)
+{
+    // Set CUDA device.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+    // Validate arguments.
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
+    TORCH_CHECK(x.numel() > 0, "x is empty");
+    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat || x.dtype() == torch::kDouble, "x must be float16, float32 or float64");
+
+    // Output signs if we don't have sign input.
+    torch::Tensor so;
+    torch::Tensor s = si;
+    bool readSigns = !!s.numel();
+    if (writeSigns)
+    {
+        int64_t sw = x.size(3);
+        sw = (sw + 15) & ~15; // Round to a multiple of 16 for coalescing.
+        s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
+    }
+
+    // Validate sign tensor if in use.
+    if (readSigns || writeSigns)
+    {
+        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
+        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
+        TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX, "signs tensor is too large");
+    }
+
+    // Initialize CUDA kernel parameters.
+    filtered_lrelu_act_kernel_params p;
+    p.x         = x.data_ptr();
+    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+    p.gain      = gain;
+    p.slope     = slope;
+    p.clamp     = clamp;
+    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.xStride   = make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
+    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3) << 2, (int)s.size(2)) : make_int2(0, 0); // Width is in elements. Contiguous.
+    p.sOfs      = make_int2(sx, sy);
+
+    // Choose CUDA kernel.
+    void* func = 0;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_act_cuda", [&]
+    {
+        if (writeSigns)
+            func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
+        else if (readSigns)
+            func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
+        else
+            func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
+    });
+    TORCH_CHECK(func, "internal error - CUDA kernel not found");
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    int bx = 128; // 4 warps per block.
+
+    // Logical size of launch = writeSigns ? p.s : p.x
+    uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
+    uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
+    uint32_t gz = p.xShape.z * p.xShape.w; // Same as in p.sShape if signs are in use.
+    gx = (gx - 1) / bx + 1;
+
+    // Make sure grid y and z dimensions are within CUDA launch limits. Kernel loops internally to do the rest.
+    const uint32_t gmax = 65535;
+    gy = std::min(gy, gmax);
+    gz = std::min(gz, gmax);
+
+    // Launch.
+    AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0, at::cuda::getCurrentCUDAStream()));
+    return so;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("filtered_lrelu",      &filtered_lrelu);      // The whole thing.
+    m.def("filtered_lrelu_act_", &filtered_lrelu_act);  // Activation and sign tensor handling only. Modifies data tensor in-place.
+}
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/filtered_lrelu.cu b/torch_utils/ops/filtered_lrelu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8e6f47f873d42f7181a0faf64779377e70be3012
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu.cu
@@ -0,0 +1,1284 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include "filtered_lrelu.h"
+#include <cstdint>
+
+//------------------------------------------------------------------------
+// Helpers.
+
+enum // Filter modes.
+{
+    MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
+    MODE_FUSD = 1,  // Full upsampling, separable downsampling.
+    MODE_SUFD = 2,  // Separable upsampling, full downsampling.
+    MODE_FUFD = 3,  // Full upsampling, full downsampling.
+};
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>
+{
+    typedef double scalar_t; typedef double2 vec2_t; typedef double4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_double2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_double4(0, 0, 0, 0); }
+    __device__ __forceinline__ static double clamp(double x, double c) { return fmin(fmax(x, -c), c); }
+};
+template <> struct InternalType<float>
+{
+    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
+    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
+};
+template <> struct InternalType<c10::Half>
+{
+    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
+    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
+    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
+    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
+};
+
+#define MIN(A, B)       ((A) < (B) ? (A) : (B))
+#define MAX(A, B)       ((A) > (B) ? (A) : (B))
+#define CEIL_DIV(A, B) (((B)==1) ? (A) : \
+                        ((B)==2) ? ((int)((A)+1) >> 1) : \
+                        ((B)==4) ? ((int)((A)+3) >> 2) : \
+                        (((A) + ((A) > 0 ? (B) - 1 : 0)) / (B)))
+
+// This works only up to blocks of size 256 x 256 and for all N that are powers of two.
+template <int N> __device__ __forceinline__ void fast_div_mod(int& x, int& y, unsigned int i)
+{
+    if ((N & (N-1)) && N <= 256)
+        y = (i * ((1<<24)/N + 1)) >> 24; // Assumes N <= 256, i < N*256.
+    else
+        y = i/N;
+
+    x = i - y*N;
+}
+
+// Type cast stride before reading it.
+template <class T> __device__ __forceinline__ T get_stride(const int64_t& x)
+{
+    return *reinterpret_cast<const T*>(&x);
+}
+
+//------------------------------------------------------------------------
+// Filters, setup kernel, copying function.
+
+#define MAX_FILTER_SIZE 32
+
+// Combined up/down filter buffers so that transfer can be done with one copy.
+__device__              float g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in global memory, written by setup kernel.
+__device__ __constant__ float c_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in constant memory, read by main kernel.
+
+// Accessors to combined buffers to index up/down filters individually.
+#define c_fu (c_fbuf)
+#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+#define g_fu (g_fbuf)
+#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+
+// Set up filters into global memory buffer.
+static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p)
+{
+    for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE; idx += blockDim.x)
+    {
+        int x, y;
+        fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);
+
+        int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
+        int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
+        if (p.fuShape.y > 0)
+            g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y) ? 0.0f : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
+        else
+            g_fu[idx] = (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];
+
+        int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
+        int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
+        if (p.fdShape.y > 0)
+            g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y) ? 0.0f : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
+        else
+            g_fd[idx] = (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
+    }
+}
+
+// Host function to copy filters written by setup kernel into constant buffer for main kernel.
+template <bool, bool> static cudaError_t copy_filters(cudaStream_t stream)
+{
+    void* src = 0;
+    cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
+    if (err) return err;
+    return cudaMemcpyToSymbolAsync(c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream);
+}
+
+//------------------------------------------------------------------------
+// Coordinate spaces:
+// - Relative to input tensor:      inX, inY, tileInX, tileInY
+// - Relative to input tile:        relInX, relInY, tileInW, tileInH
+// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
+// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
+// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
+//
+// Relationships between coordinate spaces:
+// - inX = tileInX + relInX
+// - inY = tileInY + relInY
+// - relUpX = relInX * up + phaseInX
+// - relUpY = relInY * up + phaseInY
+// - relUpX = relOutX * down
+// - relUpY = relOutY * down
+// - outX = tileOutX + relOutX
+// - outY = tileOutY + relOutY
+
+extern __shared__ char s_buf_raw[]; // When sharedKB <= 48, allocate shared memory statically inside the kernel, otherwise use the externally allocated shared memory buffer.
+
+template <class T, class index_t, int sharedKB, bool signWrite, bool signRead, int filterMode, int up, int fuSize, int down, int fdSize, int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep, bool enableWriteSkip>
+static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p)
+{
+    // Check that we don't try to support non-existing filter modes.
+    static_assert(up   == 1 || up   == 2 || up   == 4, "only up=1, up=2, up=4 scales supported");
+    static_assert(down == 1 || down == 2 || down == 4, "only down=1, down=2, down=4 scales supported");
+    static_assert(fuSize >= up,   "upsampling filter size must be at least upsampling factor");
+    static_assert(fdSize >= down, "downsampling filter size must be at least downsampling factor");
+    static_assert(fuSize % up   == 0, "upsampling filter size must be divisible with upsampling factor");
+    static_assert(fdSize % down == 0, "downsampling filter size must be divisible with downsampling factor");
+    static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE, "filter size greater than MAX_FILTER_SIZE");
+    static_assert(up   != 1 || (fuSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "up=1 supported only for 1x1 full filters");
+    static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "down=1 supported only for 1x1 full filters");
+    static_assert(!(up   == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "full filters not supported for up=4");
+    static_assert(!(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "full filters not supported for down=4");
+
+    // Static definitions.
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    typedef typename InternalType<T>::vec2_t vec2_t;
+    typedef typename InternalType<T>::vec4_t vec4_t;
+    const int tileUpW    = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) & ~3;  // Upsampled tile width, rounded up to multiple of 4.
+    const int tileUpH    = tileOutH * down + (fdSize - 1) - (down - 1);             // Upsampled tile height.
+    const int tileInW    = CEIL_DIV(tileUpW  + (fuSize - 1), up);                   // Input tile width.
+    const int tileInH    = CEIL_DIV(tileUpH  + (fuSize - 1), up);                   // Input tile height.
+    const int tileUpH_up = CEIL_DIV(tileUpH, up) * up;                              // Upsampled tile height rounded up to a multiple of up.
+    const int tileInH_up = CEIL_DIV(tileUpH_up + (fuSize - 1), up);                 // For allocations only, to avoid shared memory read overruns with up=2 and up=4.
+
+    // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
+    const bool downInline = (down == 1) && ((up == 1 && filterMode == MODE_FUFD) || (up == 2 && filterMode == MODE_SUFD));
+
+    // Sizes of logical buffers.
+    const int szIn    = tileInH_up * tileInW;
+    const int szUpX   = tileInH_up * tileUpW;
+    const int szUpXY  = downInline ? 0 : (tileUpH * tileUpW);
+    const int szDownX = tileUpH * tileOutW;
+
+    // Sizes for shared memory arrays.
+    const int s_buf0_size_base =
+        (filterMode == MODE_SUSD) ? MAX(szIn, szUpXY) :
+        (filterMode == MODE_FUSD) ? MAX(szIn, szDownX) :
+        (filterMode == MODE_SUFD) ? MAX(szIn, szUpXY) :
+        (filterMode == MODE_FUFD) ? szIn :
+        -1;
+    const int s_buf1_size_base =
+        (filterMode == MODE_SUSD) ? MAX(szUpX, szDownX) :
+        (filterMode == MODE_FUSD) ? szUpXY :
+        (filterMode == MODE_SUFD) ? szUpX  :
+        (filterMode == MODE_FUFD) ? szUpXY :
+        -1;
+
+    // Ensure U128 alignment.
+    const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
+    const int s_buf1_size = (s_buf1_size_base + 3) & ~3;
+
+    // Check at compile time that we don't use too much shared memory.
+    static_assert((s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10), "shared memory overflow");
+
+    // Declare shared memory arrays.
+    scalar_t* s_buf0;
+    scalar_t* s_buf1;
+    if (sharedKB <= 48)
+    {
+        // Allocate shared memory arrays here.
+        __shared__ scalar_t s_buf0_st[(sharedKB > 48) ? (1<<24) : (s_buf0_size + s_buf1_size)]; // Prevent launching if this isn't optimized away when unused.
+        s_buf0 = s_buf0_st;
+        s_buf1 = s_buf0 + s_buf0_size;
+    }
+    else
+    {
+        // Use the dynamically allocated shared memory array.
+        s_buf0 = (scalar_t*)s_buf_raw;
+        s_buf1 = s_buf0 + s_buf0_size;
+    }
+
+    // Pointers to the buffers.
+    scalar_t* s_tileIn;       // Input tile:                      [relInX * tileInH + relInY]
+    scalar_t* s_tileUpX;      // After horizontal upsampling:     [relInY * tileUpW + relUpX]
+    scalar_t* s_tileUpXY;     // After upsampling:                [relUpY * tileUpW + relUpX]
+    scalar_t* s_tileDownX;    // After horizontal downsampling:   [relUpY * tileOutW + relOutX]
+    if (filterMode == MODE_SUSD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpX   = s_buf1;
+        s_tileUpXY  = s_buf0;
+        s_tileDownX = s_buf1;
+    }
+    else if (filterMode == MODE_FUSD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpXY  = s_buf1;
+        s_tileDownX = s_buf0;
+    }
+    else if (filterMode == MODE_SUFD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpX   = s_buf1;
+        s_tileUpXY  = s_buf0;
+    }
+    else if (filterMode == MODE_FUFD)
+    {
+        s_tileIn    = s_buf0;
+        s_tileUpXY  = s_buf1;
+    }
+
+    // Allow large grids in z direction via per-launch offset.
+    int channelIdx = blockIdx.z + p.blockZofs;
+    int batchIdx = channelIdx / p.yShape.z;
+    channelIdx -= batchIdx * p.yShape.z;
+
+    // Offset to output feature map. In bytes.
+    index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) + batchIdx * get_stride<index_t>(p.yStride.w);
+
+    // Sign shift amount.
+    uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;
+
+    // Inner tile loop.
+    #pragma unroll 1
+    for (int tileIdx = 0; !enableXrep || (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y)); tileIdx++)
+    {
+        // Locate output tile.
+        int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
+        int tileOutX = tileX * tileOutW;
+        int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;
+
+        // Locate input tile.
+        int tmpX = tileOutX * down - p.pad0.x;
+        int tmpY = tileOutY * down - p.pad0.y;
+        int tileInX = CEIL_DIV(tmpX, up);
+        int tileInY = CEIL_DIV(tmpY, up);
+        const int phaseInX = tileInX * up - tmpX;
+        const int phaseInY = tileInY * up - tmpY;
+
+        // Extra sync if input and output buffers are the same and we are not on first tile.
+        if (enableXrep && tileIdx > 0 && (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) || (filterMode == MODE_FUFD && downInline)))
+            __syncthreads();
+
+        // Load input tile & apply bias. Unrolled.
+        scalar_t b = (scalar_t)*(const T*)((const char*)p.b + (channelIdx * get_stride<index_t>(p.bStride)));
+        index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) + batchIdx * get_stride<index_t>(p.xStride.w);
+        int idx = threadIdx.x;
+        const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
+        #pragma unroll
+        for (int loop = 0; loop < loopCountIN; loop++)
+        {
+            int relInX, relInY;
+            fast_div_mod<tileInW>(relInX, relInY, idx);
+            int inX = tileInX + relInX;
+            int inY = tileInY + relInY;
+            scalar_t v = 0;
+
+            if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
+                v = (scalar_t)*((const T*)((const char*)p.x + (inX * get_stride<index_t>(p.xStride.x) + inY * get_stride<index_t>(p.xStride.y) + mapOfsIn))) + b;
+
+            bool skip = (loop == loopCountIN-1) && (idx >= tileInW * tileInH);
+            if (!skip)
+                s_tileIn[idx] = v;
+
+            idx += threadsPerBlock;
+        }
+
+        if (filterMode == MODE_SUSD || filterMode == MODE_SUFD) // Separable upsampling filter.
+        {
+            // Horizontal upsampling.
+            __syncthreads();
+            if (up == 4)
+            {
+                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
+                {
+                    int relUpX0, relInY;
+                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+                    int relInX0 = relUpX0 / up;
+                    int src0 = relInX0 + tileInW * relInY;
+                    int dst = relInY * tileUpW + relUpX0;
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    scalar_t a = s_tileIn[src0];
+                    if (phaseInX == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 3];
+                            v.z += a * (scalar_t)c_fu[step * up + 2];
+                            v.w += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else if (phaseInX == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 3];
+                            v.w += a * (scalar_t)c_fu[step * up + 2];
+                        }
+                    }
+                    else if (phaseInX == 2)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 2];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 3];
+                        }
+                    }
+                    else // (phaseInX == 3)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 3];
+                            v.y += a * (scalar_t)c_fu[step * up + 2];
+                            v.z += a * (scalar_t)c_fu[step * up + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                        }
+                    }
+                    s_tileUpX[dst+0] = v.x;
+                    s_tileUpX[dst+1] = v.y;
+                    s_tileUpX[dst+2] = v.z;
+                    s_tileUpX[dst+3] = v.w;
+                }
+            }
+            else if (up == 2)
+            {
+                bool p0 = (phaseInX == 0);
+                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
+                {
+                    int relUpX0, relInY;
+                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+                    int relInX0 = relUpX0 / up;
+                    int src0 = relInX0 + tileInW * relInY;
+                    int dst = relInY * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    scalar_t a = s_tileIn[src0];
+                    if (p0) // (phaseInX == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else // (phaseInX == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileIn[src0 + step + 1];
+                        }
+                    }
+                    s_tileUpX[dst+0] = v.x;
+                    s_tileUpX[dst+1] = v.y;
+                }
+            }
+
+            // Vertical upsampling & nonlinearity.
+
+            __syncthreads();
+            int groupMask = 15 << ((threadIdx.x & 31) & ~3);
+            int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
+            int sShapeMaxY = MIN(p.sShape.y, tileOutY * down + tileUpH); // Avoid out-of-tile sign writes.
+            if (up == 4)
+            {
+                minY -= 3; // Adjust according to block height.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
+                {
+                    int relUpX, relInY0;
+                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+                    int relUpY0 = relInY0 * up;
+                    int src0 = relInY0 * tileUpW + relUpX;
+                    int dst = relUpY0 * tileUpW + relUpX;
+                    vec4_t v = InternalType<T>::zero_vec4();
+
+                    scalar_t a = s_tileUpX[src0];
+                    if (phaseInY == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.y += a * (scalar_t)c_fu[step * up + 3];
+                            v.z += a * (scalar_t)c_fu[step * up + 2];
+                            v.w += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else if (phaseInY == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.z += a * (scalar_t)c_fu[step * up + 3];
+                            v.w += a * (scalar_t)c_fu[step * up + 2];
+                        }
+                    }
+                    else if (phaseInY == 2)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 2];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                            v.z += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.w += a * (scalar_t)c_fu[step * up + 3];
+                        }
+                    }
+                    else // (phaseInY == 3)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 3];
+                            v.y += a * (scalar_t)c_fu[step * up + 2];
+                            v.z += a * (scalar_t)c_fu[step * up + 1];
+                            v.w += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                        }
+                    }
+
+                    int x = tileOutX * down + relUpX;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    index_t si1 = si0 + p.sShape.x;
+                    index_t si2 = si0 + p.sShape.x * 2;
+                    index_t si3 = si0 + p.sShape.x * 3;
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31 <<  0;
+                            int sy = __float_as_uint(v.y) >> 31 <<  8;
+                            int sz = __float_as_uint(v.z) >> 31 << 16;
+                            int sw = __float_as_uint(v.w) >> 31 << 24;
+                            if (sx) v.x *= p.slope;
+                            if (sy) v.y *= p.slope;
+                            if (sz) v.z *= p.slope;
+                            if (sw) v.w *= p.slope;
+                            if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                            if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                            if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                // Combine signs.
+                                uint32_t s = sx + sy + sw + sz;
+                                s <<= (signX & 3) << 1;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
+                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31 <<  0;
+                                int sy = __float_as_uint(v.y) >> 31 <<  8;
+                                int sz = __float_as_uint(v.z) >> 31 << 16;
+                                int sw = __float_as_uint(v.w) >> 31 << 24;
+                                if (sx) v.x *= p.slope;
+                                if (sy) v.y *= p.slope;
+                                if (sz) v.z *= p.slope;
+                                if (sw) v.w *= p.slope;
+                                if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                                if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                                if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                                // Combine signs.
+                                uint32_t s = sx + sy + sw + sz;
+                                s <<= (signX & 3) << 1;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
+                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read signs and apply.
+                    {
+                        if ((uint32_t)signXb < p.swLimit)
+                        {
+                            int ss = (signX & 3) << 1;
+                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> ss; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
+                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> ss; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
+                            if ((uint32_t)(signY + 2) < p.sShape.y) { int s = p.s[si2] >> ss; if (s & 1) v.z *= p.slope; if (s & 2) v.z = 0.f; }
+                            if ((uint32_t)(signY + 3) < p.sShape.y) { int s = p.s[si3] >> ss; if (s & 1) v.w *= p.slope; if (s & 2) v.w = 0.f; }
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                    }
+
+                    s_tileUpXY[dst + 0 * tileUpW] = v.x;
+                    if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
+                    if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
+                    if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
+                }
+            }
+            else if (up == 2)
+            {
+                minY -= 1; // Adjust according to block height.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
+                {
+                    int relUpX, relInY0;
+                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+                    int relUpY0 = relInY0 * up;
+                    int src0 = relInY0 * tileUpW + relUpX;
+                    int dst = relUpY0 * tileUpW + relUpX;
+                    vec2_t v = InternalType<T>::zero_vec2();
+
+                    scalar_t a = s_tileUpX[src0];
+                    if (phaseInY == 0)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                            v.y += a * (scalar_t)c_fu[step * up + 1];
+                        }
+                    }
+                    else // (phaseInY == 1)
+                    {
+                        #pragma unroll
+                        for (int step = 0; step < fuSize / up; step++)
+                        {
+                            v.x += a * (scalar_t)c_fu[step * up + 1];
+                            v.y += a * (scalar_t)c_fu[step * up + 0];
+                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
+                        }
+                    }
+
+                    int x = tileOutX * down + relUpX;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    index_t si1 = si0 + p.sShape.x;
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31 << 0;
+                            int sy = __float_as_uint(v.y) >> 31 << 8;
+                            if (sx) v.x *= p.slope;
+                            if (sy) v.y *= p.slope;
+                            if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                // Combine signs.
+                                int s = sx + sy;
+                                s <<= signXo;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31 << 0;
+                                int sy = __float_as_uint(v.y) >> 31 << 8;
+                                if (sx) v.x *= p.slope;
+                                if (sy) v.y *= p.slope;
+                                if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+
+                                // Combine signs.
+                                int s = sx + sy;
+                                s <<= signXo;
+                                s |= __shfl_xor_sync(groupMask, s, 1);
+                                s |= __shfl_xor_sync(groupMask, s, 2);
+
+                                // Write signs.
+                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
+                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read signs and apply.
+                    {
+                        if ((uint32_t)signXb < p.swLimit)
+                        {
+                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> signXo; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
+                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> signXo; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                    }
+
+                    if (!downInline)
+                    {
+                        // Write into temporary buffer.
+                        s_tileUpXY[dst] = v.x;
+                        if (relUpY0 < tileUpH - 1)
+                            s_tileUpXY[dst + tileUpW] = v.y;
+                    }
+                    else
+                    {
+                        // Write directly into output buffer.
+                        if ((uint32_t)x < p.yShape.x)
+                        {
+                            int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
+                            index_t ofs = x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+                            if ((uint32_t)y + 0 < p.yShape.y) *((T*)((char*)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
+                            if ((uint32_t)y + 1 < ymax) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.y))) = (T)(v.y * (scalar_t)c_fd[0]);
+                        }
+                    }
+                }
+            }
+        }
+        else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD)
+        {
+            // Full upsampling filter.
+
+            if (up == 2)
+            {
+                // 2 x 2-wide.
+                __syncthreads();
+                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y : 0; // Skip already written signs.
+                for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH; idx += blockDim.x * 4)
+                {
+                    int relUpX0, relUpY0;
+                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+                    int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
+                    int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
+                    int src0 = relInX0 + tileInW * relInY0;
+                    int tap0y = (relInY0 * up + phaseInY - relUpY0);
+
+                    #define X_LOOP(TAPY, PX) \
+                        for (int sx = 0; sx < fuSize / up; sx++) \
+                        { \
+                            v.x += a * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+                            v.z += b * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 0) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
+                            v.y += a * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+                            v.w += b * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 1) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
+                        }
+
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    if (tap0y == 0 && phaseInX == 0)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(0, 0) }
+                    if (tap0y == 0 && phaseInX == 1)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(0, 1) }
+                    if (tap0y == 1 && phaseInX == 0)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(1, 0) }
+                    if (tap0y == 1 && phaseInX == 1)
+                        #pragma unroll
+                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+                            #pragma unroll
+                            X_LOOP(1, 1) }
+
+                    #undef X_LOOP
+
+                    int x = tileOutX * down + relUpX0;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+
+                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
+                    v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write signs.
+                            int sx = __float_as_uint(v.x) >> 31;
+                            int sy = __float_as_uint(v.y) >> 31;
+                            int sz = __float_as_uint(v.z) >> 31;
+                            int sw = __float_as_uint(v.w) >> 31;
+                            if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                            if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                            if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                            if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write signs.
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                int sx = __float_as_uint(v.x) >> 31;
+                                int sy = __float_as_uint(v.y) >> 31;
+                                int sz = __float_as_uint(v.z) >> 31;
+                                int sw = __float_as_uint(v.w) >> 31;
+                                if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
+                                if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
+                                if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
+                                if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }
+
+                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+                            }
+                            else
+                            {
+                                // Just compute the values.
+                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead) // Read sign and apply.
+                    {
+                        if ((uint32_t)signY < p.sShape.y)
+                        {
+                            int s = 0;
+                            if ((uint32_t)signXb     < p.swLimit) s  = p.s[si];
+                            if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
+                            s >>= (signX & 3) << 1;
+                            if (s & 0x01) v.x *= p.slope; if (s & 0x02) v.x = 0.f;
+                            if (s & 0x04) v.y *= p.slope; if (s & 0x08) v.y = 0.f;
+                            if (s & 0x10) v.z *= p.slope; if (s & 0x20) v.z = 0.f;
+                            if (s & 0x40) v.w *= p.slope; if (s & 0x80) v.w = 0.f;
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
+                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
+                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
+                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
+                    }
+
+                    s_tileUpXY[idx + 0] = v.x;
+                    s_tileUpXY[idx + 1] = v.y;
+                    s_tileUpXY[idx + 2] = v.z;
+                    s_tileUpXY[idx + 3] = v.w;
+                }
+            }
+            else if (up == 1)
+            {
+                __syncthreads();
+                uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
+                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
+                for (int idx = threadIdx.x; idx < tileUpW * tileUpH; idx += blockDim.x)
+                {
+                    int relUpX0, relUpY0;
+                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+                    scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0]; // 1x1 filter.
+
+                    int x = tileOutX * down + relUpX0;
+                    int y = tileOutY * down + relUpY0;
+                    int signX = x + p.sOfs.x;
+                    int signY = y + p.sOfs.y;
+                    int signZ = blockIdx.z + p.blockZofs;
+                    int signXb = signX >> 2;
+                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+                    v *= (scalar_t)((float)up * (float)up * p.gain);
+
+                    if (signWrite)
+                    {
+                        if (!enableWriteSkip)
+                        {
+                            // Determine and write sign.
+                            uint32_t s = 0;
+                            uint32_t signXbit = (1u << signXo);
+                            if (v < 0.f)
+                            {
+                                s = signXbit;
+                                v *= p.slope;
+                            }
+                            if (fabsf(v) > p.clamp)
+                            {
+                                s = signXbit * 2;
+                                v = InternalType<T>::clamp(v, p.clamp);
+                            }
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+                                p.s[si] = s;                            // Write.
+                            }
+                        }
+                        else
+                        {
+                            // Determine and write sign.
+                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
+                            {
+                                uint32_t s = 0;
+                                uint32_t signXbit = (1u << signXo);
+                                if (v < 0.f)
+                                {
+                                    s = signXbit;
+                                    v *= p.slope;
+                                }
+                                if (fabsf(v) > p.clamp)
+                                {
+                                    s = signXbit * 2;
+                                    v = InternalType<T>::clamp(v, p.clamp);
+                                }
+                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+                                p.s[si] = s;                            // Write.
+                            }
+                            else
+                            {
+                                // Just compute the value.
+                                if (v < 0.f) v *= p.slope;
+                                v = InternalType<T>::clamp(v, p.clamp);
+                            }
+                        }
+                    }
+                    else if (signRead)
+                    {
+                        // Read sign and apply if within sign tensor bounds.
+                        if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y)
+                        {
+                            int s = p.s[si];
+                            s >>= signXo;
+                            if (s & 1) v *= p.slope;
+                            if (s & 2) v = 0.f;
+                        }
+                    }
+                    else // Forward pass with no sign write.
+                    {
+                        if (v < 0.f) v *= p.slope;
+                        v = InternalType<T>::clamp(v, p.clamp);
+                    }
+
+                    if (!downInline) // Write into temporary buffer.
+                        s_tileUpXY[idx] = v;
+                    else if ((uint32_t)x < p.yShape.x && (uint32_t)y < p.yShape.y) // Write directly into output buffer
+                        *((T*)((char*)p.y + (x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
+                }
+            }
+        }
+
+        // Downsampling.
+        if (filterMode == MODE_SUSD || filterMode == MODE_FUSD)
+        {
+            // Horizontal downsampling.
+            __syncthreads();
+            if (down == 4 && tileOutW % 4 == 0)
+            {
+                // Calculate 4 pixels at a time.
+                for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH; idx += blockDim.x * 4)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src0 = relUpY * tileUpW + relUpX0;
+                    vec4_t v = InternalType<T>::zero_vec4();
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                    {
+                        v.x += s_tileUpXY[src0 +  0 + step] * (scalar_t)c_fd[step];
+                        v.y += s_tileUpXY[src0 +  4 + step] * (scalar_t)c_fd[step];
+                        v.z += s_tileUpXY[src0 +  8 + step] * (scalar_t)c_fd[step];
+                        v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
+                    }
+                    s_tileDownX[idx+0] = v.x;
+                    s_tileDownX[idx+1] = v.y;
+                    s_tileDownX[idx+2] = v.z;
+                    s_tileDownX[idx+3] = v.w;
+                }
+            }
+            else if ((down == 2 || down == 4) && (tileOutW % 2 == 0))
+            {
+                // Calculate 2 pixels at a time.
+                for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH; idx += blockDim.x * 2)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src0 = relUpY * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                    {
+                        v.x += s_tileUpXY[src0 +    0 + step] * (scalar_t)c_fd[step];
+                        v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
+                    }
+                    s_tileDownX[idx+0] = v.x;
+                    s_tileDownX[idx+1] = v.y;
+                }
+            }
+            else
+            {
+                // Calculate 1 pixel at a time.
+                for (int idx = threadIdx.x; idx < tileOutW * tileUpH; idx += blockDim.x)
+                {
+                    int relOutX0, relUpY;
+                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int src = relUpY * tileUpW + relUpX0;
+                    scalar_t v = 0.f;
+                    #pragma unroll
+                    for (int step = 0; step < fdSize; step++)
+                        v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
+                    s_tileDownX[idx] = v;
+                }
+            }
+
+            // Vertical downsampling & store output tile.
+            __syncthreads();
+            for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
+            {
+                int relOutX, relOutY0;
+                fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
+                int relUpY0 = relOutY0 * down;
+                int src0 = relUpY0 * tileOutW + relOutX;
+                scalar_t v = 0;
+                #pragma unroll
+                for (int step = 0; step < fdSize; step++)
+                    v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];
+
+                int outX = tileOutX + relOutX;
+                int outY = tileOutY + relOutY0;
+
+                if (outX < p.yShape.x & outY < p.yShape.y)
+                    *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
+            }
+        }
+        else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD)
+        {
+            // Full downsampling filter.
+            if (down == 2)
+            {
+                // 2-wide.
+                __syncthreads();
+                for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH; idx += blockDim.x * 2)
+                {
+                    int relOutX0, relOutY0;
+                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+                    int relUpX0 = relOutX0 * down;
+                    int relUpY0 = relOutY0 * down;
+                    int src0 = relUpY0 * tileUpW + relUpX0;
+                    vec2_t v = InternalType<T>::zero_vec2();
+                    #pragma unroll
+                    for (int sy = 0; sy < fdSize; sy++)
+                    #pragma unroll
+                    for (int sx = 0; sx < fdSize; sx++)
+                    {
+                        v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+                        v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+                    }
+
+                    int outX = tileOutX + relOutX0;
+                    int outY = tileOutY + relOutY0;
+                    if ((uint32_t)outY < p.yShape.y)
+                    {
+                        index_t ofs = outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+                        if (outX + 0 < p.yShape.x) *((T*)((char*)p.y + ofs)) = (T)v.x;
+                        if (outX + 1 < p.yShape.x) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.x))) = (T)v.y;
+                    }
+                }
+            }
+            else if (down == 1 && !downInline)
+            {
+                // Thread per pixel.
+                __syncthreads();
+                for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
+                {
+                    int relOutX0, relOutY0;
+                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+                    scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0]; // 1x1 filter.
+
+                    int outX = tileOutX + relOutX0;
+                    int outY = tileOutY + relOutY0;
+                    if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
+                        *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
+                }
+            }
+        }
+
+        if (!enableXrep)
+            break;
+    }
+}
+
+//------------------------------------------------------------------------
+// Compute activation function and signs for upsampled data tensor, modifying data tensor in-place. Used for accelerating the generic variant.
+// Sign tensor is known to be contiguous, and p.x and p.s have the same z, w dimensions. 64-bit indexing is always used.
+
+template <class T, bool signWrite, bool signRead>
+static __global__ void filtered_lrelu_act_kernel(filtered_lrelu_act_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+
+    // Indexing.
+    int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
+    int32_t qmax = p.xShape.z * p.xShape.w; // Combined minibatch*channel maximum index.
+
+    // Loop to accommodate oversized tensors.
+    for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
+    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y)
+    {
+        // Extract z and w (channel, minibatch index).
+        int32_t w = q / p.xShape.z;
+        int32_t z = q - w * p.xShape.z;
+
+        // Choose behavior based on sign read/write mode.
+        if (signWrite)
+        {
+            // Process value if in p.x.
+            uint32_t s = 0;
+            if (x < p.xShape.x && y < p.xShape.y)
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+
+                // Gain, LReLU, clamp.
+                v *= p.gain;
+                if (v < 0.f)
+                {
+                    v *= p.slope;
+                    s = 1; // Sign.
+                }
+                if (fabsf(v) > p.clamp)
+                {
+                    v = InternalType<T>::clamp(v, p.clamp);
+                    s = 2; // Clamp.
+                }
+
+                *pv = (T)v; // Write value.
+            }
+
+            // Coalesce into threads 0 and 16 of warp.
+            uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
+            s <<= ((threadIdx.x & 15) << 1); // Shift into place.
+            s |= __shfl_xor_sync(m, s, 1); // Distribute.
+            s |= __shfl_xor_sync(m, s, 2);
+            s |= __shfl_xor_sync(m, s, 4);
+            s |= __shfl_xor_sync(m, s, 8);
+
+            // Write signs if leader and in p.s.
+            if (!(threadIdx.x & 15) && x < p.sShape.x) // y is always in.
+            {
+                uint64_t is = x + p.sShape.x * (y + (int64_t)p.sShape.y * q); // Contiguous.
+                ((uint32_t*)p.s)[is >> 4] = s;
+            }
+        }
+        else if (signRead)
+        {
+            // Process value if in p.x.
+            if (x < p.xShape.x) // y is always in.
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+                v *= p.gain;
+
+                // Apply sign buffer offset.
+                uint32_t sx = x + p.sOfs.x;
+                uint32_t sy = y + p.sOfs.y;
+
+                // Read and apply signs if we land inside valid region of sign buffer.
+                if (sx < p.sShape.x && sy < p.sShape.y)
+                {
+                    uint64_t is = (sx >> 2) + (p.sShape.x >> 2) * (sy + (uint64_t)p.sShape.y * q); // Contiguous.
+                    unsigned char s = p.s[is];
+                    s >>= (sx & 3) << 1; // Shift into place.
+                    if (s & 1) // Sign?
+                        v *= p.slope;
+                    if (s & 2) // Clamp?
+                        v = 0.f;
+                }
+
+                *pv = (T)v; // Write value.
+            }
+        }
+        else
+        {
+            // Forward pass with no sign write. Process value if in p.x.
+            if (x < p.xShape.x) // y is always in.
+            {
+                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
+                T* pv = ((T*)p.x) + ix;
+                scalar_t v = (scalar_t)(*pv);
+                v *= p.gain;
+                if (v < 0.f)
+                    v *= p.slope;
+                if (fabsf(v) > p.clamp)
+                    v = InternalType<T>::clamp(v, p.clamp);
+                *pv = (T)v; // Write value.
+            }
+        }
+    }
+}
+
+template <class T, bool signWrite, bool signRead> void* choose_filtered_lrelu_act_kernel(void)
+{
+    return (void*)filtered_lrelu_act_kernel<T, signWrite, signRead>;
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead> filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB)
+{
+    filtered_lrelu_kernel_spec s = { 0 };
+
+    // Return the first matching kernel.
+#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS) \
+    if (sharedKB >= SH) \
+    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) || (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD))) \
+    if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) || (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD))) \
+    if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU && p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) \
+    { \
+        static_assert((D*TW % 4) == 0, "down * tileWidth must be divisible by 4"); \
+        static_assert(FU % U == 0, "upscaling filter size must be multiple of upscaling factor"); \
+        static_assert(FD % D == 0, "downscaling filter size must be multiple of downscaling factor"); \
+        s.setup = (void*)setup_filters_kernel; \
+        s.exec = (void*)filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, U, FU, D, FD, TW, TH, W*32, !!XR, !!WS>; \
+        s.tileOut = make_int2(TW, TH); \
+        s.numWarps = W; \
+        s.xrep = XR; \
+        s.dynamicSharedKB = (SH == 48) ? 0 : SH; \
+        return s; \
+    }
+
+    // Launch parameters for various kernel specializations.
+    // Small filters must be listed before large filters, otherwise the kernel for larger filter will always match first.
+    // Kernels that use more shared memory must be listed before those that use less, for the same reason.
+
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/1,1,  /*mode*/MODE_FUFD, /*tw,th,warps,xrep,wskip*/64,  178, 32,  0,  0) // 1t-upf1-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/152, 95,  16,  0,  0) // 4t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,8,  /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/56,  22,  16,  0,  0) // 4t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/56,  29,  16,  11, 0) // 4t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/60,  28,  16,  0,  0) // 4t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/2,8,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/56,  28,  16,  0,  0) // 4t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,16, /*down,fd*/2,8,  /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/56,  31,  16,  11, 0) // 4t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,16, /*down,fd*/2,8,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/56,  36,  16,  0,  0) // 4t-ups4-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/4,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  22,  16,  12, 0) // 4t-ups2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,8,  /*down,fd*/4,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/29,  15,  16,  0,  0) // 4t-upf2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/96,  150, 28,  0,  0) // 6t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,12, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/32,  35,  24,  0,  0) // 6t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  46,  16,  10, 0) // 6t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/58,  28,  24,  8,  0) // 6t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/2,12, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/52,  28,  16,  0,  0) // 6t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,24, /*down,fd*/2,12, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  51,  16,  5,  0) // 6t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,24, /*down,fd*/2,12, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  56,  16,  6,  0) // 6t-ups4-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  18,  16,  12, 0) // 6t-ups2-downs4
+    CASE(/*sharedKB*/96, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/27,  31,  32,  6,  0) // 6t-upf2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,12, /*down,fd*/4,24, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/27,  13,  24,  0,  0) // 6t-upf2-downs4
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/1,1,  /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/148, 89,  24,  0,  0) // 8t-ups2-downf1
+    CASE(/*sharedKB*/48, /*up,fu*/1,1,  /*down,fd*/2,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/32,  31,  16,  5,  0) // 8t-upf1-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  41,  16,  9,  0) // 8t-ups2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/56,  26,  24,  0,  0) // 8t-upf2-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/2,16, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  40,  16,  0,  0) // 8t-ups2-downf2
+    CASE(/*sharedKB*/48, /*up,fu*/4,32, /*down,fd*/2,16, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/32,  46,  24,  5,  0) // 8t-ups4-downs2
+    CASE(/*sharedKB*/48, /*up,fu*/4,32, /*down,fd*/2,16, /*mode*/MODE_SUFD, /*tw,th,warps,xrep,wskip*/32,  50,  16,  0,  0) // 8t-ups4-downf2
+    CASE(/*sharedKB*/96, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/24,  24,  32,  12, 1) // 8t-ups2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_SUSD, /*tw,th,warps,xrep,wskip*/16,  13,  16,  10, 1) // 8t-ups2-downs4
+    CASE(/*sharedKB*/96, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/25,  28,  28,  4,  0) // 8t-upf2-downs4 96kB
+    CASE(/*sharedKB*/48, /*up,fu*/2,16, /*down,fd*/4,32, /*mode*/MODE_FUSD, /*tw,th,warps,xrep,wskip*/25,  10,  24,  0,  0) // 8t-upf2-downs4
+
+    #undef CASE
+    return s; // No kernel found.
+}
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/filtered_lrelu.h b/torch_utils/ops/filtered_lrelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c403e3f275f472315662321cad54dd0dbc56d00
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct filtered_lrelu_kernel_params
+{
+    // These parameters decide which kernel to use.
+    int             up;         // upsampling ratio (1, 2, 4)
+    int             down;       // downsampling ratio (1, 2, 4)
+    int2            fuShape;    // [size, 1] | [size, size]
+    int2            fdShape;    // [size, 1] | [size, size]
+
+    int             _dummy;     // Alignment.
+
+    // Rest of the parameters.
+    const void*     x;          // Input tensor.
+    void*           y;          // Output tensor.
+    const void*     b;          // Bias tensor.
+    unsigned char*  s;          // Sign tensor in/out. NULL if unused.
+    const float*    fu;         // Upsampling filter.
+    const float*    fd;         // Downsampling filter.
+
+    int2            pad0;       // Left/top padding.
+    float           gain;       // Additional gain factor.
+    float           slope;      // Leaky ReLU slope on negative side.
+    float           clamp;      // Clamp after nonlinearity.
+    int             flip;       // Filter kernel flip for gradient computation.
+
+    int             tilesXdim;  // Original number of horizontal output tiles.
+    int             tilesXrep;  // Number of horizontal tiles per CTA.
+    int             blockZofs;  // Block z offset to support large minibatch, channel dimensions.
+
+    int4            xShape;     // [width, height, channel, batch]
+    int4            yShape;     // [width, height, channel, batch]
+    int2            sShape;     // [width, height] - width is in bytes. Contiguous. Zeros if unused.
+    int2            sOfs;       // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+    int             swLimit;    // Active width of sign tensor in bytes.
+
+    longlong4       xStride;    // Strides of all tensors except signs, same component order as shapes.
+    longlong4       yStride;    //
+    int64_t         bStride;    //
+    longlong3       fuStride;   //
+    longlong3       fdStride;   //
+};
+
+struct filtered_lrelu_act_kernel_params
+{
+    void*           x;          // Input/output, modified in-place.
+    unsigned char*  s;          // Sign tensor in/out. NULL if unused.
+
+    float           gain;       // Additional gain factor.
+    float           slope;      // Leaky ReLU slope on negative side.
+    float           clamp;      // Clamp after nonlinearity.
+
+    int4            xShape;     // [width, height, channel, batch]
+    longlong4       xStride;    // Input/output tensor strides, same order as in shape.
+    int2            sShape;     // [width, height] - width is in elements. Contiguous. Zeros if unused.
+    int2            sOfs;       // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct filtered_lrelu_kernel_spec
+{
+    void*   setup;              // Function for filter kernel setup.
+    void*   exec;               // Function for main operation.
+    int2    tileOut;            // Width/height of launch tile.
+    int     numWarps;           // Number of warps per thread block, determines launch block size.
+    int     xrep;               // For processing multiple horizontal tiles per thread block.
+    int     dynamicSharedKB;    // How much dynamic shared memory the exec kernel wants.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead> filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(const filtered_lrelu_kernel_params& p, int sharedKB);
+template <class T, bool signWrite, bool signRead> void* choose_filtered_lrelu_act_kernel(void);
+template <bool signWrite, bool signRead> cudaError_t copy_filters(cudaStream_t stream);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/filtered_lrelu.py b/torch_utils/ops/filtered_lrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6106c917d1cbff4f1cf637390dd6ba0c597a830f
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import numpy as np
+import torch
+import warnings
+
+from .. import custom_ops
+from .. import misc
+from . import upfirdn2d
+from . import bias_act
+
+#----------------------------------------------------------------------------
+
+_plugin = None
+
+def _init():
+    global _plugin
+    if _plugin is None:
+        _plugin = custom_ops.get_plugin(
+            module_name='filtered_lrelu_plugin',
+            sources=['filtered_lrelu.cpp', 'filtered_lrelu_wr.cu', 'filtered_lrelu_rd.cu', 'filtered_lrelu_ns.cu'],
+            headers=['filtered_lrelu.h', 'filtered_lrelu.cu'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math'],
+        )
+    return True
+
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+    assert isinstance(f, torch.Tensor)
+    assert 1 <= f.ndim <= 2
+    return f.shape[-1], f.shape[0] # width, height
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, (int, np.integer)) for x in padding)
+    padding = [int(x) for x in padding]
+    if len(padding) == 2:
+        px, py = padding
+        padding = [px, px, py, py]
+    px0, px1, py0, py1 = padding
+    return px0, px1, py0, py1
+
+#----------------------------------------------------------------------------
+
+def filtered_lrelu(x, fu=None, fd=None, b=None, up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False, impl='cuda'):
+    r"""Filtered leaky ReLU for a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Add channel-specific bias if provided (`b`).
+
+    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    3. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+
+    4. Convolve the image with the specified upsampling FIR filter (`fu`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+
+    5. Multiply each value by the provided gain factor (`gain`).
+
+    6. Apply leaky ReLU activation function to each value.
+
+    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is provided.
+
+    8. Convolve the image with the specified downsampling FIR filter (`fd`), shrinking
+       it so that the footprint of all output pixels lies within the input image.
+
+    9. Downsample the image by keeping every Nth pixel (`down`).
+
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:           Float32/float16/float64 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        fu:          Float32 upsampling FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        fd:          Float32 downsampling FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        b:           Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                     as `x`. The length of vector must must match the channel dimension of `x`.
+        up:          Integer upsampling factor (default: 1).
+        down:        Integer downsampling factor. (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        gain:        Overall scaling factor for signal magnitude (default: sqrt(2)).
+        slope:       Slope on the negative side of leaky ReLU (default: 0.2).
+        clamp:       Maximum magnitude for leaky ReLU output (default: None).
+        flip_filter: False = convolution, True = correlation (default: False).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _filtered_lrelu_cuda(up=up, down=down, padding=padding, gain=gain, slope=slope, clamp=clamp, flip_filter=flip_filter).apply(x, fu, fd, b, None, 0, 0)
+    return _filtered_lrelu_ref(x, fu=fu, fd=fd, b=b, up=up, down=down, padding=padding, gain=gain, slope=slope, clamp=clamp, flip_filter=flip_filter)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _filtered_lrelu_ref(x, fu=None, fd=None, b=None, up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False):
+    """Slow and memory-inefficient reference implementation of `filtered_lrelu()` using
+    existing `upfirdn2n()` and `bias_act()` ops.
+    """
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    fu_w, fu_h = _get_filter_size(fu)
+    fd_w, fd_h = _get_filter_size(fd)
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.dtype == x.dtype
+        misc.assert_shape(b, [x.shape[1]])
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    assert slope == float(slope) and slope >= 0
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+
+    # Calculate output size.
+    batch_size, channels, in_h, in_w = x.shape
+    in_dtype = x.dtype
+    out_w = (in_w * up + (px0 + px1) - (fu_w - 1) - (fd_w - 1) + (down - 1)) // down
+    out_h = (in_h * up + (py0 + py1) - (fu_h - 1) - (fd_h - 1) + (down - 1)) // down
+
+    # Compute using existing ops.
+    x = bias_act.bias_act(x=x, b=b) # Apply bias.
+    x = upfirdn2d.upfirdn2d(x=x, f=fu, up=up, padding=[px0, px1, py0, py1], gain=up**2, flip_filter=flip_filter) # Upsample.
+    x = bias_act.bias_act(x=x, act='lrelu', alpha=slope, gain=gain, clamp=clamp) # Bias, leaky ReLU, clamp.
+    x = upfirdn2d.upfirdn2d(x=x, f=fd, down=down, flip_filter=flip_filter) # Downsample.
+
+    # Check output shape & dtype.
+    misc.assert_shape(x, [batch_size, channels, out_h, out_w])
+    assert x.dtype == in_dtype
+    return x
+
+#----------------------------------------------------------------------------
+
+_filtered_lrelu_cuda_cache = dict()
+
+def _filtered_lrelu_cuda(up=1, down=1, padding=0, gain=np.sqrt(2), slope=0.2, clamp=None, flip_filter=False):
+    """Fast CUDA implementation of `filtered_lrelu()` using custom ops.
+    """
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    gain = float(gain)
+    assert slope == float(slope) and slope >= 0
+    slope = float(slope)
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+    clamp = float(clamp if clamp is not None else 'inf')
+
+    # Lookup from cache.
+    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
+    if key in _filtered_lrelu_cuda_cache:
+        return _filtered_lrelu_cuda_cache[key]
+
+    # Forward op.
+    class FilteredLReluCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, fu, fd, b, si, sx, sy): # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+
+            # Replace empty up/downsample kernels with full 1x1 kernels (faster than separable).
+            if fu is None:
+                fu = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            if fd is None:
+                fd = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            assert 1 <= fu.ndim <= 2
+            assert 1 <= fd.ndim <= 2
+
+            # Replace separable 1x1 kernels with full 1x1 kernels when scale factor is 1.
+            if up == 1 and fu.ndim == 1 and fu.shape[0] == 1:
+                fu = fu.square()[None]
+            if down == 1 and fd.ndim == 1 and fd.shape[0] == 1:
+                fd = fd.square()[None]
+
+            # Missing sign input tensor.
+            if si is None:
+                si = torch.empty([0])
+
+            # Missing bias tensor.
+            if b is None:
+                b = torch.zeros([x.shape[1]], dtype=x.dtype, device=x.device)
+
+            # Construct internal sign tensor only if gradients are needed.
+            write_signs = (si.numel() == 0) and (x.requires_grad or b.requires_grad)
+
+            # Warn if input storage strides are not in decreasing order due to e.g. channels-last layout.
+            strides = [x.stride(i) for i in range(x.ndim) if x.size(i) > 1]
+            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
+                warnings.warn("low-performance memory layout detected in filtered_lrelu input", RuntimeWarning)
+
+            # Call C++/Cuda plugin if datatype is supported.
+            if x.dtype in [torch.float16, torch.float32]:
+                if torch.cuda.current_stream(x.device) != torch.cuda.default_stream(x.device):
+                    warnings.warn("filtered_lrelu called with non-default cuda stream but concurrent execution is not supported", RuntimeWarning)
+                y, so, return_code = _plugin.filtered_lrelu(x, fu, fd, b, si, up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp, flip_filter, write_signs)
+            else:
+                return_code = -1
+
+            # No Cuda kernel found? Fall back to generic implementation. Still more memory efficient than the reference implementation because
+            # only the bit-packed sign tensor is retained for gradient computation.
+            if return_code < 0:
+                warnings.warn("filtered_lrelu called with parameters that have no optimized CUDA kernel, using generic fallback", RuntimeWarning)
+
+                y = x.add(b.unsqueeze(-1).unsqueeze(-1)) # Add bias.
+                y = upfirdn2d.upfirdn2d(x=y, f=fu, up=up, padding=[px0, px1, py0, py1], gain=up**2, flip_filter=flip_filter) # Upsample.
+                so = _plugin.filtered_lrelu_act_(y, si, sx, sy, gain, slope, clamp, write_signs) # Activation function and sign handling. Modifies y in-place.
+                y = upfirdn2d.upfirdn2d(x=y, f=fd, down=down, flip_filter=flip_filter) # Downsample.
+
+            # Prepare for gradient computation.
+            ctx.save_for_backward(fu, fd, (si if si.numel() else so))
+            ctx.x_shape = x.shape
+            ctx.y_shape = y.shape
+            ctx.s_ofs = sx, sy
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            fu, fd, si = ctx.saved_tensors
+            _, _, xh, xw = ctx.x_shape
+            _, _, yh, yw = ctx.y_shape
+            sx, sy = ctx.s_ofs
+            dx  = None # 0
+            dfu = None; assert not ctx.needs_input_grad[1]
+            dfd = None; assert not ctx.needs_input_grad[2]
+            db  = None # 3
+            dsi = None; assert not ctx.needs_input_grad[4]
+            dsx = None; assert not ctx.needs_input_grad[5]
+            dsy = None; assert not ctx.needs_input_grad[6]
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
+                pp = [
+                    (fu.shape[-1] - 1) + (fd.shape[-1] - 1) - px0,
+                    xw * up - yw * down + px0 - (up - 1),
+                    (fu.shape[0] - 1) + (fd.shape[0] - 1) - py0,
+                    xh * up - yh * down + py0 - (up - 1),
+                ]
+                gg = gain * (up ** 2) / (down ** 2)
+                ff = (not flip_filter)
+                sx = sx - (fu.shape[-1] - 1) + px0
+                sy = sy - (fu.shape[0]  - 1) + py0
+                dx = _filtered_lrelu_cuda(up=down, down=up, padding=pp, gain=gg, slope=slope, clamp=None, flip_filter=ff).apply(dy, fd, fu, None, si, sx, sy)
+
+            if ctx.needs_input_grad[3]:
+                db = dx.sum([0, 2, 3])
+
+            return dx, dfu, dfd, db, dsi, dsx, dsy
+
+    # Add to cache.
+    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda
+    return FilteredLReluCuda
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/filtered_lrelu_ns.cu b/torch_utils/ops/filtered_lrelu_ns.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ef5d948c4fdf9cb0fe8a42f6268c61aeef6b2000
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu_ns.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for no signs mode (no gradients required).
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, false, false>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     false, false>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    false, false>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<false, false>(cudaStream_t stream);
diff --git a/torch_utils/ops/filtered_lrelu_rd.cu b/torch_utils/ops/filtered_lrelu_rd.cu
new file mode 100644
index 0000000000000000000000000000000000000000..968347882e9aebd36204f67e201cd16226dd9132
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu_rd.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for sign read mode.
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, false, true>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, false, true>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     false, true>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    false, true>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<false, true>(cudaStream_t stream);
diff --git a/torch_utils/ops/filtered_lrelu_wr.cu b/torch_utils/ops/filtered_lrelu_wr.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4c6a24aae908bc07248f7ff710cbd1a11a38bb1
--- /dev/null
+++ b/torch_utils/ops/filtered_lrelu_wr.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "filtered_lrelu.cu"
+
+// Template/kernel specializations for sign write mode.
+
+// Full op, 32-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int32_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int32_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Full op, 64-bit indexing.
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<c10::Half, int64_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+template filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel<float,     int64_t, true, false>(const filtered_lrelu_kernel_params& p, int sharedKB);
+
+// Activation/signs only for generic variant. 64-bit indexing.
+template void* choose_filtered_lrelu_act_kernel<c10::Half, true, false>(void);
+template void* choose_filtered_lrelu_act_kernel<float,     true, false>(void);
+template void* choose_filtered_lrelu_act_kernel<double,    true, false>(void);
+
+// Copy filters to constant memory.
+template cudaError_t copy_filters<true, false>(cudaStream_t stream);
diff --git a/torch_utils/ops/fma.py b/torch_utils/ops/fma.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a45dfa0829987e8ee5214663e068cb3af2a8b9
--- /dev/null
+++ b/torch_utils/ops/fma.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Fused multiply-add, with slightly faster gradients than `torch.addcmul()`."""
+
+import torch
+
+#----------------------------------------------------------------------------
+
+def fma(a, b, c): # => a * b + c
+    return _FusedMultiplyAdd.apply(a, b, c)
+
+#----------------------------------------------------------------------------
+
+class _FusedMultiplyAdd(torch.autograd.Function): # a * b + c
+    @staticmethod
+    def forward(ctx, a, b, c): # pylint: disable=arguments-differ
+        out = torch.addcmul(c, a, b)
+        ctx.save_for_backward(a, b)
+        ctx.c_shape = c.shape
+        return out
+
+    @staticmethod
+    def backward(ctx, dout): # pylint: disable=arguments-differ
+        a, b = ctx.saved_tensors
+        c_shape = ctx.c_shape
+        da = None
+        db = None
+        dc = None
+
+        if ctx.needs_input_grad[0]:
+            da = _unbroadcast(dout * b, a.shape)
+
+        if ctx.needs_input_grad[1]:
+            db = _unbroadcast(dout * a, b.shape)
+
+        if ctx.needs_input_grad[2]:
+            dc = _unbroadcast(dout, c_shape)
+
+        return da, db, dc
+
+#----------------------------------------------------------------------------
+
+def _unbroadcast(x, shape):
+    extra_dims = x.ndim - len(shape)
+    assert extra_dims >= 0
+    dim = [i for i in range(x.ndim) if x.shape[i] > 1 and (i < extra_dims or shape[i - extra_dims] == 1)]
+    if len(dim):
+        x = x.sum(dim=dim, keepdim=True)
+    if extra_dims:
+        x = x.reshape(-1, *x.shape[extra_dims+1:])
+    assert x.shape == shape
+    return x
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/grid_sample_gradfix.py b/torch_utils/ops/grid_sample_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f9befa1cd7b0bc63186ca6d6456c0f11b5dc67
--- /dev/null
+++ b/torch_utils/ops/grid_sample_gradfix.py
@@ -0,0 +1,83 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom replacement for `torch.nn.functional.grid_sample` that
+supports arbitrarily high order gradients between the input and output.
+Only works on 2D images and assumes
+`mode='bilinear'`, `padding_mode='zeros'`, `align_corners=False`."""
+
+import torch
+
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-differ
+# pylint: disable=protected-access
+
+#----------------------------------------------------------------------------
+
+enabled = True  # Enable the custom op by setting this to true.
+
+#----------------------------------------------------------------------------
+
+def grid_sample(input, grid):
+    if _should_use_custom_op():
+        return _GridSampleForward.apply(input, grid)
+    return torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False)
+
+#----------------------------------------------------------------------------
+
+def _should_use_custom_op():
+    return enabled
+
+#----------------------------------------------------------------------------
+
+class _GridSampleForward(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, grid):
+        assert input.ndim == 4 or input.ndim == 5
+        assert grid.ndim == 4 or input.ndim == 5
+        output = torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False)
+        ctx.save_for_backward(input, grid)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, grid = ctx.saved_tensors
+        grad_input, grad_grid = _GridSampleBackward.apply(grad_output, input, grid)
+        return grad_input, grad_grid
+
+#----------------------------------------------------------------------------
+
+class _GridSampleBackward(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, grad_output, input, grid):
+        if input.ndim == 4:
+            op = torch._C._jit_get_operation('aten::grid_sampler_2d_backward')
+        else:
+            op = torch._C._jit_get_operation('aten::grid_sampler_3d_backward')
+        grad_input, grad_grid = op(grad_output, input, grid, 0, 0, False)
+        ctx.save_for_backward(grid)
+        return grad_input, grad_grid
+
+    @staticmethod
+    def backward(ctx, grad2_grad_input, grad2_grad_grid):
+        _ = grad2_grad_grid # unused
+        grid, = ctx.saved_tensors
+        grad2_grad_output = None
+        grad2_input = None
+        grad2_grid = None
+
+        if ctx.needs_input_grad[0]:
+            grad2_grad_output = _GridSampleForward.apply(grad2_grad_input, grid)
+
+        assert not ctx.needs_input_grad[2]
+        return grad2_grad_output, grad2_input, grad2_grid
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/hash_sample.cpp b/torch_utils/ops/hash_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..93d684957e2610c2937c3d7ad3c6f62c8e9977f9
--- /dev/null
+++ b/torch_utils/ops/hash_sample.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) Facebook, Inc. and its affiliates.All Rights Reserved
+
+// Please refer to original code: https://github.com/NVlabs/instant-ngp
+// and the pytorch wrapper from https://github.com/ashawkey/torch-ngp
+
+#include <stdint.h>
+#include <torch/torch.h>
+#include <torch/extension.h>
+
+#include "hash_sample.h"
+#include "utils.h"
+
+void hash_encode_forward(at::Tensor inputs, at::Tensor embeddings, at::Tensor offsets, at::Tensor outputs,  const float beta, const uint32_t B, const uint32_t N, const uint32_t D, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, at::Tensor dy_dx, const uint32_t mode) {
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(embeddings);
+    CHECK_CUDA(offsets);
+    CHECK_CUDA(outputs);
+    CHECK_CUDA(dy_dx);
+    
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(embeddings);
+    CHECK_CONTIGUOUS(offsets);
+    CHECK_CONTIGUOUS(outputs);
+    CHECK_CONTIGUOUS(dy_dx);
+
+    CHECK_IS_FLOAT(inputs);
+    CHECK_IS_FLOAT(embeddings);
+    CHECK_IS_INT(offsets);
+    CHECK_IS_FLOAT(outputs);
+    CHECK_IS_FLOAT(dy_dx);
+
+    hash_encode_forward_cuda(inputs.data_ptr<float>(), embeddings.data_ptr<float>(), offsets.data_ptr<int>(), outputs.data_ptr<float>(), beta, B, N, D, C, L, H, calc_grad_inputs, dy_dx.data_ptr<float>(), mode);
+}
+
+void hash_encode_backward(at::Tensor grad, at::Tensor inputs, at::Tensor embeddings, at::Tensor offsets, at::Tensor grad_embeddings,  const float beta, const uint32_t B, const uint32_t N, const uint32_t D, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, at::Tensor dy_dx, at::Tensor grad_inputs, const uint32_t mode) {
+    CHECK_CUDA(grad);
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(embeddings);
+    CHECK_CUDA(offsets);
+    CHECK_CUDA(grad_embeddings);
+    CHECK_CUDA(dy_dx);
+    CHECK_CUDA(grad_inputs);
+    
+    CHECK_CONTIGUOUS(grad);
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(embeddings);
+    CHECK_CONTIGUOUS(offsets);
+    CHECK_CONTIGUOUS(grad_embeddings);
+    CHECK_CONTIGUOUS(dy_dx);
+    CHECK_CONTIGUOUS(grad_inputs);
+
+    CHECK_IS_FLOAT(grad);
+    CHECK_IS_FLOAT(inputs);
+    CHECK_IS_FLOAT(embeddings);
+    CHECK_IS_INT(offsets);
+    CHECK_IS_FLOAT(grad_embeddings);
+    CHECK_IS_FLOAT(dy_dx);
+    CHECK_IS_FLOAT(grad_inputs);
+    
+    hash_encode_backward_cuda(grad.data_ptr<float>(), inputs.data_ptr<float>(), embeddings.data_ptr<float>(), offsets.data_ptr<int>(), grad_embeddings.data_ptr<float>(), beta, B, N, D, C, L, H, calc_grad_inputs, dy_dx.data_ptr<float>(), grad_inputs.data_ptr<float>(), mode);
+}
+
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("hash_encode_forward", &hash_encode_forward, "hash encode forward (CUDA)");
+    m.def("hash_encode_backward", &hash_encode_backward, "hash encode backward (CUDA)");
+}
diff --git a/torch_utils/ops/hash_sample.cu b/torch_utils/ops/hash_sample.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27e73d86cc2977e37a046fd18367a238948681ed
--- /dev/null
+++ b/torch_utils/ops/hash_sample.cu
@@ -0,0 +1,361 @@
+// Copyright (c) Facebook, Inc. and its affiliates.All Rights Reserved
+
+
+// Please refer to original code: https://github.com/NVlabs/instant-ngp
+// and the pytorch wrapper from https://github.com/ashawkey/torch-ngp
+
+#include <stdint.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <stdexcept>
+
+#include <cstdio>
+
+template <typename T>
+__host__ __device__ T div_round_up(T val, T divisor) {
+	return (val + divisor - 1) / divisor;
+}
+
+
+template <uint32_t D>
+__device__ uint32_t fast_hash(const uint32_t pos_grid[D]) {
+	static_assert(D <= 7, "fast_hash can only hash up to 7 dimensions.");
+
+	// While 1 is technically not a good prime for hashing (or a prime at all), it helps memory coherence
+	// and is sufficient for our use case of obtaining a uniformly colliding index from high-dimensional
+	// coordinates.
+	constexpr uint32_t primes[7] = { 1, 19349663, 83492791, 25165843, 6291469, 12582917, 3145739 };
+
+	uint32_t result = 0;
+	#pragma unroll
+	for (uint32_t i = 0; i < D; ++i) {
+		result ^= pos_grid[i] * primes[i];
+	}
+
+	return result;
+}
+
+
+template <uint32_t D, uint32_t C>
+__device__ uint32_t get_grid_index(const uint32_t ch, const uint32_t hashmap_size, const uint32_t resolution, const uint32_t pos_grid[D], const uint32_t mode) {
+	uint32_t stride = 1;
+	uint32_t index = 0;
+	
+    switch(mode) {
+        case 0:   // fast-hash
+            #pragma unroll
+            for (uint32_t d = 0; d < D && stride <= hashmap_size; d++) {
+                // printf("get_grid_index d=%d, pos_grid[d]=%d, stride=%d, reso=%d\n", d, pos_grid[d], stride, resolution);
+                index += pos_grid[d] * stride;
+                stride *= (resolution + 1);
+            }
+            if (stride > hashmap_size) {
+                //printf("hash because %d > %d\n", stride, hashmap_size);
+                index = fast_hash<D>(pos_grid);
+                //printf("hashed (%d, %d) = %d to %d in %d\n", pos_grid[0], pos_grid[1], pos_grid[0] + resolution * pos_grid[1], index % hashmap_size, hashmap_size);
+            }
+            index = index % hashmap_size; break;
+        
+        case 1:   // grid-hash
+            uint32_t h_res = (uint32_t)cbrtf(hashmap_size);
+            #pragma unroll
+            for (uint32_t d = 0; d < D; d++) {
+                index += (pos_grid[d] % h_res) * stride;
+                stride *= h_res;
+            }
+            break;
+    }
+	return index * C + ch;
+}
+
+
+template <uint32_t D, uint32_t C>
+__global__ void kernel_grid(
+    const float * __restrict__ inputs, 
+    const float * __restrict__ grid, 
+    const int * __restrict__ offsets, 
+    float * outputs, 
+    const float beta,
+    uint32_t B, uint32_t N, 
+    uint32_t L, uint32_t H,
+    const bool calc_grad_inputs, 
+    float * dy_dx,
+    uint32_t mode) {
+    
+    const uint32_t b = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (b >= N) return;
+
+    const uint32_t level    = blockIdx.y;
+    const uint32_t batch_id = blockIdx.z;
+    const uint32_t batch_offset_grid   = offsets[L] * batch_id;
+    const uint32_t batch_offset_inputs = N * batch_id;
+
+    // locate
+    grid    += ((uint32_t)offsets[level] + batch_offset_grid) * C;
+    inputs  += ( b + batch_offset_inputs) * D;
+    outputs += ((b + batch_offset_inputs) * L + level) * C;
+
+    const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+    
+    // const float scale = exp2f(level) * H - 1.0f;
+    const float scale = powf(beta, level) * H - 1.0f;
+    const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+    // const float scale = powf(beta, level) * H;
+    // const uint32_t resolution = (uint32_t)ceil(scale);
+
+    // calculate coordinate
+    float pos[D];
+    uint32_t pos_grid[D];
+
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        pos[d] = inputs[d] * scale + 0.5f;
+        pos_grid[d] = floorf(pos[d]);
+        pos[d] -= (float)pos_grid[d];
+    }
+
+    // printf("[b=%d, l=%d] pos=(%f, %f)+(%d, %d) scale=%f \n", b, level, pos[0], pos[1], pos_grid[0], pos_grid[1], scale);
+
+    // interpolate
+    #pragma unroll
+    for (uint32_t idx = 0; idx < (1 << D); idx++) {
+        float w = 1;
+        uint32_t pos_grid_local[D];
+
+        #pragma unroll
+        for (uint32_t d = 0; d < D; d++) {
+            if ((idx & (1 << d)) == 0) {
+                w *= 1 - pos[d];
+                pos_grid_local[d] = pos_grid[d];
+            } else {
+                w *= pos[d];
+                pos_grid_local[d] = pos_grid[d] + 1;
+            }
+        }
+
+        uint32_t index = get_grid_index<D, C>(0, hashmap_size, resolution, pos_grid_local, mode);
+
+        #pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+            outputs[ch] += w * grid[index + ch];
+        }
+
+        //printf("[b=%d, l=%d] int %d, idx %d, w %f, val %f\n", b, level, idx, index, w, grid[index]);
+    }    
+
+    // prepare dy_dx for calc_grad_inputs
+    if (calc_grad_inputs) {
+
+        // dy_dx += b * D * L * C + level * D * C; // B N L D C
+        dy_dx += ((b + batch_offset_inputs) * L + level) * D * C;
+
+        #pragma unroll
+        for (uint32_t gd = 0; gd < D; gd++) {
+
+            #pragma unroll
+            for (uint32_t idx = 0; idx < (1 << (D - 1)); idx++) {
+                float w = scale;
+                uint32_t pos_grid_local[D];
+
+                #pragma unroll
+                for (uint32_t nd = 0; nd < D - 1; nd++) {
+                    const uint32_t d = nd > gd ? nd + 1 : nd;
+
+                    if ((idx & (1 << nd)) == 0) {
+                        w *= 1 - pos[d];
+                        pos_grid_local[d] = pos_grid[d];
+                    } else {
+                        w *= pos[d];
+                        pos_grid_local[d] = pos_grid[d] + 1;
+                    }
+                }
+
+                pos_grid_local[gd] = pos_grid[gd];
+                uint32_t index_left = get_grid_index<D, C>(0, hashmap_size, resolution, pos_grid_local, mode);
+                pos_grid_local[gd] = pos_grid[gd] + 1;
+                uint32_t index_right = get_grid_index<D, C>(0, hashmap_size, resolution, pos_grid_local, mode);
+
+                #pragma unroll
+                for (uint32_t ch = 0; ch < C; ch++) {
+                    dy_dx[gd * C + ch] += w * (grid[index_right + ch] - grid[index_left + ch]);
+                }
+            }
+        }
+    }
+}
+
+
+template <uint32_t D, uint32_t C, uint32_t N_C>
+__global__ void kernel_grid_backward(
+    const float * __restrict__ grad,
+    const float * __restrict__ inputs, 
+    const float * __restrict__ grid, 
+    const int * __restrict__ offsets, 
+    float * grad_grid, 
+    const float beta,
+    uint32_t B, uint32_t N, 
+    uint32_t L, uint32_t H,
+    uint32_t mode
+) {
+    const uint32_t b = (blockIdx.x * blockDim.x + threadIdx.x) * N_C / C;
+	if (b >= N) return;
+
+    const uint32_t level = blockIdx.y;
+    const uint32_t ch = (blockIdx.x * blockDim.x + threadIdx.x) * N_C - b * C;
+    const uint32_t batch_id = blockIdx.z;
+    const uint32_t batch_offset_grid   = offsets[L] * batch_id;
+    const uint32_t batch_offset_inputs = N * batch_id;
+
+    // locate
+    grad_grid += ((uint32_t)offsets[level] + batch_offset_grid) * C;
+    inputs    += ( b + batch_offset_inputs) * D;
+    grad      += ((b + batch_offset_inputs) * L + level) * C + ch;
+
+    const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+    // const float scale = exp2f(level) * H - 1.0f;
+    const float scale = powf(beta, level) * H - 1.0f;
+    const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+
+    // calculate coordinate
+    float pos[D];
+    uint32_t pos_grid[D];
+
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        pos[d] = inputs[d] * scale + 0.5f;
+        pos_grid[d] = floorf(pos[d]);
+        pos[d] -= (float)pos_grid[d];
+    }
+
+    // interpolate
+    #pragma unroll
+    for (uint32_t idx = 0; idx < (1 << D); idx++) {
+        float w = 1;
+        uint32_t pos_grid_local[D];
+
+        #pragma unroll
+        for (uint32_t d = 0; d < D; d++) {
+            if ((idx & (1 << d)) == 0) {
+                w *= 1 - pos[d];
+                pos_grid_local[d] = pos_grid[d];
+            } else {
+                w *= pos[d];
+                pos_grid_local[d] = pos_grid[d] + 1;
+            }
+        }
+        
+        uint32_t index = get_grid_index<D, C>(ch, hashmap_size, resolution, pos_grid_local, mode);
+
+        #pragma unroll
+        for (uint32_t c = 0; c < N_C; c++) {
+            atomicAdd(&grad_grid[index + c], w * grad[c]);
+        }
+    }    
+}
+
+
+template <uint32_t D, uint32_t C>
+__global__ void kernel_input_backward(
+    const float * __restrict__ grad,
+    const float * __restrict__ dy_dx,  
+    float * grad_inputs,
+    uint32_t B, uint32_t N, uint32_t L
+) {
+    const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x;
+    if (t >= N * D) return;
+
+    const uint32_t b = t / D;
+    const uint32_t d = t - b * D;
+    const uint32_t batch_id = blockIdx.y;
+    const uint32_t batch_offset_inputs = N * batch_id;
+
+    grad  += (b + batch_offset_inputs) * L * C;
+    dy_dx += (b + batch_offset_inputs) * L * D * C;
+    grad_inputs += N * D * batch_id;
+
+    # pragma unroll
+    for (int l = 0; l < L; l++) {
+        # pragma unroll
+        for (int ch = 0; ch < C; ch++) {
+            grad_inputs[t] += grad[l * C + ch] * dy_dx[l * D * C + d * C + ch];
+        }
+    }
+}
+
+
+template <uint32_t D>
+void kernel_grid_wrapper(const float *inputs, const float *embeddings, const int *offsets, float *outputs, const float beta, const uint32_t B, const uint32_t N, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, float *dy_dx, const uint32_t mode) {
+    static constexpr uint32_t N_THREAD = 512;
+	const dim3 blocks_hashgrid = { div_round_up(N, N_THREAD), L, B};
+    switch (C) {
+        case 1: kernel_grid<D, 1><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, beta, B, N, L, H, calc_grad_inputs, dy_dx, mode); break;
+        case 2: kernel_grid<D, 2><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, beta, B, N, L, H, calc_grad_inputs, dy_dx, mode); break;
+        case 4: kernel_grid<D, 4><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, beta, B, N, L, H, calc_grad_inputs, dy_dx, mode); break;
+        case 8: kernel_grid<D, 8><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, beta, B, N, L, H, calc_grad_inputs, dy_dx, mode); break;
+        case 32: kernel_grid<D, 32><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, beta, B, N, L, H, calc_grad_inputs, dy_dx, mode); break;
+        default: throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, 8, 32"};
+    }
+}
+
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [B, L * C], float
+// H: base resolution
+void hash_encode_forward_cuda(const float *inputs, const float *embeddings, const int *offsets, float *outputs, const float beta, const uint32_t B, const uint32_t N, const uint32_t D, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, float *dy_dx, const uint32_t mode) {
+    switch (D) {
+        case 2: kernel_grid_wrapper<2>(inputs, embeddings, offsets, outputs, beta, B, N, C, L, H, calc_grad_inputs, dy_dx, mode); break;
+        case 3: kernel_grid_wrapper<3>(inputs, embeddings, offsets, outputs, beta, B, N, C, L, H, calc_grad_inputs, dy_dx, mode); break;
+        default: throw std::runtime_error{"We only support 2D or 3D data for now."};
+    }
+    
+}
+
+template <uint32_t D>
+void kernel_grid_backward_wrapper(const float *grad, const float *inputs, const float *embeddings, const int *offsets, float *grad_embeddings, const float beta, const uint32_t B,  const uint32_t N, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, float *dy_dx, float *grad_inputs, const uint32_t mode) {
+    static constexpr uint32_t N_THREAD = 256;
+	const uint32_t N_C = std::min(2u, C); // n_features_per_thread
+	const dim3 blocks_hashgrid = {div_round_up(N * C / N_C, N_THREAD), L, B};   // batch x sample x level
+    const dim3 input_blocks_hashgrid = {div_round_up(N * D, N_THREAD), B, 1};
+    switch (C) {
+        case 1: 
+            kernel_grid_backward<D, 1, 1><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, beta, B, N, L, H, mode); 
+            if (calc_grad_inputs) kernel_input_backward<D, 1><<<input_blocks_hashgrid, N_THREAD>>>(grad, dy_dx, grad_inputs, B, N, L);
+            break;
+        case 2: 
+            kernel_grid_backward<D, 2, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, beta, B, N, L, H, mode);
+            if (calc_grad_inputs) kernel_input_backward<D, 2><<<input_blocks_hashgrid, N_THREAD>>>(grad, dy_dx, grad_inputs, B, N, L);
+            break;
+        case 4: 
+            kernel_grid_backward<D, 4, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, beta, B, N, L, H, mode);
+            if (calc_grad_inputs) kernel_input_backward<D, 4><<<input_blocks_hashgrid, N_THREAD>>>(grad, dy_dx, grad_inputs, B, N, L);
+            break;
+        case 8: 
+            kernel_grid_backward<D, 8, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, beta, B, N, L, H, mode);
+            if (calc_grad_inputs) kernel_input_backward<D, 8><<<input_blocks_hashgrid, N_THREAD>>>(grad, dy_dx, grad_inputs, B, N, L);
+            break;
+        case 32:
+            kernel_grid_backward<D, 32, 4><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, beta, B, N, L, H, mode);
+            if (calc_grad_inputs) kernel_input_backward<D, 32><<<input_blocks_hashgrid, N_THREAD>>>(grad, dy_dx, grad_inputs, B, N, L);
+            break;
+        default: throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+    }
+}
+
+
+// grad: [B, L * C], float
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// grad_embeddings: [sO, C]
+// H: base resolution
+void hash_encode_backward_cuda(const float *grad, const float *inputs, const float *embeddings, const int *offsets, float *grad_embeddings, const float beta, const uint32_t B,  const uint32_t N, const uint32_t D, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, float *dy_dx, float *grad_inputs, const uint32_t mode) {
+    switch (D) {
+        case 2: kernel_grid_backward_wrapper<2>(grad, inputs, embeddings, offsets, grad_embeddings, beta, B, N, C, L, H, calc_grad_inputs, dy_dx, grad_inputs, mode); break;
+        case 3: kernel_grid_backward_wrapper<3>(grad, inputs, embeddings, offsets, grad_embeddings, beta, B, N, C, L, H, calc_grad_inputs, dy_dx, grad_inputs, mode); break;
+        default: throw std::runtime_error{"We only support 2D or 3D data for now."};
+    }
+}
\ No newline at end of file
diff --git a/torch_utils/ops/hash_sample.h b/torch_utils/ops/hash_sample.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1d32e6e12372fbfc8606219ac6051d8f59bd298
--- /dev/null
+++ b/torch_utils/ops/hash_sample.h
@@ -0,0 +1,23 @@
+// Copyright (c) Facebook, Inc. and its affiliates.All Rights Reserved
+
+
+// Please refer to original code: https://github.com/NVlabs/instant-ngp
+// and the pytorch wrapper from https://github.com/ashawkey/torch-ngp
+
+#ifndef _HASH_SAMPLE_H
+#define _HASH_SAMPLE_H
+
+#include <stdint.h>
+#include <torch/torch.h>
+#include <torch/extension.h>
+
+// inputs: [B, N, D], float, in [0, 1]
+// embeddings: [B, sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [B, N, L * C], float
+// H: base resolution
+void hash_encode_forward(at::Tensor inputs, at::Tensor embeddings, at::Tensor offsets, at::Tensor outputs, const float beta, const uint32_t B, const uint32_t N, const uint32_t D, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, at::Tensor dy_dx, const uint32_t mode);
+void hash_encode_backward(at::Tensor grad, at::Tensor inputs, at::Tensor embeddings, at::Tensor offsets, at::Tensor grad_embeddings, const float beta, const uint32_t B,  const uint32_t N, const uint32_t D, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, at::Tensor dy_dx, at::Tensor grad_inputs, const uint32_t mode);
+void hash_encode_forward_cuda(const float *inputs, const float *embeddings, const int *offsets, float *outputs,  const float beta, const uint32_t B,  const uint32_t N, const uint32_t D, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, float *dy_dx, const uint32_t mode);
+void hash_encode_backward_cuda(const float *grad, const float *inputs, const float *embeddings, const int *offsets, float *grad_embeddings,  const float beta, const uint32_t B,  const uint32_t N, const uint32_t D, const uint32_t C, const uint32_t L, const uint32_t H, const bool calc_grad_inputs, float *dy_dx, float *grad_inputs, const uint32_t mode);
+#endif
\ No newline at end of file
diff --git a/torch_utils/ops/hash_sample.py b/torch_utils/ops/hash_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2a3379740e2f51aa5c74772823ef12b99c88d4e
--- /dev/null
+++ b/torch_utils/ops/hash_sample.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Please refer to original code: https://github.com/NVlabs/instant-ngp
+# and the pytorch wrapper from https://github.com/ashawkey/torch-ngp
+
+import os
+import torch
+
+from .. import custom_ops
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+_plugin = None
+_null_tensor = torch.empty([0])
+
+def _init():
+    global _plugin
+    if _plugin is None:
+        _plugin = custom_ops.get_plugin(
+            module_name='hash_sample_plugin',
+            sources=['hash_sample.cpp', 'hash_sample.cu'],
+            headers=['hash_sample.h', 'utils.h'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math'],
+        )
+    return True
+
+
+def hash_sample(x, h, offsets, beta=2, base_res=16, calc_grad=True, mode='fast_hash'):
+    """Hash-table look up and d-linear interpolation
+       x: B x N x D       coordinates
+       h: B x L x T x C   hash-tables
+       offsets: L resolutions
+    """
+    assert x.device.type == 'cuda'
+    assert (x.size(-1) == 3) or (x.size(-1) == 2), "currently only 2D/3D is implemented"
+    _init()    
+    return _hash_sample_cuda(mode).apply(x, h, offsets, beta, base_res, calc_grad)
+
+    
+_hash_sample_cuda_cache = dict()
+
+def _hash_sample_cuda(mode='fast_hash'):
+    """CUDA implementation of hash-table look-up
+    """
+    if mode in _hash_sample_cuda_cache:
+        return _hash_sample_cuda_cache[mode]
+
+    if mode == 'fast_hash':
+        h_mode = 0
+    elif mode == 'grid_hash':
+        h_mode = 1
+    else:
+        raise NotImplementedError('only two types are supported now.')
+
+    class HashSampleCuda(torch.autograd.Function):
+        @staticmethod
+        @custom_fwd(cast_inputs=torch.half)
+        def forward(ctx, inputs, embeddings, offsets, beta, base_resolution, calc_grad_inputs=False):
+            # inputs:     [B, N, D], float in [0, 1]
+            # embeddings: [B, sO, C], float
+            # offsets:    [L + 1], int
+            # RETURN:     [B, N, F], float
+
+            inputs = inputs.contiguous()
+            embeddings = embeddings.contiguous()
+            offsets = offsets.contiguous().to(inputs.device)
+
+            B, N, D = inputs.shape     # batch size, # of samples, coord dim
+            L = offsets.shape[0] - 1   # level
+            C = embeddings.shape[-1]   # embedding dim for each level
+            H = base_resolution        # base resolution
+            
+            outputs = torch.zeros(B, N, L * C, device=inputs.device, dtype=inputs.dtype)
+
+            if calc_grad_inputs:
+                dy_dx = torch.zeros(B, N, L * D * C).to(inputs.device, dtype=inputs.dtype)
+            else:
+                dy_dx = torch.zeros(1).to(inputs.device, dtype=inputs.dtype)
+   
+            _plugin.hash_encode_forward(inputs, embeddings, offsets, outputs, beta, B, N, D, C, L, H, calc_grad_inputs, dy_dx, h_mode)
+
+            ctx.save_for_backward(inputs, embeddings, offsets, dy_dx)
+            ctx.dims = [B, N, D, C, L, H, beta]
+            ctx.calc_grad_inputs = calc_grad_inputs
+
+            return outputs
+        
+        @staticmethod
+        @custom_bwd
+        def backward(ctx, grad):
+            # grad: [B, L * C]
+
+            grad = grad.contiguous()
+
+            inputs, embeddings, offsets, dy_dx = ctx.saved_tensors
+            B, N, D, C, L, H, beta = ctx.dims
+            calc_grad_inputs = ctx.calc_grad_inputs
+
+            grad_embeddings = torch.zeros_like(embeddings)
+
+            if calc_grad_inputs:
+                grad_inputs = torch.zeros_like(inputs)
+            else:
+                grad_inputs = torch.zeros(1).to(inputs.device, dtype=inputs.dtype)
+
+            _plugin.hash_encode_backward(grad, inputs, embeddings, offsets, grad_embeddings, beta, B, N, D, C, L, H, calc_grad_inputs, dy_dx, grad_inputs, h_mode)
+
+            if calc_grad_inputs:
+                return grad_inputs, grad_embeddings, None, None, None, None
+            else:
+                return None, grad_embeddings, None, None, None, None
+
+
+    # Add to cache.
+    _hash_sample_cuda_cache[mode] = HashSampleCuda
+    return HashSampleCuda
\ No newline at end of file
diff --git a/torch_utils/ops/nerf_utils.cu b/torch_utils/ops/nerf_utils.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9e1330efd2b31248e4d4b5c86b3e299d87a4d919
--- /dev/null
+++ b/torch_utils/ops/nerf_utils.cu
@@ -0,0 +1,85 @@
+// Copyright (c) Facebook, Inc. and its affiliates.All Rights Reserved
+
+
+#include <stdint.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <torch/torch.h>
+#include <torch/extension.h>
+
+#include "utils.h"
+
+
+template <typename T>
+__host__ __device__ T div_round_up(T val, T divisor) {
+	return (val + divisor - 1) / divisor;
+}
+
+
+template <uint32_t S>
+__global__ void kernel_topp_masking(
+    const int * __restrict__ sorted_indices,
+    const float * __restrict__ sorted_weights, 
+    bool *output_mask, 
+    const float p, const uint32_t B, 
+    const uint32_t N, const uint32_t D) {
+
+    const uint32_t b = blockIdx.x * blockDim.x + threadIdx.x;
+    if (b >= N) return;
+
+    const uint32_t batch_id = blockIdx.y;
+    
+    // locate
+    sorted_weights += (b + batch_id * N) * D;
+    sorted_indices += (b + batch_id * N) * D;
+    output_mask += (b + batch_id * N) * D;
+    
+    float w_sum = 0;
+
+    #pragma unroll
+    for (uint32_t d = 0; d < S; d++){
+        if (d >= D) break;
+        w_sum += sorted_weights[d];
+        output_mask[sorted_indices[d]] = true;
+        if (w_sum >= p) break;
+    }
+    }
+
+void topp_masking_cuda(
+    const int *sorted_indices, 
+    const float *sorted_weights, bool *output_mask, 
+    const float p, const uint32_t B, const uint32_t N, const uint32_t D) {
+    static constexpr uint32_t N_THREAD = 512;
+    const dim3 blocks = {div_round_up(N, N_THREAD), B, 1};
+    if (D < 8)        kernel_topp_masking<8><<<  blocks, N_THREAD>>>(sorted_indices, sorted_weights, output_mask, p, B, N, D);
+    else if (D < 16)  kernel_topp_masking<16><<< blocks, N_THREAD>>>(sorted_indices, sorted_weights, output_mask, p, B, N, D);
+    else if (D < 32)  kernel_topp_masking<32><<< blocks, N_THREAD>>>(sorted_indices, sorted_weights, output_mask, p, B, N, D);
+    else if (D < 64)  kernel_topp_masking<64><<< blocks, N_THREAD>>>(sorted_indices, sorted_weights, output_mask, p, B, N, D);
+    else if (D < 128) kernel_topp_masking<128><<<blocks, N_THREAD>>>(sorted_indices, sorted_weights, output_mask, p, B, N, D);
+    else if (D < 256) kernel_topp_masking<256><<<blocks, N_THREAD>>>(sorted_indices, sorted_weights, output_mask, p, B, N, D);
+    else throw std::runtime_error{"# of sampled points should not exceed 256"};
+
+}
+
+void topp_masking(
+    at::Tensor sorted_indices, at::Tensor sorted_weights, at::Tensor output_mask, 
+    const float p, const uint32_t B, const uint32_t N, const uint32_t D) {
+    CHECK_CUDA(sorted_indices);
+    CHECK_CUDA(sorted_weights);
+    CHECK_CUDA(output_mask);
+  
+    CHECK_CONTIGUOUS(sorted_indices);
+    CHECK_CONTIGUOUS(sorted_weights);
+    CHECK_CONTIGUOUS(output_mask);
+
+    CHECK_IS_FLOAT(sorted_weights);
+    CHECK_IS_INT(sorted_indices);
+    
+    topp_masking_cuda(sorted_indices.data_ptr<int>(), sorted_weights.data_ptr<float>(), output_mask.data_ptr<bool>(), p, B, N, D);
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("topp_masking", &topp_masking, "topp masking");
+}
diff --git a/torch_utils/ops/nerf_utils.py b/torch_utils/ops/nerf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a504abb083d7d9a7afbff861c428b7b7d2de7a5
--- /dev/null
+++ b/torch_utils/ops/nerf_utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import os
+import torch
+from .. import custom_ops
+
+
+_plugin = None
+
+def _init():
+    global _plugin
+    if _plugin is None:
+        _plugin = custom_ops.get_plugin(
+            module_name='nerf_utils_plugin',
+            sources=['nerf_utils.cu'],
+            headers=['utils.h'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math'],
+        )
+
+    return True
+
+def topp_masking(w, p=0.99):
+    """
+    w: B x N x S  normalized (S number of samples)
+    p: top-P used
+    """ 
+    # _init()
+    w_sorted, w_indices = w.sort(dim=-1, descending=True)
+    
+    w_mask = w_sorted.cumsum(-1).lt(p)
+    w_mask = torch.cat([torch.ones_like(w_mask[...,:1]), w_mask[..., :-1]], -1)
+    w_mask = w_mask.scatter(-1, w_indices, w_mask)
+    
+    # w_mask = torch.zeros_like(w).bool()
+    # _plugin.topp_masking(w_indices.int(), w_sorted, w_mask, p, w.size(0), w.size(1), w.size(2))
+    return w_mask
\ No newline at end of file
diff --git a/torch_utils/ops/upfirdn2d.cpp b/torch_utils/ops/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89f04901040a552b75bb62b08b16661312b3edaf
--- /dev/null
+++ b/torch_utils/ops/upfirdn2d.cpp
@@ -0,0 +1,107 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "upfirdn2d.h"
+
+//------------------------------------------------------------------------
+
+static torch::Tensor upfirdn2d(torch::Tensor x, torch::Tensor f, int upx, int upy, int downx, int downy, int padx0, int padx1, int pady0, int pady1, bool flip, float gain)
+{
+    // Validate arguments.
+    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+    TORCH_CHECK(f.device() == x.device(), "f must reside on the same device as x");
+    TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
+    // TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+    TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
+    TORCH_CHECK(x.numel() > 0, "x has zero size");
+    TORCH_CHECK(f.numel() > 0, "f has zero size");
+    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+    TORCH_CHECK(f.dim() == 2, "f must be rank 2");
+    // TORCH_CHECK((x.size(0)-1)*x.stride(0) + (x.size(1)-1)*x.stride(1) + (x.size(2)-1)*x.stride(2) + (x.size(3)-1)*x.stride(3) <= INT_MAX, "x memory footprint is too large");
+    TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
+    TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
+    TORCH_CHECK(downx >= 1 && downy >= 1, "downsampling factor must be at least 1");
+
+    // Create output tensor.
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+    int outW = ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;
+    int outH = ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;
+    TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1");
+    torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW}, x.options(), x.suggest_memory_format());
+    // TORCH_CHECK(y.numel() <= INT_MAX, "output is too large");
+    // TORCH_CHECK((y.size(0)-1)*y.stride(0) + (y.size(1)-1)*y.stride(1) + (y.size(2)-1)*y.stride(2) + (y.size(3)-1)*y.stride(3) <= INT_MAX, "output memory footprint is too large");
+
+    // Initialize CUDA kernel parameters.
+    upfirdn2d_kernel_params p;
+    p.x             = x.data_ptr();
+    p.f             = f.data_ptr<float>();
+    p.y             = y.data_ptr();
+    p.up            = make_int2(upx, upy);
+    p.down          = make_int2(downx, downy);
+    p.pad0          = make_int2(padx0, pady0);
+    p.flip          = (flip) ? 1 : 0;
+    p.gain          = gain;
+    p.inSize        = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+    p.inStride      = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1), (int)x.stride(0));
+    p.filterSize    = make_int2((int)f.size(1), (int)f.size(0));
+    p.filterStride  = make_int2((int)f.stride(1), (int)f.stride(0));
+    p.outSize       = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+    p.outStride     = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1), (int)y.stride(0));
+    p.sizeMajor     = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;
+    p.sizeMinor     = (p.inStride.z == 1) ? p.inSize.z : 1;
+
+    // Choose CUDA kernel.
+    upfirdn2d_kernel_spec spec;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&]
+    {
+        spec = choose_upfirdn2d_kernel<scalar_t>(p);
+    });
+
+    // Set looping options.
+    p.loopMajor     = (p.sizeMajor - 1) / 16384 + 1;
+    p.loopMinor     = spec.loopMinor;
+    p.loopX         = spec.loopX;
+    p.launchMinor   = (p.sizeMinor - 1) / p.loopMinor + 1;
+    p.launchMajor   = (p.sizeMajor - 1) / p.loopMajor + 1;
+
+    // Compute grid size.
+    dim3 blockSize, gridSize;
+    if (spec.tileOutW < 0) // large
+    {
+        blockSize = dim3(4, 32, 1);
+        gridSize = dim3(
+            ((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,
+            (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1,
+            p.launchMajor);
+    }
+    else // small
+    {
+        blockSize = dim3(256, 1, 1);
+        gridSize = dim3(
+            ((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,
+            (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1,
+            p.launchMajor);
+    }
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
+    return y;
+}
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("upfirdn2d", &upfirdn2d);
+}
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/upfirdn2d.cu b/torch_utils/ops/upfirdn2d.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3a33e31bbb1bbc1cd02ee7d2ede3943917f3906e
--- /dev/null
+++ b/torch_utils/ops/upfirdn2d.cu
@@ -0,0 +1,384 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include "upfirdn2d.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T> struct InternalType;
+template <> struct InternalType<double>     { typedef double scalar_t; };
+template <> struct InternalType<float>      { typedef float  scalar_t; };
+template <> struct InternalType<c10::Half>  { typedef float  scalar_t; };
+
+static __device__ __forceinline__ int floor_div(int a, int b)
+{
+    int t = 1 - a / b;
+    return (a + t * b) / b - t;
+}
+
+//------------------------------------------------------------------------
+// Generic CUDA implementation for large filters.
+
+template <class T> static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+
+    // Calculate thread index.
+    int minorBase = blockIdx.x * blockDim.x + threadIdx.x;
+    int outY = minorBase / p.launchMinor;
+    minorBase -= outY * p.launchMinor;
+    int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
+    int majorBase = blockIdx.z * p.loopMajor;
+    if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)
+        return;
+
+    // Setup Y receptive field.
+    int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;
+    int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);
+    int h = min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;
+    int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;
+    if (p.flip)
+        filterY = p.filterSize.y - 1 - filterY;
+
+    // Loop over major, minor, and X.
+    for (int majorIdx = 0, major = majorBase; majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
+    for (int minorIdx = 0, minor = minorBase; minorIdx < p.loopMinor & minor < p.sizeMinor; minorIdx++, minor += p.launchMinor)
+    {
+        int nc = major * p.sizeMinor + minor;
+        int n = nc / p.inSize.z;
+        int c = nc - n * p.inSize.z;
+        for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x; loopX++, outX += blockDim.y)
+        {
+            // Setup X receptive field.
+            int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;
+            int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);
+            int w = min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) - inX;
+            int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;
+            if (p.flip)
+                filterX = p.filterSize.x - 1 - filterX;
+
+            // Initialize pointers.
+            const T* xp = &((const T*)p.x)[inX * p.inStride.x + inY * p.inStride.y + c * p.inStride.z + n * p.inStride.w];
+            const float* fp = &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];
+            int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;
+            int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;
+
+            // Inner loop.
+            scalar_t v = 0;
+            for (int y = 0; y < h; y++)
+            {
+                for (int x = 0; x < w; x++)
+                {
+                    v += (scalar_t)(*xp) * (scalar_t)(*fp);
+                    xp += p.inStride.x;
+                    fp += filterStepX;
+                }
+                xp += p.inStride.y - w * p.inStride.x;
+                fp += filterStepY - w * filterStepX;
+            }
+
+            // Store result.
+            v *= p.gain;
+            ((T*)p.y)[outX * p.outStride.x + outY * p.outStride.y + c * p.outStride.z + n * p.outStride.w] = (T)v;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Specialized CUDA implementation for small filters.
+
+template <class T, int upx, int upy, int downx, int downy, int filterW, int filterH, int tileOutW, int tileOutH, int loopMinor>
+static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p)
+{
+    typedef typename InternalType<T>::scalar_t scalar_t;
+    const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;
+    const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;
+    __shared__ volatile scalar_t sf[filterH][filterW];
+    __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];
+
+    // Calculate tile index.
+    int minorBase = blockIdx.x;
+    int tileOutY = minorBase / p.launchMinor;
+    minorBase -= tileOutY * p.launchMinor;
+    minorBase *= loopMinor;
+    tileOutY *= tileOutH;
+    int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
+    int majorBase = blockIdx.z * p.loopMajor;
+    if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y | majorBase >= p.sizeMajor)
+        return;
+
+    // Load filter (flipped).
+    for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW; tapIdx += blockDim.x)
+    {
+        int fy = tapIdx / filterW;
+        int fx = tapIdx - fy * filterW;
+        scalar_t v = 0;
+        if (fx < p.filterSize.x & fy < p.filterSize.y)
+        {
+            int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;
+            int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;
+            v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];
+        }
+        sf[fy][fx] = v;
+    }
+
+    // Loop over major and X.
+    for (int majorIdx = 0, major = majorBase; majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
+    {
+        int baseNC = major * p.sizeMinor + minorBase;
+        int n = baseNC / p.inSize.z;
+        int baseC = baseNC - n * p.inSize.z;
+        for (int loopX = 0, tileOutX = tileOutXBase; loopX < p.loopX & tileOutX < p.outSize.x; loopX++, tileOutX += tileOutW)
+        {
+            // Load input pixels.
+            int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;
+            int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;
+            int tileInX = floor_div(tileMidX, upx);
+            int tileInY = floor_div(tileMidY, upy);
+            __syncthreads();
+            for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor; inIdx += blockDim.x)
+            {
+                int relC = inIdx;
+                int relInX = relC / loopMinor;
+                int relInY = relInX / tileInW;
+                relC -= relInX * loopMinor;
+                relInX -= relInY * tileInW;
+                int c = baseC + relC;
+                int inX = tileInX + relInX;
+                int inY = tileInY + relInY;
+                scalar_t v = 0;
+                if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y & c < p.inSize.z)
+                    v = (scalar_t)((const T*)p.x)[inX * p.inStride.x + inY * p.inStride.y + c * p.inStride.z + n * p.inStride.w];
+                sx[relInY][relInX][relC] = v;
+            }
+
+            // Loop over output pixels.
+            __syncthreads();
+            for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor; outIdx += blockDim.x)
+            {
+                int relC = outIdx;
+                int relOutX = relC / loopMinor;
+                int relOutY = relOutX / tileOutW;
+                relC -= relOutX * loopMinor;
+                relOutX -= relOutY * tileOutW;
+                int c = baseC + relC;
+                int outX = tileOutX + relOutX;
+                int outY = tileOutY + relOutY;
+
+                // Setup receptive field.
+                int midX = tileMidX + relOutX * downx;
+                int midY = tileMidY + relOutY * downy;
+                int inX = floor_div(midX, upx);
+                int inY = floor_div(midY, upy);
+                int relInX = inX - tileInX;
+                int relInY = inY - tileInY;
+                int filterX = (inX + 1) * upx - midX - 1; // flipped
+                int filterY = (inY + 1) * upy - midY - 1; // flipped
+
+                // Inner loop.
+                if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z)
+                {
+                    scalar_t v = 0;
+                    #pragma unroll
+                    for (int y = 0; y < filterH / upy; y++)
+                        #pragma unroll
+                        for (int x = 0; x < filterW / upx; x++)
+                            v += sx[relInY + y][relInX + x][relC] * sf[filterY + y * upy][filterX + x * upx];
+                    v *= p.gain;
+                    ((T*)p.y)[outX * p.outStride.x + outY * p.outStride.y + c * p.outStride.z + n * p.outStride.w] = (T)v;
+                }
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p)
+{
+    int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;
+    upfirdn2d_kernel_spec spec = {(void*)upfirdn2d_kernel_large<T>, -1,-1,1, 4}; // contiguous
+    if (s == 1)           spec = {(void*)upfirdn2d_kernel_large<T>, -1,-1,4, 1}; // channels_last
+
+    // No up/downsampling.
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,24, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,16, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 7  && fy <= 7 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 7,7,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 6,6,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 5  && fy <= 5 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 5,5,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 3  && fy <= 3 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 3,3,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,1,  128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,1,  128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 8,1,   128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,24,  32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,16,  32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,8,   32,32,1>, 32,32,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,24, 32,32,1>,  32,32,1,  1};
+        if (s == 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,16, 32,32,1>,  32,32,1,  1};
+        if (s == 1 && fx <= 7  && fy <= 7 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 7,7,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 6,6,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 5  && fy <= 5 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 5,5,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 4,4,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 3  && fy <= 3 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 3,3,   16,16,8>,  16,16,8,  1};
+        if (s == 1 && fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 24,1,  128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 16,1,  128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 8  && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 8,1,   128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,24,  1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,16,  1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,1, 1,8,   1,128,16>, 1,128,16, 1};
+    }
+
+    // 2x upsampling.
+    if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 24,24, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 16,16, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 8,8,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 6,6,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 4,4,   64,16,1>, 64,16,1, 1};
+        if (s != 1 && fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 2,2,   64,16,1>, 64,16,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 24,24, 32,32,1>, 32,32,1, 1};
+        if (s == 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 16,16, 32,32,1>, 32,32,1, 1};
+        if (s == 1 && fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 8,8,   16,16,8>, 16,16,8, 1};
+        if (s == 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 6,6,   16,16,8>, 16,16,8, 1};
+        if (s == 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 4,4,   16,16,8>, 16,16,8, 1};
+        if (s == 1 && fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 2,2, 1,1, 2,2,   16,16,8>, 16,16,8, 1};
+    }
+    if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 24,1, 128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 16,1, 128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 8,1,  128,8,1>, 128,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 24,1, 128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 16 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 16,1, 128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 8  && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 2,1, 1,1, 8,1,  128,1,16>, 128,1,16, 1};
+    }
+    if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,24, 32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,16, 32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,8,  32,32,1>, 32,32,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,24, 1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,16, 1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,2, 1,1, 1,8,  1,128,16>, 1,128,16, 1};
+    }
+
+    // 2x downsampling.
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 24,24, 32,16,1>, 32,16,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 16,16, 32,16,1>, 32,16,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 8,8,   32,8,1>,  32,8,1,  1};
+        if (s != 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 6,6,   32,8,1>,  32,8,1,  1};
+        if (s != 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 4,4,   32,8,1>,  32,8,1,  1};
+        if (s != 1 && fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 2,2,   32,8,1>,  32,8,1,  1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 24,24, 16,16,1>, 16,16,1, 1};
+        if (s == 1 && fx <= 16 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 16,16, 16,16,1>, 16,16,1, 1};
+        if (s == 1 && fx <= 8  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 8,8,   8,8,8>,   8,8,8,   1};
+        if (s == 1 && fx <= 6  && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 6,6,   8,8,8>,   8,8,8,   1};
+        if (s == 1 && fx <= 4  && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 4,4,   8,8,8>,   8,8,8,   1};
+        if (s == 1 && fx <= 2  && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,2, 2,2,   8,8,8>,   8,8,8,   1};
+    }
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 24 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 24,1, 64,8,1>, 64,8,1, 1};
+        if (s != 1 && fx <= 16 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 16,1, 64,8,1>, 64,8,1, 1};
+        if (s != 1 && fx <= 8  && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 8,1,  64,8,1>, 64,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 24 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 24,1, 64,1,8>, 64,1,8, 1};
+        if (s == 1 && fx <= 16 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 16,1, 64,1,8>, 64,1,8, 1};
+        if (s == 1 && fx <= 8  && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 2,1, 8,1,  64,1,8>, 64,1,8, 1};
+    }
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2)
+    {
+        // contiguous
+        if (s != 1 && fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,24, 32,16,1>, 32,16,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,16, 32,16,1>, 32,16,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,8,  32,16,1>, 32,16,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 1  && fy <= 24) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,24, 1,64,8>, 1,64,8, 1};
+        if (s == 1 && fx <= 1  && fy <= 16) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,16, 1,64,8>, 1,64,8, 1};
+        if (s == 1 && fx <= 1  && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,2, 1,8,  1,64,8>, 1,64,8, 1};
+    }
+
+    // 4x upsampling.
+    if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 48 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 4,4, 1,1, 48,48, 64,32,1>, 64,32,1, 1};
+        if (s != 1 && fx <= 32 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 4,4, 1,1, 32,32, 64,32,1>, 64,32,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 48 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 4,4, 1,1, 48,48, 32,32,1>, 32,32,1, 1};
+        if (s == 1 && fx <= 32 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 4,4, 1,1, 32,32, 32,32,1>, 32,32,1, 1};
+    }
+    if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 48 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 4,1, 1,1, 48,1, 128,8,1>, 128,8,1, 1};
+        if (s != 1 && fx <= 32 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 4,1, 1,1, 32,1, 128,8,1>, 128,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 48 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 4,1, 1,1, 48,1, 128,1,16>, 128,1,16, 1};
+        if (s == 1 && fx <= 32 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 4,1, 1,1, 32,1, 128,1,16>, 128,1,16, 1};
+    }
+    if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 1 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 1,4, 1,1, 1,48, 32,32,1>, 32,32,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 1,4, 1,1, 1,32, 32,32,1>, 32,32,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 1 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 1,4, 1,1, 1,48, 1,128,16>, 1,128,16, 1};
+        if (s == 1 && fx <= 1 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 1,4, 1,1, 1,32, 1,128,16>, 1,128,16, 1};
+    }
+
+    // 4x downsampling (inefficient).
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1)
+    {
+        // contiguous
+        if (s != 1 && fx <= 48 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 4,1, 48,1, 32,8,1>, 32,8,1, 1};
+        if (s != 1 && fx <= 32 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 4,1, 32,1, 32,8,1>, 32,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 48 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 4,1, 48,1, 32,1,8>, 32,1,8, 1};
+        if (s == 1 && fx <= 32 && fy <= 1) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 4,1, 32,1, 32,1,8>, 32,1,8, 1};
+    }
+    if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4)
+    {
+        // contiguous
+        if (s != 1 && fx <= 1 && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,4, 1,48, 32,8,1>, 32,8,1, 1};
+        if (s != 1 && fx <= 1 && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,4, 1,32, 32,8,1>, 32,8,1, 1};
+        // channels_last
+        if (s == 1 && fx <= 1  && fy <= 48) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,4, 1,48, 1,32,8>, 1,32,8, 1};
+        if (s == 1 && fx <= 1  && fy <= 32) spec = {(void*)upfirdn2d_kernel_small<T, 1,1, 1,4, 1,32, 1,32,8>, 1,32,8, 1};
+    }
+    return spec;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>   (const upfirdn2d_kernel_params& p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>    (const upfirdn2d_kernel_params& p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(const upfirdn2d_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/upfirdn2d.h b/torch_utils/ops/upfirdn2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..2793daf874492af01e8634a7863c036e17b6731f
--- /dev/null
+++ b/torch_utils/ops/upfirdn2d.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct upfirdn2d_kernel_params
+{
+    const void*     x;
+    const float*    f;
+    void*           y;
+
+    int2            up;
+    int2            down;
+    int2            pad0;
+    int             flip;
+    float           gain;
+
+    int4            inSize;         // [width, height, channel, batch]
+    int4            inStride;
+    int2            filterSize;     // [width, height]
+    int2            filterStride;
+    int4            outSize;        // [width, height, channel, batch]
+    int4            outStride;
+    int             sizeMinor;
+    int             sizeMajor;
+
+    int             loopMinor;
+    int             loopMajor;
+    int             loopX;
+    int             launchMinor;
+    int             launchMajor;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct upfirdn2d_kernel_spec
+{
+    void*   kernel;
+    int     tileOutW;
+    int     tileOutH;
+    int     loopMinor;
+    int     loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T> upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p);
+
+//------------------------------------------------------------------------
diff --git a/torch_utils/ops/upfirdn2d.py b/torch_utils/ops/upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..aed7ea694998300d78edba7de78de57bdfdc3971
--- /dev/null
+++ b/torch_utils/ops/upfirdn2d.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Custom PyTorch ops for efficient resampling of 2D images."""
+
+import os
+import numpy as np
+import torch
+
+from .. import custom_ops
+from .. import misc
+from . import conv2d_gradfix
+
+#----------------------------------------------------------------------------
+
+_plugin = None
+
+def _init():
+    global _plugin
+    if _plugin is None:
+        _plugin = custom_ops.get_plugin(
+            module_name='upfirdn2d_plugin',
+            sources=['upfirdn2d.cpp', 'upfirdn2d.cu'],
+            headers=['upfirdn2d.h'],
+            source_dir=os.path.dirname(__file__),
+            extra_cuda_cflags=['--use_fast_math'],
+        )
+    return True
+
+def _parse_scaling(scaling):
+    if isinstance(scaling, int):
+        scaling = [scaling, scaling]
+    assert isinstance(scaling, (list, tuple))
+    assert all(isinstance(x, int) for x in scaling)
+    sx, sy = scaling
+    assert sx >= 1 and sy >= 1
+    return sx, sy
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, int) for x in padding)
+    if len(padding) == 2:
+        padx, pady = padding
+        padding = [padx, padx, pady, pady]
+    padx0, padx1, pady0, pady1 = padding
+    return padx0, padx1, pady0, pady1
+
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    fw = f.shape[-1]
+    fh = f.shape[0]
+    with misc.suppress_tracer_warnings():
+        fw = int(fw)
+        fh = int(fh)
+    misc.assert_shape(f, [fh, fw][:f.ndim])
+    assert fw >= 1 and fh >= 1
+    return fw, fh
+
+#----------------------------------------------------------------------------
+
+def setup_filter(f, device=torch.device('cpu'), normalize=True, flip_filter=False, gain=1, separable=None):
+    r"""Convenience function to setup 2D FIR filter for `upfirdn2d()`.
+
+    Args:
+        f:           Torch tensor, numpy array, or python list of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable),
+                     `[]` (impulse), or
+                     `None` (identity).
+        device:      Result device (default: cpu).
+        normalize:   Normalize the filter so that it retains the magnitude
+                     for constant input signal (DC)? (default: True).
+        flip_filter: Flip the filter? (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        separable:   Return a separable filter? (default: select automatically).
+
+    Returns:
+        Float32 tensor of the shape
+        `[filter_height, filter_width]` (non-separable) or
+        `[filter_taps]` (separable).
+    """
+    # Validate.
+    if f is None:
+        f = 1
+    f = torch.as_tensor(f, dtype=torch.float32)
+    assert f.ndim in [0, 1, 2]
+    assert f.numel() > 0
+    if f.ndim == 0:
+        f = f[np.newaxis]
+
+    # Separable?
+    if separable is None:
+        separable = (f.ndim == 1 and f.numel() >= 8)
+    if f.ndim == 1 and not separable:
+        f = f.ger(f)
+    assert f.ndim == (1 if separable else 2)
+
+    # Apply normalize, flip, gain, and device.
+    if normalize:
+        f /= f.sum()
+    if flip_filter:
+        f = f.flip(list(range(f.ndim)))
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(device=device)
+    return f
+
+#----------------------------------------------------------------------------
+
+def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Pad, upsample, filter, and downsample a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    2. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+
+    3. Convolve the image with the specified 2D FIR filter (`f`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+
+    4. Downsample the image by keeping every Nth pixel (`down`).
+
+    This sequence of operations bears close resemblance to scipy.signal.upfirdn().
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ['ref', 'cuda']
+    if impl == 'cuda' and x.device.type == 'cuda' and _init():
+        return _upfirdn2d_cuda(up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain).apply(x, f)
+    return _upfirdn2d_ref(x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain)
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    if f is None:
+        f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    assert f.dtype == torch.float32 and not f.requires_grad
+    batch_size, num_channels, in_height, in_width = x.shape
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Check that upsampled buffer is not smaller than the filter.
+    upW = in_width * upx + padx0 + padx1
+    upH = in_height * upy + pady0 + pady1
+    assert upW >= f.shape[-1] and upH >= f.shape[0]
+
+    # Upsample by inserting zeros.
+    x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
+    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
+    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
+
+    # Pad or crop.
+    x = torch.nn.functional.pad(x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)])
+    x = x[:, :, max(-pady0, 0) : x.shape[2] - max(-pady1, 0), max(-padx0, 0) : x.shape[3] - max(-padx1, 0)]
+
+    # Setup filter.
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(x.dtype)
+    if not flip_filter:
+        f = f.flip(list(range(f.ndim)))
+
+    # Convolve with the filter.
+    f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim)
+    if f.ndim == 4:
+        x = conv2d_gradfix.conv2d(input=x, weight=f, groups=num_channels)
+    else:
+        x = conv2d_gradfix.conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels)
+        x = conv2d_gradfix.conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels)
+
+    # Downsample by throwing away pixels.
+    x = x[:, :, ::downy, ::downx]
+    return x
+
+#----------------------------------------------------------------------------
+
+_upfirdn2d_cuda_cache = dict()
+
+def _upfirdn2d_cuda(up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Fast CUDA implementation of `upfirdn2d()` using custom ops.
+    """
+    # Parse arguments.
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Lookup from cache.
+    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter, gain)
+    if key in _upfirdn2d_cuda_cache:
+        return _upfirdn2d_cuda_cache[key]
+
+    # Forward op.
+    class Upfirdn2dCuda(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x, f): # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+            if f is None:
+                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            if f.ndim == 1 and f.shape[0] == 1:
+                f = f.square().unsqueeze(0) # Convert separable-1 into full-1x1.
+            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+            y = x
+            if f.ndim == 2:
+                y = _plugin.upfirdn2d(y, f, upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter, gain)
+            else:
+                y = _plugin.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1, padx0, padx1, 0, 0, flip_filter, 1.0)
+                y = _plugin.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy, 0, 0, pady0, pady1, flip_filter, gain)
+            ctx.save_for_backward(f.clone())
+            ctx.x_shape = x.shape
+            return y
+
+        @staticmethod
+        def backward(ctx, dy): # pylint: disable=arguments-differ
+            f, = ctx.saved_tensors
+            _, _, ih, iw = ctx.x_shape
+            _, _, oh, ow = dy.shape
+            fw, fh = _get_filter_size(f)
+            p = [
+                fw - padx0 - 1,
+                iw * upx - ow * downx + padx0 - upx + 1,
+                fh - pady0 - 1,
+                ih * upy - oh * downy + pady0 - upy + 1,
+            ]
+            dx = None
+            df = None
+
+            if ctx.needs_input_grad[0]:
+                dx = _upfirdn2d_cuda(up=down, down=up, padding=p, flip_filter=(not flip_filter), gain=gain).apply(dy, f)
+
+            assert not ctx.needs_input_grad[1]
+            return dx, df
+
+    # Add to cache.
+    _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda
+    return Upfirdn2dCuda
+
+#----------------------------------------------------------------------------
+
+def filter2d(x, f, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Filter a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape matches the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        padding:     Padding with respect to the output. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + fw // 2,
+        padx1 + (fw - 1) // 2,
+        pady0 + fh // 2,
+        pady1 + (fh - 1) // 2,
+    ]
+    return upfirdn2d(x, f, padding=p, flip_filter=flip_filter, gain=gain, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def upsample2d(x, f, up=2, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Upsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a multiple of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the output. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    upx, upy = _parse_scaling(up)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw + upx - 1) // 2,
+        padx1 + (fw - upx) // 2,
+        pady0 + (fh + upy - 1) // 2,
+        pady1 + (fh - upy) // 2,
+    ]
+    return upfirdn2d(x, f, up=up, padding=p, flip_filter=flip_filter, gain=gain*upx*upy, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def downsample2d(x, f, down=2, padding=0, flip_filter=False, gain=1, impl='cuda'):
+    r"""Downsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a fraction of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the input. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw - downx + 1) // 2,
+        padx1 + (fw - downx) // 2,
+        pady0 + (fh - downy + 1) // 2,
+        pady1 + (fh - downy) // 2,
+    ]
+    return upfirdn2d(x, f, down=down, padding=p, flip_filter=flip_filter, gain=gain, impl=impl)
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/ops/utils.h b/torch_utils/ops/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d247cba2142828f327639fa4840e78a20f1e928
--- /dev/null
+++ b/torch_utils/ops/utils.h
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates.All Rights Reserved
+
+// Please refer to original code: https://github.com/NVlabs/instant-ngp
+// and the pytorch wrapper from https://github.com/ashawkey/torch-ngp
+
+#ifndef _UTILS_H
+#define _UTILS_H
+
+#include <torch/torch.h>
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor")
+#define CHECK_IS_FLOAT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float, #x " must be a float tensor")
+
+#endif
\ No newline at end of file
diff --git a/torch_utils/persistence.py b/torch_utils/persistence.py
new file mode 100644
index 0000000000000000000000000000000000000000..f90ce85e8ace0f44e839158b22c5790de448d82d
--- /dev/null
+++ b/torch_utils/persistence.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Facilities for pickling Python code alongside other data.
+
+The pickled code is automatically imported into a separate Python module
+during unpickling. This way, any previously exported pickles will remain
+usable even if the original code is no longer available, or if the current
+version of the code is not consistent with what was originally pickled."""
+
+import sys
+import pickle
+import io
+import inspect
+import copy
+import uuid
+import types
+import dnnlib
+
+#----------------------------------------------------------------------------
+
+_version            = 6         # internal version number
+_decorators         = set()     # {decorator_class, ...}
+_import_hooks       = []        # [hook_function, ...]
+_module_to_src_dict = dict()    # {module: src, ...}
+_src_to_module_dict = dict()    # {src: module, ...}
+
+#----------------------------------------------------------------------------
+
+def persistent_class(orig_class):
+    r"""Class decorator that extends a given class to save its source code
+    when pickled.
+
+    Example:
+
+        from torch_utils import persistence
+
+        @persistence.persistent_class
+        class MyNetwork(torch.nn.Module):
+            def __init__(self, num_inputs, num_outputs):
+                super().__init__()
+                self.fc = MyLayer(num_inputs, num_outputs)
+                ...
+
+        @persistence.persistent_class
+        class MyLayer(torch.nn.Module):
+            ...
+
+    When pickled, any instance of `MyNetwork` and `MyLayer` will save its
+    source code alongside other internal state (e.g., parameters, buffers,
+    and submodules). This way, any previously exported pickle will remain
+    usable even if the class definitions have been modified or are no
+    longer available.
+
+    The decorator saves the source code of the entire Python module
+    containing the decorated class. It does *not* save the source code of
+    any imported modules. Thus, the imported modules must be available
+    during unpickling, also including `torch_utils.persistence` itself.
+
+    It is ok to call functions defined in the same module from the
+    decorated class. However, if the decorated class depends on other
+    classes defined in the same module, they must be decorated as well.
+    This is illustrated in the above example in the case of `MyLayer`.
+
+    It is also possible to employ the decorator just-in-time before
+    calling the constructor. For example:
+
+        cls = MyLayer
+        if want_to_make_it_persistent:
+            cls = persistence.persistent_class(cls)
+        layer = cls(num_inputs, num_outputs)
+
+    As an additional feature, the decorator also keeps track of the
+    arguments that were used to construct each instance of the decorated
+    class. The arguments can be queried via `obj.init_args` and
+    `obj.init_kwargs`, and they are automatically pickled alongside other
+    object state. A typical use case is to first unpickle a previous
+    instance of a persistent class, and then upgrade it to use the latest
+    version of the source code:
+
+        with open('old_pickle.pkl', 'rb') as f:
+            old_net = pickle.load(f)
+        new_net = MyNetwork(*old_obj.init_args, **old_obj.init_kwargs)
+        misc.copy_params_and_buffers(old_net, new_net, require_all=True)
+    """
+    assert isinstance(orig_class, type)
+    if is_persistent(orig_class):
+        return orig_class
+
+    assert orig_class.__module__ in sys.modules
+    orig_module = sys.modules[orig_class.__module__]
+    orig_module_src = _module_to_src(orig_module)
+
+    class Decorator(orig_class):
+        _orig_module_src = orig_module_src
+        _orig_class_name = orig_class.__name__
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self._init_args = copy.deepcopy(args)
+            self._init_kwargs = copy.deepcopy(kwargs)
+            assert orig_class.__name__ in orig_module.__dict__
+            _check_pickleable(self.__reduce__())
+
+        @property
+        def init_args(self):
+            return copy.deepcopy(self._init_args)
+
+        @property
+        def init_kwargs(self):
+            return dnnlib.EasyDict(copy.deepcopy(self._init_kwargs))
+
+        def __reduce__(self):
+            fields = list(super().__reduce__())
+            fields += [None] * max(3 - len(fields), 0)
+            if fields[0] is not _reconstruct_persistent_obj:
+                meta = dict(type='class', version=_version, module_src=self._orig_module_src, class_name=self._orig_class_name, state=fields[2])
+                fields[0] = _reconstruct_persistent_obj # reconstruct func
+                fields[1] = (meta,) # reconstruct args
+                fields[2] = None # state dict
+            return tuple(fields)
+
+    Decorator.__name__ = orig_class.__name__
+    _decorators.add(Decorator)
+    return Decorator
+
+#----------------------------------------------------------------------------
+
+def is_persistent(obj):
+    r"""Test whether the given object or class is persistent, i.e.,
+    whether it will save its source code when pickled.
+    """
+    try:
+        if obj in _decorators:
+            return True
+    except TypeError:
+        pass
+    return type(obj) in _decorators # pylint: disable=unidiomatic-typecheck
+
+#----------------------------------------------------------------------------
+
+def import_hook(hook):
+    r"""Register an import hook that is called whenever a persistent object
+    is being unpickled. A typical use case is to patch the pickled source
+    code to avoid errors and inconsistencies when the API of some imported
+    module has changed.
+
+    The hook should have the following signature:
+
+        hook(meta) -> modified meta
+
+    `meta` is an instance of `dnnlib.EasyDict` with the following fields:
+
+        type:       Type of the persistent object, e.g. `'class'`.
+        version:    Internal version number of `torch_utils.persistence`.
+        module_src  Original source code of the Python module.
+        class_name: Class name in the original Python module.
+        state:      Internal state of the object.
+
+    Example:
+
+        @persistence.import_hook
+        def wreck_my_network(meta):
+            if meta.class_name == 'MyNetwork':
+                print('MyNetwork is being imported. I will wreck it!')
+                meta.module_src = meta.module_src.replace("True", "False")
+            return meta
+    """
+    assert callable(hook)
+    _import_hooks.append(hook)
+
+#----------------------------------------------------------------------------
+
+def _reconstruct_persistent_obj(meta):
+    r"""Hook that is called internally by the `pickle` module to unpickle
+    a persistent object.
+    """
+    meta = dnnlib.EasyDict(meta)
+    meta.state = dnnlib.EasyDict(meta.state)
+    for hook in _import_hooks:
+        meta = hook(meta)
+        assert meta is not None
+
+    assert meta.version == _version
+    module = _src_to_module(meta.module_src)
+
+    assert meta.type == 'class'
+    orig_class = module.__dict__[meta.class_name]
+    decorator_class = persistent_class(orig_class)
+    obj = decorator_class.__new__(decorator_class)
+
+    setstate = getattr(obj, '__setstate__', None)
+    if callable(setstate):
+        setstate(meta.state) # pylint: disable=not-callable
+    else:
+        obj.__dict__.update(meta.state)
+    return obj
+
+#----------------------------------------------------------------------------
+
+def _module_to_src(module):
+    r"""Query the source code of a given Python module.
+    """
+    src = _module_to_src_dict.get(module, None)
+    if src is None:
+        src = inspect.getsource(module)
+        _module_to_src_dict[module] = src
+        _src_to_module_dict[src] = module
+    return src
+
+def _src_to_module(src):
+    r"""Get or create a Python module for the given source code.
+    """
+    module = _src_to_module_dict.get(src, None)
+    if module is None:
+        module_name = "_imported_module_" + uuid.uuid4().hex
+        module = types.ModuleType(module_name)
+        sys.modules[module_name] = module
+        _module_to_src_dict[module] = src
+        _src_to_module_dict[src] = module
+        exec(src, module.__dict__) # pylint: disable=exec-used
+    return module
+
+#----------------------------------------------------------------------------
+
+def _check_pickleable(obj):
+    r"""Check that the given object is pickleable, raising an exception if
+    it is not. This function is expected to be considerably more efficient
+    than actually pickling the object.
+    """
+    def recurse(obj):
+        if isinstance(obj, (list, tuple, set)):
+            return [recurse(x) for x in obj]
+        if isinstance(obj, dict):
+            return [[recurse(x), recurse(y)] for x, y in obj.items()]
+        if isinstance(obj, (str, int, float, bool, bytes, bytearray)):
+            return None # Python primitive types are pickleable.
+        if f'{type(obj).__module__}.{type(obj).__name__}' in ['numpy.ndarray', 'torch.Tensor', 'torch.nn.parameter.Parameter']:
+            return None # NumPy arrays and PyTorch tensors are pickleable.
+        if is_persistent(obj):
+            return None # Persistent objects are pickleable, by virtue of the constructor check.
+        return obj
+    with io.BytesIO() as f:
+        pickle.dump(recurse(obj), f)
+
+#----------------------------------------------------------------------------
diff --git a/torch_utils/training_stats.py b/torch_utils/training_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..5de4134f1943e7c3104bbc926b2abaf828626525
--- /dev/null
+++ b/torch_utils/training_stats.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Facilities for reporting and collecting training statistics across
+multiple processes and devices. The interface is designed to minimize
+synchronization overhead as well as the amount of boilerplate in user
+code."""
+
+import re
+import numpy as np
+import torch
+import dnnlib
+
+from . import misc
+
+#----------------------------------------------------------------------------
+
+_num_moments    = 3             # [num_scalars, sum_of_scalars, sum_of_squares]
+_reduce_dtype   = torch.float32 # Data type to use for initial per-tensor reduction.
+_counter_dtype  = torch.float64 # Data type to use for the internal counters.
+_rank           = 0             # Rank of the current process.
+_sync_device    = None          # Device to use for multiprocess communication. None = single-process.
+_sync_called    = False         # Has _sync() been called yet?
+_counters       = dict()        # Running counters on each device, updated by report(): name => device => torch.Tensor
+_cumulative     = dict()        # Cumulative counters on the CPU, updated by _sync(): name => torch.Tensor
+
+#----------------------------------------------------------------------------
+
+def init_multiprocessing(rank, sync_device):
+    r"""Initializes `torch_utils.training_stats` for collecting statistics
+    across multiple processes.
+
+    This function must be called after
+    `torch.distributed.init_process_group()` and before `Collector.update()`.
+    The call is not necessary if multi-process collection is not needed.
+
+    Args:
+        rank:           Rank of the current process.
+        sync_device:    PyTorch device to use for inter-process
+                        communication, or None to disable multi-process
+                        collection. Typically `torch.device('cuda', rank)`.
+    """
+    global _rank, _sync_device
+    assert not _sync_called
+    _rank = rank
+    _sync_device = sync_device
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def report(name, value):
+    r"""Broadcasts the given set of scalars to all interested instances of
+    `Collector`, across device and process boundaries.
+
+    This function is expected to be extremely cheap and can be safely
+    called from anywhere in the training loop, loss function, or inside a
+    `torch.nn.Module`.
+
+    Warning: The current implementation expects the set of unique names to
+    be consistent across processes. Please make sure that `report()` is
+    called at least once for each unique name by each process, and in the
+    same order. If a given process has no scalars to broadcast, it can do
+    `report(name, [])` (empty list).
+
+    Args:
+        name:   Arbitrary string specifying the name of the statistic.
+                Averages are accumulated separately for each unique name.
+        value:  Arbitrary set of scalars. Can be a list, tuple,
+                NumPy array, PyTorch tensor, or Python scalar.
+
+    Returns:
+        The same `value` that was passed in.
+    """
+    if name not in _counters:
+        _counters[name] = dict()
+
+    elems = torch.as_tensor(value)
+    if elems.numel() == 0:
+        return value
+
+    elems = elems.detach().flatten().to(_reduce_dtype)
+    moments = torch.stack([
+        torch.ones_like(elems).sum(),
+        elems.sum(),
+        elems.square().sum(),
+    ])
+    assert moments.ndim == 1 and moments.shape[0] == _num_moments
+    moments = moments.to(_counter_dtype)
+
+    device = moments.device
+    if device not in _counters[name]:
+        _counters[name][device] = torch.zeros_like(moments)
+    _counters[name][device].add_(moments)
+    return value
+
+#----------------------------------------------------------------------------
+
+def report0(name, value):
+    r"""Broadcasts the given set of scalars by the first process (`rank = 0`),
+    but ignores any scalars provided by the other processes.
+    See `report()` for further details.
+    """
+    report(name, value if _rank == 0 else [])
+    return value
+
+#----------------------------------------------------------------------------
+
+class Collector:
+    r"""Collects the scalars broadcasted by `report()` and `report0()` and
+    computes their long-term averages (mean and standard deviation) over
+    user-defined periods of time.
+
+    The averages are first collected into internal counters that are not
+    directly visible to the user. They are then copied to the user-visible
+    state as a result of calling `update()` and can then be queried using
+    `mean()`, `std()`, `as_dict()`, etc. Calling `update()` also resets the
+    internal counters for the next round, so that the user-visible state
+    effectively reflects averages collected between the last two calls to
+    `update()`.
+
+    Args:
+        regex:          Regular expression defining which statistics to
+                        collect. The default is to collect everything.
+        keep_previous:  Whether to retain the previous averages if no
+                        scalars were collected on a given round
+                        (default: True).
+    """
+    def __init__(self, regex='.*', keep_previous=True):
+        self._regex = re.compile(regex)
+        self._keep_previous = keep_previous
+        self._cumulative = dict()
+        self._moments = dict()
+        self.update()
+        self._moments.clear()
+
+    def names(self):
+        r"""Returns the names of all statistics broadcasted so far that
+        match the regular expression specified at construction time.
+        """
+        return [name for name in _counters if self._regex.fullmatch(name)]
+
+    def update(self):
+        r"""Copies current values of the internal counters to the
+        user-visible state and resets them for the next round.
+
+        If `keep_previous=True` was specified at construction time, the
+        operation is skipped for statistics that have received no scalars
+        since the last update, retaining their previous averages.
+
+        This method performs a number of GPU-to-CPU transfers and one
+        `torch.distributed.all_reduce()`. It is intended to be called
+        periodically in the main training loop, typically once every
+        N training steps.
+        """
+        if not self._keep_previous:
+            self._moments.clear()
+        for name, cumulative in _sync(self.names()):
+            if name not in self._cumulative:
+                self._cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+            delta = cumulative - self._cumulative[name]
+            self._cumulative[name].copy_(cumulative)
+            if float(delta[0]) != 0:
+                self._moments[name] = delta
+
+    def _get_delta(self, name):
+        r"""Returns the raw moments that were accumulated for the given
+        statistic between the last two calls to `update()`, or zero if
+        no scalars were collected.
+        """
+        assert self._regex.fullmatch(name)
+        if name not in self._moments:
+            self._moments[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+        return self._moments[name]
+
+    def num(self, name):
+        r"""Returns the number of scalars that were accumulated for the given
+        statistic between the last two calls to `update()`, or zero if
+        no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        return int(delta[0])
+
+    def mean(self, name):
+        r"""Returns the mean of the scalars that were accumulated for the
+        given statistic between the last two calls to `update()`, or NaN if
+        no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        if int(delta[0]) == 0:
+            return float('nan')
+        return float(delta[1] / delta[0])
+
+    def std(self, name):
+        r"""Returns the standard deviation of the scalars that were
+        accumulated for the given statistic between the last two calls to
+        `update()`, or NaN if no scalars were collected.
+        """
+        delta = self._get_delta(name)
+        if int(delta[0]) == 0 or not np.isfinite(float(delta[1])):
+            return float('nan')
+        if int(delta[0]) == 1:
+            return float(0)
+        mean = float(delta[1] / delta[0])
+        raw_var = float(delta[2] / delta[0])
+        return np.sqrt(max(raw_var - np.square(mean), 0))
+
+    def as_dict(self):
+        r"""Returns the averages accumulated between the last two calls to
+        `update()` as an `dnnlib.EasyDict`. The contents are as follows:
+
+            dnnlib.EasyDict(
+                NAME = dnnlib.EasyDict(num=FLOAT, mean=FLOAT, std=FLOAT),
+                ...
+            )
+        """
+        stats = dnnlib.EasyDict()
+        for name in self.names():
+            stats[name] = dnnlib.EasyDict(num=self.num(name), mean=self.mean(name), std=self.std(name))
+        return stats
+
+    def __getitem__(self, name):
+        r"""Convenience getter.
+        `collector[name]` is a synonym for `collector.mean(name)`.
+        """
+        return self.mean(name)
+
+#----------------------------------------------------------------------------
+
+def _sync(names):
+    r"""Synchronize the global cumulative counters across devices and
+    processes. Called internally by `Collector.update()`.
+    """
+    if len(names) == 0:
+        return []
+    global _sync_called
+    _sync_called = True
+
+    # Collect deltas within current rank.
+    deltas = []
+    device = _sync_device if _sync_device is not None else torch.device('cpu')
+    for name in names:
+        delta = torch.zeros([_num_moments], dtype=_counter_dtype, device=device)
+        for counter in _counters[name].values():
+            delta.add_(counter.to(device))
+            counter.copy_(torch.zeros_like(counter))
+        deltas.append(delta)
+    deltas = torch.stack(deltas)
+
+    # Sum deltas across ranks.
+    if _sync_device is not None:
+        torch.distributed.all_reduce(deltas)
+
+    # Update cumulative values.
+    deltas = deltas.cpu()
+    for idx, name in enumerate(names):
+        if name not in _cumulative:
+            _cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype)
+        _cumulative[name].add_(deltas[idx])
+
+    # Return name-value pairs.
+    return [(name, _cumulative[name]) for name in names]
+
+#----------------------------------------------------------------------------
diff --git a/training/__init__.py b/training/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e1e1a5ba99e56a56ecaa14f7d4fa41777789c0cf
--- /dev/null
+++ b/training/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# empty
diff --git a/training/augment.py b/training/augment.py
new file mode 100755
index 0000000000000000000000000000000000000000..db3a668c5bfc72235611ac07a247f7dd297d831a
--- /dev/null
+++ b/training/augment.py
@@ -0,0 +1,431 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import scipy.signal
+import torch
+from torch_utils import persistence
+from torch_utils import misc
+from torch_utils.ops import upfirdn2d
+from torch_utils.ops import grid_sample_gradfix
+from torch_utils.ops import conv2d_gradfix
+
+#----------------------------------------------------------------------------
+# Coefficients of various wavelet decomposition low-pass filters.
+
+wavelets = {
+    'haar': [0.7071067811865476, 0.7071067811865476],
+    'db1':  [0.7071067811865476, 0.7071067811865476],
+    'db2':  [-0.12940952255092145, 0.22414386804185735, 0.836516303737469, 0.48296291314469025],
+    'db3':  [0.035226291882100656, -0.08544127388224149, -0.13501102001039084, 0.4598775021193313, 0.8068915093133388, 0.3326705529509569],
+    'db4':  [-0.010597401784997278, 0.032883011666982945, 0.030841381835986965, -0.18703481171888114, -0.02798376941698385, 0.6308807679295904, 0.7148465705525415, 0.23037781330885523],
+    'db5':  [0.003335725285001549, -0.012580751999015526, -0.006241490213011705, 0.07757149384006515, -0.03224486958502952, -0.24229488706619015, 0.13842814590110342, 0.7243085284385744, 0.6038292697974729, 0.160102397974125],
+    'db6':  [-0.00107730108499558, 0.004777257511010651, 0.0005538422009938016, -0.031582039318031156, 0.02752286553001629, 0.09750160558707936, -0.12976686756709563, -0.22626469396516913, 0.3152503517092432, 0.7511339080215775, 0.4946238903983854, 0.11154074335008017],
+    'db7':  [0.0003537138000010399, -0.0018016407039998328, 0.00042957797300470274, 0.012550998556013784, -0.01657454163101562, -0.03802993693503463, 0.0806126091510659, 0.07130921926705004, -0.22403618499416572, -0.14390600392910627, 0.4697822874053586, 0.7291320908465551, 0.39653931948230575, 0.07785205408506236],
+    'db8':  [-0.00011747678400228192, 0.0006754494059985568, -0.0003917403729959771, -0.00487035299301066, 0.008746094047015655, 0.013981027917015516, -0.04408825393106472, -0.01736930100202211, 0.128747426620186, 0.00047248457399797254, -0.2840155429624281, -0.015829105256023893, 0.5853546836548691, 0.6756307362980128, 0.3128715909144659, 0.05441584224308161],
+    'sym2': [-0.12940952255092145, 0.22414386804185735, 0.836516303737469, 0.48296291314469025],
+    'sym3': [0.035226291882100656, -0.08544127388224149, -0.13501102001039084, 0.4598775021193313, 0.8068915093133388, 0.3326705529509569],
+    'sym4': [-0.07576571478927333, -0.02963552764599851, 0.49761866763201545, 0.8037387518059161, 0.29785779560527736, -0.09921954357684722, -0.012603967262037833, 0.0322231006040427],
+    'sym5': [0.027333068345077982, 0.029519490925774643, -0.039134249302383094, 0.1993975339773936, 0.7234076904024206, 0.6339789634582119, 0.01660210576452232, -0.17532808990845047, -0.021101834024758855, 0.019538882735286728],
+    'sym6': [0.015404109327027373, 0.0034907120842174702, -0.11799011114819057, -0.048311742585633, 0.4910559419267466, 0.787641141030194, 0.3379294217276218, -0.07263752278646252, -0.021060292512300564, 0.04472490177066578, 0.0017677118642428036, -0.007800708325034148],
+    'sym7': [0.002681814568257878, -0.0010473848886829163, -0.01263630340325193, 0.03051551316596357, 0.0678926935013727, -0.049552834937127255, 0.017441255086855827, 0.5361019170917628, 0.767764317003164, 0.2886296317515146, -0.14004724044296152, -0.10780823770381774, 0.004010244871533663, 0.010268176708511255],
+    'sym8': [-0.0033824159510061256, -0.0005421323317911481, 0.03169508781149298, 0.007607487324917605, -0.1432942383508097, -0.061273359067658524, 0.4813596512583722, 0.7771857517005235, 0.3644418948353314, -0.05194583810770904, -0.027219029917056003, 0.049137179673607506, 0.003808752013890615, -0.01495225833704823, -0.0003029205147213668, 0.0018899503327594609],
+}
+
+#----------------------------------------------------------------------------
+# Helpers for constructing transformation matrices.
+
+def matrix(*rows, device=None):
+    assert all(len(row) == len(rows[0]) for row in rows)
+    elems = [x for row in rows for x in row]
+    ref = [x for x in elems if isinstance(x, torch.Tensor)]
+    if len(ref) == 0:
+        return misc.constant(np.asarray(rows), device=device)
+    assert device is None or device == ref[0].device
+    elems = [x if isinstance(x, torch.Tensor) else misc.constant(x, shape=ref[0].shape, device=ref[0].device) for x in elems]
+    return torch.stack(elems, dim=-1).reshape(ref[0].shape + (len(rows), -1))
+
+def translate2d(tx, ty, **kwargs):
+    return matrix(
+        [1, 0, tx],
+        [0, 1, ty],
+        [0, 0, 1],
+        **kwargs)
+
+def translate3d(tx, ty, tz, **kwargs):
+    return matrix(
+        [1, 0, 0, tx],
+        [0, 1, 0, ty],
+        [0, 0, 1, tz],
+        [0, 0, 0, 1],
+        **kwargs)
+
+def scale2d(sx, sy, **kwargs):
+    return matrix(
+        [sx, 0,  0],
+        [0,  sy, 0],
+        [0,  0,  1],
+        **kwargs)
+
+def scale3d(sx, sy, sz, **kwargs):
+    return matrix(
+        [sx, 0,  0,  0],
+        [0,  sy, 0,  0],
+        [0,  0,  sz, 0],
+        [0,  0,  0,  1],
+        **kwargs)
+
+def rotate2d(theta, **kwargs):
+    return matrix(
+        [torch.cos(theta), torch.sin(-theta), 0],
+        [torch.sin(theta), torch.cos(theta),  0],
+        [0,                0,                 1],
+        **kwargs)
+
+def rotate3d(v, theta, **kwargs):
+    vx = v[..., 0]; vy = v[..., 1]; vz = v[..., 2]
+    s = torch.sin(theta); c = torch.cos(theta); cc = 1 - c
+    return matrix(
+        [vx*vx*cc+c,    vx*vy*cc-vz*s, vx*vz*cc+vy*s, 0],
+        [vy*vx*cc+vz*s, vy*vy*cc+c,    vy*vz*cc-vx*s, 0],
+        [vz*vx*cc-vy*s, vz*vy*cc+vx*s, vz*vz*cc+c,    0],
+        [0,             0,             0,             1],
+        **kwargs)
+
+def translate2d_inv(tx, ty, **kwargs):
+    return translate2d(-tx, -ty, **kwargs)
+
+def scale2d_inv(sx, sy, **kwargs):
+    return scale2d(1 / sx, 1 / sy, **kwargs)
+
+def rotate2d_inv(theta, **kwargs):
+    return rotate2d(-theta, **kwargs)
+
+#----------------------------------------------------------------------------
+# Versatile image augmentation pipeline from the paper
+# "Training Generative Adversarial Networks with Limited Data".
+#
+# All augmentations are disabled by default; individual augmentations can
+# be enabled by setting their probability multipliers to 1.
+
+@persistence.persistent_class
+class AugmentPipe(torch.nn.Module):
+    def __init__(self,
+        xflip=0, rotate90=0, xint=0, xint_max=0.125,
+        scale=0, rotate=0, aniso=0, xfrac=0, scale_std=0.2, rotate_max=1, aniso_std=0.2, xfrac_std=0.125,
+        brightness=0, contrast=0, lumaflip=0, hue=0, saturation=0, brightness_std=0.2, contrast_std=0.5, hue_max=1, saturation_std=1,
+        imgfilter=0, imgfilter_bands=[1,1,1,1], imgfilter_std=1,
+        noise=0, cutout=0, noise_std=0.1, cutout_size=0.5,
+    ):
+        super().__init__()
+        self.register_buffer('p', torch.ones([]))       # Overall multiplier for augmentation probability.
+
+        # Pixel blitting.
+        self.xflip            = float(xflip)            # Probability multiplier for x-flip.
+        self.rotate90         = float(rotate90)         # Probability multiplier for 90 degree rotations.
+        self.xint             = float(xint)             # Probability multiplier for integer translation.
+        self.xint_max         = float(xint_max)         # Range of integer translation, relative to image dimensions.
+
+        # General geometric transformations.
+        self.scale            = float(scale)            # Probability multiplier for isotropic scaling.
+        self.rotate           = float(rotate)           # Probability multiplier for arbitrary rotation.
+        self.aniso            = float(aniso)            # Probability multiplier for anisotropic scaling.
+        self.xfrac            = float(xfrac)            # Probability multiplier for fractional translation.
+        self.scale_std        = float(scale_std)        # Log2 standard deviation of isotropic scaling.
+        self.rotate_max       = float(rotate_max)       # Range of arbitrary rotation, 1 = full circle.
+        self.aniso_std        = float(aniso_std)        # Log2 standard deviation of anisotropic scaling.
+        self.xfrac_std        = float(xfrac_std)        # Standard deviation of frational translation, relative to image dimensions.
+
+        # Color transformations.
+        self.brightness       = float(brightness)       # Probability multiplier for brightness.
+        self.contrast         = float(contrast)         # Probability multiplier for contrast.
+        self.lumaflip         = float(lumaflip)         # Probability multiplier for luma flip.
+        self.hue              = float(hue)              # Probability multiplier for hue rotation.
+        self.saturation       = float(saturation)       # Probability multiplier for saturation.
+        self.brightness_std   = float(brightness_std)   # Standard deviation of brightness.
+        self.contrast_std     = float(contrast_std)     # Log2 standard deviation of contrast.
+        self.hue_max          = float(hue_max)          # Range of hue rotation, 1 = full circle.
+        self.saturation_std   = float(saturation_std)   # Log2 standard deviation of saturation.
+
+        # Image-space filtering.
+        self.imgfilter        = float(imgfilter)        # Probability multiplier for image-space filtering.
+        self.imgfilter_bands  = list(imgfilter_bands)   # Probability multipliers for individual frequency bands.
+        self.imgfilter_std    = float(imgfilter_std)    # Log2 standard deviation of image-space filter amplification.
+
+        # Image-space corruptions.
+        self.noise            = float(noise)            # Probability multiplier for additive RGB noise.
+        self.cutout           = float(cutout)           # Probability multiplier for cutout.
+        self.noise_std        = float(noise_std)        # Standard deviation of additive RGB noise.
+        self.cutout_size      = float(cutout_size)      # Size of the cutout rectangle, relative to image dimensions.
+
+        # Setup orthogonal lowpass filter for geometric augmentations.
+        self.register_buffer('Hz_geom', upfirdn2d.setup_filter(wavelets['sym6']))
+
+        # Construct filter bank for image-space filtering.
+        Hz_lo = np.asarray(wavelets['sym2'])            # H(z)
+        Hz_hi = Hz_lo * ((-1) ** np.arange(Hz_lo.size)) # H(-z)
+        Hz_lo2 = np.convolve(Hz_lo, Hz_lo[::-1]) / 2    # H(z) * H(z^-1) / 2
+        Hz_hi2 = np.convolve(Hz_hi, Hz_hi[::-1]) / 2    # H(-z) * H(-z^-1) / 2
+        Hz_fbank = np.eye(4, 1)                         # Bandpass(H(z), b_i)
+        for i in range(1, Hz_fbank.shape[0]):
+            Hz_fbank = np.dstack([Hz_fbank, np.zeros_like(Hz_fbank)]).reshape(Hz_fbank.shape[0], -1)[:, :-1]
+            Hz_fbank = scipy.signal.convolve(Hz_fbank, [Hz_lo2])
+            Hz_fbank[i, (Hz_fbank.shape[1] - Hz_hi2.size) // 2 : (Hz_fbank.shape[1] + Hz_hi2.size) // 2] += Hz_hi2
+        self.register_buffer('Hz_fbank', torch.as_tensor(Hz_fbank, dtype=torch.float32))
+
+    def forward(self, images, debug_percentile=None):
+        assert isinstance(images, torch.Tensor) and images.ndim == 4
+        batch_size, num_channels, height, width = images.shape
+        device = images.device
+        if debug_percentile is not None:
+            debug_percentile = torch.as_tensor(debug_percentile, dtype=torch.float32, device=device)
+
+        # -------------------------------------
+        # Select parameters for pixel blitting.
+        # -------------------------------------
+
+        # Initialize inverse homogeneous 2D transform: G_inv @ pixel_out ==> pixel_in
+        I_3 = torch.eye(3, device=device)
+        G_inv = I_3
+
+        # Apply x-flip with probability (xflip * strength).
+        if self.xflip > 0:
+            i = torch.floor(torch.rand([batch_size], device=device) * 2)
+            i = torch.where(torch.rand([batch_size], device=device) < self.xflip * self.p, i, torch.zeros_like(i))
+            if debug_percentile is not None:
+                i = torch.full_like(i, torch.floor(debug_percentile * 2))
+            G_inv = G_inv @ scale2d_inv(1 - 2 * i, 1)
+
+        # Apply 90 degree rotations with probability (rotate90 * strength).
+        if self.rotate90 > 0:
+            i = torch.floor(torch.rand([batch_size], device=device) * 4)
+            i = torch.where(torch.rand([batch_size], device=device) < self.rotate90 * self.p, i, torch.zeros_like(i))
+            if debug_percentile is not None:
+                i = torch.full_like(i, torch.floor(debug_percentile * 4))
+            G_inv = G_inv @ rotate2d_inv(-np.pi / 2 * i)
+
+        # Apply integer translation with probability (xint * strength).
+        if self.xint > 0:
+            t = (torch.rand([batch_size, 2], device=device) * 2 - 1) * self.xint_max
+            t = torch.where(torch.rand([batch_size, 1], device=device) < self.xint * self.p, t, torch.zeros_like(t))
+            if debug_percentile is not None:
+                t = torch.full_like(t, (debug_percentile * 2 - 1) * self.xint_max)
+            G_inv = G_inv @ translate2d_inv(torch.round(t[:,0] * width), torch.round(t[:,1] * height))
+
+        # --------------------------------------------------------
+        # Select parameters for general geometric transformations.
+        # --------------------------------------------------------
+
+        # Apply isotropic scaling with probability (scale * strength).
+        if self.scale > 0:
+            s = torch.exp2(torch.randn([batch_size], device=device) * self.scale_std)
+            s = torch.where(torch.rand([batch_size], device=device) < self.scale * self.p, s, torch.ones_like(s))
+            if debug_percentile is not None:
+                s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.scale_std))
+            G_inv = G_inv @ scale2d_inv(s, s)
+
+        # Apply pre-rotation with probability p_rot.
+        p_rot = 1 - torch.sqrt((1 - self.rotate * self.p).clamp(0, 1)) # P(pre OR post) = p
+        if self.rotate > 0:
+            theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.rotate_max
+            theta = torch.where(torch.rand([batch_size], device=device) < p_rot, theta, torch.zeros_like(theta))
+            if debug_percentile is not None:
+                theta = torch.full_like(theta, (debug_percentile * 2 - 1) * np.pi * self.rotate_max)
+            G_inv = G_inv @ rotate2d_inv(-theta) # Before anisotropic scaling.
+
+        # Apply anisotropic scaling with probability (aniso * strength).
+        if self.aniso > 0:
+            s = torch.exp2(torch.randn([batch_size], device=device) * self.aniso_std)
+            s = torch.where(torch.rand([batch_size], device=device) < self.aniso * self.p, s, torch.ones_like(s))
+            if debug_percentile is not None:
+                s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.aniso_std))
+            G_inv = G_inv @ scale2d_inv(s, 1 / s)
+
+        # Apply post-rotation with probability p_rot.
+        if self.rotate > 0:
+            theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.rotate_max
+            theta = torch.where(torch.rand([batch_size], device=device) < p_rot, theta, torch.zeros_like(theta))
+            if debug_percentile is not None:
+                theta = torch.zeros_like(theta)
+            G_inv = G_inv @ rotate2d_inv(-theta) # After anisotropic scaling.
+
+        # Apply fractional translation with probability (xfrac * strength).
+        if self.xfrac > 0:
+            t = torch.randn([batch_size, 2], device=device) * self.xfrac_std
+            t = torch.where(torch.rand([batch_size, 1], device=device) < self.xfrac * self.p, t, torch.zeros_like(t))
+            if debug_percentile is not None:
+                t = torch.full_like(t, torch.erfinv(debug_percentile * 2 - 1) * self.xfrac_std)
+            G_inv = G_inv @ translate2d_inv(t[:,0] * width, t[:,1] * height)
+
+        # ----------------------------------
+        # Execute geometric transformations.
+        # ----------------------------------
+
+        # Execute if the transform is not identity.
+        if G_inv is not I_3:
+
+            # Calculate padding.
+            cx = (width - 1) / 2
+            cy = (height - 1) / 2
+            cp = matrix([-cx, -cy, 1], [cx, -cy, 1], [cx, cy, 1], [-cx, cy, 1], device=device) # [idx, xyz]
+            cp = G_inv @ cp.t() # [batch, xyz, idx]
+            Hz_pad = self.Hz_geom.shape[0] // 4
+            margin = cp[:, :2, :].permute(1, 0, 2).flatten(1) # [xy, batch * idx]
+            margin = torch.cat([-margin, margin]).max(dim=1).values # [x0, y0, x1, y1]
+            margin = margin + misc.constant([Hz_pad * 2 - cx, Hz_pad * 2 - cy] * 2, device=device)
+            margin = margin.max(misc.constant([0, 0] * 2, device=device))
+            margin = margin.min(misc.constant([width-1, height-1] * 2, device=device))
+            mx0, my0, mx1, my1 = margin.ceil().to(torch.int32)
+
+            # Pad image and adjust origin.
+            images = torch.nn.functional.pad(input=images, pad=[mx0,mx1,my0,my1], mode='reflect')
+            G_inv = translate2d((mx0 - mx1) / 2, (my0 - my1) / 2) @ G_inv
+
+            # Upsample.
+            images = upfirdn2d.upsample2d(x=images, f=self.Hz_geom, up=2)
+            G_inv = scale2d(2, 2, device=device) @ G_inv @ scale2d_inv(2, 2, device=device)
+            G_inv = translate2d(-0.5, -0.5, device=device) @ G_inv @ translate2d_inv(-0.5, -0.5, device=device)
+
+            # Execute transformation.
+            shape = [batch_size, num_channels, (height + Hz_pad * 2) * 2, (width + Hz_pad * 2) * 2]
+            G_inv = scale2d(2 / images.shape[3], 2 / images.shape[2], device=device) @ G_inv @ scale2d_inv(2 / shape[3], 2 / shape[2], device=device)
+            grid = torch.nn.functional.affine_grid(theta=G_inv[:,:2,:], size=shape, align_corners=False)
+            images = grid_sample_gradfix.grid_sample(images, grid)
+
+            # Downsample and crop.
+            images = upfirdn2d.downsample2d(x=images, f=self.Hz_geom, down=2, padding=-Hz_pad*2, flip_filter=True)
+
+        # --------------------------------------------
+        # Select parameters for color transformations.
+        # --------------------------------------------
+
+        # Initialize homogeneous 3D transformation matrix: C @ color_in ==> color_out
+        I_4 = torch.eye(4, device=device)
+        C = I_4
+
+        # Apply brightness with probability (brightness * strength).
+        if self.brightness > 0:
+            b = torch.randn([batch_size], device=device) * self.brightness_std
+            b = torch.where(torch.rand([batch_size], device=device) < self.brightness * self.p, b, torch.zeros_like(b))
+            if debug_percentile is not None:
+                b = torch.full_like(b, torch.erfinv(debug_percentile * 2 - 1) * self.brightness_std)
+            C = translate3d(b, b, b) @ C
+
+        # Apply contrast with probability (contrast * strength).
+        if self.contrast > 0:
+            c = torch.exp2(torch.randn([batch_size], device=device) * self.contrast_std)
+            c = torch.where(torch.rand([batch_size], device=device) < self.contrast * self.p, c, torch.ones_like(c))
+            if debug_percentile is not None:
+                c = torch.full_like(c, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.contrast_std))
+            C = scale3d(c, c, c) @ C
+
+        # Apply luma flip with probability (lumaflip * strength).
+        v = misc.constant(np.asarray([1, 1, 1, 0]) / np.sqrt(3), device=device) # Luma axis.
+        if self.lumaflip > 0:
+            i = torch.floor(torch.rand([batch_size, 1, 1], device=device) * 2)
+            i = torch.where(torch.rand([batch_size, 1, 1], device=device) < self.lumaflip * self.p, i, torch.zeros_like(i))
+            if debug_percentile is not None:
+                i = torch.full_like(i, torch.floor(debug_percentile * 2))
+            C = (I_4 - 2 * v.ger(v) * i) @ C # Householder reflection.
+
+        # Apply hue rotation with probability (hue * strength).
+        if self.hue > 0 and num_channels > 1:
+            theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.hue_max
+            theta = torch.where(torch.rand([batch_size], device=device) < self.hue * self.p, theta, torch.zeros_like(theta))
+            if debug_percentile is not None:
+                theta = torch.full_like(theta, (debug_percentile * 2 - 1) * np.pi * self.hue_max)
+            C = rotate3d(v, theta) @ C # Rotate around v.
+
+        # Apply saturation with probability (saturation * strength).
+        if self.saturation > 0 and num_channels > 1:
+            s = torch.exp2(torch.randn([batch_size, 1, 1], device=device) * self.saturation_std)
+            s = torch.where(torch.rand([batch_size, 1, 1], device=device) < self.saturation * self.p, s, torch.ones_like(s))
+            if debug_percentile is not None:
+                s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.saturation_std))
+            C = (v.ger(v) + (I_4 - v.ger(v)) * s) @ C
+
+        # ------------------------------
+        # Execute color transformations.
+        # ------------------------------
+
+        # Execute if the transform is not identity.
+        if C is not I_4:
+            images = images.reshape([batch_size, num_channels, height * width])
+            if num_channels == 3:
+                images = C[:, :3, :3] @ images + C[:, :3, 3:]
+            elif num_channels == 1:
+                C = C[:, :3, :].mean(dim=1, keepdims=True)
+                images = images * C[:, :, :3].sum(dim=2, keepdims=True) + C[:, :, 3:]
+            else:
+                raise ValueError('Image must be RGB (3 channels) or L (1 channel)')
+            images = images.reshape([batch_size, num_channels, height, width])
+
+        # ----------------------
+        # Image-space filtering.
+        # ----------------------
+
+        if self.imgfilter > 0:
+            num_bands = self.Hz_fbank.shape[0]
+            assert len(self.imgfilter_bands) == num_bands
+            expected_power = misc.constant(np.array([10, 1, 1, 1]) / 13, device=device) # Expected power spectrum (1/f).
+
+            # Apply amplification for each band with probability (imgfilter * strength * band_strength).
+            g = torch.ones([batch_size, num_bands], device=device) # Global gain vector (identity).
+            for i, band_strength in enumerate(self.imgfilter_bands):
+                t_i = torch.exp2(torch.randn([batch_size], device=device) * self.imgfilter_std)
+                t_i = torch.where(torch.rand([batch_size], device=device) < self.imgfilter * self.p * band_strength, t_i, torch.ones_like(t_i))
+                if debug_percentile is not None:
+                    t_i = torch.full_like(t_i, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.imgfilter_std)) if band_strength > 0 else torch.ones_like(t_i)
+                t = torch.ones([batch_size, num_bands], device=device)                  # Temporary gain vector.
+                t[:, i] = t_i                                                           # Replace i'th element.
+                t = t / (expected_power * t.square()).sum(dim=-1, keepdims=True).sqrt() # Normalize power.
+                g = g * t                                                               # Accumulate into global gain.
+
+            # Construct combined amplification filter.
+            Hz_prime = g @ self.Hz_fbank                                    # [batch, tap]
+            Hz_prime = Hz_prime.unsqueeze(1).repeat([1, num_channels, 1])   # [batch, channels, tap]
+            Hz_prime = Hz_prime.reshape([batch_size * num_channels, 1, -1]) # [batch * channels, 1, tap]
+
+            # Apply filter.
+            p = self.Hz_fbank.shape[1] // 2
+            images = images.reshape([1, batch_size * num_channels, height, width])
+            images = torch.nn.functional.pad(input=images, pad=[p,p,p,p], mode='reflect')
+            images = conv2d_gradfix.conv2d(input=images, weight=Hz_prime.unsqueeze(2), groups=batch_size*num_channels)
+            images = conv2d_gradfix.conv2d(input=images, weight=Hz_prime.unsqueeze(3), groups=batch_size*num_channels)
+            images = images.reshape([batch_size, num_channels, height, width])
+
+        # ------------------------
+        # Image-space corruptions.
+        # ------------------------
+
+        # Apply additive RGB noise with probability (noise * strength).
+        if self.noise > 0:
+            sigma = torch.randn([batch_size, 1, 1, 1], device=device).abs() * self.noise_std
+            sigma = torch.where(torch.rand([batch_size, 1, 1, 1], device=device) < self.noise * self.p, sigma, torch.zeros_like(sigma))
+            if debug_percentile is not None:
+                sigma = torch.full_like(sigma, torch.erfinv(debug_percentile) * self.noise_std)
+            images = images + torch.randn([batch_size, num_channels, height, width], device=device) * sigma
+
+        # Apply cutout with probability (cutout * strength).
+        if self.cutout > 0:
+            size = torch.full([batch_size, 2, 1, 1, 1], self.cutout_size, device=device)
+            size = torch.where(torch.rand([batch_size, 1, 1, 1, 1], device=device) < self.cutout * self.p, size, torch.zeros_like(size))
+            center = torch.rand([batch_size, 2, 1, 1, 1], device=device)
+            if debug_percentile is not None:
+                size = torch.full_like(size, self.cutout_size)
+                center = torch.full_like(center, debug_percentile)
+            coord_x = torch.arange(width, device=device).reshape([1, 1, 1, -1])
+            coord_y = torch.arange(height, device=device).reshape([1, 1, -1, 1])
+            mask_x = (((coord_x + 0.5) / width - center[:, 0]).abs() >= size[:, 0] / 2)
+            mask_y = (((coord_y + 0.5) / height - center[:, 1]).abs() >= size[:, 1] / 2)
+            mask = torch.logical_or(mask_x, mask_y).to(torch.float32)
+            images = images * mask
+
+        return images
+
+#----------------------------------------------------------------------------
diff --git a/training/data_utils.py b/training/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9cb117bf66fefe8aacbba552c43ac269bb05e2
--- /dev/null
+++ b/training/data_utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import PIL.Image
+import torch
+import cv2, albumentations
+import numpy as np
+
+
+def save_image(img, filename):
+    img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+    PIL.Image.fromarray(img[0].cpu().numpy(), 'RGB').save(filename)
+
+
+def save_image_grid(img, fname, drange, grid_size):
+    lo, hi = drange
+    img = np.asarray(img, dtype=np.float32)
+    img = (img - lo) * (255 / (hi - lo))
+    img = np.rint(img).clip(0, 255).astype(np.uint8)
+
+    gw, gh = grid_size
+    _N, C, H, W = img.shape
+    img = img.reshape(gh, gw, C, H, W)
+    img = img.transpose(0, 3, 1, 4, 2)
+    img = img.reshape(gh * H, gw * W, C)
+
+    assert C in [1, 3]
+    if C == 1:
+        PIL.Image.fromarray(img[:, :, 0], 'L').save(fname)
+    if C == 3:
+        PIL.Image.fromarray(img, 'RGB').save(fname)
+
+
+def resize_image(img_pytorch, curr_res):
+    img = img_pytorch.permute(0,2,3,1).cpu().numpy()
+    img = [albumentations.geometric.functional.resize(
+        img[i], height=curr_res, width=curr_res, interpolation=cv2.INTER_LANCZOS4)
+        for i in range(img.shape[0])]
+    img = torch.from_numpy(np.stack(img, axis=0)).permute(0,3,1,2).to(img_pytorch.device)
+    return img
\ No newline at end of file
diff --git a/training/dataset.py b/training/dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..0df9031f874cb4ee5ba1a5c6ea016991bbbbd749
--- /dev/null
+++ b/training/dataset.py
@@ -0,0 +1,275 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from curses import raw
+import os
+from urllib import response
+import numpy as np
+import zipfile
+import PIL.Image
+import cv2
+import json
+import torch
+import dnnlib
+
+try:
+    import pyspng
+except ImportError:
+    pyspng = None
+
+#----------------------------------------------------------------------------
+
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self,
+        name,                   # Name of the dataset.
+        raw_shape,              # Shape of the raw image data (NCHW).
+        max_size    = None,     # Artificially limit the size of the dataset. None = no limit. Applied before xflip.
+        use_labels  = False,    # Enable conditioning labels? False = label dimension is zero.
+        xflip       = False,    # Artificially double the size of the dataset via x-flips. Applied after max_size.
+        random_seed = 0,        # Random seed to use when applying max_size.
+    ):
+        self._name = name
+        self._raw_shape = list(raw_shape)
+        self._use_labels = use_labels
+        self._raw_labels = None
+        self._label_shape = None
+
+        # Apply max_size.
+        self._raw_idx = np.arange(self._raw_shape[0], dtype=np.int64)
+        if (max_size is not None) and (self._raw_idx.size > max_size):
+            np.random.RandomState(random_seed).shuffle(self._raw_idx)
+            self._raw_idx = np.sort(self._raw_idx[:max_size])
+
+        # Apply xflip.
+        self.xflip = xflip
+        self._xflip = np.zeros(self._raw_idx.size, dtype=np.uint8)
+        if xflip:
+            self._raw_idx = np.tile(self._raw_idx, 2)
+            self._xflip = np.concatenate([self._xflip, np.ones_like(self._xflip)])
+
+    def _get_raw_labels(self):
+        if self._raw_labels is None:
+            self._raw_labels = self._load_raw_labels() if self._use_labels else None
+            if self._raw_labels is None:
+                self._raw_labels = np.zeros([self._raw_shape[0], 0], dtype=np.float32)
+            assert isinstance(self._raw_labels, np.ndarray)
+            assert self._raw_labels.shape[0] == self._raw_shape[0]
+            assert self._raw_labels.dtype in [np.float32, np.int64]
+            if self._raw_labels.dtype == np.int64:
+                assert self._raw_labels.ndim == 1
+                assert np.all(self._raw_labels >= 0)
+        return self._raw_labels
+
+    def close(self): # to be overridden by subclass
+        pass
+
+    def _load_raw_image(self, raw_idx): # to be overridden by subclass
+        raise NotImplementedError
+
+    def _load_raw_labels(self): # to be overridden by subclass
+        raise NotImplementedError
+
+    def __getstate__(self):
+        return dict(self.__dict__, _raw_labels=None)
+
+    def __del__(self):
+        try:
+            self.close()
+        except:
+            pass
+
+    def __len__(self):
+        return self._raw_idx.size
+
+    def __getitem__(self, idx):
+        image = self._load_raw_image(self._raw_idx[idx])
+        assert isinstance(image, np.ndarray)
+        assert list(image.shape) == self.image_shape
+        assert image.dtype == np.uint8
+        if self._xflip[idx]:
+            assert image.ndim == 3 # CHW
+            image = image[:, :, ::-1]
+        return image.copy(), self.get_label(idx), idx
+
+    def get_label(self, idx):
+        label = self._get_raw_labels()[self._raw_idx[idx]]
+        if label.dtype == np.int64:
+            onehot = np.zeros(self.label_shape, dtype=np.float32)
+            onehot[label] = 1
+            label = onehot
+        return label.copy()
+
+    def get_details(self, idx):
+        d = dnnlib.EasyDict()
+        d.raw_idx = int(self._raw_idx[idx])
+        d.xflip = (int(self._xflip[idx]) != 0)
+        d.raw_label = self._get_raw_labels()[d.raw_idx].copy()
+        return d
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def image_shape(self):
+        return list(self._raw_shape[1:])
+
+    @property
+    def num_channels(self):
+        assert len(self.image_shape) == 3 # CHW
+        return self.image_shape[0]
+
+    @property
+    def resolution(self):
+        assert len(self.image_shape) == 3 # CHW
+        assert self.image_shape[1] == self.image_shape[2]
+        return self.image_shape[1]
+
+    @property
+    def label_shape(self):
+        if self._label_shape is None:
+            raw_labels = self._get_raw_labels()
+            if raw_labels.dtype == np.int64:
+                self._label_shape = [int(np.max(raw_labels)) + 1]
+            else:
+                self._label_shape = raw_labels.shape[1:]
+        return list(self._label_shape)
+
+    @property
+    def label_dim(self):
+        assert len(self.label_shape) == 1
+        return self.label_shape[0]
+
+    @property
+    def has_labels(self):
+        return any(x != 0 for x in self.label_shape)
+
+    @property
+    def has_onehot_labels(self):
+        return self._get_raw_labels().dtype == np.int64
+
+#----------------------------------------------------------------------------
+
+class ImageFolderDataset(Dataset):
+    def __init__(self,
+        path,                   # Path to directory or zip.
+        resolution      = None, # Ensure specific resolution, None = highest available.
+        **super_kwargs,         # Additional arguments for the Dataset base class.
+    ):
+        self._path = path
+        self._zipfile = None
+
+        if os.path.isdir(self._path):
+            self._type = 'dir'
+            self._all_fnames = {os.path.relpath(os.path.join(root, fname), start=self._path) for root, _dirs, files in os.walk(self._path) for fname in files}
+        elif self._file_ext(self._path) == '.zip':
+            self._type = 'zip'
+            self._all_fnames = set(self._get_zipfile().namelist())
+        else:
+            raise IOError('Path must point to a directory or zip')
+
+        PIL.Image.init()
+        self._image_fnames = sorted(fname for fname in self._all_fnames if self._file_ext(fname) in PIL.Image.EXTENSION)
+        if len(self._image_fnames) == 0:
+            raise IOError('No image files found in the specified path')
+
+        name = os.path.splitext(os.path.basename(self._path))[0]
+        raw_shape = [len(self._image_fnames)] + list(self._load_raw_image(0).shape)
+        if resolution is not None:
+            raw_shape[2] = raw_shape[3] = resolution
+        # if resolution is not None and (raw_shape[2] != resolution or raw_shape[3] != resolution):
+        #     raise IOError('Image files do not match the specified resolution')
+        super().__init__(name=name, raw_shape=raw_shape, **super_kwargs)
+
+    @staticmethod
+    def _file_ext(fname):
+        return os.path.splitext(fname)[1].lower()
+
+    def _get_zipfile(self):
+        assert self._type == 'zip'
+        if self._zipfile is None:
+            self._zipfile = zipfile.ZipFile(self._path)
+        return self._zipfile
+
+    def _open_file(self, fname):
+        if self._type == 'dir':
+            return open(os.path.join(self._path, fname), 'rb')
+        if self._type == 'zip':
+            return self._get_zipfile().open(fname, 'r')
+        return None
+
+    def close(self):
+        try:
+            if self._zipfile is not None:
+                self._zipfile.close()
+        finally:
+            self._zipfile = None
+
+    def __getstate__(self):
+        return dict(super().__getstate__(), _zipfile=None)
+
+    def _load_raw_image(self, raw_idx):
+        fname = self._image_fnames[raw_idx]
+        with self._open_file(fname) as f:
+            if pyspng is not None and self._file_ext(fname) == '.png':
+                image = pyspng.load(f.read())
+            else:
+                image = np.array(PIL.Image.open(f))
+        if image.ndim == 2:
+            image = image[:, :, np.newaxis] # HW => HWC
+        if hasattr(self, '_raw_shape') and image.shape[-1] != self.resolution:  # resize input image
+            image = cv2.resize(image, (self.resolution, self.resolution), interpolation=cv2.INTER_AREA)
+        image = image.transpose(2, 0, 1) # HWC => CHW
+        return image
+
+    def _load_raw_labels(self):
+        fname = 'dataset.json'
+        if fname not in self._all_fnames:
+            return None
+        with self._open_file(fname) as f:
+            labels = json.load(f)['labels']
+        if labels is None:
+            return None
+        labels = dict(labels)
+        labels = [labels[fname.replace('\\', '/')] for fname in self._image_fnames]
+        labels = np.array(labels)
+        labels = labels.astype({1: np.int64, 2: np.float32}[labels.ndim])
+        return labels
+
+    def get_dali_dataloader(self, batch_size, world_size, rank, gpu):  # TODO
+        from nvidia.dali import pipeline_def, Pipeline
+        import nvidia.dali.fn as fn
+        import nvidia.dali.types as types
+        from nvidia.dali.plugin.pytorch import DALIGenericIterator
+        
+        @pipeline_def
+        def pipeline():
+            jpegs, _ = fn.readers.file(
+                file_root=self._path,
+                files=list(self._all_fnames),
+                random_shuffle=True,
+                shard_id=rank, 
+                num_shards=world_size, 
+                name='reader')
+            images = fn.decoders.image(jpegs, device='mixed')
+            mirror = fn.random.coin_flip(probability=0.5) if self.xflip else False
+            images = fn.crop_mirror_normalize(
+                images.gpu(), output_layout="CHW", dtype=types.UINT8, mirror=mirror)
+            labels = np.zeros([1, 0], dtype=np.float32)
+            return images, labels
+        
+        dali_pipe = pipeline(batch_size=batch_size//world_size, num_threads=2, device_id=gpu)
+        dali_pipe.build()
+        training_set_iterator = DALIGenericIterator([dali_pipe], ['img', 'label'])
+        for data in training_set_iterator:
+            yield data[0]['img'], data[0]['label']
+
+#----------------------------------------------------------------------------
+
diff --git a/training/facial_recognition/__init__.py b/training/facial_recognition/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1098df114424861bfd89e067f00401a515116da6
--- /dev/null
+++ b/training/facial_recognition/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
\ No newline at end of file
diff --git a/training/facial_recognition/helpers.py b/training/facial_recognition/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..508aa362671237ea96fbe00c3f053ced0aaff126
--- /dev/null
+++ b/training/facial_recognition/helpers.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from collections import namedtuple
+import torch
+from torch.nn import Conv2d, BatchNorm2d, PReLU, ReLU, Sigmoid, MaxPool2d, AdaptiveAvgPool2d, Sequential, Module
+
+"""
+ArcFace implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
+"""
+
+
+class Flatten(Module):
+	def forward(self, input):
+		return input.view(input.size(0), -1)
+
+
+def l2_norm(input, axis=1):
+	norm = torch.norm(input, 2, axis, True)
+	output = torch.div(input, norm)
+	return output
+
+
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+	""" A named tuple describing a ResNet block. """
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+	return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+	if num_layers == 50:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=4),
+			get_block(in_channel=128, depth=256, num_units=14),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	elif num_layers == 100:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=13),
+			get_block(in_channel=128, depth=256, num_units=30),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	elif num_layers == 152:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=8),
+			get_block(in_channel=128, depth=256, num_units=36),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	else:
+		raise ValueError("Invalid number of layers: {}. Must be one of [50, 100, 152]".format(num_layers))
+	return blocks
+
+
+class SEModule(Module):
+	def __init__(self, channels, reduction):
+		super(SEModule, self).__init__()
+		self.avg_pool = AdaptiveAvgPool2d(1)
+		self.fc1 = Conv2d(channels, channels // reduction, kernel_size=1, padding=0, bias=False)
+		self.relu = ReLU(inplace=True)
+		self.fc2 = Conv2d(channels // reduction, channels, kernel_size=1, padding=0, bias=False)
+		self.sigmoid = Sigmoid()
+
+	def forward(self, x):
+		module_input = x
+		x = self.avg_pool(x)
+		x = self.fc1(x)
+		x = self.relu(x)
+		x = self.fc2(x)
+		x = self.sigmoid(x)
+		return module_input * x
+
+
+class bottleneck_IR(Module):
+	def __init__(self, in_channel, depth, stride):
+		super(bottleneck_IR, self).__init__()
+		if in_channel == depth:
+			self.shortcut_layer = MaxPool2d(1, stride)
+		else:
+			self.shortcut_layer = Sequential(
+				Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+				BatchNorm2d(depth)
+			)
+		self.res_layer = Sequential(
+			BatchNorm2d(in_channel),
+			Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), PReLU(depth),
+			Conv2d(depth, depth, (3, 3), stride, 1, bias=False), BatchNorm2d(depth)
+		)
+
+	def forward(self, x):
+		shortcut = self.shortcut_layer(x)
+		res = self.res_layer(x)
+		return res + shortcut
+
+
+class bottleneck_IR_SE(Module):
+	def __init__(self, in_channel, depth, stride):
+		super(bottleneck_IR_SE, self).__init__()
+		if in_channel == depth:
+			self.shortcut_layer = MaxPool2d(1, stride)
+		else:
+			self.shortcut_layer = Sequential(
+				Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+				BatchNorm2d(depth)
+			)
+		self.res_layer = Sequential(
+			BatchNorm2d(in_channel),
+			Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+			PReLU(depth),
+			Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+			BatchNorm2d(depth),
+			SEModule(depth, 16)
+		)
+
+	def forward(self, x):
+		shortcut = self.shortcut_layer(x)
+		res = self.res_layer(x)
+		return res + shortcut
diff --git a/training/facial_recognition/model_irse.py b/training/facial_recognition/model_irse.py
new file mode 100644
index 0000000000000000000000000000000000000000..48f3dc128f0ba7bfe49ae43a65f8922786a236b2
--- /dev/null
+++ b/training/facial_recognition/model_irse.py
@@ -0,0 +1,86 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Dropout, Sequential, Module
+from training.facial_recognition.helpers import get_blocks, Flatten, bottleneck_IR, bottleneck_IR_SE, l2_norm
+
+"""
+Modified Backbone implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
+"""
+
+
+class Backbone(Module):
+	def __init__(self, input_size, num_layers, mode='ir', drop_ratio=0.4, affine=True):
+		super(Backbone, self).__init__()
+		assert input_size in [112, 224], "input_size should be 112 or 224"
+		assert num_layers in [50, 100, 152], "num_layers should be 50, 100 or 152"
+		assert mode in ['ir', 'ir_se'], "mode should be ir or ir_se"
+		blocks = get_blocks(num_layers)
+		if mode == 'ir':
+			unit_module = bottleneck_IR
+		elif mode == 'ir_se':
+			unit_module = bottleneck_IR_SE
+		self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False),
+									  BatchNorm2d(64),
+									  PReLU(64))
+		if input_size == 112:
+			self.output_layer = Sequential(BatchNorm2d(512),
+			                               Dropout(drop_ratio),
+			                               Flatten(),
+			                               Linear(512 * 7 * 7, 512),
+			                               BatchNorm1d(512, affine=affine))
+		else:
+			self.output_layer = Sequential(BatchNorm2d(512),
+			                               Dropout(drop_ratio),
+			                               Flatten(),
+			                               Linear(512 * 14 * 14, 512),
+			                               BatchNorm1d(512, affine=affine))
+
+		modules = []
+		for block in blocks:
+			for bottleneck in block:
+				modules.append(unit_module(bottleneck.in_channel,
+										   bottleneck.depth,
+										   bottleneck.stride))
+		self.body = Sequential(*modules)
+
+	def forward(self, x):
+		x = self.input_layer(x)
+		x = self.body(x)
+		x = self.output_layer(x)
+		return l2_norm(x)
+
+
+def IR_50(input_size):
+	"""Constructs a ir-50 model."""
+	model = Backbone(input_size, num_layers=50, mode='ir', drop_ratio=0.4, affine=False)
+	return model
+
+
+def IR_101(input_size):
+	"""Constructs a ir-101 model."""
+	model = Backbone(input_size, num_layers=100, mode='ir', drop_ratio=0.4, affine=False)
+	return model
+
+
+def IR_152(input_size):
+	"""Constructs a ir-152 model."""
+	model = Backbone(input_size, num_layers=152, mode='ir', drop_ratio=0.4, affine=False)
+	return model
+
+
+def IR_SE_50(input_size):
+	"""Constructs a ir_se-50 model."""
+	model = Backbone(input_size, num_layers=50, mode='ir_se', drop_ratio=0.4, affine=False)
+	return model
+
+
+def IR_SE_101(input_size):
+	"""Constructs a ir_se-101 model."""
+	model = Backbone(input_size, num_layers=100, mode='ir_se', drop_ratio=0.4, affine=False)
+	return model
+
+
+def IR_SE_152(input_size):
+	"""Constructs a ir_se-152 model."""
+	model = Backbone(input_size, num_layers=152, mode='ir_se', drop_ratio=0.4, affine=False)
+	return model
diff --git a/training/loss.py b/training/loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..0eb2ebfe2c478098d917ade05ae9d10e4af65ccf
--- /dev/null
+++ b/training/loss.py
@@ -0,0 +1,283 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from email import generator
+
+from cv2 import DescriptorMatcher
+import training
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.utils import save_image
+from torch_utils import training_stats
+from torch_utils import misc
+from torch_utils.ops import conv2d_gradfix
+
+#----------------------------------------------------------------------------
+
+class Loss:
+    def accumulate_gradients(self, **kwargs): # to be overridden by subclass
+        raise NotImplementedError()
+
+#----------------------------------------------------------------------------
+
+class StyleGAN2Loss(Loss):
+    def __init__(
+        self, device, G_mapping, G_synthesis, D, 
+        G_encoder=None, augment_pipe=None, D_ema=None,
+        style_mixing_prob=0.9, r1_gamma=10, 
+        pl_batch_shrink=2, pl_decay=0.01, pl_weight=2, other_weights=None,
+        curriculum=None, alpha_start=0.0, cycle_consistency=False, label_smooth=0,
+        generator_mode='random_z_random_c'):
+
+        super().__init__()
+        self.device            = device
+        self.G_mapping         = G_mapping
+        self.G_synthesis       = G_synthesis
+        self.G_encoder         = G_encoder
+        self.D                 = D
+        self.D_ema             = D_ema
+        self.augment_pipe      = augment_pipe
+        self.style_mixing_prob = style_mixing_prob
+        self.r1_gamma          = r1_gamma
+        self.pl_batch_shrink   = pl_batch_shrink
+        self.pl_decay          = pl_decay
+        self.pl_weight         = pl_weight
+        self.other_weights     = other_weights
+        self.pl_mean           = torch.zeros([], device=device)
+        self.curriculum        = curriculum
+        self.alpha_start       = alpha_start
+        self.alpha             = None
+        self.cycle_consistency = cycle_consistency
+        self.label_smooth      = label_smooth
+        self.generator_mode    = generator_mode
+
+        if self.G_encoder is not None:
+            import lpips
+            self.lpips_loss      = lpips.LPIPS(net='vgg').to(device=device)
+
+    def set_alpha(self, steps):
+        alpha = None
+        if self.curriculum is not None:
+            if self.curriculum == 'upsample':
+                alpha = 0.0
+            else:
+                assert len(self.curriculum) == 2, "currently support one stage for now"
+                start, end = self.curriculum
+                alpha = min(1., max(0., (steps / 1e3 - start) / (end - start)))
+                if self.alpha_start > 0:
+                    alpha = self.alpha_start + (1 - self.alpha_start) * alpha
+        self.alpha = alpha
+        self.steps = steps
+        self.curr_status = None
+
+        def _apply(m):
+            if hasattr(m, "set_alpha") and m != self:
+                m.set_alpha(alpha)
+            if hasattr(m, "set_steps") and m != self:
+                m.set_steps(steps)
+            if hasattr(m, "set_resolution") and m != self:
+                m.set_resolution(self.curr_status)
+        
+        self.G_synthesis.apply(_apply)
+        self.curr_status = self.resolution
+        self.D.apply(_apply)
+        if self.G_encoder is not None:
+            self.G_encoder.apply(_apply)
+
+    def run_G(self, z, c, sync, img=None, mode=None, get_loss=True):
+        synthesis_kwargs = {'camera_mode': 'random'}
+        generator_mode   = self.generator_mode if mode is None else mode
+
+        if (generator_mode == 'image_z_random_c') or (generator_mode == 'image_z_image_c'):
+            assert (self.G_encoder is not None) and (img is not None)
+            with misc.ddp_sync(self.G_encoder, sync):
+                ws  = self.G_encoder(img)['ws']
+            if generator_mode == 'image_z_image_c':
+                with misc.ddp_sync(self.D, False):
+                    synthesis_kwargs['camera_RT'] = misc.get_func(self.D, 'get_estimated_camera')[0](img)
+            with misc.ddp_sync(self.G_synthesis, sync):
+                out = self.G_synthesis(ws, **synthesis_kwargs)            
+            if get_loss:  # consistency loss given the image predicted camera (train the image encoder jointly)
+                out['consist_l1_loss']    = F.smooth_l1_loss(out['img'], img['img']) * 2.0   # TODO: DEBUG
+                out['consist_lpips_loss'] = self.lpips_loss(out['img'],  img['img']) * 10.0  # TODO: DEBUG
+            
+        elif (generator_mode == 'random_z_random_c') or (generator_mode == 'random_z_image_c'):
+            with misc.ddp_sync(self.G_mapping, sync):
+                ws  = self.G_mapping(z, c)
+                if self.style_mixing_prob > 0:
+                    with torch.autograd.profiler.record_function('style_mixing'):
+                        cutoff = torch.empty([], dtype=torch.int64, device=ws.device).random_(1, ws.shape[1])
+                        cutoff = torch.where(torch.rand([], device=ws.device) < self.style_mixing_prob, cutoff, torch.full_like(cutoff, ws.shape[1]))
+                        ws[:, cutoff:] = self.G_mapping(torch.randn_like(z), c, skip_w_avg_update=True)[:, cutoff:]
+            if generator_mode == 'random_z_image_c':
+                assert img is not None
+                with torch.no_grad():
+                    D = self.D_ema if self.D_ema is not None else self.D
+                    with misc.ddp_sync(D, sync):
+                        estimated_c = misc.get_func(D, 'get_estimated_camera')(img)[0].detach()
+                        if estimated_c.size(-1) == 16:
+                            synthesis_kwargs['camera_RT'] = estimated_c
+                        if estimated_c.size(-1) == 3:
+                            synthesis_kwargs['camera_UV'] = estimated_c
+            with misc.ddp_sync(self.G_synthesis, sync):
+                out = self.G_synthesis(ws, **synthesis_kwargs)
+        else:
+            raise NotImplementedError(f'wrong generator_mode {generator_mode}')
+        return out, ws
+
+    def run_D(self, img, c, sync):
+        with misc.ddp_sync(self.D, sync):
+            logits = self.D(img, c, aug_pipe=self.augment_pipe)
+        return logits
+
+    def get_loss(self, outputs, module='D'):
+        reg_loss, logs, del_keys = 0, [], []
+        if isinstance(outputs, dict):
+            for key in outputs:
+                if key[-5:] == '_loss':
+                    logs += [(f'Loss/{module}/{key}', outputs[key])]
+                    del_keys += [key]
+                    if (self.other_weights is not None) and (key in self.other_weights):
+                        reg_loss = reg_loss + outputs[key].mean() * self.other_weights[key]
+                    else:
+                        reg_loss = reg_loss + outputs[key].mean()
+            for key in del_keys:
+                del outputs[key]
+            for key, loss in logs:
+                training_stats.report(key, loss)
+        return reg_loss
+
+    @property
+    def resolution(self):
+        return misc.get_func(self.G_synthesis, 'get_current_resolution')()[-1]
+
+    def accumulate_gradients(self, phase, real_img, real_c, gen_z, gen_c, fake_img, sync, gain, scaler=None):
+        assert phase in ['Gmain', 'Greg', 'Gboth', 'Dmain', 'Dreg', 'Dboth']
+        do_Gmain = (phase in ['Gmain', 'Gboth'])
+        do_Dmain = (phase in ['Dmain', 'Dboth'])
+        do_Gpl   = (phase in ['Greg', 'Gboth']) 
+        do_Dr1   = (phase in ['Dreg', 'Dboth'])
+        losses   = {}
+
+        # Gmain: Maximize logits for generated images.
+        loss_Gmain, reg_loss = 0, 0
+        if isinstance(fake_img, dict): fake_img = fake_img['img']
+        if do_Gmain:
+            with torch.autograd.profiler.record_function('Gmain_forward'):
+                gen_img, gen_ws = self.run_G(gen_z, gen_c, sync=(sync and not do_Gpl), img=fake_img)   # May get synced by Gpl.
+                reg_loss  += self.get_loss(gen_img, 'G')
+                gen_logits = self.run_D(gen_img, gen_c, sync=False)
+                reg_loss  += self.get_loss(gen_logits, 'G')
+                if isinstance(gen_logits, dict):
+                    gen_logits = gen_logits['logits']
+                    
+                loss_Gmain = torch.nn.functional.softplus(-gen_logits) # -log(sigmoid(gen_logits))
+                if self.label_smooth > 0:
+                    loss_Gmain = loss_Gmain * (1 - self.label_smooth) +  torch.nn.functional.softplus(gen_logits) * self.label_smooth
+                
+                training_stats.report('Loss/scores/fake', gen_logits)
+                training_stats.report('Loss/signs/fake', gen_logits.sign())
+                training_stats.report('Loss/G/loss', loss_Gmain)
+
+            with torch.autograd.profiler.record_function('Gmain_backward'):
+                loss_Gmain  = loss_Gmain + reg_loss
+                losses['Gmain'] = loss_Gmain.mean().mul(gain)
+                loss = scaler.scale(losses['Gmain']) if scaler is not None else losses['Gmain']
+                loss.backward()
+
+        # Gpl: Apply path length regularization.
+        if do_Gpl and (self.pl_weight != 0):
+            with torch.autograd.profiler.record_function('Gpl_forward'):
+                batch_size = max(1, gen_z.shape[0] // self.pl_batch_shrink)
+                gen_img, gen_ws = self.run_G(
+                    gen_z[:batch_size], gen_c[:batch_size], sync=sync, 
+                    img=fake_img[:batch_size] if fake_img is not None else None)
+                if isinstance(gen_img, dict):  gen_img = gen_img['img']
+                pl_noise = torch.randn_like(gen_img) / np.sqrt(gen_img.shape[2] * gen_img.shape[3])
+                with torch.autograd.profiler.record_function('pl_grads'), conv2d_gradfix.no_weight_gradients():
+                # with torch.autograd.profiler.record_function('pl_grads'):
+                    pl_grads = torch.autograd.grad(outputs=[(gen_img * pl_noise).sum()], inputs=[gen_ws], create_graph=True, only_inputs=True, allow_unused=True)[0]
+                pl_lengths = pl_grads.square().sum(2).mean(1).sqrt()
+                pl_mean = self.pl_mean.lerp(pl_lengths.mean(), self.pl_decay)
+                self.pl_mean.copy_(pl_mean.detach())
+                pl_penalty = (pl_lengths - pl_mean).square()
+                training_stats.report('Loss/pl_penalty', pl_penalty)
+                loss_Gpl = pl_penalty * self.pl_weight
+                training_stats.report('Loss/G/reg', loss_Gpl)
+
+            with torch.autograd.profiler.record_function('Gpl_backward'):
+                losses['Gpl'] = (gen_img[:, 0, 0, 0] * 0 + loss_Gpl).mean().mul(gain)
+                loss = scaler.scale(losses['Gpl']) if scaler is not None else losses['Gpl']
+                loss.backward()
+
+        # Dmain: Minimize logits for generated images.
+        loss_Dgen, reg_loss = 0, 0
+        if do_Dmain:
+            with torch.autograd.profiler.record_function('Dgen_forward'):
+                gen_img    = self.run_G(gen_z, gen_c, sync=False, img=fake_img)[0]                
+                reg_loss  += self.get_loss(gen_img, 'D')
+                gen_logits = self.run_D(gen_img, gen_c, sync=False) # Gets synced by loss_Dreal.
+                reg_loss  += self.get_loss(gen_logits, 'D')
+                if isinstance(gen_logits, dict):
+                    gen_logits = gen_logits['logits']
+                   
+                training_stats.report('Loss/scores/fake', gen_logits)
+                training_stats.report('Loss/signs/fake',  gen_logits.sign())
+                loss_Dgen = torch.nn.functional.softplus(gen_logits) # -log(1 - sigmoid(gen_logits))
+
+            with torch.autograd.profiler.record_function('Dgen_backward'):
+                loss_Dgen  = loss_Dgen + reg_loss
+                losses['Dgen'] = loss_Dgen.mean().mul(gain)
+                loss = scaler.scale(losses['Dgen']) if scaler is not None else losses['Dgen']
+                loss.backward()
+
+        # Dmain: Maximize logits for real images.
+        # Dr1: Apply R1 regularization.
+        if do_Dmain or (do_Dr1 and (self.r1_gamma != 0)):
+            name = 'Dreal_Dr1' if do_Dmain and do_Dr1 else 'Dreal' if do_Dmain else 'Dr1'
+            with torch.autograd.profiler.record_function(name + '_forward'):
+                if isinstance(real_img, dict):
+                    real_img['img'] = real_img['img'].requires_grad_(do_Dr1)
+                else:
+                    real_img = real_img.requires_grad_(do_Dr1)
+                real_logits = self.run_D(real_img, real_c, sync=sync)
+                if isinstance(real_logits, dict):
+                    real_logits = real_logits['logits']
+
+                training_stats.report('Loss/scores/real', real_logits)
+                training_stats.report('Loss/signs/real',  real_logits.sign())
+
+                loss_Dreal = 0
+                if do_Dmain:
+                    loss_Dreal = torch.nn.functional.softplus(-real_logits) # -log(sigmoid(real_logits))
+                    if self.label_smooth > 0:
+                        loss_Dreal = loss_Dreal * (1 - self.label_smooth) +  torch.nn.functional.softplus(real_logits) * self.label_smooth
+                    
+                    training_stats.report('Loss/D/loss', loss_Dgen.mean() + loss_Dreal.mean())
+
+                loss_Dr1 = 0
+                if do_Dr1:
+                    with torch.autograd.profiler.record_function('r1_grads'), conv2d_gradfix.no_weight_gradients():
+                        real_img_tmp = real_img['img'] if isinstance(real_img, dict) else real_img
+                        r1_grads = torch.autograd.grad(outputs=[real_logits.sum()], inputs=[real_img_tmp], create_graph=True, only_inputs=True)[0]
+                    r1_penalty = r1_grads.square().sum([1,2,3])
+                    loss_Dr1 = r1_penalty * (self.r1_gamma / 2)
+                    training_stats.report('Loss/r1_penalty', r1_penalty)
+                    training_stats.report('Loss/D/reg', loss_Dr1)
+
+            with torch.autograd.profiler.record_function(name + '_backward'):
+                losses['Dr1'] = (real_logits * 0 + loss_Dreal + loss_Dr1).mean().mul(gain)
+                loss = scaler.scale(losses['Dr1']) if scaler is not None else losses['Dr1']
+                loss.backward()
+
+        return losses
+
+#----------------------------------------------------------------------------
diff --git a/training/networks.py b/training/networks.py
new file mode 100755
index 0000000000000000000000000000000000000000..6dd0e849fa36d5b1d3dd05230dc221dac8e73a5b
--- /dev/null
+++ b/training/networks.py
@@ -0,0 +1,1563 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from pickle import NONE
+from re import X
+from sndhdr import whathdr
+import numpy as np
+import math
+import scipy.signal
+import scipy.optimize
+
+from numpy import core
+from numpy.lib.arraysetops import isin
+
+import torch
+import torch.nn.functional as F
+from torch.overrides import is_tensor_method_or_property
+from einops import repeat
+from dnnlib import camera, util, geometry
+from torch_utils import misc
+from torch_utils import persistence
+from torch_utils.ops import conv2d_resample
+from torch_utils.ops import upfirdn2d
+from torch_utils.ops import bias_act
+from torch_utils.ops import fma
+from torch_utils.ops import filtered_lrelu
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def normalize_2nd_moment(x, dim=1, eps=1e-8):
+    return x * (x.square().mean(dim=dim, keepdim=True) + eps).rsqrt()
+
+
+@misc.profiled_function
+def conv3d(x, w, up=1, down=1, padding=0, groups=1):
+    if up > 1:
+        x = F.interpolate(x, scale_factor=up, mode='trilinear', align_corners=True)
+    x = F.conv3d(x, w, padding=padding, groups=groups)
+    if down > 1:
+        x = F.interpolate(x, scale_factor=1./float(down), mode='trilinear', align_corners=True)
+    return x
+
+#----------------------------------------------------------------------------
+
+@misc.profiled_function
+def modulated_conv2d(
+    x,                          # Input tensor of shape [batch_size, in_channels, in_height, in_width].
+    weight,                     # Weight tensor of shape [out_channels, in_channels, kernel_height, kernel_width].
+    styles,                     # Modulation coefficients of shape [batch_size, in_channels].
+    noise           = None,     # Optional noise tensor to add to the output activations.
+    up              = 1,        # Integer upsampling factor.
+    down            = 1,        # Integer downsampling factor.
+    padding         = 0,        # Padding with respect to the upsampled image.
+    resample_filter = None,     # Low-pass filter to apply when resampling activations. Must be prepared beforehand by calling upfirdn2d.setup_filter().
+    demodulate      = True,     # Apply weight demodulation?
+    flip_weight     = True,     # False = convolution, True = correlation (matches torch.nn.functional.conv2d) ????????
+    fused_modconv   = True,     # Perform modulation, convolution, and demodulation as a single fused operation?
+    mode            = '2d',     # modulated 2d/3d conv or MLP
+    **unused,
+):
+    batch_size = x.shape[0]
+    if mode == '3d':
+        _, in_channels, kd, kh, kw = weight.shape
+    else:
+        _, in_channels, kh, kw = weight.shape
+
+    # Pre-normalize inputs to avoid FP16 overflow.
+    if x.dtype == torch.float16 and demodulate:
+        weight_sizes = in_channels * kh * kw if mode != '3d' else in_channels * kd * kh * kw
+        weight = weight * (1 / np.sqrt(weight_sizes) / weight.norm(float('inf'), dim=[1,2,3], keepdim=True)) # max_Ikk
+        styles = styles / styles.norm(float('inf'), dim=1, keepdim=True) # max_I
+
+    # Calculate per-sample weights and demodulation coefficients.
+    w = None
+    dcoefs = None
+    if mode != '3d':
+        rsizes, ssizes = [-1, 1, 1], [2, 3, 4]
+    else:
+        rsizes, ssizes = [-1, 1, 1, 1], [2, 3, 4, 5]
+
+    if demodulate or fused_modconv:  # if not fused, skip
+        w =  weight.unsqueeze(0) * styles.reshape(batch_size, 1, *rsizes)
+    if demodulate:
+        dcoefs = (w.square().sum(dim=ssizes) + 1e-8).rsqrt() # [NO]
+
+    if demodulate and fused_modconv:
+        w = w * dcoefs.reshape(batch_size, *rsizes, 1) # [NOIkk]  (batch_size, out_channels, in_channels, kernel_size, kernel_size)
+
+    # Execute by scaling the activations before and after the convolution.
+    if not fused_modconv:
+        x = x * styles.to(x.dtype).reshape(batch_size, *rsizes)
+        if mode == '2d':
+            x = conv2d_resample.conv2d_resample(x=x, w=weight.to(x.dtype), f=resample_filter, up=up, down=down, padding=padding, flip_weight=flip_weight)
+        elif mode == '3d':
+            x = conv3d(x=x, w=weight.to(x.dtype), up=up, down=down, padding=padding)
+        else:
+            raise NotImplementedError
+
+        if demodulate and noise is not None:
+            x = fma.fma(x, dcoefs.to(x.dtype).reshape(batch_size, *rsizes), noise.to(x.dtype))  # fused multiply add
+        elif demodulate:
+            x = x * dcoefs.to(x.dtype).reshape(batch_size, *rsizes)
+        elif noise is not None:
+            x = x.add_(noise.to(x.dtype))
+        return x
+
+    # Execute as one fused op using grouped convolution.
+    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+        batch_size = int(batch_size)
+
+    x = x.reshape(1, -1, *x.shape[2:])
+    w = w.reshape(-1, *w.shape[2:])
+    if mode == '2d':
+        x = conv2d_resample.conv2d_resample(x=x, w=w.to(x.dtype), f=resample_filter, up=up, down=down, padding=padding, groups=batch_size, flip_weight=flip_weight)
+    elif mode == '3d':
+        x = conv3d(x=x, w=w.to(x.dtype), up=up, down=down, padding=padding, groups=batch_size)
+    x = x.reshape(batch_size, -1, *x.shape[2:])
+    
+    if noise is not None:
+        x = x.add_(noise)
+    return x
+
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class FullyConnectedLayer(torch.nn.Module):
+    def __init__(self,
+        in_features,                # Number of input features.
+        out_features,               # Number of output features.
+        bias            = True,     # Apply additive bias before the activation function?
+        activation      = 'linear', # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier   = 1,        # Learning rate multiplier.
+        bias_init       = 0,        # Initial value for the additive bias.
+    ):
+        super().__init__()
+        self.activation = activation
+        self.weight = torch.nn.Parameter(torch.randn([out_features, in_features]) / lr_multiplier)
+        self.bias = torch.nn.Parameter(torch.full([out_features], np.float32(bias_init))) if bias else None
+        self.weight_gain = lr_multiplier / np.sqrt(in_features)
+        self.bias_gain = lr_multiplier
+
+    def forward(self, x):
+        w = self.weight.to(x.dtype) * self.weight_gain
+        b = self.bias
+        if b is not None:
+            b = b.to(x.dtype)
+            if self.bias_gain != 1:
+                b = b * self.bias_gain
+
+        if self.activation == 'linear' and b is not None:
+            x = torch.addmm(b.unsqueeze(0), x, w.t())
+        else:
+            x = x.matmul(w.t())
+            x = bias_act.bias_act(x, b, act=self.activation)
+        return x
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class Conv2dLayer(torch.nn.Module):
+    def __init__(self,
+        in_channels,                    # Number of input channels.
+        out_channels,                   # Number of output channels.
+        kernel_size,                    # Width and height of the convolution kernel.
+        bias            = True,         # Apply additive bias before the activation function?
+        activation      = 'linear',     # Activation function: 'relu', 'lrelu', etc.
+        up              = 1,            # Integer upsampling factor.
+        down            = 1,            # Integer downsampling factor.
+        resample_filter = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp      = None,         # Clamp the output to +-X, None = disable clamping.
+        channels_last   = False,        # Expect the input to have memory_format=channels_last?
+        trainable       = True,         # Update the weights of this layer during training?
+        mode            = '2d',
+        **unused
+    ):
+        super().__init__()
+        self.activation = activation
+        self.up = up
+        self.down = down
+        self.conv_clamp = conv_clamp
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.padding = kernel_size // 2
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2))
+        self.act_gain = bias_act.activation_funcs[activation].def_gain
+        self.mode = mode
+        weight_shape = [out_channels, in_channels, kernel_size, kernel_size]
+        if mode == '3d':
+            weight_shape += [kernel_size]
+
+        memory_format = torch.channels_last if channels_last else torch.contiguous_format
+        weight = torch.randn(weight_shape).to(memory_format=memory_format)
+        bias = torch.zeros([out_channels]) if bias else None
+        if trainable:
+            self.weight = torch.nn.Parameter(weight)
+            self.bias = torch.nn.Parameter(bias) if bias is not None else None
+        else:
+            self.register_buffer('weight', weight)
+            if bias is not None:
+                self.register_buffer('bias', bias)
+            else:
+                self.bias = None
+
+    def forward(self, x, gain=1):
+        w = self.weight * self.weight_gain
+        b = self.bias.to(x.dtype) if self.bias is not None else None
+        flip_weight = (self.up == 1)  # slightly faster
+
+        if self.mode == '2d':
+            x = conv2d_resample.conv2d_resample(x=x, w=w.to(x.dtype), f=self.resample_filter, up=self.up, down=self.down, padding=self.padding, flip_weight=flip_weight)
+        elif self.mode == '3d':
+            x = conv3d(x=x, w=w.to(x.dtype), up=self.up, down=self.down, padding=self.padding)
+
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        x = bias_act.bias_act(x, b, act=self.activation, gain=act_gain, clamp=act_clamp)
+        return x
+
+# ---------------------------------------------------------------------------
+
+@persistence.persistent_class
+class Blur(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        f = torch.Tensor([1, 2, 1])
+        self.register_buffer('f', f)
+
+    def forward(self, x):
+        from kornia.filters import filter2d
+        f = self.f
+        f = f[None, None, :] * f [None, :, None]
+        return filter2d(x, f, normalized=True)
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class MappingNetwork(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality, 0 = no latent.
+        c_dim,                      # Conditioning label (C) dimensionality, 0 = no label.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        num_ws,                     # Number of intermediate latents to output, None = do not broadcast.
+        num_layers      = 8,        # Number of mapping layers.
+        embed_features  = None,     # Label embedding dimensionality, None = same as w_dim.
+        layer_features  = None,     # Number of intermediate features in the mapping layers, None = same as w_dim.
+        activation      = 'lrelu',  # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier   = 0.01,     # Learning rate multiplier for the mapping layers.
+        w_avg_beta      = 0.995,    # Decay for tracking the moving average of W during training, None = do not track.
+        **unused,
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.num_ws = num_ws
+        self.num_layers = num_layers
+        self.w_avg_beta = w_avg_beta
+
+        if embed_features is None:
+            embed_features = w_dim
+        if c_dim == 0:
+            embed_features = 0
+        if layer_features is None:
+            layer_features = w_dim
+        features_list = [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim]
+
+        if c_dim > 0:   # project label condition
+            self.embed = FullyConnectedLayer(c_dim, embed_features)
+        for idx in range(num_layers):
+            in_features = features_list[idx]
+            out_features = features_list[idx + 1]
+            layer = FullyConnectedLayer(in_features, out_features, activation=activation, lr_multiplier=lr_multiplier)
+            setattr(self, f'fc{idx}', layer)
+
+        if num_ws is not None and w_avg_beta is not None:
+            self.register_buffer('w_avg', torch.zeros([w_dim]))
+
+    def forward(self, z=None, c=None, truncation_psi=1, truncation_cutoff=None, skip_w_avg_update=False, styles=None, **unused_kwargs):
+        if styles is not None:
+            return styles
+
+        # Embed, normalize, and concat inputs.
+        x = None
+        with torch.autograd.profiler.record_function('input'):
+            if self.z_dim > 0:
+                misc.assert_shape(z, [None, self.z_dim])
+                x = normalize_2nd_moment(z.to(torch.float32))   # normalize z to shpere
+            if self.c_dim > 0:
+                misc.assert_shape(c, [None, self.c_dim])
+                y = normalize_2nd_moment(self.embed(c.to(torch.float32)))
+                x = torch.cat([x, y], dim=1) if x is not None else y
+        
+        # Main layers.
+        for idx in range(self.num_layers):
+            layer = getattr(self, f'fc{idx}')
+            x = layer(x)
+
+        # Update moving average of W.
+        if self.w_avg_beta is not None and self.training and not skip_w_avg_update:
+            with torch.autograd.profiler.record_function('update_w_avg'):
+                self.w_avg.copy_(x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta))
+
+        # Broadcast.
+        if self.num_ws is not None:
+            with torch.autograd.profiler.record_function('broadcast'):
+                x = x.unsqueeze(1).repeat([1, self.num_ws, 1])
+
+        # Apply truncation.
+        if truncation_psi != 1:
+            with torch.autograd.profiler.record_function('truncate'):
+                assert self.w_avg_beta is not None
+                if self.num_ws is None or truncation_cutoff is None:
+                    x = self.w_avg.lerp(x, truncation_psi)
+                else:
+                    x[:, :truncation_cutoff] = self.w_avg.lerp(x[:, :truncation_cutoff], truncation_psi)
+        return x
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisLayer(torch.nn.Module):
+    def __init__(self,
+        in_channels,                     # Number of input channels.
+        out_channels,                    # Number of output channels.
+        w_dim,                           # Intermediate latent (W) dimensionality.
+        resolution,                      # Resolution of this layer.
+        kernel_size        = 3,            # Convolution kernel size.
+        up                 = 1,            # Integer upsampling factor.
+        use_noise          = True,         # Enable noise input?
+        activation         = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+        resample_filter    = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp         = None,         # Clamp the output of convolution layers to +-X, None = disable clamping.
+        channels_last      = False,        # Use channels_last format for the weights?
+        upsample_mode      = 'default',    # [default, bilinear, ray_comm, ray_attn, ray_penc]
+        use_group          = False,
+        magnitude_ema_beta = -1,           # -1 means not using magnitude ema
+        mode               = '2d',         # choose from 1d, 2d or 3d
+        **unused_kwargs
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.up = up
+        self.use_noise = use_noise
+        self.activation = activation
+        self.conv_clamp = conv_clamp
+        self.upsample_mode = upsample_mode
+        self.mode = mode
+
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        if up == 2:
+            if 'pixelshuffle' in upsample_mode:
+                self.adapter = torch.nn.Sequential(
+                    Conv2dLayer(out_channels, out_channels // 4, kernel_size=1, activation=activation),
+                    Conv2dLayer(out_channels // 4, out_channels * 4, kernel_size=1, activation='linear'),
+                )
+            elif upsample_mode == 'liif':
+                from dnnlib.geometry import get_grids, local_ensemble
+                pi = get_grids(self.resolution//2, self.resolution//2, 'cpu', align=False).transpose(0,1)
+                po = get_grids(self.resolution, self.resolution, 'cpu', align=False).transpose(0,1)
+                diffs, coords, coeffs = local_ensemble(pi, po, self.resolution)
+
+                self.diffs   = torch.nn.Parameter(diffs, requires_grad=False)
+                self.coords  = torch.nn.Parameter(coords.float(), requires_grad=False)
+                self.coeffs  = torch.nn.Parameter(coeffs, requires_grad=False)
+                add_dim      = 2
+                self.adapter = torch.nn.Sequential(
+                    Conv2dLayer(out_channels + add_dim, out_channels // 2, kernel_size=1, activation=activation),
+                    Conv2dLayer(out_channels // 2, out_channels, kernel_size=1, activation='linear'),
+                )
+            elif 'nn_cat' in upsample_mode:
+                self.adapter = torch.nn.Sequential(
+                    Conv2dLayer(out_channels * 2, out_channels // 4, kernel_size=1, activation=activation),
+                    Conv2dLayer(out_channels // 4, out_channels, kernel_size=1, activation='linear'),
+                ) 
+            elif 'ada' in upsample_mode:
+                self.adapter = torch.nn.Sequential(
+                    Conv2dLayer(out_channels, 8, kernel_size=1, activation=activation),
+                    Conv2dLayer(8, out_channels, kernel_size=1, activation='linear')
+                )
+                self.adapter[1].weight.data.zero_()
+                if 'blur' in upsample_mode:
+                    self.blur = Blur()
+
+        self.padding = kernel_size // 2
+        self.groups = 2 if use_group else 1
+        self.act_gain = bias_act.activation_funcs[activation].def_gain
+        self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+
+        memory_format = torch.channels_last if channels_last else torch.contiguous_format
+        weight_sizes = [out_channels // self.groups, in_channels, kernel_size, kernel_size]
+        if self.mode == '3d':
+            weight_sizes += [kernel_size]
+        weight = torch.randn(weight_sizes).to(memory_format=memory_format)
+        self.weight = torch.nn.Parameter(weight)
+        
+        if use_noise:
+            if self.mode == '2d':
+                noise_sizes = [resolution, resolution]
+            elif self.mode == '3d': 
+                noise_sizes = [resolution, resolution, resolution]
+            else:
+                raise NotImplementedError('not support for MLP')
+            self.register_buffer('noise_const', torch.randn(noise_sizes))  # HACK: for safety reasons
+            self.noise_strength = torch.nn.Parameter(torch.zeros([]))
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+
+        self.magnitude_ema_beta = magnitude_ema_beta
+        if magnitude_ema_beta > 0:
+            self.register_buffer('w_avg', torch.ones([]))  # TODO: name for compitibality
+
+    def forward(self, x, w, noise_mode='random', fused_modconv=True, gain=1, skip_up=False, input_noise=None, **unused_kwargs):
+        assert noise_mode in ['random', 'const', 'none']
+        batch_size = x.size(0)
+        
+        if (self.magnitude_ema_beta > 0):
+            if self.training:  # updating EMA.
+                with torch.autograd.profiler.record_function('update_magnitude_ema'):
+                    magnitude_cur = x.detach().to(torch.float32).square().mean()
+                    self.w_avg.copy_(magnitude_cur.lerp(self.w_avg, self.magnitude_ema_beta))
+            input_gain = self.w_avg.rsqrt()
+            x = x * input_gain
+
+        styles          = self.affine(w)      # Batch x style_dim
+        if styles.size(0) < x.size(0):        # for repeating
+            assert (x.size(0) // styles.size(0) * styles.size(0) == x.size(0))
+            styles = repeat(styles, 'b c -> (b s) c', s=x.size(0) // styles.size(0))
+        up              = self.up if not skip_up else 1
+        use_default     = (self.upsample_mode == 'default')
+        noise           = None
+        resample_filter = None
+        if use_default and (up > 1):
+            resample_filter = self.resample_filter
+
+        if self.use_noise:
+            if input_noise is not None:
+                noise = input_noise * self.noise_strength
+            elif noise_mode == 'random':
+                noise_sizes = [x.shape[0], 1, up * x.shape[2], up * x.shape[3]]
+                if self.mode == '3d':
+                    noise_sizes += [up * x.shape[4]]
+                noise = torch.randn(noise_sizes, device=x.device) * self.noise_strength
+            elif noise_mode == 'const':
+                noise = self.noise_const * self.noise_strength
+                if noise.shape[-1] < (up * x.shape[3]):
+                    noise = repeat(noise, 'h w -> h (s w)', s=up*x.shape[3]//noise.shape[-1])
+
+        flip_weight = (up == 1)  # slightly faster
+        x = modulated_conv2d(
+            x=x, weight=self.weight, styles=styles, 
+            noise=noise if (use_default and not skip_up) else None, 
+            up=up if use_default else 1,
+            padding=self.padding, 
+            resample_filter=resample_filter, 
+            flip_weight=flip_weight, 
+            fused_modconv=fused_modconv,
+            groups=self.groups,
+            mode=self.mode
+        )
+        
+        if (up == 2) and (not use_default):
+            resolution = x.size(-1) * 2
+            if 'bilinear' in self.upsample_mode:
+                x = F.interpolate(x, size=(resolution, resolution), mode='bilinear', align_corners=True)
+            elif 'nearest' in self.upsample_mode:
+                x = F.interpolate(x, size=(resolution, resolution), mode='nearest')
+                x = upfirdn2d.filter2d(x, self.resample_filter)
+            elif 'bicubic' in self.upsample_mode:
+                x = F.interpolate(x, size=(resolution, resolution), mode='bicubic',  align_corners=True)
+            elif 'pixelshuffle' in self.upsample_mode:  # does not have rotation invariance
+                x = F.interpolate(x, size=(resolution, resolution), mode='nearest') + torch.pixel_shuffle(self.adapter(x), 2)
+                if not 'noblur' in self.upsample_mode:           
+                   x = upfirdn2d.filter2d(x, self.resample_filter)
+            elif 'nn_cat' in self.upsample_mode:
+                x_pad = x.new_zeros(*x.size()[:2], x.size(-2)+2, x.size(-1)+2)
+                x_pad[...,1:-1,1:-1] = x
+                xl, xu, xd, xr = x_pad[..., 1:-1, :-2], x_pad[..., :-2, 1:-1], x_pad[..., 2:, 1:-1], x_pad[..., 1:-1, 2:]
+                x1, x2, x3, x4 = xl + xu, xu + xr, xl + xd, xr + xd
+                xb = torch.stack([x1, x2, x3, x4], 2) / 2
+                xb = torch.pixel_shuffle(xb.view(xb.size(0), -1, xb.size(-2), xb.size(-1)), 2)
+                xa = F.interpolate(x, size=(resolution, resolution), mode='nearest')
+                x = xa + self.adapter(torch.cat([xa, xb], 1))
+                if not 'noblur' in self.upsample_mode:
+                    x = upfirdn2d.filter2d(x, self.resample_filter)
+            elif self.upsample_mode == 'liif':   # this is an old version
+                x = torch.stack([x[..., self.coords[j,:,:,0].long(), self.coords[j,:,:,1].long()] for j in range(4)], 0)
+                d = self.diffs[:, None].type_as(x).repeat(1,batch_size,1,1,1).permute(0,1,4,2,3)
+                x = self.adapter(torch.cat([x, d.type_as(x)], 2).reshape(batch_size*4,-1,*x.size()[-2:]))
+                x = (x.reshape(4,batch_size,*x.size()[-3:]) * self.coeffs[:,None,None].type_as(x)).sum(0)
+            else:
+                raise NotImplementedError
+
+        if up == 2:
+            if 'ada' in self.upsample_mode:
+                x = x + self.adapter(x)
+                if 'blur' in self.upsample_mode:
+                    x = self.blur(x)
+
+        if (noise is not None) and (not use_default) and (not skip_up):
+            x = x.add_(noise.type_as(x))
+        
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        x = bias_act.bias_act(x, self.bias.to(x.dtype), act=self.activation, gain=act_gain, clamp=act_clamp)
+
+        return x
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisLayer3(torch.nn.Module):
+    """copy from the stylegan3 codebase with minor changes"""
+    def __init__(self,
+        w_dim,                          # Intermediate latent (W) dimensionality.
+        is_torgb,                       # Is this the final ToRGB layer?
+        is_critically_sampled,          # Does this layer use critical sampling?
+        use_fp16,                       # Does this layer use FP16?
+
+        # Input & output specifications.
+        in_channels,                    # Number of input channels.
+        out_channels,                   # Number of output channels.
+        in_size,                        # Input spatial size: int or [width, height].
+        out_size,                       # Output spatial size: int or [width, height].
+        in_sampling_rate,               # Input sampling rate (s).
+        out_sampling_rate,              # Output sampling rate (s).
+        in_cutoff,                      # Input cutoff frequency (f_c).
+        out_cutoff,                     # Output cutoff frequency (f_c).
+        in_half_width,                  # Input transition band half-width (f_h).
+        out_half_width,                 # Output Transition band half-width (f_h).
+
+        # Hyperparameters.
+        kernel_size         = 3,        # Convolution kernel size. Ignored for final the ToRGB layer.
+        filter_size         = 6,        # Low-pass filter size relative to the lower resolution when up/downsampling.
+        lrelu_upsampling    = 2,        # Relative sampling rate for leaky ReLU. Ignored for final the ToRGB layer.
+        use_radial_filters  = False,    # Use radially symmetric downsampling filter? Ignored for critically sampled layers.
+        conv_clamp          = 256,      # Clamp the output to [-X, +X], None = disable clamping.
+        magnitude_ema_beta  = 0.999,    # Decay rate for the moving average of input magnitudes.
+
+        **unused_kwargs,
+    ):
+        super().__init__()
+        self.w_dim = w_dim
+        self.is_torgb = is_torgb
+        self.is_critically_sampled = is_critically_sampled
+        self.use_fp16 = use_fp16
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.in_size = np.broadcast_to(np.asarray(in_size), [2])
+        self.out_size = np.broadcast_to(np.asarray(out_size), [2])
+        self.in_sampling_rate = in_sampling_rate
+        self.out_sampling_rate = out_sampling_rate
+        self.tmp_sampling_rate = max(in_sampling_rate, out_sampling_rate) * (1 if is_torgb else lrelu_upsampling)
+        self.in_cutoff = in_cutoff
+        self.out_cutoff = out_cutoff
+        self.in_half_width = in_half_width
+        self.out_half_width = out_half_width
+        self.conv_kernel = 1 if is_torgb else kernel_size
+        self.conv_clamp = conv_clamp
+        self.magnitude_ema_beta = magnitude_ema_beta
+
+        # Setup parameters and buffers.
+        self.affine = FullyConnectedLayer(self.w_dim, self.in_channels, bias_init=1)
+        self.weight = torch.nn.Parameter(torch.randn([self.out_channels, self.in_channels, self.conv_kernel, self.conv_kernel]))
+        self.bias = torch.nn.Parameter(torch.zeros([self.out_channels]))
+        if magnitude_ema_beta > 0:
+            self.register_buffer('w_avg', torch.ones([]))
+
+        # Design upsampling filter.
+        self.up_factor = int(np.rint(self.tmp_sampling_rate / self.in_sampling_rate))
+        assert self.in_sampling_rate * self.up_factor == self.tmp_sampling_rate
+        self.up_taps = filter_size * self.up_factor if self.up_factor > 1 and not self.is_torgb else 1
+        self.register_buffer('up_filter', self.design_lowpass_filter(
+            numtaps=self.up_taps, cutoff=self.in_cutoff, width=self.in_half_width*2, fs=self.tmp_sampling_rate))
+
+        # Design downsampling filter.
+        self.down_factor = int(np.rint(self.tmp_sampling_rate / self.out_sampling_rate))
+        assert self.out_sampling_rate * self.down_factor == self.tmp_sampling_rate
+        self.down_taps = filter_size * self.down_factor if self.down_factor > 1 and not self.is_torgb else 1
+        self.down_radial = use_radial_filters and not self.is_critically_sampled
+        self.register_buffer('down_filter', self.design_lowpass_filter(
+            numtaps=self.down_taps, cutoff=self.out_cutoff, width=self.out_half_width*2, fs=self.tmp_sampling_rate, radial=self.down_radial))
+
+        # Compute padding.
+        pad_total = (self.out_size - 1) * self.down_factor + 1 # Desired output size before downsampling.
+        pad_total -= (self.in_size + self.conv_kernel - 1) * self.up_factor # Input size after upsampling.
+        pad_total += self.up_taps + self.down_taps - 2 # Size reduction caused by the filters.
+        pad_lo = (pad_total + self.up_factor) // 2 # Shift sample locations according to the symmetric interpretation (Appendix C.3).
+        pad_hi = pad_total - pad_lo
+        self.padding = [int(pad_lo[0]), int(pad_hi[0]), int(pad_lo[1]), int(pad_hi[1])]
+
+    def forward(self, x, w, noise_mode='random', force_fp32=False, **unused_kwargs):
+        assert noise_mode in ['random', 'const', 'none'] # unused
+        misc.assert_shape(x, [None, self.in_channels, int(self.in_size[1]), int(self.in_size[0])])
+        misc.assert_shape(w, [x.shape[0], self.w_dim])
+
+        # Track input magnitude.
+        if (self.magnitude_ema_beta > 0):
+            if self.training:  # updating EMA.
+                with torch.autograd.profiler.record_function('update_magnitude_ema'):
+                    magnitude_cur = x.detach().to(torch.float32).square().mean()
+                    self.w_avg.copy_(magnitude_cur.lerp(self.w_avg, self.magnitude_ema_beta))
+            input_gain = self.w_avg.rsqrt()
+            x = x * input_gain
+
+        # Execute affine layer.
+        styles = self.affine(w)
+        if self.is_torgb:
+            weight_gain = 1 / np.sqrt(self.in_channels * (self.conv_kernel ** 2))
+            styles = styles * weight_gain
+
+        # Execute modulated conv2d.
+        dtype = torch.float16 if (self.use_fp16 and not force_fp32 and x.device.type == 'cuda') else torch.float32
+        x = modulated_conv2d(x=x.to(dtype), weight=self.weight, styles=styles, padding=self.conv_kernel-1, up=1, fused_modconv=True)
+
+        # Execute bias, filtered leaky ReLU, and clamping.
+        gain = 1 if self.is_torgb else np.sqrt(2)
+        slope = 1 if self.is_torgb else 0.2
+        x = filtered_lrelu.filtered_lrelu(x=x, fu=self.up_filter, fd=self.down_filter, b=self.bias.to(x.dtype),
+            up=self.up_factor, down=self.down_factor, padding=self.padding, gain=gain, slope=slope, clamp=self.conv_clamp)
+        
+        # Ensure correct shape and dtype.
+        misc.assert_shape(x, [None, self.out_channels, int(self.out_size[1]), int(self.out_size[0])])
+        assert x.dtype == dtype
+        return x
+
+    @staticmethod
+    def design_lowpass_filter(numtaps, cutoff, width, fs, radial=False):
+        assert numtaps >= 1
+
+        # Identity filter.
+        if numtaps == 1:
+            return None
+
+        # Separable Kaiser low-pass filter.
+        if not radial:
+            f = scipy.signal.firwin(numtaps=numtaps, cutoff=cutoff, width=width, fs=fs)
+            return torch.as_tensor(f, dtype=torch.float32)
+
+        # Radially symmetric jinc-based filter.
+        x = (np.arange(numtaps) - (numtaps - 1) / 2) / fs
+        r = np.hypot(*np.meshgrid(x, x))
+        f = scipy.special.j1(2 * cutoff * (np.pi * r)) / (np.pi * r)
+        beta = scipy.signal.kaiser_beta(scipy.signal.kaiser_atten(numtaps, width / (fs / 2)))
+        w = np.kaiser(numtaps, beta)
+        f *= np.outer(w, w)
+        f /= np.sum(f)
+        return torch.as_tensor(f, dtype=torch.float32)
+
+    def extra_repr(self):
+        return '\n'.join([
+            f'w_dim={self.w_dim:d}, is_torgb={self.is_torgb},',
+            f'is_critically_sampled={self.is_critically_sampled}, use_fp16={self.use_fp16},',
+            f'in_sampling_rate={self.in_sampling_rate:g}, out_sampling_rate={self.out_sampling_rate:g},',
+            f'in_cutoff={self.in_cutoff:g}, out_cutoff={self.out_cutoff:g},',
+            f'in_half_width={self.in_half_width:g}, out_half_width={self.out_half_width:g},',
+            f'in_size={list(self.in_size)}, out_size={list(self.out_size)},',
+            f'in_channels={self.in_channels:d}, out_channels={self.out_channels:d}'])
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class ToRGBLayer(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, w_dim=0, kernel_size=1, conv_clamp=None, channels_last=False, mode='2d', **unused):
+        super().__init__()
+        self.conv_clamp = conv_clamp
+        self.mode = mode
+        weight_shape = [out_channels, in_channels, kernel_size, kernel_size]
+        if mode == '3d':
+            weight_shape += [kernel_size]
+
+        if w_dim > 0:
+            self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+            memory_format = torch.channels_last if channels_last else torch.contiguous_format
+            self.weight = torch.nn.Parameter(torch.randn(weight_shape).to(memory_format=memory_format))
+            self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+            self.weight_gain = 1 / np.sqrt(np.prod(weight_shape[1:]))
+        
+        else:
+            assert kernel_size == 1, "does not support larger kernel sizes for now. used in NeRF"
+            assert mode != '3d', "does not support 3D convolution for now"
+
+            self.weight = torch.nn.Parameter(torch.Tensor(out_channels, in_channels))
+            self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+            self.weight_gain = 1.
+
+            # initialization
+            torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+            fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            torch.nn.init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x, w=None, fused_modconv=True):
+        if w is not None:
+            styles = self.affine(w) * self.weight_gain
+            if x.size(0) > styles.size(0):
+                assert (x.size(0) // styles.size(0) * styles.size(0) == x.size(0))
+                styles = repeat(styles, 'b c -> (b s) c', s=x.size(0) // styles.size(0))
+            x = modulated_conv2d(x=x, weight=self.weight, styles=styles, demodulate=False, fused_modconv=fused_modconv, mode=self.mode)
+            x = bias_act.bias_act(x, self.bias.to(x.dtype), clamp=self.conv_clamp)
+        else:
+            if x.ndim == 2:
+                x = F.linear(x, self.weight, self.bias)
+            else:
+                x = F.conv2d(x, self.weight[:,:,None,None], self.bias)
+        return x
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisBlock(torch.nn.Module):
+    def __init__(self,
+        in_channels,                        # Number of input channels, 0 = first block.
+        out_channels,                       # Number of output channels.
+        w_dim,                              # Intermediate latent (W) dimensionality.
+        resolution,                         # Resolution of this block.
+        img_channels,                       # Number of output color channels.
+        is_last,                            # Is this the last block?
+        architecture        = 'skip',       # Architecture: 'orig', 'skip', 'resnet'.
+        resample_filter     = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp          = None,         # Clamp the output of convolution layers to +-X, None = disable clamping.
+        use_fp16            = False,        # Use FP16 for this block?
+        fp16_channels_last  = False,        # Use channels-last memory format with FP16?
+        use_single_layer    = False,        # use only one instead of two synthesis layer
+        disable_upsample    = False,
+        **layer_kwargs,                     # Arguments for SynthesisLayer.
+    ):
+        assert architecture in ['orig', 'skip', 'resnet']
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.w_dim = w_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.is_last = is_last
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.channels_last = (use_fp16 and fp16_channels_last)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.num_conv = 0
+        self.num_torgb = 0
+
+        self.groups = 1
+        self.use_single_layer = use_single_layer
+        self.margin = layer_kwargs.get('margin', 0)
+        self.upsample_mode = layer_kwargs.get('upsample_mode', 'default')
+        self.disable_upsample = disable_upsample
+        self.mode = layer_kwargs.get('mode', '2d')
+
+        if in_channels == 0:
+            const_sizes = [out_channels, resolution, resolution]
+            if self.mode == '3d':
+                const_sizes = const_sizes + [resolution]
+            self.const = torch.nn.Parameter(torch.randn(const_sizes))
+        
+        if in_channels != 0:
+            self.conv0 = util.construct_class_by_name(
+                class_name=layer_kwargs.get('layer_name', "training.networks.SynthesisLayer"),
+                in_channels=in_channels, out_channels=out_channels, 
+                w_dim=w_dim, resolution=resolution, 
+                up=2 if (not disable_upsample) else 1,
+                resample_filter=resample_filter, conv_clamp=conv_clamp, 
+                channels_last=self.channels_last, **layer_kwargs)
+            self.num_conv += 1
+
+        if not self.use_single_layer:
+            self.conv1 = util.construct_class_by_name(
+                    class_name=layer_kwargs.get('layer_name', "training.networks.SynthesisLayer"),
+                    in_channels=out_channels, out_channels=out_channels, 
+                    w_dim=w_dim, resolution=resolution,
+                    conv_clamp=conv_clamp, channels_last=self.channels_last, **layer_kwargs)
+            self.num_conv += 1
+
+        if is_last or architecture == 'skip':
+            self.torgb = ToRGBLayer(
+                out_channels, img_channels, w_dim=w_dim,
+                conv_clamp=conv_clamp, channels_last=self.channels_last,
+                groups=self.groups, mode=self.mode)
+            self.num_torgb += 1
+            
+        if in_channels != 0 and architecture == 'resnet':
+            self.skip = Conv2dLayer(
+                in_channels, out_channels, kernel_size=1, bias=False, up=2,
+                resample_filter=resample_filter, 
+                channels_last=self.channels_last,
+                mode=self.mode)
+
+    def forward(self, x, img, ws, force_fp32=False, fused_modconv=None, add_on=None, block_noise=None, disable_rgb=False, **layer_kwargs):
+        misc.assert_shape(ws, [None, self.num_conv + self.num_torgb, self.w_dim])
+        w_iter = iter(ws.unbind(dim=1))
+        dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        memory_format = torch.channels_last if self.channels_last and not force_fp32 else torch.contiguous_format
+        if fused_modconv is None:
+            with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+                fused_modconv = (not self.training) and (dtype == torch.float32 or int(x.shape[0]) == 1)
+
+        # Input.
+        if self.in_channels == 0:
+            x = self.const.to(dtype=dtype, memory_format=memory_format)
+            x = x.unsqueeze(0).expand(ws.shape[0], *x.size())
+        else:
+            x = x.to(dtype=dtype, memory_format=memory_format)
+
+        # Main layers.
+        if add_on is not None:
+            add_on = add_on.to(dtype=dtype, memory_format=memory_format)
+
+        if self.in_channels == 0:
+            if not self.use_single_layer:
+                layer_kwargs['input_noise'] = block_noise[:,1:2] if block_noise is not None else None
+                x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+        
+        elif self.architecture == 'resnet':
+            y = self.skip(x, gain=np.sqrt(0.5))
+            layer_kwargs['input_noise'] = block_noise[:,0:1] if block_noise is not None else None
+            x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+            if not self.use_single_layer:
+                layer_kwargs['input_noise'] = block_noise[:,1:2] if block_noise is not None else None
+                x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, gain=np.sqrt(0.5), **layer_kwargs)
+            x = y.add_(x)
+        else:
+            layer_kwargs['input_noise'] = block_noise[:,0:1] if block_noise is not None else None
+            x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+            if not self.use_single_layer:
+                layer_kwargs['input_noise'] = block_noise[:,1:2] if block_noise is not None else None
+                x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+
+        # ToRGB.
+        if img is not None:
+            if img.size(-1) * 2 == x.size(-1):
+                if (self.upsample_mode == 'bilinear_all') or (self.upsample_mode == 'bilinear_ada'):
+                    img = F.interpolate(img, scale_factor=2, mode='bilinear', align_corners=True)
+                else:
+                    img = upfirdn2d.upsample2d(img, self.resample_filter)   # this is upsampling. Not sure about details and why they do this..
+            elif img.size(-1) == x.size(-1):
+                pass
+            else:
+                raise NotImplementedError
+
+        if self.is_last or self.architecture == 'skip':
+            if not disable_rgb:
+                y = x if add_on is None else x + add_on
+                y = self.torgb(y, next(w_iter), fused_modconv=fused_modconv)
+                y = y.to(dtype=torch.float32, memory_format=torch.contiguous_format)
+                img = img.add_(y) if img is not None else y
+            else:
+                img = None
+
+        assert x.dtype == dtype
+        assert img is None or img.dtype == torch.float32
+        return x, img
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisBlock3(torch.nn.Module):
+    def __init__(self,
+        in_channels,                        # Number of input channels, 0 = first block.
+        out_channels,                       # Number of output channels.
+        w_dim,                              # Intermediate latent (W) dimensionality.
+        resolution,                         # Resolution of this block.
+        img_channels,                       # Number of output color channels.
+        block_id,
+        stylegan3_hyperam,
+        use_fp16            = False,        # Use FP16 for this block?
+        **layer_kwargs,                     # Arguments for SynthesisLayer.
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.w_dim = w_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.num_conv = 0
+        self.num_torgb = 0
+        self.use_fp16 = use_fp16
+
+        is_critically_sampled = block_id == (len(stylegan3_hyperam['sampling_rates'][:-1]) // 2 - 1)
+        sizes, sampling_rates, cutoffs, half_widths = \
+            stylegan3_hyperam['sizes'], stylegan3_hyperam['sampling_rates'], \
+            stylegan3_hyperam['cutoffs'], stylegan3_hyperam['half_widths']
+
+        # each block has two layer
+        prev = max(block_id * 2 - 1, 0)
+        curr = block_id * 2 
+        self.conv0 = util.construct_class_by_name(
+                class_name=layer_kwargs.get('layer_name', "training.networks.SynthesisLayer3"),
+                w_dim=self.w_dim, 
+                is_torgb=False, 
+                is_critically_sampled=is_critically_sampled, 
+                use_fp16=use_fp16,
+                in_channels=in_channels, 
+                out_channels=out_channels,
+                in_size=int(sizes[prev]), 
+                out_size=int(sizes[curr]),
+                in_sampling_rate=int(sampling_rates[prev]), 
+                out_sampling_rate=int(sampling_rates[curr]),
+                in_cutoff=cutoffs[prev], 
+                out_cutoff=cutoffs[curr],
+                in_half_width=half_widths[prev], 
+                out_half_width=half_widths[curr],
+                use_radial_filters=True,
+                **layer_kwargs)
+        self.num_conv += 1
+
+        prev = block_id * 2
+        curr = block_id * 2 + 1 
+        self.conv1 = util.construct_class_by_name(
+                class_name=layer_kwargs.get('layer_name', "training.networks.SynthesisLayer3"),
+                w_dim=self.w_dim, 
+                is_torgb=False, 
+                is_critically_sampled=is_critically_sampled, 
+                use_fp16=use_fp16,
+                in_channels=out_channels, 
+                out_channels=out_channels,
+                in_size=int(sizes[prev]), 
+                out_size=int(sizes[curr]),
+                in_sampling_rate=int(sampling_rates[prev]), 
+                out_sampling_rate=int(sampling_rates[curr]),
+                in_cutoff=cutoffs[prev], 
+                out_cutoff=cutoffs[curr],
+                in_half_width=half_widths[prev], 
+                out_half_width=half_widths[curr],
+                use_radial_filters=True,
+                **layer_kwargs)
+        self.num_conv += 1
+
+        # toRGB layer (used for progressive growing)
+        self.torgb = ToRGBLayer(out_channels, img_channels, w_dim=w_dim)
+        self.num_torgb += 1
+
+    def forward(self, x, img, ws,  force_fp32=False, add_on=None, disable_rgb=False, **layer_kwargs):
+        w_iter = iter(ws.unbind(dim=1))
+        dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        memory_format = torch.contiguous_format
+        
+        # Main layers.
+        x = x.to(dtype=dtype, memory_format=memory_format)
+        if add_on is not None:
+            add_on = add_on.to(dtype=dtype, memory_format=memory_format)
+
+        x = self.conv0(x, next(w_iter), **layer_kwargs)
+        x = self.conv1(x, next(w_iter), **layer_kwargs)
+
+        assert img is None, "currently not support."
+        if not disable_rgb:
+            y = x if add_on is None else x + add_on
+            y = self.torgb(y, next(w_iter), fused_modconv=True)
+            y = y.to(dtype=torch.float32, memory_format=torch.contiguous_format)
+            img = y
+        
+        assert x.dtype == dtype
+        assert img is None or img.dtype == torch.float32
+        return x, img
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class SynthesisNetwork(torch.nn.Module):
+    def __init__(self,
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        img_resolution,             # Output image resolution.
+        img_channels,               # Number of color channels.
+        channel_base    = 1,        # Overall multiplier for the number of channels.
+        channel_max     = 512,      # Maximum number of channels in any layer.
+        num_fp16_res    = 0,        # Use FP16 for the N highest resolutions.
+        **block_kwargs,             # Arguments for SynthesisBlock.
+    ):
+        assert img_resolution >= 4 and img_resolution & (img_resolution - 1) == 0
+        super().__init__()
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [2 ** i for i in range(2, self.img_resolution_log2 + 1)]
+
+        channel_base = int(channel_base * 32768)
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+        self.channels_dict = channels_dict
+
+        self.num_ws = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res // 2] if res > 4 else 0
+            out_channels = channels_dict[res]
+            use_fp16 = (res >= fp16_resolution)
+            is_last = (res == self.img_resolution)
+            block = util.construct_class_by_name(
+                class_name=block_kwargs.get('block_name', "training.networks.SynthesisBlock"),
+                in_channels=in_channels, out_channels=out_channels, w_dim=w_dim, resolution=res,
+                img_channels=img_channels, is_last=is_last, use_fp16=use_fp16, **block_kwargs)
+
+            self.num_ws += block.num_conv
+            if is_last:
+                self.num_ws += block.num_torgb
+            setattr(self, f'b{res}', block)
+
+    def forward(self, ws, **block_kwargs):
+        block_ws = []
+
+        # this part is to slice the style matrices (W) to each layer (conv/RGB)
+        with torch.autograd.profiler.record_function('split_ws'):
+            misc.assert_shape(ws, [None, self.num_ws, self.w_dim])
+            ws = ws.to(torch.float32)
+            w_idx = 0
+            for res in self.block_resolutions:
+                block = getattr(self, f'b{res}')
+                block_ws.append(ws.narrow(1, w_idx, block.num_conv + block.num_torgb))
+                w_idx += block.num_conv
+
+        x = img = None
+        for res, cur_ws in zip(self.block_resolutions, block_ws):
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, cur_ws, **block_kwargs)
+        return img
+
+    def get_current_resolution(self):
+        return [self.img_resolution]   # For compitibility
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class Generator(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality.
+        c_dim,                      # Conditioning label (C) dimensionality.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        img_resolution,             # Output resolution.
+        img_channels,               # Number of output color channels.
+        mapping_kwargs      = {},   # Arguments for MappingNetwork.
+        synthesis_kwargs    = {},   # Arguments for SynthesisNetwork.
+        encoder_kwargs      = {},   # Arguments for Encoder (optional)
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        self.synthesis = util.construct_class_by_name(
+            class_name=synthesis_kwargs.get('module_name', "training.networks.SynthesisNetwork"),
+            w_dim=w_dim, img_resolution=img_resolution, img_channels=img_channels, **synthesis_kwargs)
+        self.num_ws  = self.synthesis.num_ws
+        self.mapping = None
+        self.encoder = None
+
+        if len(mapping_kwargs) > 0:   # Use mapping network
+            self.mapping = util.construct_class_by_name(
+                class_name=mapping_kwargs.get('module_name', "training.networks.MappingNetwork"),
+                z_dim=z_dim, c_dim=c_dim, w_dim=w_dim, num_ws=self.num_ws, **mapping_kwargs)
+        
+        if len(encoder_kwargs) > 0:   # Use Image-Encoder
+            encoder_kwargs['model_kwargs'].update({'num_ws': self.num_ws, 'w_dim': self.w_dim})
+            self.encoder = util.construct_class_by_name(
+               img_resolution=img_resolution, 
+               img_channels=img_channels,
+               **encoder_kwargs) 
+        
+    def forward(self, z=None, c=None, styles=None, truncation_psi=1, truncation_cutoff=None, img=None, **synthesis_kwargs):
+        if styles is None:
+            assert z is not None
+            if (self.encoder is not None) and (img is not None):  #TODO: debug
+                outputs = self.encoder(img)
+                ws = outputs['ws']
+                if ('camera' in outputs) and ('camera_mode' not in synthesis_kwargs):
+                    synthesis_kwargs['camera_RT'] = outputs['camera']
+            else:
+                ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, **synthesis_kwargs)
+        else:
+            ws = styles
+
+        img = self.synthesis(ws, **synthesis_kwargs)
+        return img
+
+    def get_final_output(self, *args, **kwargs):
+        img = self.forward(*args, **kwargs)
+        if isinstance(img, list):
+            return img[-1]
+        elif isinstance(img, dict):
+            return img['img']
+        return img
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class DiscriminatorBlock(torch.nn.Module):
+    def __init__(self,
+        in_channels,                        # Number of input channels, 0 = first block.
+        tmp_channels,                       # Number of intermediate channels.
+        out_channels,                       # Number of output channels.
+        resolution,                         # Resolution of this block.
+        img_channels,                       # Number of input color channels.
+        first_layer_idx,                    # Index of the first layer.
+        architecture        = 'resnet',     # Architecture: 'orig', 'skip', 'resnet'.
+        activation          = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+        resample_filter     = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp          = None,         # Clamp the output of convolution layers to +-X, None = disable clamping.
+        use_fp16            = False,        # Use FP16 for this block?
+        fp16_channels_last  = False,        # Use channels-last memory format with FP16?
+        freeze_layers       = 0,            # Freeze-D: Number of layers to freeze.
+    ):
+        assert in_channels in [0, tmp_channels]
+        assert architecture in ['orig', 'skip', 'resnet']
+        super().__init__()
+        self.in_channels = in_channels
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.first_layer_idx = first_layer_idx
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.channels_last = (use_fp16 and fp16_channels_last)
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+
+        self.num_layers = 0
+        def trainable_gen():
+            while True:
+                layer_idx = self.first_layer_idx + self.num_layers
+                trainable = (layer_idx >= freeze_layers)
+                self.num_layers += 1
+                yield trainable
+        trainable_iter = trainable_gen()
+
+        if in_channels == 0 or architecture == 'skip':
+            self.fromrgb = Conv2dLayer(img_channels, tmp_channels, kernel_size=1, activation=activation,
+                trainable=next(trainable_iter), conv_clamp=conv_clamp, channels_last=self.channels_last)
+
+        self.conv0 = Conv2dLayer(tmp_channels, tmp_channels, kernel_size=3, activation=activation,
+            trainable=next(trainable_iter), conv_clamp=conv_clamp, channels_last=self.channels_last)
+
+        self.conv1 = Conv2dLayer(tmp_channels, out_channels, kernel_size=3, activation=activation, down=2,
+            trainable=next(trainable_iter), resample_filter=resample_filter, conv_clamp=conv_clamp, channels_last=self.channels_last)
+
+        if architecture == 'resnet':
+            self.skip = Conv2dLayer(tmp_channels, out_channels, kernel_size=1, bias=False, down=2,
+                trainable=next(trainable_iter), resample_filter=resample_filter, channels_last=self.channels_last)
+
+    def forward(self, x, img, force_fp32=False, downsampler=None):
+        dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        memory_format = torch.channels_last if self.channels_last and not force_fp32 else torch.contiguous_format
+
+        # Input.
+        if x is not None:
+            misc.assert_shape(x, [None, self.in_channels, self.resolution, self.resolution])
+            x = x.to(dtype=dtype, memory_format=memory_format)
+
+        # FromRGB.
+        if self.in_channels == 0 or self.architecture == 'skip':
+            misc.assert_shape(img, [None, self.img_channels, self.resolution, self.resolution])
+            img = img.to(dtype=dtype, memory_format=memory_format)
+            y = self.fromrgb(img)
+            x = x + y if x is not None else y
+            if self.architecture != 'skip':
+                img = None
+            elif downsampler is not None:
+                img = downsampler(img, 2)
+            else:
+                img = upfirdn2d.downsample2d(img, self.resample_filter)
+
+        # Main layers.
+        if self.architecture == 'resnet':
+            y = self.skip(x, gain=np.sqrt(0.5))
+            x = self.conv0(x)
+            x = self.conv1(x, gain=np.sqrt(0.5))
+            x = y.add_(x)
+        else:
+            x = self.conv0(x)
+            x = self.conv1(x)
+
+        assert x.dtype == dtype
+        return x, img
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class MinibatchStdLayer(torch.nn.Module):
+    def __init__(self, group_size, num_channels=1):
+        super().__init__()
+        self.group_size = group_size
+        self.num_channels = num_channels
+
+    def forward(self, x):
+        N, C, H, W = x.shape
+        with misc.suppress_tracer_warnings(): # as_tensor results are registered as constants
+            G = torch.min(torch.as_tensor(self.group_size), torch.as_tensor(N)) if self.group_size is not None else N
+        F = self.num_channels
+        c = C // F
+
+        y = x.reshape(G, -1, F, c, H, W)    # [GnFcHW] Split minibatch N into n groups of size G, and channels C into F groups of size c.
+        y = y - y.mean(dim=0)               # [GnFcHW] Subtract mean over group.
+        y = y.square().mean(dim=0)          # [nFcHW]  Calc variance over group.
+        y = (y + 1e-8).sqrt()               # [nFcHW]  Calc stddev over group.
+        y = y.mean(dim=[2,3,4])             # [nF]     Take average over channels and pixels.
+        y = y.reshape(-1, F, 1, 1)          # [nF11]   Add missing dimensions.
+        y = y.repeat(G, 1, H, W)            # [NFHW]   Replicate over group and pixels.
+        x = torch.cat([x, y], dim=1)        # [NCHW]   Append to input as new channels.
+        return x
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class DiscriminatorEpilogue(torch.nn.Module):
+    def __init__(self,
+        in_channels,                    # Number of input channels.
+        cmap_dim,                       # Dimensionality of mapped conditioning label, 0 = no label.
+        resolution,                     # Resolution of this block.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        mbstd_group_size    = 4,        # Group size for the minibatch standard deviation layer, None = entire minibatch.
+        mbstd_num_channels  = 1,        # Number of features for the minibatch standard deviation layer, 0 = disable.
+        activation          = 'lrelu',  # Activation function: 'relu', 'lrelu', etc.
+        conv_clamp          = None,     # Clamp the output of convolution layers to +-X, None = disable clamping.
+        final_channels      = 1,        # for classification it is always 1.
+    ):
+        assert architecture in ['orig', 'skip', 'resnet']
+        super().__init__()
+        self.in_channels = in_channels
+        self.final_channels = final_channels
+        self.cmap_dim = cmap_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.architecture = architecture
+
+        if architecture == 'skip':
+            self.fromrgb = Conv2dLayer(img_channels, in_channels, kernel_size=1, activation=activation)
+        self.mbstd = MinibatchStdLayer(group_size=mbstd_group_size, num_channels=mbstd_num_channels) if mbstd_num_channels > 0 else None
+        self.conv = Conv2dLayer(in_channels + mbstd_num_channels, in_channels, kernel_size=3, activation=activation, conv_clamp=conv_clamp)
+        self.fc = FullyConnectedLayer(in_channels * (resolution ** 2), in_channels, activation=activation)
+        self.out = FullyConnectedLayer(in_channels, final_channels if cmap_dim == 0 else cmap_dim)
+
+    def forward(self, x, img, cmap, force_fp32=False):
+        misc.assert_shape(x, [None, self.in_channels, self.resolution, self.resolution]) # [NCHW]
+        _ = force_fp32 # unused
+        dtype = torch.float32
+        memory_format = torch.contiguous_format
+
+        # FromRGB.
+        x = x.to(dtype=dtype, memory_format=memory_format)
+        if self.architecture == 'skip':
+            misc.assert_shape(img, [None, self.img_channels, self.resolution, self.resolution])
+            img = img.to(dtype=dtype, memory_format=memory_format)
+            x = x + self.fromrgb(img)
+
+        # Main layers.
+        if self.mbstd is not None:
+            x = self.mbstd(x)
+        x = self.conv(x)
+        x = self.fc(x.flatten(1))
+        x = self.out(x)
+
+        # Conditioning.
+        if self.cmap_dim > 0:
+            if not isinstance(cmap, list):
+                cmap = [cmap]   # in case of multiple conditions. a trick (TODO)
+            x = [(x * c).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim)) for c in cmap]
+            x = sum(x) / len(cmap)
+            
+        assert x.dtype == dtype
+        return x
+
+#----------------------------------------------------------------------------
+
+@persistence.persistent_class
+class Discriminator(torch.nn.Module):   # The original StyleGAN2 discriminator
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 32768,    # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 0,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = None,     # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+    ):
+        super().__init__()
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+
+        common_kwargs = dict(img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp)
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+
+    def forward(self, img, c, **block_kwargs):
+        x = None
+        if isinstance(img, dict):
+            img = img['img']
+        for res in self.block_resolutions:
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, **block_kwargs)
+
+        cmap = None
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        x = self.b4(x, img, cmap)
+        return x
+
+#----------------------------------------------------------------------------
+# encoders maybe used for inversion (not cleaned)
+
+@persistence.persistent_class
+class EncoderResBlock(torch.nn.Module):
+    def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]):
+        super().__init__()
+
+        self.conv1 = Conv2dLayer(in_channel, in_channel, 3, activation='lrelu')
+        self.conv2 = Conv2dLayer(in_channel, out_channel, 3, down=2, activation='lrelu')
+        self.skip  = Conv2dLayer(in_channel, out_channel, 1, down=2, activation='linear', bias=False)
+
+    def forward(self, input):
+        out  = self.conv1(input)
+        out  = self.conv2(out)
+        skip = self.skip(input)
+        out  = (out + skip) / math.sqrt(2)
+        return out
+
+
+@persistence.persistent_class
+class EqualConv2d(torch.nn.Module):
+    def __init__(
+        self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True
+    ):
+        super().__init__()
+        new_scale   = 1.0
+        self.weight = torch.nn.Parameter(
+            torch.randn(out_channel, in_channel, kernel_size, kernel_size) * new_scale
+        )
+        self.scale   = 1 / math.sqrt(in_channel * kernel_size ** 2)
+        self.stride  = stride
+        self.padding = padding
+        if bias:
+            self.bias = torch.nn.Parameter(torch.zeros(out_channel))
+        else:
+            self.bias = None
+
+    def forward(self, input):
+        out = F.conv2d(
+            input,
+            self.weight * self.scale,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+        )
+        return out
+
+    def __repr__(self):
+        return (
+            f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},'
+            f' {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})'
+        )
+
+
+@persistence.persistent_class
+class Encoder(torch.nn.Module):
+    def __init__(self, size, n_latents, w_dim=512, add_dim=0, **unused):
+        super().__init__()
+        
+        channels = {
+            4: 512,
+            8: 512,
+            16: 512,
+            32: 512,
+            64: 256,
+            128: 128,
+            256: 64,
+            512: 32,
+            1024: 16
+        }        
+        
+        self.w_dim = w_dim
+        self.add_dim = add_dim
+        log_size = int(math.log(size, 2))
+        
+        self.n_latents = n_latents
+        convs = [Conv2dLayer(3, channels[size], 1)]
+
+        in_channel = channels[size]
+        for i in range(log_size, 2, -1):
+            out_channel = channels[2 ** (i - 1)]
+            convs.append(EncoderResBlock(in_channel, out_channel))
+            in_channel = out_channel
+   
+        self.convs = torch.nn.Sequential(*convs)
+        self.projector = EqualConv2d(in_channel, self.n_latents*self.w_dim + add_dim, 4, padding=0, bias=False)
+
+    def forward(self, input):
+        out = self.convs(input)
+        out = self.projector(out)
+        pws, pcm = out[:, :-2], out[:, -2:]
+        pws = pws.view(len(input), self.n_latents, self.w_dim)
+        pcm = pcm.view(len(input), self.add_dim)
+        return pws, pcm
+
+
+@persistence.persistent_class
+class ResNetEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        import torchvision
+        resnet_net = torchvision.models.resnet18(pretrained=True)
+        modules = list(resnet_net.children())[:-1]
+        self.convs = torch.nn.Sequential(*modules)
+        self.requires_grad_(True)
+        self.train()
+
+    def preprocess_tensor(self, x):
+        x = F.interpolate(x, size=(224, 224), mode='bicubic', align_corners=False)
+        return x
+
+    def forward(self, input):
+        out = self.convs(self.preprocess_tensor(input))
+        return out[:, :, 0, 0]
+
+
+@persistence.persistent_class
+class CLIPEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        import clip
+        clip_net, _ = clip.load('ViT-B/32', device='cpu', jit=False)
+        self.encoder = clip_net.visual
+        for p in self.encoder.parameters():
+            p.requires_grad_(True)
+
+    def preprocess_tensor(self, x):
+        import PIL.Image
+        import torchvision.transforms.functional as TF
+        x = x * 0.5 + 0.5  # mapping to 0~1
+        x = TF.resize(x, size=224, interpolation=PIL.Image.BICUBIC)
+        x = TF.normalize(x, (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+        return x
+
+    def forward(self, input):
+        out = self.encoder(self.preprocess_tensor(input))
+        return out
+
+
+# --------------------------------------------------------------------------------------------------- #
+# VolumeGAN thanks https://gist.github.com/justimyhxu/a96f5ac25480d733f3151adb8142d706
+
+@persistence.persistent_class
+class InstanceNormLayer3d(torch.nn.Module):
+    """Implements instance normalization layer."""
+    def __init__(self, num_features, epsilon=1e-8, affine=False):
+        super().__init__()
+        self.eps = epsilon
+        self.affine = affine
+        if self.affine:
+            self.weight = torch.nn.Parameter(torch.Tensor(1, num_features,1,1,1))
+            self.bias = torch.nn.Parameter(torch.Tensor(1, num_features,1,1,1))
+            self.weight.data.uniform_()
+            self.bias.data.zero_()
+
+    def forward(self, x, weight=None, bias=None):
+        x = x - torch.mean(x, dim=[2, 3, 4], keepdim=True)
+        norm = torch.sqrt(
+            torch.mean(x**2, dim=[2, 3, 4], keepdim=True) + self.eps)
+        x = x / norm
+        isnot_input_none = weight is not None and bias is not None
+        assert (isnot_input_none and not self.affine) or (not isnot_input_none and self.affine)
+        if self.affine:
+            x = x*self.weight + self.bias
+        else:
+            x = x*weight + bias
+        return x
+
+@persistence.persistent_class      
+class FeatureVolume(torch.nn.Module):
+    def __init__(
+        self,
+        feat_res=32,
+        init_res=4,
+        base_channels=256,
+        output_channels=32,
+        z_dim=256,
+        use_mapping=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.num_stages = int(np.log2(feat_res // init_res)) + 1
+        self.use_mapping = use_mapping
+
+        self.const = nn.Parameter(
+            torch.ones(1, base_channels, init_res, init_res, init_res))
+        inplanes = base_channels
+        outplanes = base_channels
+
+        self.stage_channels = []
+        for i in range(self.num_stages):
+            conv = nn.Conv3d(inplanes,
+                             outplanes,
+                             kernel_size=(3, 3, 3),
+                             padding=(1, 1, 1))
+            self.stage_channels.append(outplanes)
+            self.add_module(f'layer{i}', conv)
+            instance_norm = InstanceNormLayer3d(num_features=outplanes, affine=not use_mapping)
+
+            self.add_module(f'instance_norm{i}', instance_norm)
+            inplanes = outplanes
+            outplanes = max(outplanes // 2, output_channels)
+            if i == self.num_stages - 1:
+                outplanes = output_channels
+
+        if self.use_mapping:
+            self.mapping_network = CustomMappingNetwork(
+                z_dim, 256,
+                sum(self.stage_channels) * 2)
+        self.upsample = UpsamplingLayer()
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2)
+
+    def forward(self, z, **kwargs):
+        if self.use_mapping:
+            scales, shifts, style = self.mapping_network(z)
+
+        x = self.const.repeat(z.shape[0], 1, 1, 1, 1)
+        for idx in range(self.num_stages):
+            if idx != 0:
+                x = self.upsample(x)
+            conv_layer = self.__getattr__(f'layer{idx}')
+            x = conv_layer(x)
+            instance_norm = self.__getattr__(f'instance_norm{idx}')
+            if self.use_mapping:
+                scale = scales[:, sum(self.stage_channels[:idx]):sum(self.stage_channels[:idx + 1])]
+                shift = shifts[:, sum(self.stage_channels[:idx]):sum(self.stage_channels[:idx + 1])]
+                scale = scale.view(scale.shape + (1, 1, 1))
+                shift = shift.view(shift.shape + (1, 1, 1))
+            else:
+                scale, shift = None, None
+            x = instance_norm(x, weight=scale, bias=shift)
+            x = self.lrelu(x)
+
+        return x
\ No newline at end of file
diff --git a/training/stylenerf.py b/training/stylenerf.py
new file mode 100644
index 0000000000000000000000000000000000000000..34ee799b3b30a67f66a7c7a2514c3ee47518aa1a
--- /dev/null
+++ b/training/stylenerf.py
@@ -0,0 +1,2395 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from bdb import set_trace
+import copy
+from email import generator
+import imp
+import math
+from platform import architecture
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.autograd import grad
+from training.networks import *
+from dnnlib.camera import *
+from dnnlib.geometry import (
+    positional_encoding, upsample, downsample
+)
+from dnnlib.util import dividable, hash_func, EasyDict
+from torch_utils.ops.hash_sample import hash_sample
+from torch_utils.ops.grid_sample_gradfix import grid_sample
+from torch_utils.ops.nerf_utils import topp_masking
+from einops import repeat, rearrange
+
+
+# --------------------------------- basic modules ------------------------------------------- #
+@persistence.persistent_class
+class Style2Layer(nn.Module):
+    def __init__(self, 
+        in_channels, 
+        out_channels, 
+        w_dim, 
+        activation='lrelu', 
+        resample_filter=[1,3,3,1],
+        magnitude_ema_beta = -1,           # -1 means not using magnitude ema
+        **unused_kwargs):
+
+        # simplified version of SynthesisLayer 
+        # no noise, kernel size forced to be 1x1, used in NeRF block
+        super().__init__()
+        self.activation = activation
+        self.conv_clamp = None
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.padding = 0
+        self.act_gain = bias_act.activation_funcs[activation].def_gain
+        self.w_dim = w_dim
+        self.in_features = in_channels
+        self.out_features = out_channels
+        memory_format = torch.contiguous_format
+
+        if w_dim > 0:
+            self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+            self.weight = torch.nn.Parameter(
+               torch.randn([out_channels, in_channels, 1, 1]).to(memory_format=memory_format))
+            self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+        
+        else:
+            self.weight = torch.nn.Parameter(torch.Tensor(out_channels, in_channels))
+            self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+            self.weight_gain = 1.
+
+            # initialization
+            torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+            fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            torch.nn.init.uniform_(self.bias, -bound, bound)
+
+        self.magnitude_ema_beta = magnitude_ema_beta
+        if magnitude_ema_beta > 0:
+            self.register_buffer('w_avg', torch.ones([]))
+
+    def extra_repr(self) -> str:
+        return 'in_features={}, out_features={}, style={}'.format(
+            self.in_features, self.out_features, self.w_dim
+        )
+
+    def forward(self, x, w=None, fused_modconv=None, gain=1, up=1, **unused_kwargs):
+        flip_weight = True # (up == 1) # slightly faster HACK
+        act = self.activation
+
+        if (self.magnitude_ema_beta > 0):
+            if self.training:  # updating EMA.
+                with torch.autograd.profiler.record_function('update_magnitude_ema'):
+                    magnitude_cur = x.detach().to(torch.float32).square().mean()
+                    self.w_avg.copy_(magnitude_cur.lerp(self.w_avg, self.magnitude_ema_beta))
+            input_gain = self.w_avg.rsqrt()
+            x = x * input_gain
+
+        if fused_modconv is None:
+            with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+                fused_modconv = not self.training
+        
+        if self.w_dim > 0:           # modulated convolution
+            assert x.ndim == 4,  "currently not support modulated MLP"
+            styles = self.affine(w)      # Batch x style_dim
+            if x.size(0) > styles.size(0):
+                styles = repeat(styles, 'b c -> (b s) c', s=x.size(0) // styles.size(0))
+            
+            x = modulated_conv2d(x=x, weight=self.weight, styles=styles, noise=None, up=up,
+                padding=self.padding, resample_filter=self.resample_filter, 
+                flip_weight=flip_weight, fused_modconv=fused_modconv)
+            act_gain = self.act_gain * gain
+            act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+            x = bias_act.bias_act(x, self.bias.to(x.dtype), act=act, gain=act_gain, clamp=act_clamp)
+        
+        else:
+            if x.ndim == 2:  # MLP mode
+                x = F.relu(F.linear(x, self.weight, self.bias.to(x.dtype)))
+            else:
+                x = F.relu(F.conv2d(x, self.weight[:,:,None, None], self.bias))
+                # x = bias_act.bias_act(x, self.bias.to(x.dtype), act='relu')
+        return x
+
+
+@persistence.persistent_class
+class SDFDensityLaplace(nn.Module):  # alpha * Laplace(loc=0, scale=beta).cdf(-sdf)
+    def __init__(self, params_init={}, noise_std=0.0, beta_min=0.001, exp_beta=False):
+        super().__init__()
+        self.noise_std = noise_std
+        for p in params_init:
+            param = nn.Parameter(torch.tensor(params_init[p]))
+            setattr(self, p, param)
+        self.beta_min = beta_min
+        self.exp_beta = exp_beta
+        if (exp_beta == 'upper') or exp_beta:
+            self.register_buffer("steps", torch.scalar_tensor(0).float())
+
+    def density_func(self, sdf, beta=None):
+        if beta is None:
+            beta = self.get_beta()
+        alpha = 1 / beta
+        return alpha * (0.5 + 0.5 * sdf.sign() * torch.expm1(-sdf.abs() / beta))  # TODO: need abs maybe, not sure
+
+    def get_beta(self):
+        if self.exp_beta == 'upper':
+            beta_upper = 0.12 * torch.exp(-0.003 * (self.steps / 1e3))
+            beta = min(self.beta.abs(), beta_upper) + self.beta_min
+        elif self.exp_beta:
+            if self.steps < 500000:
+                beta = self.beta.abs() + self.beta_min
+            else:
+                beta = self.beta.abs().detach() + self.beta_min
+        else:
+            beta = self.beta.abs() + self.beta_min
+        return beta
+
+    def set_steps(self, steps):
+        if hasattr(self, "steps"):
+            self.steps = self.steps * 0 + steps
+
+# ------------------------------------------------------------------------------------------- #
+
+@persistence.persistent_class
+class NeRFBlock(nn.Module):
+    ''' 
+    Predicts volume density and color from 3D location, viewing
+    direction, and latent code z.
+    '''
+    # dimensions
+    input_dim            = 3
+    w_dim                = 512   # style latent
+    z_dim                = 0     # input latent
+    rgb_out_dim          = 128
+    hidden_size          = 128
+    n_blocks             = 8
+    img_channels         = 3
+    magnitude_ema_beta   = -1
+    disable_latents      = False
+    max_batch_size       = 2 ** 18
+    shuffle_factor       = 1
+    implementation       = 'batch_reshape'  # option: [flatten_2d, batch_reshape]
+
+    # architecture settings
+    activation           = 'lrelu'
+    use_skip             = False 
+    use_viewdirs         = False
+    add_rgb              = False
+    predict_rgb          = False
+    inverse_sphere       = False
+    merge_sigma_feat     = False   # use one MLP for sigma and features
+    no_sigma             = False   # do not predict sigma, only output features
+    
+    tcnn_backend         = False
+    use_style            = None 
+    use_normal           = False
+    use_sdf              = None
+    volsdf_exp_beta      = False
+    normalized_feat      = False
+    final_sigmoid_act    = False
+
+    # positional encoding inpuut
+    use_pos              = False
+    n_freq_posenc        = 10
+    n_freq_posenc_views  = 4
+    downscale_p_by       = 1
+    gauss_dim_pos        = 20 
+    gauss_dim_view       = 4 
+    gauss_std            = 10.
+    positional_encoding  = "normal"
+
+    def __init__(self, nerf_kwargs):
+        super().__init__()
+        for key in nerf_kwargs:
+            if hasattr(self, key):
+                setattr(self, key, nerf_kwargs[key])
+
+        self.sdf_mode = self.use_sdf
+        self.use_sdf  = self.use_sdf is not None
+        if self.use_sdf == 'volsdf':
+            self.density_transform = SDFDensityLaplace(
+                params_init={'beta': 0.1}, 
+                beta_min=0.0001, 
+                exp_beta=self.volsdf_exp_beta)
+
+        # ----------- input module -------------------------
+        D = self.input_dim if not self.inverse_sphere else self.input_dim + 1
+        if self.positional_encoding == 'gauss':
+            rng = np.random.RandomState(2021)
+            B_pos  = self.gauss_std * torch.from_numpy(rng.randn(D, self.gauss_dim_pos * D)).float()
+            B_view = self.gauss_std * torch.from_numpy(rng.randn(3, self.gauss_dim_view * 3)).float()
+            self.register_buffer("B_pos", B_pos)
+            self.register_buffer("B_view", B_view)
+            dim_embed = D * self.gauss_dim_pos * 2
+            dim_embed_view = 3 * self.gauss_dim_view * 2
+        elif self.positional_encoding == 'normal':
+            dim_embed = D * self.n_freq_posenc * 2
+            dim_embed_view = 3 * self.n_freq_posenc_views * 2
+        else:  # not using positional encoding
+            dim_embed, dim_embed_view = D, 3
+
+        if self.use_pos:
+            dim_embed, dim_embed_view = dim_embed + D, dim_embed_view + 3
+
+        self.dim_embed = dim_embed
+        self.dim_embed_view = dim_embed_view
+
+        # ------------ Layers --------------------------
+        assert not (self.add_rgb and self.predict_rgb), "only one could be achieved"
+        assert not ((self.use_viewdirs or self.use_normal) and (self.merge_sigma_feat or self.no_sigma)), \
+            "merged MLP does not support."
+        
+        if self.disable_latents:
+            w_dim = 0
+        elif self.z_dim > 0:  # if input global latents, disable using style vectors
+            w_dim, dim_embed, dim_embed_view = 0, dim_embed + self.z_dim, dim_embed_view + self.z_dim
+        else:
+            w_dim = self.w_dim
+
+        final_in_dim = self.hidden_size
+        if self.use_normal:
+            final_in_dim += D
+
+        final_out_dim = self.rgb_out_dim * self.shuffle_factor
+        if self.merge_sigma_feat:
+            final_out_dim += self.shuffle_factor  # predicting sigma
+        if self.add_rgb:
+            final_out_dim += self.img_channels
+
+        # start building the model
+        if self.tcnn_backend:
+            try:
+                import tinycudann as tcnn
+            except ImportError:
+                raise ImportError("This sample requires the tiny-cuda-nn extension for PyTorch.")
+
+            assert self.merge_sigma_feat and (not self.predict_rgb) and (not self.add_rgb)
+            assert w_dim == 0, "do not use any modulating inputs"
+            
+            tcnn_config  = {"otype": "FullyFusedMLP", "activation": "ReLU", "output_activation": "None", "n_neurons": 64, "n_hidden_layers": 1}
+            self.network = tcnn.Network(dim_embed, final_out_dim, tcnn_config)
+            self.num_ws  = 0
+            
+        else:
+            self.fc_in  = Style2Layer(dim_embed, self.hidden_size, w_dim, activation=self.activation)
+            self.num_ws = 1
+            self.skip_layer = self.n_blocks // 2 - 1 if self.use_skip else None      
+            if self.n_blocks > 1:
+                self.blocks = nn.ModuleList([
+                    Style2Layer(
+                        self.hidden_size if i != self.skip_layer else self.hidden_size + dim_embed, 
+                        self.hidden_size, 
+                        w_dim, activation=self.activation,
+                        magnitude_ema_beta=self.magnitude_ema_beta)
+                    for i in range(self.n_blocks - 1)])
+                self.num_ws += (self.n_blocks - 1)
+
+            if not (self.merge_sigma_feat or self.no_sigma):
+                self.sigma_out = ToRGBLayer(self.hidden_size, self.shuffle_factor, w_dim, kernel_size=1)
+                self.num_ws += 1
+            self.feat_out = ToRGBLayer(final_in_dim, final_out_dim, w_dim, kernel_size=1)
+            if (self.z_dim == 0 and (not self.disable_latents)):
+                self.num_ws += 1
+            else:
+                self.num_ws = 0        
+            
+            if self.use_viewdirs:
+                assert self.predict_rgb, "only works when predicting RGB"
+                self.from_ray = Conv2dLayer(dim_embed_view, final_out_dim, kernel_size=1, activation='linear')
+            
+            if self.predict_rgb:   # predict RGB over features
+                self.to_rgb = Conv2dLayer(final_out_dim, self.img_channels * self.shuffle_factor, kernel_size=1, activation='linear')
+        
+    def set_steps(self, steps):
+        if hasattr(self, "steps"):
+            self.steps.fill_(steps)
+        
+    def transform_points(self, p, views=False):
+        p = p / self.downscale_p_by
+        if self.positional_encoding == 'gauss':
+            B = self.B_view if views else self.B_pos
+            p_transformed = positional_encoding(p, B, 'gauss', self.use_pos)
+        elif self.positional_encoding == 'normal':
+            L = self.n_freq_posenc_views if views else self.n_freq_posenc
+            p_transformed = positional_encoding(p, L, 'normal', self.use_pos)
+        else:
+            p_transformed = p
+        return p_transformed
+
+    def forward(self, p_in, ray_d, z_shape=None, z_app=None, ws=None, shape=None, requires_grad=False, impl=None):
+        with torch.set_grad_enabled(self.training or self.use_sdf or requires_grad):
+            impl = 'mlp' if self.tcnn_backend else impl
+            option, p_in = self.forward_inputs(p_in, shape=shape, impl=impl)
+            if self.tcnn_backend:
+                with torch.cuda.amp.autocast():
+                    p = p_in.squeeze(-1).squeeze(-1)
+                    o = self.network(p)
+                    sigma_raw, feat = o[:, :self.shuffle_factor], o[:, self.shuffle_factor:]
+                sigma_raw = rearrange(sigma_raw, '(b s) d -> b s d', s=option[2]).to(p_in.dtype)
+                feat = rearrange(feat,  '(b s) d -> b s d', s=option[2]).to(p_in.dtype)
+            else:
+                feat, sigma_raw = self.forward_nerf(option, p_in, ray_d,  ws=ws, z_shape=z_shape, z_app=z_app)
+        return feat, sigma_raw
+
+    def forward_inputs(self, p_in, shape=None, impl=None):
+        # prepare the inputs
+        impl = impl if impl is not None else self.implementation
+        if (shape is not None) and (impl == 'batch_reshape'):
+            height, width, n_steps = shape[1:]
+        elif impl == 'flatten_2d':
+            (height, width), n_steps = dividable(p_in.shape[1]), 1
+        elif impl == 'mlp':
+            height, width, n_steps = 1, 1, p_in.shape[1]
+        else:
+            raise NotImplementedError("looking for more efficient implementation.")        
+        p_in = rearrange(p_in, 'b (h w s) d -> (b s) d h w', h=height, w=width, s=n_steps)
+        use_normal = self.use_normal or self.use_sdf
+        if use_normal:
+            p_in.requires_grad_(True)
+        return (height, width, n_steps, use_normal), p_in
+    
+    def forward_nerf(self, option, p_in, ray_d=None, ws=None, z_shape=None, z_app=None):
+        height, width, n_steps, use_normal = option
+        
+        # forward nerf feature networks
+        p = self.transform_points(p_in.permute(0,2,3,1))
+        if (self.z_dim > 0) and (not self.disable_latents):
+            assert (z_shape is not None) and (ws is None)
+            z_shape = repeat(z_shape, 'b c -> (b s) h w c', h=height, w=width, s=n_steps)
+            p = torch.cat([p, z_shape], -1)
+        p = p.permute(0,3,1,2)    # BS x C x H x W
+
+        if height == width == 1:  # MLP
+            p = p.squeeze(-1).squeeze(-1)
+            
+        net = self.fc_in(p, ws[:, 0] if ws is not None else None)
+        if self.n_blocks > 1:
+            for idx, layer in enumerate(self.blocks):
+                ws_i = ws[:, idx + 1] if ws is not None else None
+                if (self.skip_layer is not None) and (idx == self.skip_layer):
+                    net = torch.cat([net, p], 1)
+                net = layer(net, ws_i, up=1)
+
+        # forward to get the final results
+        w_idx = self.n_blocks  # fc_in, self.blocks
+                
+        feat_inputs = [net]
+        if not (self.merge_sigma_feat or self.no_sigma):
+            ws_i      = ws[:, w_idx] if ws is not None else None
+            sigma_out = self.sigma_out(net, ws_i)
+            if use_normal:
+                gradients, = grad(
+                    outputs=sigma_out, inputs=p_in, 
+                    grad_outputs=torch.ones_like(sigma_out, requires_grad=False), 
+                    retain_graph=True, create_graph=True, only_inputs=True)
+                feat_inputs.append(gradients)
+    
+        ws_i = ws[:, -1] if ws is not None else None
+        net = torch.cat(feat_inputs, 1) if len(feat_inputs) > 1 else net
+        feat_out = self.feat_out(net, ws_i)  # this is used for lowres output
+
+        if self.merge_sigma_feat:  # split sigma from the feature
+            sigma_out, feat_out = feat_out[:, :self.shuffle_factor], feat_out[:, self.shuffle_factor:]
+        elif self.no_sigma:
+            sigma_out = None
+                
+        if self.predict_rgb:
+            if self.use_viewdirs and ray_d is not None:
+                ray_d = ray_d / torch.norm(ray_d, dim=-1, keepdim=True)
+                ray_d = self.transform_points(ray_d, views=True)
+                if self.z_dim > 0:
+                    ray_d = torch.cat([ray_d, repeat(z_app, 'b c -> b (h w s) c', h=height, w=width, s=n_steps)], -1)
+                ray_d = rearrange(ray_d, 'b (h w s) d -> （b s) d h w', h=height, w=width, s=n_steps)
+                feat_ray = self.from_ray(ray_d)
+                rgb = self.to_rgb(F.leaky_relu(feat_out + feat_ray))
+            else:
+                rgb = self.to_rgb(feat_out)
+
+            if self.final_sigmoid_act:
+                rgb = torch.sigmoid(rgb)    
+            if self.normalized_feat:
+                feat_out = feat_out / (1e-7 + feat_out.norm(dim=-1, keepdim=True))
+            feat_out = torch.cat([rgb, feat_out], 1)
+
+        # transform back
+        if feat_out.ndim == 2:  # mlp mode
+            sigma_out = rearrange(sigma_out, '(b s) d -> b s d', s=n_steps) if sigma_out is not None else None
+            feat_out  = rearrange(feat_out,  '(b s) d -> b s d', s=n_steps)
+        else:
+            sigma_out = rearrange(sigma_out, '(b s) d h w -> b (h w s) d', s=n_steps) if sigma_out is not None else None
+            feat_out  = rearrange(feat_out,  '(b s) d h w -> b (h w s) d', s=n_steps)
+        return feat_out, sigma_out
+
+
+@persistence.persistent_class
+class CameraGenerator(torch.nn.Module):
+    def __init__(self, in_dim=2, hi_dim=128, out_dim=2):
+        super().__init__()
+        self.affine1 = FullyConnectedLayer(in_dim, hi_dim, activation='lrelu')
+        self.affine2 = FullyConnectedLayer(hi_dim, hi_dim, activation='lrelu')
+        self.proj    = FullyConnectedLayer(hi_dim, out_dim)
+        
+    def forward(self, x):
+        cam = self.proj(self.affine2(self.affine1(x)))
+        return cam
+
+
+@persistence.persistent_class
+class CameraRay(object):
+
+    range_u          = (0, 0)
+    range_v          = (0.25, 0.25)
+    range_radius     = (2.732, 2.732)
+    depth_range      = [0.5, 6.]
+    gaussian_camera  = False
+    angular_camera   = False
+    intersect_ball   = False
+    fov              = 49.13
+    bg_start         = 1.0
+    depth_transform  = None     # "LogWarp" or "InverseWarp"
+    dists_normalized = False    # use normalized interval instead of real dists
+    random_rotate    = False
+    ray_align_corner = True
+    
+    nonparam_cameras = None
+
+    def __init__(self, camera_kwargs, **other_kwargs):
+        if len(camera_kwargs) == 0:  # for compitatbility of old checkpoints
+            camera_kwargs.update(other_kwargs)        
+        for key in camera_kwargs:
+            if hasattr(self, key):
+                setattr(self, key, camera_kwargs[key])
+        self.camera_matrix = get_camera_mat(fov=self.fov)
+
+    def prepare_pixels(self, img_res, tgt_res, vol_res, camera_matrices, theta, margin=0, **unused):
+        if self.ray_align_corner:    
+            all_pixels = self.get_pixel_coords(img_res, camera_matrices, theta=theta)
+            all_pixels = rearrange(all_pixels, 'b (h w) c -> b c h w', h=img_res, w=img_res)
+            tgt_pixels = F.interpolate(all_pixels, size=(tgt_res, tgt_res), mode='nearest') if tgt_res < img_res else all_pixels.clone()
+            vol_pixels = F.interpolate(tgt_pixels, size=(vol_res, vol_res), mode='nearest') if tgt_res > vol_res else tgt_pixels.clone()
+            vol_pixels = rearrange(vol_pixels, 'b c h w -> b (h w) c')
+            
+        else:  # coordinates not aligned!
+            tgt_pixels = self.get_pixel_coords(tgt_res, camera_matrices, corner_aligned=False, theta=theta)
+            vol_pixels = self.get_pixel_coords(vol_res, camera_matrices, corner_aligned=False, theta=theta, margin=margin) \
+                if (tgt_res > vol_res) or (margin > 0) else tgt_pixels.clone()
+            tgt_pixels = rearrange(tgt_pixels, 'b (h w) c -> b c h w', h=tgt_res, w=tgt_res)
+        return vol_pixels, tgt_pixels
+
+    def prepare_pixels_regularization(self, tgt_pixels, n_reg_samples):
+        # only apply when size is bigger than voxel resolution
+        pace = tgt_pixels.size(-1) // n_reg_samples
+        idxs = torch.arange(0, tgt_pixels.size(-1), pace, device=tgt_pixels.device)           # n_reg_samples
+        u_xy = torch.rand(tgt_pixels.size(0), 2, device=tgt_pixels.device)
+        u_xy = (u_xy * pace).floor().long()    # batch_size x 2
+        x_idxs, y_idxs = idxs[None,:] + u_xy[:,:1], idxs[None,:] + u_xy[:,1:]
+        rand_indexs = (x_idxs[:,None,:] + y_idxs[:,:,None] * tgt_pixels.size(-1)).reshape(tgt_pixels.size(0), -1)
+        tgt_pixels  = rearrange(tgt_pixels, 'b c h w -> b (h w) c')
+        rand_pixels = tgt_pixels.gather(1, rand_indexs.unsqueeze(-1).repeat(1,1,2))
+        return rand_pixels, rand_indexs
+
+    def get_roll(self, ws, training=True, theta=None, **unused):
+        if (self.random_rotate is not None) and training:
+            theta = torch.randn(ws.size(0)).to(ws.device) * self.random_rotate / 2
+            theta = theta / 180 * math.pi
+        else:
+            if theta is not None:
+                theta = torch.ones(ws.size(0)).to(ws.device) * theta
+        return theta
+
+    def get_camera(self, batch_size, device, mode='random', fov=None, force_uniform=False):
+        if fov is not None:
+            camera_matrix = get_camera_mat(fov)
+        else:
+            camera_matrix = self.camera_matrix
+        camera_mat = camera_matrix.repeat(batch_size, 1, 1).to(device)
+        reg_loss = None  # TODO: useless
+
+        if isinstance(mode, list):   
+            # default camera generator, we assume input mode is linear
+            if len(mode) == 3:
+                val_u, val_v, val_r = mode
+                r0 = self.range_radius[0]
+                r1 = self.range_radius[1]
+            else:
+                val_u, val_v, val_r, r_s = mode
+                r0 = self.range_radius[0] * r_s
+                r1 = self.range_radius[1] * r_s
+            
+            world_mat = get_camera_pose(
+                self.range_u, self.range_v, [r0, r1], 
+                val_u, val_v, val_r, 
+                batch_size=batch_size, 
+                gaussian=False,   # input mode is by default uniform
+                angular=self.angular_camera).to(device)
+        
+        elif isinstance(mode, torch.Tensor):    
+            world_mat, mode = get_camera_pose_v2(
+                self.range_u, self.range_v, self.range_radius, mode, 
+                gaussian=self.gaussian_camera and (not force_uniform), 
+                angular=self.angular_camera)
+            world_mat = world_mat.to(device)
+            mode = torch.stack(mode, 1).to(device)
+        
+        else:
+            world_mat, mode = get_random_pose(
+                self.range_u, self.range_v, 
+                self.range_radius, batch_size,
+                gaussian=self.gaussian_camera, 
+                angular=self.angular_camera)            
+            world_mat = world_mat.to(device)
+            mode = torch.stack(mode, 1).to(device)
+        return camera_mat.float(), world_mat.float(), mode, reg_loss
+
+    def get_transformed_depth(self, di, reversed=False):
+        depth_range = self.depth_range
+        
+        if (self.depth_transform is None) or (self.depth_transform == 'None'):
+            g_fwd, g_inv = lambda x: x, lambda x: x
+        elif self.depth_transform == 'LogWarp':
+            g_fwd, g_inv = math.log, torch.exp
+        elif self.depth_transform == 'InverseWarp':
+            g_fwd, g_inv = lambda x: 1/x, lambda x: 1/x
+        else:
+            raise NotImplementedError
+
+        if not reversed:
+            return g_inv(g_fwd(depth_range[1]) * di + g_fwd(depth_range[0]) * (1 - di))
+        else:
+            d0 = (g_fwd(di) - g_fwd(depth_range[0])) / (g_fwd(depth_range[1]) - g_fwd(depth_range[0])) 
+            return d0.clip(min=0, max=1)
+    
+    def get_evaluation_points(self, pixels_world=None, camera_world=None, di=None, p_i=None, no_reshape=False, transform=None):
+        if p_i is None:
+            batch_size = pixels_world.shape[0]
+            n_steps = di.shape[-1]
+            ray_i = pixels_world - camera_world
+            p_i = camera_world.unsqueeze(-2).contiguous() + \
+                di.unsqueeze(-1).contiguous() * ray_i.unsqueeze(-2).contiguous()
+            ray_i = ray_i.unsqueeze(-2).repeat(1, 1, n_steps, 1)
+
+        else:
+            assert no_reshape, "only used to transform points to a warped space"
+
+        if transform is None:
+            transform = self.depth_transform
+
+        if transform == 'LogWarp':
+            c = torch.tensor([1., 0., 0.]).to(p_i.device)
+            p_i = normalization_inverse_sqrt_dist_centered(
+                p_i, c[None, None, None, :], self.depth_range[1])
+        
+        elif transform == 'InverseWarp':
+            # https://arxiv.org/pdf/2111.12077.pdf
+            p_n = p_i.norm(p=2, dim=-1, keepdim=True).clamp(min=1e-7)
+            con = p_n.ge(1).type_as(p_n)
+            p_i = p_i * (1 -con) + (2 - 1 / p_n) * (p_i / p_n) * con
+            
+        if no_reshape:
+            return p_i
+
+        assert(p_i.shape == ray_i.shape)
+        p_i = p_i.reshape(batch_size, -1, 3)
+        ray_i = ray_i.reshape(batch_size, -1, 3)
+        return p_i, ray_i
+
+    def get_evaluation_points_bg(self, pixels_world, camera_world, di):
+        batch_size = pixels_world.shape[0]
+        n_steps    = di.shape[-1]
+        n_pixels   = pixels_world.shape[1]
+        ray_world  = pixels_world - camera_world
+        ray_world  = ray_world / ray_world.norm(dim=-1, keepdim=True)  # normalize
+        
+        camera_world = camera_world.unsqueeze(-2).expand(batch_size, n_pixels, n_steps, 3)
+        ray_world = ray_world.unsqueeze(-2).expand(batch_size, n_pixels, n_steps, 3)
+        bg_pts, _ = depth2pts_outside(camera_world, ray_world, di)    # di: 1 ---> 0
+
+        bg_pts    = bg_pts.reshape(batch_size, -1, 4)
+        ray_world = ray_world.reshape(batch_size, -1, 3)
+        return bg_pts, ray_world
+
+    def add_noise_to_interval(self, di):
+        di_mid  = .5 * (di[..., 1:] + di[..., :-1])
+        di_high = torch.cat([di_mid, di[..., -1:]], dim=-1)
+        di_low  = torch.cat([di[..., :1], di_mid], dim=-1)
+        noise   = torch.rand_like(di_low)
+        ti      = di_low + (di_high - di_low) * noise
+        return ti
+
+    def calc_volume_weights(self, sigma, z_vals=None, ray_vector=None, dists=None, last_dist=1e10):
+        if dists is None:
+            dists = z_vals[..., 1:] - z_vals[..., :-1]
+            if ray_vector is not None:
+                dists = dists * torch.norm(ray_vector, dim=-1, keepdim=True)
+            dists = torch.cat([dists, torch.ones_like(dists[..., :1]) * last_dist], dim=-1)
+        alpha = 1.-torch.exp(-F.relu(sigma)*dists)
+
+        if last_dist > 0:
+            alpha[..., -1] = 1
+            
+        # alpha = 1.-torch.exp(-sigma * dists)
+        T = torch.cumprod(torch.cat([
+                torch.ones_like(alpha[:, :, :1]),
+                (1. - alpha + 1e-10), ], dim=-1), dim=-1)[..., :-1]
+        weights = alpha * T
+        return weights, T[..., -1], dists
+
+    def get_pixel_coords(self, tgt_res, camera_matrices, corner_aligned=True, margin=0, theta=None, invert_y=True):
+        device     = camera_matrices[0].device
+        batch_size = camera_matrices[0].shape[0]
+        # margin = self.margin if margin is None else margin
+        full_pixels = arange_pixels((tgt_res, tgt_res), 
+            batch_size, invert_y_axis=invert_y, margin=margin,
+            corner_aligned=corner_aligned).to(device)
+        if (theta is not None):
+            theta = theta.unsqueeze(-1)
+            x = full_pixels[..., 0] * torch.cos(theta) - full_pixels[..., 1] * torch.sin(theta)
+            y = full_pixels[..., 0] * torch.sin(theta) + full_pixels[..., 1] * torch.cos(theta)
+            full_pixels = torch.stack([x, y], -1)
+        return full_pixels
+
+    def get_origin_direction(self, pixels, camera_matrices):
+        camera_mat, world_mat = camera_matrices[:2]
+        if camera_mat.size(0) < pixels.size(0):
+            camera_mat = repeat(camera_mat, 'b c d -> (b s) c d', s=pixels.size(0)//camera_mat.size(0))
+        if world_mat.size(0) < pixels.size(0):
+            world_mat = repeat(world_mat, 'b c d -> (b s) c d', s=pixels.size(0)//world_mat.size(0))
+        pixels_world = image_points_to_world(pixels, camera_mat=camera_mat, world_mat=world_mat)
+        camera_world = origin_to_world(pixels.size(1), camera_mat=camera_mat, world_mat=world_mat)
+        ray_vector = pixels_world - camera_world
+        return pixels_world, camera_world, ray_vector
+
+    def set_camera_prior(self, dataset_cams):
+        self.nonparam_cameras = dataset_cams
+
+
+@persistence.persistent_class
+class VolumeRenderer(object):
+
+    n_ray_samples     = 14
+    n_bg_samples      = 4
+    n_final_samples   = None    # final nerf steps after upsampling (optional)
+    sigma_type        = 'relu'  # other allowed options including, "abs", "shiftedsoftplus", "exp"
+    
+    hierarchical      = True
+    fine_only         = False
+    no_background     = False
+    white_background  = False
+    mask_background   = False
+    pre_volume_size   = None
+    
+    bound             = None
+    density_p_target  = 1.0
+    tv_loss_weight    = 0.0     # for now only works for density-based voxels
+
+    def __init__(self, renderer_kwargs, camera_ray, input_encoding=None, **other_kwargs):
+        if len(renderer_kwargs) == 0:  # for compitatbility of old checkpoints
+            renderer_kwargs.update(other_kwargs)        
+        for key in renderer_kwargs:
+            if hasattr(self, key):
+                setattr(self, key, renderer_kwargs[key])
+        self.C = camera_ray
+        self.I = input_encoding
+
+    def split_feat(self, x, img_channels, white_color=None, split_rgb=True):
+        img = x[:, :img_channels]
+        if split_rgb:
+            x = x[:, img_channels:]
+        if (white_color is not None) and self.white_background:
+            img = img + white_color
+        return x, img
+
+    def get_bound(self):
+        if self.bound is not None:
+            return self.bound
+
+        # when applying normalization, the points are restricted inside R=2 ball
+        if self.C.depth_transform == 'InverseWarp':
+            bound = 2
+        else:  # TODO: this is a bit hacky as we assume object at origin
+            bound = (self.C.depth_range[1] - self.C.depth_range[0])
+        return bound
+
+    def get_density(self, sigma_raw, fg_nerf, no_noise=False, training=False):
+        if fg_nerf.use_sdf:
+            sigma = fg_nerf.density_transform.density_func(sigma_raw)
+        elif self.sigma_type == 'relu':
+            if training and (not no_noise):    # adding noise to pass gradient?
+                sigma_raw = sigma_raw + torch.randn_like(sigma_raw)
+            sigma = F.relu(sigma_raw)
+        elif self.sigma_type == 'shiftedsoftplus':  # https://arxiv.org/pdf/2111.11215.pdf
+            sigma = F.softplus(sigma_raw - 1)       # 1 is the shifted bias.
+        elif self.sigma_type == 'exp_truncated':    # density in the log-space
+            sigma = torch.exp(5 - F.relu(5 - (sigma_raw - 1)))  # up-bound = 5, also shifted by 1
+        else:
+            sigma = sigma_raw
+        return sigma
+    
+    def forward_hierarchical_sampling(self, di, weights, n_steps, det=False):
+        di_mid = 0.5 * (di[..., :-1] + di[..., 1:])
+        n_bins = di_mid.size(-1)
+        batch_size = di.size(0)
+        di_fine = sample_pdf(
+            di_mid.reshape(-1, n_bins), 
+            weights.reshape(-1, n_bins+1)[:, 1:-1],
+            n_steps, det=det).reshape(batch_size, -1, n_steps)
+        return di_fine
+
+    def forward_rendering_with_pre_density(self, H, output, fg_nerf, nerf_input_cams, nerf_input_feats, latent_codes, styles):
+        pixels_world, camera_world, ray_vector = nerf_input_cams
+        z_shape_obj, z_app_obj = latent_codes[:2]
+        height, width = dividable(H.n_points)
+        fg_shape = [H.batch_size, height, width, H.n_steps]
+        bound = self.get_bound()
+
+        # sample points
+        di = torch.linspace(0., 1., steps=H.n_steps).to(H.device)
+        di = repeat(di, 's -> b n s', b=H.batch_size, n=H.n_points)
+        if (H.training and (not H.get('disable_noise', False))) or H.get('force_noise', False):
+            di = self.C.add_noise_to_interval(di)
+        di_trs = self.C.get_transformed_depth(di)
+        p_i, r_i = self.C.get_evaluation_points(pixels_world, camera_world, di_trs)
+        p_i = self.I.query_input_features(p_i, nerf_input_feats, fg_shape, bound)
+        
+        pre_sigma_raw, p_i = p_i[...,:self.I.sigma_dim].sum(dim=-1, keepdim=True), p_i[..., self.I.sigma_dim:] 
+        pre_sigma = self.get_density(rearrange(pre_sigma_raw, 'b (n s) () -> b n s', s=H.n_steps), 
+                                     fg_nerf, training=H.training)
+        
+        pre_weights = self.C.calc_volume_weights(
+            pre_sigma, di if self.C.dists_normalized else di_trs, ray_vector, last_dist=1e10)[0]
+        
+        feat, _ = fg_nerf(p_i, r_i, z_shape_obj, z_app_obj, ws=styles, shape=fg_shape)
+        feat = rearrange(feat, 'b (n s) d -> b n s d', s=H.n_steps)
+        feat = torch.sum(pre_weights.unsqueeze(-1) * feat, dim=-2)
+        
+        output.feat += [feat]
+        output.fg_weights = pre_weights
+        output.fg_depths  = (di, di_trs)  
+        return output
+
+    def forward_sampling(self, H, output, fg_nerf, nerf_input_cams, nerf_input_feats, latent_codes, styles):
+        # TODO: experimental research code. Not functional yet.
+         
+        pixels_world, camera_world, ray_vector = nerf_input_cams
+        z_shape_obj, z_app_obj = latent_codes[:2]
+        height, width = dividable(H.n_points)
+        bound = self.get_bound()
+        
+        # just to simulate
+        H.n_steps = 64
+        di = torch.linspace(0., 1., steps=H.n_steps).to(H.device)
+        di = repeat(di, 's -> b n s', b=H.batch_size, n=H.n_points)
+        if (H.training and (not H.get('disable_noise', False))) or H.get('force_noise', False):
+            di = self.C.add_noise_to_interval(di)
+        di_trs = self.C.get_transformed_depth(di)
+        
+        fg_shape = [H.batch_size, height, width, 1]
+        
+        # iteration in the loop (?)
+        feats, sigmas = [], []
+        with torch.enable_grad():
+            di_trs.requires_grad_(True)
+            for s in range(di_trs.shape[-1]):
+                di_s = di_trs[..., s:s+1]
+                p_i, r_i = self.C.get_evaluation_points(pixels_world, camera_world, di_s)
+                if nerf_input_feats is not None:
+                    p_i = self.I.query_input_features(p_i, nerf_input_feats, fg_shape, bound)        
+                feat, sigma_raw = fg_nerf(p_i, r_i, z_shape_obj, z_app_obj, ws=styles, shape=fg_shape, requires_grad=True)
+                sigma = self.get_density(sigma_raw, fg_nerf, training=H.training)
+            feats += [feat]
+            sigmas += [sigma]
+        feat, sigma = torch.stack(feats, 2), torch.cat(sigmas, 2)
+        fg_weights, bg_lambda = self.C.calc_volume_weights(
+            sigma, di if self.C.dists_normalized else di_trs,  # use real dists for computing weights
+            ray_vector, last_dist=0 if not H.fg_inf_depth else 1e10)[:2]
+        fg_feat = torch.sum(fg_weights.unsqueeze(-1) * feat, dim=-2)
+        
+        output.feat       += [fg_feat]
+        output.full_out   += [feat]
+        output.fg_weights  = fg_weights
+        output.bg_lambda   = bg_lambda
+        output.fg_depths   = (di, di_trs)
+        return output
+        
+    def forward_rendering(self, H, output, fg_nerf, nerf_input_cams, nerf_input_feats, latent_codes, styles):
+        pixels_world, camera_world, ray_vector = nerf_input_cams
+        z_shape_obj, z_app_obj = latent_codes[:2]
+        height, width = dividable(H.n_points)
+        fg_shape = [H.batch_size, height, width, H.n_steps]
+        bound = self.get_bound()
+
+        # sample points
+        di = torch.linspace(0., 1., steps=H.n_steps).to(H.device)
+        di = repeat(di, 's -> b n s', b=H.batch_size, n=H.n_points)
+        if (H.training and (not H.get('disable_noise', False))) or H.get('force_noise', False):
+            di = self.C.add_noise_to_interval(di)
+        di_trs = self.C.get_transformed_depth(di)
+        p_i, r_i = self.C.get_evaluation_points(pixels_world, camera_world, di_trs)
+
+        if nerf_input_feats is not None:
+            p_i = self.I.query_input_features(p_i, nerf_input_feats, fg_shape, bound)
+        
+        feat, sigma_raw = fg_nerf(p_i, r_i, z_shape_obj, z_app_obj, ws=styles, shape=fg_shape)
+        feat = rearrange(feat, 'b (n s) d -> b n s d', s=H.n_steps)
+        sigma_raw = rearrange(sigma_raw.squeeze(-1), 'b (n s) -> b n s', s=H.n_steps) 
+        sigma = self.get_density(sigma_raw, fg_nerf, training=H.training)         
+        fg_weights, bg_lambda = self.C.calc_volume_weights(
+            sigma, di if self.C.dists_normalized else di_trs,  # use real dists for computing weights
+            ray_vector, last_dist=0 if not H.fg_inf_depth else 1e10)[:2]
+        
+        if self.hierarchical and (not H.get('disable_hierarchical', False)):
+            with torch.no_grad():
+                di_fine = self.forward_hierarchical_sampling(di, fg_weights, H.n_steps, det=(not H.training))
+            di_trs_fine = self.C.get_transformed_depth(di_fine)
+            p_f, r_f = self.C.get_evaluation_points(pixels_world, camera_world, di_trs_fine)
+            if nerf_input_feats is not None:
+                p_f = self.I.query_input_features(p_f, nerf_input_feats, fg_shape, bound)
+
+            feat_f, sigma_raw_f = fg_nerf(p_f, r_f, z_shape_obj, z_app_obj, ws=styles, shape=fg_shape)
+            feat_f      = rearrange(feat_f, 'b (n s) d -> b n s d', s=H.n_steps)
+            sigma_raw_f = rearrange(sigma_raw_f.squeeze(-1), 'b (n s) -> b n s', s=H.n_steps)
+            sigma_f     = self.get_density(sigma_raw_f, fg_nerf, training=H.training)
+            
+            feat      = torch.cat([feat_f, feat], 2)
+            sigma     = torch.cat([sigma_f, sigma], 2)
+            sigma_raw = torch.cat([sigma_raw_f, sigma_raw], 2)
+            di        = torch.cat([di_fine, di], 2)
+            di_trs    = torch.cat([di_trs_fine, di_trs], 2)
+            
+            di, indices = torch.sort(di, dim=2)
+            di_trs    = torch.gather(di_trs, 2, indices)
+            sigma     = torch.gather(sigma, 2, indices)
+            sigma_raw = torch.gather(sigma_raw, 2, indices)
+            feat      = torch.gather(feat, 2, repeat(indices, 'b n s -> b n s d', d=feat.size(-1)))
+
+            fg_weights, bg_lambda = self.C.calc_volume_weights(
+                sigma, di if self.C.dists_normalized else di_trs,  # use real dists for computing weights, 
+                ray_vector, last_dist=0 if not H.fg_inf_depth else 1e10)[:2]
+
+        fg_feat = torch.sum(fg_weights.unsqueeze(-1) * feat, dim=-2)
+        
+        output.feat       += [fg_feat]
+        output.full_out   += [feat]
+        output.fg_weights  = fg_weights
+        output.bg_lambda   = bg_lambda
+        output.fg_depths   = (di, di_trs)
+        return output
+
+    def forward_rendering_background(self, H, output, bg_nerf, nerf_input_cams, latent_codes, styles_bg):
+        pixels_world, camera_world, _ = nerf_input_cams
+        z_shape_bg, z_app_bg = latent_codes[2:]
+        height, width = dividable(H.n_points)
+        bg_shape = [H.batch_size, height, width, H.n_bg_steps]            
+        if H.fixed_input_cams is not None:
+            pixels_world, camera_world, _ = H.fixed_input_cams
+
+        # render background, use NeRF++ inverse sphere parameterization
+        di = torch.linspace(-1., 0., steps=H.n_bg_steps).to(H.device)
+        di = repeat(di, 's -> b n s', b=H.batch_size, n=H.n_points) * self.C.bg_start
+        if (H.training and (not H.get('disable_noise', False))) or H.get('force_noise', False):
+            di = self.C.add_noise_to_interval(di)
+        p_bg, r_bg = self.C.get_evaluation_points_bg(pixels_world, camera_world, -di)
+
+        feat, sigma_raw = bg_nerf(p_bg, r_bg, z_shape_bg, z_app_bg, ws=styles_bg, shape=bg_shape)
+        feat      = rearrange(feat, 'b (n s) d -> b n s d', s=H.n_bg_steps)
+        sigma_raw = rearrange(sigma_raw.squeeze(-1), 'b (n s) -> b n s', s=H.n_bg_steps)
+        sigma     = self.get_density(sigma_raw, bg_nerf, training=H.training)
+        bg_weights = self.C.calc_volume_weights(sigma, di, None)[0]
+        bg_feat = torch.sum(bg_weights.unsqueeze(-1) * feat, dim=-2)
+        
+        if output.get('bg_lambda', None) is not None:
+            bg_feat = output.bg_lambda.unsqueeze(-1) * bg_feat
+        output.feat       += [bg_feat]
+        output.full_out   += [feat]
+        output.bg_weights  = bg_weights
+        output.bg_depths   = di
+        return output
+        
+    def forward_volume_rendering(
+        self, 
+        nerf_modules,      # (fg_nerf, bg_nerf)
+        camera_matrices,   # camera (K, RT)
+        vol_pixels,
+
+        nerf_input_feats       = None,
+        latent_codes           = None,
+        styles                 = None,
+        styles_bg              = None,
+        not_render_background  = False, 
+        only_render_background = False,
+
+        render_option          = None,
+        return_full            = False,
+        
+        alpha                  = 0,
+        **unused):
+
+        assert (latent_codes is not None) or (styles is not None)
+        assert self.no_background or (nerf_input_feats is None), "input features do not support background field"
+        
+        # hyper-parameters for rendering
+        H      = EasyDict(**unused)
+        output = EasyDict()
+        output.reg_loss = EasyDict()
+        output.feat = []
+        output.full_out = []
+
+        if render_option is None:
+            render_option = ""
+        H.render_option = render_option
+        H.alpha         = alpha
+
+        # prepare for rendering (parameters)
+        fg_nerf, bg_nerf  = nerf_modules
+        
+        H.training     = fg_nerf.training
+        H.device       = camera_matrices[0].device
+        H.batch_size   = camera_matrices[0].shape[0]
+        H.img_channels = fg_nerf.img_channels
+        H.n_steps      = self.n_ray_samples
+        H.n_bg_steps   = self.n_bg_samples
+        if alpha == -1:
+            H.n_steps  = 20  # just for memory safe.
+        if "steps" in render_option:
+            H.n_steps  = [int(r.split(':')[1]) for r in H.render_option.split(',') if r[:5] == 'steps'][0]
+
+        # prepare for pixels for generating images
+        if isinstance(vol_pixels, tuple):
+            vol_pixels, rand_pixels = vol_pixels
+            pixels    = torch.cat([vol_pixels, rand_pixels], 1)
+            H.rnd_res = int(math.sqrt(rand_pixels.size(1)))
+        else:
+            pixels, rand_pixels, H.rnd_res = vol_pixels, None, None
+        H.tgt_res, H.n_points = int(math.sqrt(vol_pixels.size(1))), pixels.size(1)
+        nerf_input_cams = self.C.get_origin_direction(pixels, camera_matrices)
+
+        # set up an frozen camera for background if necessary
+        if ('freeze_bg' in H.render_option) and (bg_nerf is not None):
+            pitch, yaw = 0.2 + np.pi/2, 0
+            range_u, range_v = self.C.range_u, self.C.range_v
+            u = (yaw - range_u[0])   / (range_u[1] - range_u[0])
+            v = (pitch - range_v[0]) / (range_v[1] - range_v[0])
+            fixed_camera = self.C.get_camera(
+                batch_size=H.batch_size, mode=[u, v, 0.5], device=H.device)
+            H.fixed_input_cams = self.C.get_origin_direction(pixels, fixed_camera)
+        else:
+            H.fixed_input_cams = None
+        
+        H.fg_inf_depth = (self.no_background or not_render_background) and (not self.white_background)
+        assert(not (not_render_background and only_render_background))
+        
+        # volume rendering options: bg_weights, bg_lambda = None, None
+        if (nerf_input_feats is not None) and \
+            len(nerf_input_feats) == 4 and \
+            nerf_input_feats[2] == 'tri_vector' and \
+            self.I.sigma_dim > 0 and H.fg_inf_depth:   
+            # volume rendering with pre-computed density similar to tensor-decomposition
+            output = self.forward_rendering_with_pre_density(
+                H, output, fg_nerf, nerf_input_cams, nerf_input_feats, latent_codes, styles)
+
+        else: 
+            # standard volume rendering 
+            if not only_render_background:
+                output = self.forward_rendering(
+                    H, output, fg_nerf, nerf_input_cams, nerf_input_feats, latent_codes, styles)
+                
+            # background rendering (NeRF++)
+            if (not not_render_background) and (not self.no_background):
+                output = self.forward_rendering_background(
+                    H, output, bg_nerf, nerf_input_cams, latent_codes, styles_bg)
+                         
+        if ('early' in render_option) and ('value' not in render_option):
+            return self.gen_optional_output(
+                H, fg_nerf, nerf_input_cams, nerf_input_feats, latent_codes, styles, output)
+
+        # ------------------------------------------- PREPARE FULL OUTPUT (NO 2D aggregation) -------------------------------------------- #        
+        vol_len   = vol_pixels.size(1)
+        feat_map  = sum(output.feat)
+        full_x    = rearrange(feat_map[:, :vol_len], 'b (h w) d -> b d h w', h=H.tgt_res)
+        split_rgb = fg_nerf.add_rgb or fg_nerf.predict_rgb
+        
+        full_out = self.split_feat(full_x, H.img_channels, None, split_rgb=split_rgb) 
+        if rand_pixels is not None:   # used in full supervision (debug later)
+            if return_full:
+                assert (fg_nerf.predict_rgb or fg_nerf.add_rgb)
+                rand_outputs = [f[:,vol_pixels.size(1):] for f in output.full_out]
+                full_weights = torch.cat([output.fg_weights, output.bg_weights * output.bg_lambda.unsqueeze(-1)], -1) \
+                    if output.get('bg_weights', None) is not None else output.fg_weights
+                full_weights = full_weights[:,vol_pixels.size(1):]
+                full_weights = rearrange(full_weights, 'b (h w) s -> b s h w', h=H.rnd_res, w=H.rnd_res)
+
+                lh, lw = dividable(full_weights.size(1))
+                full_x = rearrange(torch.cat(rand_outputs, 2), 'b (h w) (l m) d -> b d (l h) (m w)', 
+                                   h=H.rnd_res, w=H.rnd_res, l=lh, m=lw)
+                full_x, full_img = self.split_feat(full_x, H.img_channels, split_rgb=split_rgb)
+                output.rand_out = (full_x, full_img, full_weights)
+            
+            else:
+                rand_x = rearrange(feat_map[:, vol_len:], 'b (h w) d -> b d h w', h=H.rnd_res)
+                output.rand_out = self.split_feat(rand_x, H.img_channels, split_rgb=split_rgb)
+        output.full_out = full_out            
+        return output
+
+    def post_process_outputs(self, outputs, freeze_nerf=False):
+        if freeze_nerf:
+            outputs = [x.detach() if isinstance(x, torch.Tensor) else x for x in outputs]
+        x, img = outputs[0], outputs[1]
+        probs  = outputs[2] if len(outputs) == 3 else None
+        return x, img, probs
+
+    def gen_optional_output(self, H, fg_nerf, nerf_input_cams, nerf_input_feats, latent_codes, styles, output):
+        _, camera_world, ray_vector = nerf_input_cams
+        z_shape_obj, z_app_obj = latent_codes[:2]
+        fg_depth_map = torch.sum(output.fg_weights * output.fg_depths[1], dim=-1, keepdim=True)
+        img = camera_world[:, :1] + fg_depth_map * ray_vector
+        img = img.permute(0,2,1).reshape(-1, 3, H.tgt_res, H.tgt_res)
+        
+        if 'input_feats' in H.render_option:
+            a, b = [r.split(':')[1:] for r in H.render_option.split(',') if r.startswith('input_feats')][0]
+            a, b = int(a), int(b)
+            if nerf_input_feats[0] == 'volume':
+                img = nerf_input_feats[1][:,a:a+3,b,:,:]
+            elif nerf_input_feats[0] == 'tri_plane':
+                img = nerf_input_feats[1][:,b,a:a+3,:,:]
+            elif nerf_input_feats[0] == 'hash_table':
+                assert self.I.hash_mode == 'grid_hash'
+                img = nerf_input_feats[1][:,self.I.offsets[b]:self.I.offsets[b+1], :]
+                siz = int(np.ceil(img.size(1)**(1/3)))
+                img = rearrange(img, 'b (d h w) c -> b (d c) h w', h=siz, w=siz, d=siz)
+                img = img[:, a:a+3]
+            else:
+                raise NotImplementedError
+
+        if 'normal' in H.render_option.split(','):
+            shift_l, shift_r = img[:,:,2:,:], img[:,:,:-2,:]
+            shift_u, shift_d = img[:,:,:,2:], img[:,:,:,:-2]
+            diff_hor = normalize(shift_r - shift_l, axis=1)[0][:, :, :, 1:-1]
+            diff_ver = normalize(shift_u - shift_d, axis=1)[0][:, :, 1:-1, :]
+            normal = torch.cross(diff_hor, diff_ver, dim=1)
+            img = normalize(normal, axis=1)[0]
+
+        if 'gradient' in H.render_option.split(','):
+            points, _ = self.C.get_evaluation_points(camera_world + ray_vector, camera_world, output.fg_depths[1])
+            fg_shape  = [H.batch_size, H.tgt_res, H.tgt_res, output.fg_depths[1].size(-1)]
+            with torch.enable_grad():
+                points.requires_grad_(True)
+                inputs = self.I.query_input_features(points, nerf_input_feats, fg_shape, self.get_bound(), True) \
+                        if nerf_input_feats is not None else points
+                if (nerf_input_feats is not None) and len(nerf_input_feats) == 4 and nerf_input_feats[2] == 'tri_vector' and (self.I.sigma_dim > 0):
+                    sigma_out = inputs[..., :8].sum(dim=-1, keepdim=True)
+                else:
+                    _, sigma_out = fg_nerf(inputs, None, ws=styles, shape=fg_shape, z_shape=z_shape_obj, z_app=z_app_obj, requires_grad=True)
+                gradients, = grad(
+                    outputs=sigma_out, inputs=points, 
+                    grad_outputs=torch.ones_like(sigma_out, requires_grad=False), 
+                    retain_graph=True, create_graph=True, only_inputs=True)
+            gradients = rearrange(gradients, 'b (n s) d -> b n s d', s=output.fg_depths[1].size(-1))
+            avg_grads = (gradients * output.fg_weights.unsqueeze(-1)).sum(-2)
+            avg_grads = F.normalize(avg_grads, p=2, dim=-1)
+            normal    = rearrange(avg_grads, 'b (h w) s -> b s h w', h=H.tgt_res, w=H.tgt_res)
+            img       = -normal
+
+        return {'full_out': (None, img)}
+
+
+@persistence.persistent_class
+class Upsampler(object):
+
+    no_2d_renderer   = False
+    no_residual_img  = False
+    block_reses      = None
+    shared_rgb_style = False
+    upsample_type    = 'default'
+    img_channels     = 3
+    in_res           = 32
+    out_res          = 512
+    channel_base     = 1
+    channel_base_sz  = None
+    channel_max      = 512
+    channel_dict     = None
+    out_channel_dict = None
+
+    def __init__(self, upsampler_kwargs, **other_kwargs):
+        # for compitatbility of old checkpoints
+        for key in other_kwargs:
+            if hasattr(self, key) and (key not in upsampler_kwargs):
+                upsampler_kwargs[key] = other_kwargs[key]
+        for key in upsampler_kwargs:
+            if hasattr(self, key):
+                setattr(self, key, upsampler_kwargs[key])
+
+        self.out_res_log2 = int(np.log2(self.out_res))
+
+        # set up upsamplers
+        if self.block_reses is None:
+            self.block_resolutions = [2 ** i for i in range(2, self.out_res_log2 + 1)]
+            self.block_resolutions = [b for b in self.block_resolutions if b > self.in_res]
+        else:
+            self.block_resolutions = self.block_reses
+        
+        if self.no_2d_renderer:
+            self.block_resolutions = []
+
+    def build_network(self, w_dim, input_dim, **block_kwargs):
+        upsamplers = []
+        if len(self.block_resolutions) > 0:  # nerf resolution smaller than image
+            channel_base       = int(self.channel_base * 32768) if self.channel_base_sz is None else self.channel_base_sz
+            fp16_resolution    = self.block_resolutions[0] * 2 # do not use fp16 for the first block
+
+            if self.channel_dict is None:
+                channels_dict  = {res: min(channel_base // res, self.channel_max) for res in self.block_resolutions}
+            else:
+                channels_dict  = self.channel_dict 
+
+            if self.out_channel_dict is not None:
+                img_channels   = self.out_channel_dict
+            else:
+                img_channels   = {res: self.img_channels for res in self.block_resolutions}
+            
+            for ir, res in enumerate(self.block_resolutions):
+                res_before   = self.block_resolutions[ir-1] if ir > 0 else self.in_res
+                in_channels  = channels_dict[res_before] if ir > 0 else input_dim
+                out_channels = channels_dict[res]                
+                use_fp16     = (res >= fp16_resolution) # TRY False
+                is_last      = (ir == (len(self.block_resolutions) - 1))
+                no_upsample  = (res == res_before)
+                block        = util.construct_class_by_name(
+                    class_name=block_kwargs.get('block_name', "training.networks.SynthesisBlock"),
+                    in_channels=in_channels, 
+                    out_channels=out_channels, 
+                    w_dim=w_dim, 
+                    resolution=res,
+                    img_channels=img_channels[res], 
+                    is_last=is_last, 
+                    use_fp16=use_fp16, 
+                    disable_upsample=no_upsample,
+                    block_id=ir,
+                    **block_kwargs)
+
+                upsamplers  += [{
+                    'block': block,
+                    'num_ws': block.num_conv if not is_last else block.num_conv + block.num_torgb,
+                    'name': f'b{res}' if res_before != res else f'b{res}_l{ir}'
+                }]
+            self.num_ws = sum([u['num_ws'] for u in upsamplers])
+        return upsamplers
+
+    def forward_ws_split(self, ws, blocks):
+        block_ws, w_idx = [], 0
+        for ir, res in enumerate(self.block_resolutions):
+            block = blocks[ir]
+            if self.shared_rgb_style:
+                w     = ws.narrow(1, w_idx, block.num_conv)
+                w_img = ws.narrow(1, -block.num_torgb, block.num_torgb)  # TODO: tRGB to use the same style (?)
+                block_ws.append(torch.cat([w, w_img], 1))
+            else:
+                block_ws.append(ws.narrow(1, w_idx, block.num_conv + block.num_torgb))
+            w_idx += block.num_conv
+        return block_ws
+
+    def forward_network(self, blocks, block_ws, x, img, target_res, alpha, skip_up=False, **block_kwargs):
+        imgs = []
+        for index_l, (res, cur_ws) in enumerate(zip(self.block_resolutions, block_ws)):
+            if res > target_res:
+                break
+
+            block  = blocks[index_l]
+            block_noise = block_kwargs['voxel_noise'][index_l] if "voxel_noise" in block_kwargs else None
+            x, img  = block(
+                x, 
+                img if not self.no_residual_img else None, 
+                cur_ws,
+                block_noise=block_noise,
+                skip_up=skip_up,
+                **block_kwargs)
+
+            imgs += [img]
+        return imgs
+
+
+@persistence.persistent_class
+class NeRFInput(Upsampler):
+    """ Instead of positional encoding, it learns additional features for each points.
+        However, it is important to normalize the input points 
+    """
+    output_mode  = 'none'
+    input_mode   = 'random'  # coordinates
+
+    architecture = 'skip'
+
+    # only useful for triplane/volume inputs
+    in_res       = 4
+    out_res      = 256
+    out_dim      = 32
+    sigma_dim    = 8
+    split_size   = 64
+
+    # only useful for hashtable inputs
+    hash_n_min   = 16
+    hash_n_max   = 512
+    hash_size    = 16
+    hash_level   = 16
+    hash_dim_in  = 32
+    hash_dim_mid = None
+    hash_dim_out = 2
+    hash_n_layer = 4 
+    hash_mode    = 'fast_hash'  # grid_hash (like volumes)
+
+    keep_posenc  = -1
+    keep_nerf_latents = False
+
+    def build_network(self, w_dim, **block_kwargs):
+        # change global settings for input field.
+        kwargs_copy = copy.deepcopy(block_kwargs)
+        kwargs_copy['kernel_size'] = 3
+        kwargs_copy['upsample_mode'] = 'default'
+        kwargs_copy['use_noise'] = True
+        kwargs_copy['architecture'] = self.architecture
+        self._flag = 0
+        
+        assert self.input_mode == 'random', \
+            "currently only support normal StyleGAN2. in the future we may work on other inputs."
+
+        # plane-based inputs with modulated 2D convolutions
+        if self.output_mode   == 'tri_plane_reshape':
+            self.img_channels, in_channels, const = 3 * self.out_dim, 0, None    
+        elif self.output_mode == 'tri_plane_product':   #TODO: sigma_dim is for density
+            self.img_channels, in_channels = 3 * (self.out_dim + self.sigma_dim), 0
+            const = torch.nn.Parameter(0.1 * torch.randn([self.img_channels, self.out_res]))
+        elif self.output_mode == 'multi_planes':
+            self.img_channels, in_channels, const = self.out_dim * self.split_size, 0, None
+            kwargs_copy['architecture'] = 'orig'
+
+        # volume-based inputs with modulated 3D convolutions
+        elif self.output_mode == '3d_volume':  # use 3D convolution to generate
+            kwargs_copy['architecture'] = 'orig'
+            kwargs_copy['mode'] = '3d'
+            self.img_channels, in_channels, const = self.out_dim, 0, None        
+        elif self.output_mode == 'ms_volume':  # multi-resolution voulume, between hashtable and volumes
+            kwargs_copy['architecture'] = 'orig'
+            kwargs_copy['mode'] = '3d'
+            self.img_channels, in_channels, const = self.out_dim, 0, None
+
+        # embedding-based inputs with modulated MLPs
+        elif self.output_mode == 'hash_table':
+            if self.hash_mode == 'grid_hash':
+                assert self.hash_size % 3 == 0, "needs to be 3D"
+            kwargs_copy['hash_size'], self._flag = 2 ** self.hash_size, 1
+            assert self.hash_dim_out * self.hash_level == self.out_dim, "size must matched"
+            return self.build_modulated_embedding(w_dim, **kwargs_copy)
+        
+        elif self.output_mode == 'ms_nerf_hash':
+            self.hash_mode, self._flag = 'grid_hash', 2
+            ms_nerf = NeRFBlock({
+                'rgb_out_dim': self.hash_dim_out * self.hash_level,  # HACK
+                'magnitude_ema_beta': block_kwargs['magnitude_ema_beta'],
+                'no_sigma': True, 'predict_rgb': False, 'add_rgb': False,
+                'n_freq_posenc': 5,
+            })
+            self.num_ws = ms_nerf.num_ws
+            return [{'block': ms_nerf, 'num_ws': ms_nerf.num_ws, 'name': 'ms_nerf'}]
+            
+        else:
+            raise NotImplementedError
+
+        networks = super().build_network(w_dim, in_channels, **kwargs_copy)
+        if const is not None:
+            networks.append({'block': const, 'num_ws': 0, 'name': 'const'})
+        return networks
+
+    def forward_ws_split(self, ws, blocks):
+        if self._flag == 1:
+            return ws.split(1, dim=1)[:len(blocks)-1]
+        elif self._flag == 0:
+            return super().forward_ws_split(ws, blocks)
+        else:
+            return ws  # do not split
+
+    def forward_network(self, blocks, block_ws, batch_size, **block_kwargs):
+        x, img, out = None, None, None
+        def _forward_conv_networks(x, img, blocks, block_ws):
+            for index_l, (res, cur_ws) in enumerate(zip(self.block_resolutions, block_ws)):
+                x, img  = blocks[index_l](x, img, cur_ws, **block_kwargs)
+            return img
+
+        def _forward_ffn_networks(x, blocks, block_ws):  
+            #TODO: FFN is implemented as 1x1 conv for now #
+            h, w = dividable(x.size(0))
+            x = repeat(x, 'n d -> b n d', b=batch_size)
+            x = rearrange(x, 'b (h w) d -> b d h w', h=h, w=w)
+            for index_l, cur_ws in enumerate(block_ws):
+                block, cur_ws = blocks[index_l], cur_ws[:, 0]
+                x = block(x, cur_ws)
+            return x
+
+        # tri-plane outputs
+        if 'tri_plane' in self.output_mode:
+            img = _forward_conv_networks(x, img, blocks, block_ws)
+            if self.output_mode == 'tri_plane_reshape':
+                out = ('tri_plane', rearrange(img, 'b (s c) h w -> b s c h w', s=3))
+            elif self.output_mode == 'tri_plane_product':
+                out = ('tri_plane', rearrange(img, 'b (s c) h w -> b s c h w', s=3), 
+                       'tri_vector', repeat(rearrange(blocks[-1], '(s c) d -> s c d', s=3), 's c d -> b s c d', b=img.size(0)))
+            else:
+                raise NotImplementedError("remove support for other types of tri-plane implementation.")
+            
+        # volume/3d voxel outputs
+        elif self.output_mode == 'multi_planes':
+            img = _forward_conv_networks(x, img, blocks, block_ws)
+            out = ('volume', rearrange(img, 'b (s c) h w -> b s c h w', s=self.out_dim))
+        elif self.output_mode == '3d_volume':
+            img = _forward_conv_networks(x, img, blocks, block_ws)
+            out = ('volume', img)
+
+        # multi-resolution 3d volume outputs (similar to hash-table)
+        elif self.output_mode == 'ms_volume':
+            img = _forward_conv_networks(x, img, blocks, block_ws)
+            out = ('ms_volume', rearrange(img, 'b (l m) d h w -> b l m d h w', l=self.hash_level))
+            
+        # hash-table outputs (need hash sample implemented #TODO#
+        elif self.output_mode == 'hash_table':
+            x, blocks = blocks[-1], blocks[:-1]
+            if len(blocks) > 0:
+                x = _forward_ffn_networks(x, blocks, block_ws)
+                out = ('hash_table', rearrange(x, 'b d h w -> b (h w) d'))
+            else:
+                out = ('hash_table', repeat(x, 'n d -> b n d', b=batch_size))
+        elif self.output_mode == 'ms_nerf_hash':
+            # prepare inputs for nerf
+            x = torch.linspace(-1, 1, steps=self.out_res, device=block_ws.device)
+            x = torch.stack(torch.meshgrid(x,x,x), -1).reshape(-1, 3)
+            x = repeat(x, 'n s -> b n s', b=block_ws.size(0))
+            x = blocks[0](x, None, ws=block_ws, shape=[block_ws.size(0), 32, 32, 32])[0]
+            x = rearrange(x, 'b (d h w) (l m) -> b l m d h w', l=self.hash_level, d=32, h=32, w=32)
+            out = ('ms_volume', x)
+            
+        else:
+            raise NotImplementedError
+
+        return out
+
+    def query_input_features(self, p_i, input_feats, p_shape, bound, grad_inputs=False):
+        batch_size, height, width, n_steps = p_shape        
+        p_i = p_i / bound
+        
+        if input_feats[0] == 'tri_plane':
+            # TODO!! Our world space, x->depth, y->width, z->height
+            lh, lw = dividable(n_steps)
+            p_ds = rearrange(p_i, 'b (h w l m) d -> b (l h) (m w) d',
+                b=batch_size, h=height, w=width, l=lh, m=lw).split(1, dim=-1)
+            px, py, pz = p_ds[0], p_ds[1], p_ds[2]
+                
+            # project points onto three planes
+            p_xy = torch.cat([px, py], -1)
+            p_xz = torch.cat([px, pz], -1)
+            p_yz = torch.cat([py, pz], -1)
+            p_gs = torch.cat([p_xy, p_xz, p_yz], 0)
+            f_in = torch.cat([input_feats[1][:, i] for i in range(3)], 0)
+            p_f  = grid_sample(f_in, p_gs)  # gradient-fix bilinear interpolation
+            p_f  = [p_f[i * batch_size: (i+1) * batch_size] for i in range(3)]
+            
+            # project points to three vectors (optional)
+            if len(input_feats) == 4 and input_feats[2] == 'tri_vector':
+                # TODO: PyTorch did not support grid_sample for 1D data. Maybe need custom code.
+                p_gs_vec = torch.cat([pz, py, px], 0)
+                f_in_vec = torch.cat([input_feats[3][:, i] for i in range(3)], 0)
+                p_f_vec  = grid_sample(f_in_vec.unsqueeze(-1), torch.cat([torch.zeros_like(p_gs_vec), p_gs_vec], -1))
+                p_f_vec  = [p_f_vec[i * batch_size: (i+1) * batch_size] for i in range(3)]
+                
+                # multiply on the triplane features
+                p_f = [m * v for m, v in zip(p_f, p_f_vec)]
+                
+            p_f  = sum(p_f)
+            p_f  = rearrange(p_f, 'b d (l h) (m w) -> b (h w l m) d', l=lh, m=lw)
+            
+        elif input_feats[0] == 'volume':
+            # TODO!! Our world space, x->depth, y->width, z->height
+            # (width-c, height-c, depth-c), volume (B x N x D x H x W)
+            p_ds  = rearrange(p_i, 'b (h w s) d -> b s h w d',
+                b=batch_size, h=height, w=width, s=n_steps).split(1, dim=-1)
+            px, py, pz = p_ds[0], p_ds[1], p_ds[2]
+            p_yzx = torch.cat([py, -pz, px], -1)
+            p_f   = F.grid_sample(input_feats[1], p_yzx, mode='bilinear', align_corners=False)
+            p_f   = rearrange(p_f, 'b c s h w -> b (h w s) c')
+
+        elif input_feats[0] == 'ms_volume':
+            # TODO!! Multi-resolution volumes (experimental)
+            # for smoothness, maybe we should expand the volume? (TODO)
+            # print(p_i.shape)
+            ms_v = input_feats[1].new_zeros(
+                batch_size, self.hash_level, self.hash_dim_out, self.out_res+1, self.out_res+1, self.out_res+1)
+            ms_v[..., 1:, 1:, 1:] = input_feats[1].flip([3,4,5])
+            ms_v[..., :self.out_res, :self.out_res, :self.out_res] = input_feats[1]
+            v_size = ms_v.size(-1)
+
+            # multi-resolutions
+            b = math.exp((math.log(self.hash_n_max) - math.log(self.hash_n_min))/(self.hash_level-1))
+            hash_res_ls = [round(self.hash_n_min * b ** l) for l in range(self.hash_level)]
+
+            # prepare interpolate grids
+            p_ds = rearrange(p_i, 'b (h w s) d -> b s h w d',
+                b=batch_size, h=height, w=width, s=n_steps).split(1, dim=-1)
+            px, py, pz = p_ds[0], p_ds[1], p_ds[2]
+            p_yzx = torch.cat([py, -pz, px], -1)
+            p_yzx = ((p_yzx + 1) / 2).clamp(min=0, max=1)     # normalize to 0~1 (just for safe)
+            p_yzx = torch.stack([p_yzx if n < v_size else torch.fmod(p_yzx * n, v_size) / v_size for n in hash_res_ls], 1)
+            p_yzx = (p_yzx * 2 - 1).view(-1, n_steps, height, width, 3)
+            
+            ms_v  = ms_v.view(-1, self.hash_dim_out, v_size, v_size, v_size)  # back to -1~1
+            p_f   = F.grid_sample(ms_v, p_yzx, mode='bilinear', align_corners=False)
+            p_f   = rearrange(p_f, '(b l) c s h w -> b (h w s) (l c)', l=self.hash_level)
+        
+        elif input_feats[0] == 'hash_table':
+            # TODO:!! Experimental code trying to learn hashtable used in (maybe buggy)
+            # https://nvlabs.github.io/instant-ngp/assets/mueller2022instant.pdf
+            
+            p_xyz = ((p_i + 1) / 2).clamp(min=0, max=1)  # normalize to 0~1
+            p_f = hash_sample(
+                p_xyz, input_feats[1], self.offsets.to(p_xyz.device), 
+                self.beta, self.hash_n_min, grad_inputs, mode=self.hash_mode)
+
+        else:
+            raise NotImplementedError
+
+        if self.keep_posenc > -1:
+            if self.keep_posenc > 0:
+                p_f = torch.cat([p_f, positional_encoding(p_i, self.keep_posenc, use_pos=True)], -1)
+            else:
+                p_f = torch.cat([p_f, p_i], -1)
+            
+        return p_f
+
+    def build_hashtable_info(self, hash_size):
+        self.beta = math.exp((math.log(self.hash_n_max) - math.log(self.hash_n_min)) / (self.hash_level-1))
+        self.hash_res_ls = [round(self.hash_n_min * self.beta ** l) for l in range(self.hash_level)]
+        offsets, offset = [], 0
+        for i in range(self.hash_level):
+            resolution = self.hash_res_ls[i]
+            params_in_level = min(hash_size, (resolution + 1) ** 3)
+            offsets.append(offset)
+            offset += params_in_level
+        offsets.append(offset)
+        self.offsets = torch.from_numpy(np.array(offsets, dtype=np.int32))
+        return offset
+
+    def build_modulated_embedding(self, w_dim, hash_size, **block_kwargs):
+        # allocate parameters
+        offset = self.build_hashtable_info(hash_size)
+        hash_const = torch.nn.Parameter(torch.zeros(
+            [offset, self.hash_dim_in if self.hash_n_layer > -1 else self.hash_dim_out]))
+        hash_const.data.uniform_(-1e-4, 1e-4)
+
+        hash_networks = []
+        if self.hash_n_layer > -1:
+            input_dim     = self.hash_dim_in
+            for l in range(self.hash_n_layer):
+                output_dim = self.hash_dim_mid if self.hash_dim_mid is not None else self.hash_dim_in
+                hash_networks.append({
+                    'block': Style2Layer(input_dim, output_dim, w_dim),
+                    'num_ws': 1, 'name': f'hmlp{l}'
+                })
+                input_dim  = output_dim 
+            hash_networks.append({
+                'block': ToRGBLayer(input_dim, self.hash_dim_out, w_dim, kernel_size=1),
+                'num_ws': 1, 'name': 'hmlpout'})
+        hash_networks.append({'block': hash_const, 'num_ws': 0, 'name': 'hash_const'})
+        self.num_ws = sum([h['num_ws'] for h in hash_networks])
+        return hash_networks
+        
+
+@persistence.persistent_class
+class NeRFSynthesisNetwork(torch.nn.Module):
+    def __init__(self,
+        w_dim,                        # Intermediate latent (W) dimensionality.
+        img_resolution,               # Output image resolution.
+        img_channels,                 # Number of color channels.
+        channel_base      = 1,
+        channel_max       = 1024,
+
+        # module settings
+        camera_kwargs     = {},
+        renderer_kwargs   = {},
+        upsampler_kwargs  = {},
+        input_kwargs      = {},
+        foreground_kwargs = {},
+        background_kwargs = {},
+
+        # nerf space settings
+        z_dim             = 256,
+        z_dim_bg          = 128,
+        rgb_out_dim       = 256,
+        rgb_out_dim_bg    = None,
+        resolution_vol    = 32,
+        resolution_start  = None,
+        progressive       = True,
+        prog_nerf_only    = False,
+        interp_steps      = None,   # (optional) "start_step:final_step"
+
+        # others (regularization)
+        regularization    = [],     # nv_beta, nv_vol 
+        predict_camera    = False,
+        camera_condition  = None,
+        n_reg_samples     = 0,
+        reg_full          = False,
+        
+        cam_based_sampler = False,
+        rectangular       = None,
+        freeze_nerf       = False,
+        **block_kwargs,            # Other arguments for SynthesisBlock.
+    ):
+        assert img_resolution >= 4 and img_resolution & (img_resolution - 1) == 0
+        super().__init__()
+
+        # dimensions
+        self.w_dim            = w_dim
+        self.z_dim            = z_dim
+        self.z_dim_bg         = z_dim_bg
+        self.num_ws           = 0
+        self.rgb_out_dim      = rgb_out_dim
+        self.rgb_out_dim_bg   = rgb_out_dim_bg if rgb_out_dim_bg is not None else rgb_out_dim
+
+        self.img_resolution   = img_resolution
+        self.resolution_vol   = resolution_vol if resolution_vol < img_resolution else img_resolution
+        self.resolution_start = resolution_start if resolution_start is not None else resolution_vol
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels     = img_channels
+        
+        # number of samples
+        self.n_reg_samples = n_reg_samples
+        self.reg_full = reg_full
+        self.use_noise = block_kwargs.get('use_noise', False)
+        
+        # ---------------------------------- Initialize Modules ---------------------------------------- -#
+        # camera module
+        self.C = CameraRay(camera_kwargs, **block_kwargs)
+        
+        # input encoding module
+        if (len(input_kwargs) > 0) and (input_kwargs['output_mode'] != 'none'):  # using synthezied inputs
+            input_kwargs['channel_base'] = input_kwargs.get('channel_base', channel_base)
+            input_kwargs['channel_max']  = input_kwargs.get('channel_max', channel_max)
+            self.I = NeRFInput(input_kwargs, **block_kwargs)
+        else:
+            self.I = None
+
+        # volume renderer module
+        self.V = VolumeRenderer(renderer_kwargs, camera_ray=self.C, input_encoding=self.I, **block_kwargs)
+
+        # upsampler module
+        upsampler_kwargs.update(dict(
+            img_channels=img_channels,
+            in_res=resolution_vol, 
+            out_res=img_resolution, 
+            channel_max=channel_max, 
+            channel_base=channel_base))
+        self.U = Upsampler(upsampler_kwargs, **block_kwargs)
+
+        # full model resolutions
+        self.block_resolutions = copy.deepcopy(self.U.block_resolutions)
+        if self.resolution_start < self.resolution_vol:
+            r = self.resolution_vol
+            while r > self.resolution_start:
+                self.block_resolutions.insert(0, r)
+                r = r // 2
+
+        self.predict_camera = predict_camera
+        if predict_camera:  # encoder side camera predictor (not very useful)
+            self.camera_generator = CameraGenerator()
+        self.camera_condition = camera_condition
+        if self.camera_condition is not None:   # style vector modulated by the camera poses (uv)
+            self.camera_map = MappingNetwork(z_dim=0, c_dim=16, w_dim=self.w_dim, num_ws=None, w_avg_beta=None, num_layers=2)
+            
+        # ray level choices
+        self.regularization   = regularization
+        self.margin           = block_kwargs.get('margin', 0)
+        self.activation       = block_kwargs.get('activation', 'lrelu')
+        self.rectangular_crop = rectangular  # [384, 512] ??
+    
+        # nerf (foregournd/background)
+        foreground_kwargs.update(dict(
+            z_dim=self.z_dim, 
+            w_dim=w_dim, 
+            rgb_out_dim=self.rgb_out_dim,
+            activation=self.activation))
+
+        # disable positional encoding if input encoding is given
+        if self.I is not None:  
+            foreground_kwargs.update(dict(
+                disable_latents=(not self.I.keep_nerf_latents), 
+                input_dim=self.I.out_dim + 3 * (2 * self.I.keep_posenc + 1) 
+                    if self.I.keep_posenc > -1 else self.I.out_dim,
+                positional_encoding='none'))
+        
+        self.fg_nerf = NeRFBlock(foreground_kwargs)
+        self.num_ws += self.fg_nerf.num_ws
+
+        if not self.V.no_background:
+            background_kwargs.update(dict(
+                z_dim=self.z_dim_bg, w_dim=w_dim, 
+                rgb_out_dim=self.rgb_out_dim_bg,
+                activation=self.activation))
+            self.bg_nerf = NeRFBlock(background_kwargs)
+            self.num_ws += self.bg_nerf.num_ws
+        else:
+            self.bg_nerf = None
+        
+        # ---------------------------------- Build Networks ---------------------------------------- -#
+        # input encoding (optional)
+        if self.I is not None:
+            assert self.V.no_background, "does not support background field"
+            nerf_inputs = self.I.build_network(w_dim, **block_kwargs)
+            self.input_block_names = ['in_' + i['name'] for i in nerf_inputs]
+            self.num_ws += sum([i['num_ws'] for i in nerf_inputs])
+            for i in nerf_inputs:
+                setattr(self, 'in_' + i['name'], i['block'])
+                
+        # upsampler
+        upsamplers = self.U.build_network(w_dim, self.fg_nerf.rgb_out_dim, **block_kwargs)
+        if len(upsamplers) > 0:
+            self.block_names = [u['name'] for u in upsamplers]
+            self.num_ws += sum([u['num_ws'] for u in upsamplers])
+            for u in upsamplers:
+                setattr(self, u['name'], u['block'])
+
+        # data-sampler
+        if cam_based_sampler:
+            self.sampler = (CameraQueriedSampler, {'camera_module': self.C})
+            
+        # other hyperameters
+        self.progressive_growing   = progressive
+        self.progressive_nerf_only = prog_nerf_only
+        assert not (self.progressive_growing and self.progressive_nerf_only)
+        if prog_nerf_only:
+            assert (self.n_reg_samples == 0) and (not reg_full), "does not support regularization"
+
+        self.register_buffer("alpha", torch.scalar_tensor(-1))
+        if predict_camera:
+            self.num_ws += 1  # additional w for camera
+        self.freeze_nerf = freeze_nerf
+        self.steps = None
+        self.interp_steps = [int(a) for a in interp_steps.split(':')] \
+            if interp_steps is not None else None  #TODO two-stage training trick (from EG3d paper, not working so far)
+
+    def set_alpha(self, alpha):
+        if alpha is not None:
+            self.alpha.fill_(alpha)
+
+    def set_steps(self, steps):
+        if hasattr(self, "steps"):
+            if self.steps is not None:
+                self.steps = self.steps * 0 + steps / 1000.0
+            else:
+                self.steps = steps / 1000.0
+
+    def forward(self, ws, **block_kwargs):
+        block_ws, imgs, rand_imgs = [], [], []
+        batch_size = block_kwargs['batch_size'] = ws.size(0)
+        n_levels, end_l, _, target_res = self.get_current_resolution()
+        
+        # save ws for potential usage.
+        block_kwargs['ws_detach'] = ws.detach()
+
+        # cameras, background codes
+        if self.camera_condition is not None:
+            cam_cond = self.get_camera_samples(batch_size, ws, block_kwargs, gen_cond=True)
+        
+        if "camera_matrices" not in block_kwargs:
+            block_kwargs['camera_matrices'] = self.get_camera_samples(batch_size, ws, block_kwargs)
+        if (self.camera_condition is not None) and (cam_cond is None):
+            cam_cond = block_kwargs['camera_matrices']
+           
+        block_kwargs['theta'] = self.C.get_roll(ws, self.training, **block_kwargs)
+        
+        # get latent codes instead of style vectors (used in GRAF & GIRAFFE)
+        if "latent_codes" not in block_kwargs:
+            block_kwargs["latent_codes"] = self.get_latent_codes(batch_size, device=ws.device)
+    
+        if (self.camera_condition is not None) and (self.camera_condition == 'full'):
+            cam_cond = normalize_2nd_moment(self.camera_map(None, cam_cond[1].reshape(-1, 16)))
+            ws = ws * cam_cond[:, None, :]     
+    
+        # generate features for input points (Optional, default not use)
+        with torch.autograd.profiler.record_function('nerf_input_feats'):
+            if self.I is not None:
+                ws = ws.to(torch.float32)
+                blocks   = [getattr(self, name) for name in self.input_block_names]
+                block_ws = self.I.forward_ws_split(ws, blocks)
+                nerf_input_feats = self.I.forward_network(blocks, block_ws, **block_kwargs)
+                ws = ws[:, self.I.num_ws:]
+            else:
+                nerf_input_feats = None
+
+        # prepare for NeRF part
+        with torch.autograd.profiler.record_function('prepare_nerf_path'):
+            if self.progressive_nerf_only and (self.alpha > -1):
+                cur_resolution = int(self.resolution_start * (1 - self.alpha) + self.resolution_vol * self.alpha)
+            elif (end_l == 0) or len(self.block_resolutions) == 0:
+                cur_resolution = self.resolution_start
+            else:
+                cur_resolution = self.block_resolutions[end_l-1]
+
+            vol_resolution  = self.resolution_vol if self.resolution_vol < cur_resolution else cur_resolution
+            nerf_resolution = vol_resolution
+            if (self.interp_steps is not None) and (self.steps is not None) and (self.alpha > 0):  # interpolation trick (maybe work??)
+                if self.steps < self.interp_steps[0]:
+                    nerf_resolution = vol_resolution // 2
+                elif self.steps < self.interp_steps[1]:
+                    nerf_resolution = (self.steps - self.interp_steps[0]) / (self.interp_steps[1] - self.interp_steps[0])
+                    nerf_resolution = int(nerf_resolution * (vol_resolution / 2) + vol_resolution / 2)
+            
+            vol_pixels, tgt_pixels = self.C.prepare_pixels(self.img_resolution, cur_resolution, nerf_resolution, **block_kwargs)
+            if (end_l > 0) and (self.n_reg_samples > 0) and self.training:
+                rand_pixels, rand_indexs = self.C.prepare_pixels_regularization(tgt_pixels, self.n_reg_samples)
+            else:
+                rand_pixels, rand_indexs = None, None
+                
+            if self.fg_nerf.num_ws > 0:  # use style vector instead of latent codes?
+                block_kwargs["styles"] = ws[:, :self.fg_nerf.num_ws]
+                ws = ws[:, self.fg_nerf.num_ws:]
+            if (self.bg_nerf is not None) and self.bg_nerf.num_ws > 0:
+                block_kwargs["styles_bg"] = ws[:, :self.bg_nerf.num_ws]
+                ws = ws[:, self.bg_nerf.num_ws:]
+        
+        # volume rendering
+        with torch.autograd.profiler.record_function('nerf'):
+            if (rand_pixels is not None) and self.training:
+                vol_pixels = (vol_pixels, rand_pixels)
+            outputs = self.V.forward_volume_rendering(
+                nerf_modules=(self.fg_nerf, self.bg_nerf),
+                vol_pixels=vol_pixels, 
+                nerf_input_feats=nerf_input_feats,
+                return_full=self.reg_full,
+                alpha=self.alpha,
+                **block_kwargs)
+            
+            reg_loss = outputs.get('reg_loss', {})
+            x, img, _ = self.V.post_process_outputs(outputs['full_out'], self.freeze_nerf)
+            if nerf_resolution < vol_resolution:
+                x   = F.interpolate(x,   vol_resolution, mode='bilinear', align_corners=False)
+                img = F.interpolate(img, vol_resolution, mode='bilinear', align_corners=False)
+            
+            # early output from the network (used for visualization)
+            if 'meshes' in block_kwargs:
+                from dnnlib.geometry import render_mesh
+                block_kwargs['voxel_noise'] = render_mesh(block_kwargs['meshes'], block_kwargs["camera_matrices"])
+            
+            if (len(self.U.block_resolutions) == 0) or \
+                (x is None) or \
+                (block_kwargs.get("render_option", None) is not None and 
+                    'early' in block_kwargs['render_option']):
+                if 'value' in block_kwargs['render_option']:
+                    img = x[:,:3]
+                    img = img / img.norm(dim=1, keepdim=True)
+                assert img is not None, "need to add RGB"
+                return img
+
+            if 'rand_out' in outputs:
+                x_rand, img_rand, rand_probs = self.V.post_process_outputs(outputs['rand_out'], self.freeze_nerf)
+                lh, lw  = dividable(rand_probs.size(1))
+                rand_imgs += [img_rand]
+
+            # append low-resolution image
+            if img is not None:
+                if self.progressive_nerf_only and (img.size(-1) < self.resolution_vol):
+                    x   = upsample(x,   self.resolution_vol)
+                    img = upsample(img, self.resolution_vol)
+                block_kwargs['img_nerf'] = img
+
+        # Use 2D upsampler
+        if (cur_resolution > self.resolution_vol) or self.progressive_nerf_only:
+            imgs += [img]
+            if (self.camera_condition is not None) and (self.camera_condition != 'full'):
+                cam_cond = normalize_2nd_moment(self.camera_map(None, cam_cond[1].reshape(-1, 16)))
+                ws = ws * cam_cond[:, None, :] 
+            
+            # 2D feature map upsampling
+            with torch.autograd.profiler.record_function('upsampling'):
+                ws       = ws.to(torch.float32)
+                blocks   = [getattr(self, name) for name in self.block_names]
+                block_ws = self.U.forward_ws_split(ws, blocks)               
+                imgs    += self.U.forward_network(blocks, block_ws, x, img, target_res, self.alpha, **block_kwargs)
+                img      = imgs[-1]
+                if len(rand_imgs) > 0:   # nerf path regularization
+                    rand_imgs += self.U.forward_network(
+                        blocks, block_ws, x_rand, img_rand, target_res, self.alpha, skip_up=True, **block_kwargs)
+                    img_rand = rand_imgs[-1]
+            
+            with torch.autograd.profiler.record_function('rgb_interp'):
+                if (self.alpha > -1) and (not self.progressive_nerf_only) and self.progressive_growing:
+                    if (self.alpha < 1) and (self.alpha > 0):     
+                        alpha, _ = math.modf(self.alpha * n_levels)
+                        img_nerf = imgs[-2]
+                        if img_nerf.size(-1) < img.size(-1):  # need upsample image
+                            img_nerf = upsample(img_nerf, 2 * img_nerf.size(-1))
+                        img = img_nerf * (1 - alpha) + img * alpha
+                        if len(rand_imgs) > 0:
+                            img_rand = rand_imgs[-2] * (1 - alpha) + img_rand * alpha
+
+            with torch.autograd.profiler.record_function('nerf_path_reg_loss'):
+                if len(rand_imgs) > 0: # and self.training:  # random pixel regularization??            
+                    assert self.progressive_growing
+                    if self.reg_full:     # aggregate RGB in the end.
+                        lh, lw = img_rand.size(2) // self.n_reg_samples, img_rand.size(3) // self.n_reg_samples
+                        img_rand = rearrange(img_rand, 'b d (l h) (m w) -> b d (l m) h w', l=lh, m=lw)
+                        img_rand = (img_rand * rand_probs[:, None]).sum(2)
+                        if self.V.white_background:
+                            img_rand = img_rand + (1 - rand_probs.sum(1, keepdim=True))
+                    rand_indexs = repeat(rand_indexs, 'b n -> b d n', d=img_rand.size(1))
+                    img_ff = rearrange(rearrange(img, 'b d l h -> b d (l h)').gather(2, rand_indexs), 'b d (l h) -> b d l h', l=self.n_reg_samples)
+                
+                    def l2(img_ff, img_nf):
+                        batch_size = img_nf.size(0)
+                        return ((img_ff - img_nf) ** 2).sum(1).reshape(batch_size, -1).mean(-1, keepdim=True)
+                    
+                    reg_loss['reg_loss'] = l2(img_ff, img_rand) * 2.0
+
+        if len(reg_loss) > 0:
+            for key in reg_loss:
+                block_kwargs[key] = reg_loss[key]
+        
+        if self.rectangular_crop is not None:   # in case rectangular 
+            h, w = self.rectangular_crop
+            c = int(img.size(-1) * (1 - h / w) / 2)
+            mask = torch.ones_like(img)
+            mask[:, :, c:-c, :] = 0
+            img = img.masked_fill(mask > 0, -1)
+
+        block_kwargs['img'] = img
+        return block_kwargs
+
+    def get_current_resolution(self):
+        n_levels = len(self.block_resolutions)
+        if not self.progressive_growing:
+            end_l = n_levels
+        elif (self.alpha > -1) and (not self.progressive_nerf_only):
+            if self.alpha == 0:
+                end_l = 0
+            elif self.alpha == 1:
+                end_l = n_levels
+            elif self.alpha < 1:
+                end_l = int(math.modf(self.alpha * n_levels)[1] + 1)
+        else:
+            end_l = n_levels
+        target_res = self.resolution_start if end_l <= 0 else self.block_resolutions[end_l-1]
+        before_res = self.resolution_start if end_l <= 1 else self.block_resolutions[end_l-2]
+        return n_levels, end_l, before_res, target_res
+
+    def get_latent_codes(self, batch_size=32, device="cpu", tmp=1.):
+        z_dim, z_dim_bg = self.z_dim, self.z_dim_bg
+
+        def sample_z(*size):
+            torch.randn(*size).to(device)
+            return torch.randn(*size).to(device) * tmp
+
+        z_shape_obj = sample_z(batch_size, z_dim)
+        z_app_obj = sample_z(batch_size, z_dim)
+        z_shape_bg = sample_z(batch_size, z_dim_bg) if not self.V.no_background else None
+        z_app_bg = sample_z(batch_size, z_dim_bg) if not self.V.no_background else None
+        return z_shape_obj, z_app_obj, z_shape_bg, z_app_bg
+
+    def get_camera(self, *args, **kwargs):   # for compitability
+        return self.C.get_camera(*args, **kwargs)
+    
+    def get_camera_samples(self, batch_size, ws, block_kwargs, gen_cond=False):
+        if gen_cond:  # camera condition for generator (? a special variant)
+            if ('camera_matrices' in block_kwargs) and (not self.training):  # this is for rendering
+                camera_matrices = self.get_camera(batch_size, device=ws.device, mode=[0.5, 0.5, 0.5])
+            elif self.training and (np.random.rand() > 0.5):
+                camera_matrices = self.get_camera(batch_size, device=ws.device)
+            else:
+                camera_matrices = None
+        
+        elif 'camera_mode' in block_kwargs:
+            camera_matrices = self.get_camera(batch_size, device=ws.device, mode=block_kwargs["camera_mode"])    
+        
+        else:
+            if self.predict_camera:
+                rand_mode = ws.new_zeros(ws.size(0), 2)
+                if self.C.gaussian_camera:
+                    rand_mode = rand_mode.normal_()
+                    pred_mode = self.camera_generator(rand_mode)
+                else:
+                    rand_mode = rand_mode.uniform_()
+                    pred_mode = self.camera_generator(rand_mode - 0.5)
+                mode = rand_mode if self.alpha <= 0 else rand_mode + pred_mode * 0.1
+                camera_matrices = self.get_camera(batch_size, device=ws.device, mode=mode)
+            
+            else:   
+                camera_matrices = self.get_camera(batch_size, device=ws.device)
+        
+        if ('camera_RT' in block_kwargs) or ('camera_UV' in block_kwargs):
+            camera_matrices = list(camera_matrices)
+            camera_mask = torch.rand(batch_size).type_as(camera_matrices[1]).lt(self.alpha)
+            if 'camera_RT' in block_kwargs:
+                image_RT = block_kwargs['camera_RT'].reshape(-1, 4, 4)
+                camera_matrices[1][camera_mask] = image_RT[camera_mask]  # replacing with inferred cameras
+            else:  # sample uv instead of sampling the extrinsic matrix
+                image_UV = block_kwargs['camera_UV']
+                image_RT = self.get_camera(batch_size, device=ws.device, mode=image_UV, force_uniform=True)[1]           
+                camera_matrices[1][camera_mask] = image_RT[camera_mask]  # replacing with inferred cameras
+                camera_matrices[2][camera_mask] = image_UV[camera_mask]  # replacing with inferred uvs
+            camera_matrices = tuple(camera_matrices)
+        return camera_matrices
+
+
+@persistence.persistent_class
+class Discriminator(torch.nn.Module):
+    def __init__(self,
+        c_dim,                          # Conditioning label (C) dimensionality.
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 1,        # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 0,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = None,     # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim            = None,     # Dimensionality of mapped conditioning label, None = default.
+        lowres_head         = None,     # add a low-resolution discriminator head
+        dual_discriminator  = False,    # add low-resolution (NeRF) image 
+        
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        mapping_kwargs      = {},       # Arguments for MappingNetwork.
+        epilogue_kwargs     = {},       # Arguments for DiscriminatorEpilogue.
+        camera_kwargs       = {},       # Arguments for Camera predictor and condition (optional, refactoring)
+        upsample_type       = 'default',
+        
+        progressive         = False,
+        resize_real_early   = False,    # Peform resizing before the training loop
+        enable_ema          = False,    # Additionally save an EMA checkpoint
+        
+        **unused
+    ):
+        super().__init__()
+        # setup parameters
+        self.img_resolution      = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels        = img_channels
+        self.block_resolutions   = [2 ** i for i in range(self.img_resolution_log2, 2, -1)]
+        self.architecture        = architecture
+        self.lowres_head         = lowres_head
+
+        self.dual_discriminator  = dual_discriminator
+        self.upsample_type       = upsample_type
+        self.progressive         = progressive
+        self.resize_real_early   = resize_real_early
+        self.enable_ema          = enable_ema
+        
+        if self.progressive:
+            assert self.architecture == 'skip', "not supporting other types for now."
+
+        channel_base = int(channel_base * 32768)
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+        
+        # camera prediction module
+        self.camera_kwargs = EasyDict(
+            predict_camera=False, 
+            predict_styles=False,
+            camera_type='3d', 
+            camera_encoder=True, 
+            camera_encoder_progressive=False, 
+            camera_disc=True)
+        
+        ## ------ for compitibility ------- #
+        self.camera_kwargs.predict_camera = unused.get('predict_camera', False)
+        self.camera_kwargs.camera_type = '9d' if unused.get('predict_9d_camera', False) else '3d'
+        self.camera_kwargs.camera_disc = not unused.get('no_camera_condition', False)
+        self.camera_kwargs.camera_encoder = unused.get('saperate_camera', False)
+        
+        self.camera_kwargs.update(camera_kwargs)
+        ## ------ for compitibility ------- #
+        
+        self.c_dim = c_dim
+        if self.camera_kwargs.predict_camera:
+            if self.camera_kwargs.camera_type == '3d':
+                self.c_dim = out_dim = 3     # (u, v) on the sphere
+            elif self.camera_kwargs.camera_type == '9d':
+                self.c_dim,  out_dim = 16, 9
+            elif self.camera_kwargs.camera_type == '16d':
+                self.c_dim = out_dim = 16
+            else:
+                raise NotImplementedError('Wrong camera type')
+            if not self.camera_kwargs.camera_disc:
+                self.c_dim = c_dim    
+            self.projector = EqualConv2d(channels_dict[4], out_dim, 4, padding=0, bias=False)
+             
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if self.c_dim == 0:
+            cmap_dim = 0
+        if self.c_dim > 0:
+            self.mapping = MappingNetwork(z_dim=0, c_dim=self.c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+  
+        if self.camera_kwargs.predict_styles:
+            self.w_dim, self.num_ws = self.camera_kwargs.w_dim, self.camera_kwargs.num_ws
+            self.projector_styles = EqualConv2d(channels_dict[4], self.w_dim * self.num_ws, 4, padding=0, bias=False)
+            self.mapping_styles = MappingNetwork(z_dim=0, c_dim=self.w_dim * self.num_ws, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs)
+        
+        # main discriminator blocks
+        common_kwargs = dict(img_channels=self.img_channels, architecture=architecture, conv_clamp=conv_clamp)    
+        
+        def build_blocks(layer_name='b', low_resolution=False):
+            cur_layer_idx = 0
+            block_resolutions = self.block_resolutions
+            if low_resolution:
+                block_resolutions = [r for r in self.block_resolutions if r <= self.lowres_head]
+            for res in block_resolutions:
+                in_channels = channels_dict[res] if res < img_resolution else 0
+                tmp_channels = channels_dict[res]
+                out_channels = channels_dict[res // 2]
+                use_fp16 = (res >= fp16_resolution)
+                block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                    first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+                setattr(self, f'{layer_name}{res}', block)
+                cur_layer_idx += block.num_layers
+
+        build_blocks(layer_name='b')  # main blocks
+        if self.dual_discriminator:
+            build_blocks(layer_name='dual', low_resolution=True)
+        if self.camera_kwargs.camera_encoder:
+            build_blocks(layer_name='c', low_resolution=(not self.camera_kwargs.camera_encoder_progressive))
+
+        # final output module
+        self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, **epilogue_kwargs, **common_kwargs)
+        self.register_buffer("alpha", torch.scalar_tensor(-1))
+
+    def set_alpha(self, alpha):
+        if alpha is not None:
+            self.alpha = self.alpha * 0 + alpha
+    
+    def set_resolution(self, res):
+        self.curr_status = res
+        
+    def forward_blocks_progressive(self, img, mode="disc", **block_kwargs):
+        # mode from ['disc', 'dual_disc', 'cam_enc']
+        if isinstance(img, dict):
+            img = img['img']
+        
+        block_resolutions, alpha, lowres_head = self.get_block_resolutions(img)
+        layer_name, progressive = 'b', self.progressive
+        if mode == "cam_enc":        
+            assert self.camera_kwargs.predict_camera and self.camera_kwargs.camera_encoder
+            layer_name = 'c'
+            if not self.camera_kwargs.camera_encoder_progressive:
+                block_resolutions, progressive = [r for r in self.block_resolutions if r <= self.lowres_head], False
+                img = downsample(img, self.lowres_head)
+        elif mode == 'dual_disc':
+            layer_name = 'dual'
+            block_resolutions, progressive = [r for r in self.block_resolutions if r <= self.lowres_head], False
+         
+        img0 = downsample(img, img.size(-1) // 2) if \
+            progressive and (self.lowres_head is not None) and (self.alpha > -1) and (self.alpha < 1) and (alpha > 0) \
+            else None            
+        x = None if (not progressive) or (block_resolutions[0] == self.img_resolution) \
+            else getattr(self, f'{layer_name}{block_resolutions[0]}').fromrgb(img)
+        
+        for res in block_resolutions:
+            block = getattr(self, f'{layer_name}{res}')
+            if (lowres_head == res) and (self.alpha > -1) and (self.alpha < 1) and (alpha > 0):
+                if progressive:
+                    if self.architecture == 'skip':
+                        img = img * alpha + img0 * (1 - alpha)        
+                    x = x * alpha + block.fromrgb(img0) * (1 - alpha)
+            x, img = block(x, img, **block_kwargs)
+        
+        output = {}
+        if (mode == 'cam_enc') or \
+           (mode == 'disc' and self.camera_kwargs.predict_camera and (not self.camera_kwargs.camera_encoder)):
+            c = self.projector(x)[:,:,0,0]
+            if self.camera_kwargs.camera_type == '9d':
+                c = camera_9d_to_16d(c)    
+            output['cam'] = c
+            if self.camera_kwargs.predict_styles:
+                w = self.projector_styles(x)[:,:,0,0]
+                output['styles'] = w
+        return output, x, img
+
+    def get_camera_loss(self, RT=None, UV=None, c=None):
+        if (RT is None) or (UV is None):
+            return None
+        if self.camera_kwargs.camera_type == '3d':  # UV has higher priority?
+            return F.mse_loss(UV, c)
+        else:
+            return F.smooth_l1_loss(RT.reshape(RT.size(0), -1), c) * 10
+
+    def get_styles_loss(self, WS=None, w=None):
+        if WS is None:
+            return None
+        return F.mse_loss(WS, w) * 0.1
+        
+    def get_block_resolutions(self, input_img):
+        block_resolutions = self.block_resolutions
+        lowres_head = self.lowres_head
+        alpha = self.alpha
+        img_res = input_img.size(-1)
+        if self.progressive and (self.lowres_head is not None) and (self.alpha > -1):
+            if (self.alpha < 1) and (self.alpha > 0): 
+                try:
+                    n_levels, _, before_res, target_res = self.curr_status
+                    alpha, index = math.modf(self.alpha * n_levels)
+                    index = int(index)
+                except Exception as e:  # TODO: this is a hack, better to save status as buffers.
+                    before_res = target_res = img_res
+                if before_res == target_res:  # no upsampling was used in generator, do not increase the discriminator
+                    alpha = 0    
+                block_resolutions = [res for res in self.block_resolutions if res <= target_res]
+                lowres_head = before_res
+            elif self.alpha == 0:
+                block_resolutions = [res for res in self.block_resolutions if res <= lowres_head]
+        return block_resolutions, alpha, lowres_head
+
+    def forward(self, inputs, c=None, aug_pipe=None, return_camera=False, **block_kwargs):
+        if not isinstance(inputs, dict):
+            inputs = {'img': inputs}
+        img = inputs['img']
+        
+        # this is to handle real images
+        block_resolutions, alpha, _ = self.get_block_resolutions(img)
+        if img.size(-1) > block_resolutions[0]:
+            img = downsample(img, block_resolutions[0])
+        if self.dual_discriminator and ('img_nerf' not in inputs):
+            inputs['img_nerf'] = downsample(img, self.lowres_head)  
+   
+        RT = inputs['camera_matrices'][1].detach() if 'camera_matrices' in inputs else None
+        UV = inputs['camera_matrices'][2].detach() if 'camera_matrices' in inputs else None
+        WS = inputs['ws_detach'].reshape(inputs['batch_size'], -1) if 'ws_detach' in inputs else None
+        
+        no_condition = (c.size(-1) == 0)
+        
+        # forward separate camera encoder, which can also be progressive...
+        if self.camera_kwargs.camera_encoder:
+            out_camenc, _, _ = self.forward_blocks_progressive(img, mode='cam_enc', **block_kwargs)
+            if no_condition and ('cam' in out_camenc):
+                c, camera_loss = out_camenc['cam'], self.get_camera_loss(RT, UV, out_camenc['cam'])
+                if 'styles' in out_camenc:
+                    w, styles_loss = out_camenc['styles'], self.get_styles_loss(WS, out_camenc['styles'])
+                no_condition = False
+        
+        # forward another dual discriminator only for low resolution images
+        if self.dual_discriminator:
+            _, x_nerf, img_nerf = self.forward_blocks_progressive(inputs['img_nerf'], mode='dual_disc', **block_kwargs)            
+
+        # if applied data augmentation for discriminator
+        if aug_pipe is not None:
+            img = aug_pipe(img)
+
+        # perform main discriminator block
+        out_disc, x, img = self.forward_blocks_progressive(img, mode='disc', **block_kwargs)
+        if no_condition and ('cam' in out_disc):
+            c, camera_loss = out_disc['cam'], self.get_camera_loss(RT, UV, out_disc['cam'])
+            if 'styles' in out_disc:
+                w, styles_loss = out_disc['styles'], self.get_styles_loss(WS, out_disc['styles'])
+            no_condition = False
+        
+        # camera conditional discriminator
+        cmap = None
+        if self.c_dim > 0:
+            cc = c.clone().detach()
+            cmap = self.mapping(None, cc)
+            if self.camera_kwargs.predict_styles:
+                ww = w.clone().detach()
+                cmap = [cmap] + [self.mapping_styles(None, ww)]
+                    
+        logits  = self.b4(x, img, cmap)
+        if self.dual_discriminator:
+            logits = torch.cat([logits, self.b4(x_nerf, img_nerf, cmap)], 0)
+                
+        outputs = {'logits': logits}
+        if self.camera_kwargs.predict_camera and (camera_loss is not None):
+            outputs['camera_loss'] = camera_loss
+        if self.camera_kwargs.predict_styles and (styles_loss is not None):
+            outputs['styles_loss'] = styles_loss
+        if return_camera:
+            outputs['camera'] = c
+        return outputs
+
+      
+@persistence.persistent_class
+class Encoder(torch.nn.Module):
+    def __init__(self,
+        img_resolution,                 # Input resolution.
+        img_channels,                   # Number of input color channels.
+        bottleneck_factor   = 2,        # By default, the same as discriminator we use 4x4 features
+        architecture        = 'resnet', # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base        = 1,        # Overall multiplier for the number of channels.
+        channel_max         = 512,      # Maximum number of channels in any layer.
+        num_fp16_res        = 0,        # Use FP16 for the N highest resolutions.
+        conv_clamp          = None,     # Clamp the output of convolution layers to +-X, None = disable clamping
+        lowres_head         = None,     # add a low-resolution discriminator head
+        block_kwargs        = {},       # Arguments for DiscriminatorBlock.
+        model_kwargs        = {},
+        upsample_type       = 'default',
+        progressive         = False,
+        **unused
+    ):
+        super().__init__()
+        self.img_resolution      = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels        = img_channels
+        self.block_resolutions   = [2 ** i for i in range(self.img_resolution_log2, bottleneck_factor, -1)]
+        self.architecture        = architecture
+        self.lowres_head         = lowres_head
+        self.upsample_type       = upsample_type
+        self.progressive         = progressive
+        self.model_kwargs        = model_kwargs
+        self.output_mode         = model_kwargs.get('output_mode', 'styles')
+        if self.progressive:
+            assert self.architecture == 'skip', "not supporting other types for now."
+        self.predict_camera      = model_kwargs.get('predict_camera', False)
+
+        channel_base = int(channel_base * 32768)
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)        
+        common_kwargs = dict(img_channels=self.img_channels, architecture=architecture, conv_clamp=conv_clamp)    
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels  = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = (res >= fp16_resolution)
+            block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res,
+                first_layer_idx=cur_layer_idx, use_fp16=use_fp16, **block_kwargs, **common_kwargs)
+            setattr(self, f'b{res}', block)
+            cur_layer_idx += block.num_layers
+
+        # this is an encoder
+        if self.output_mode in ['W', 'W+', 'None']:
+            self.num_ws    = self.model_kwargs.get('num_ws', 0)
+            self.n_latents = self.num_ws if self.output_mode == 'W+' else (0 if self.output_mode == 'None' else 1) 
+            self.w_dim     = self.model_kwargs.get('w_dim', 512)
+            self.add_dim   = self.model_kwargs.get('add_dim', 0) if not self.predict_camera else 9
+            self.out_dim   = self.w_dim * self.n_latents + self.add_dim
+            assert self.out_dim > 0, 'output dimenstion has to be larger than 0'
+            assert self.block_resolutions[-1] // 2 == 4, "make sure the last resolution is 4x4"
+            self.projector = EqualConv2d(channels_dict[4], self.out_dim, 4, padding=0, bias=False)
+        else:
+            raise NotImplementedError
+        self.register_buffer("alpha", torch.scalar_tensor(-1))
+
+    def set_alpha(self, alpha):
+        if alpha is not None:
+            self.alpha.fill_(alpha)
+    
+    def set_resolution(self, res):
+        self.curr_status = res
+
+    def get_block_resolutions(self, input_img):
+        block_resolutions = self.block_resolutions
+        lowres_head = self.lowres_head
+        alpha = self.alpha
+        img_res = input_img.size(-1)
+        if self.progressive and (self.lowres_head is not None) and (self.alpha > -1):
+            if (self.alpha < 1) and (self.alpha > 0): 
+                try:
+                    n_levels, _, before_res, target_res = self.curr_status
+                    alpha, index = math.modf(self.alpha * n_levels)
+                    index = int(index)
+                except Exception as e:  # TODO: this is a hack, better to save status as buffers.
+                    before_res = target_res = img_res 
+                if before_res == target_res:
+                    # no upsampling was used in generator, do not increase the discriminator
+                    alpha = 0    
+                block_resolutions = [res for res in self.block_resolutions if res <= target_res]
+                lowres_head = before_res
+            elif self.alpha == 0:
+                block_resolutions = [res for res in self.block_resolutions if res <= lowres_head]
+        return block_resolutions, alpha, lowres_head
+
+    def forward(self, inputs, **block_kwargs):
+        if isinstance(inputs, dict):
+            img = inputs['img']
+        else:
+            img = inputs
+
+        block_resolutions, alpha, lowres_head = self.get_block_resolutions(img)
+        if img.size(-1) > block_resolutions[0]:
+            img = downsample(img, block_resolutions[0])
+
+        if self.progressive and (self.lowres_head is not None) and (self.alpha > -1) and (self.alpha < 1) and (alpha > 0):
+            img0 = downsample(img, img.size(-1) // 2)
+           
+        x = None if (not self.progressive) or (block_resolutions[0] == self.img_resolution) \
+            else getattr(self, f'b{block_resolutions[0]}').fromrgb(img)
+
+        for res in block_resolutions:
+            block = getattr(self, f'b{res}')
+            if (lowres_head == res) and (self.alpha > -1) and (self.alpha < 1) and (alpha > 0):
+                if self.architecture == 'skip':
+                    img = img * alpha + img0 * (1 - alpha)
+                if self.progressive:
+                    x = x * alpha + block.fromrgb(img0) * (1 - alpha)      # combine from img0           
+            x, img = block(x, img, **block_kwargs)
+        
+        outputs = {}
+        if self.output_mode in ['W', 'W+', 'None']:
+            out = self.projector(x)[:,:,0,0]
+            if self.predict_camera:
+                out, out_cam_9d = out[:, 9:], out[:, :9]
+                outputs['camera'] = camera_9d_to_16d(out_cam_9d)
+            
+            if self.output_mode == 'W+':
+                out = rearrange(out, 'b (n s) -> b n s', n=self.num_ws, s=self.w_dim)
+            elif self.output_mode == 'W':
+                out = repeat(out, 'b s -> b n s', n=self.num_ws)
+            else:
+                out = None
+            outputs['ws'] = out
+
+        return outputs
+    
+# ------------------------------------------------------------------------------------------- #
+
+class CameraQueriedSampler(torch.utils.data.Sampler):
+    def __init__(self, dataset, camera_module, nearest_neighbors=400, rank=0, num_replicas=1, device='cpu', seed=0):
+        assert len(dataset) > 0
+
+        super().__init__(dataset)
+        self.dataset = dataset
+        self.dataset_cameras = None
+        self.seed = seed
+        self.rank = rank
+        self.device = device
+        self.num_replicas = num_replicas
+        self.C = camera_module
+        self.K = nearest_neighbors
+        self.B = 1000
+        
+    def update_dataset_cameras(self, estimator):
+        import tqdm
+        from torch_utils.distributed_utils import gather_list_and_concat
+        output = torch.ones(len(self.dataset), 16).to(self.device)
+        with torch.no_grad():
+            predicted_cameras, image_indices, bsz = [], [], 64
+            item_subset = [(i * self.num_replicas + self.rank) % len(self.dataset) for i in range((len(self.dataset) - 1) // self.num_replicas + 1)]
+            for _, (images, _, indices) in tqdm.tqdm(enumerate(torch.utils.data.DataLoader(
+                    dataset=copy.deepcopy(self.dataset), sampler=item_subset, batch_size=bsz)), 
+                total=len(item_subset)//bsz+1, colour='red', desc=f'Estimating camera poses for the training set at'):
+                predicted_cameras += [estimator(images.to(self.device).to(torch.float32) / 127.5 - 1)]
+                image_indices += [indices.to(self.device).long()]
+            predicted_cameras = torch.cat(predicted_cameras, 0)
+            image_indices = torch.cat(image_indices, 0)
+            if self.num_replicas > 1:
+                predicted_cameras = gather_list_and_concat(predicted_cameras)
+                image_indices = gather_list_and_concat(image_indices)
+        output[image_indices] = predicted_cameras
+        self.dataset_cameras = output        
+                
+    def get_knn_cameras(self):
+        return torch.norm(
+            self.dataset_cameras.unsqueeze(1) - 
+            self.C.get_camera(self.B, self.device)[0].reshape(1,self.B,16), dim=2, p=None
+        ).topk(self.K, largest=False, dim=0)[1]   # K x B
+
+    def __iter__(self):
+        order = np.arange(len(self.dataset))
+        rnd = np.random.RandomState(self.seed+self.rank)
+        while True:
+            if self.dataset_cameras is None:
+                rand_idx = rnd.randint(order.size)
+                yield rand_idx
+            else:
+                knn_idxs = self.get_knn_cameras()
+                for i in range(self.B):
+                    rand_idx = rnd.randint(self.K)
+                    yield knn_idxs[rand_idx, i].item()
diff --git a/training/training_loop.py b/training/training_loop.py
new file mode 100755
index 0000000000000000000000000000000000000000..1e2b04bc9dc7fd19cf75cf4ec5ac960260eb1126
--- /dev/null
+++ b/training/training_loop.py
@@ -0,0 +1,534 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imp
+import os
+import time
+import copy
+import json
+import pickle
+import psutil
+import PIL.Image
+import numpy as np
+import torch
+import dnnlib
+import tqdm
+import shutil
+import legacy
+
+from torch_utils import misc
+from torch_utils import training_stats
+from torch_utils.ops import conv2d_gradfix
+from torch_utils.ops import grid_sample_gradfix
+from torch_utils.distributed_utils import gather_list_and_concat
+from metrics import metric_main
+from training.data_utils import save_image_grid, resize_image
+
+#----------------------------------------------------------------------------
+
+def setup_snapshot_image_grid(training_set, random_seed=0):
+    rnd = np.random.RandomState(random_seed)
+    gw = np.clip(7680 // training_set.image_shape[2], 7, 32)
+    gh = np.clip(4320 // training_set.image_shape[1], 4, 32)
+
+    # No labels => show random subset of training samples.
+    if not training_set.has_labels:
+        all_indices = list(range(len(training_set)))
+        rnd.shuffle(all_indices)
+        grid_indices = [all_indices[i % len(all_indices)] for i in range(gw * gh)]
+
+    else:
+        # Group training samples by label.
+        label_groups = dict() # label => [idx, ...]
+        for idx in range(len(training_set)):
+            label = tuple(training_set.get_details(idx).raw_label.flat[::-1])
+            if label not in label_groups:
+                label_groups[label] = []
+            label_groups[label].append(idx)
+
+        # Reorder.
+        label_order = sorted(label_groups.keys())
+        for label in label_order:
+            rnd.shuffle(label_groups[label])
+
+        # Organize into grid.
+        grid_indices = []
+        for y in range(gh):
+            label = label_order[y % len(label_order)]
+            indices = label_groups[label]
+            grid_indices += [indices[x % len(indices)] for x in range(gw)]
+            label_groups[label] = [indices[(i + gw) % len(indices)] for i in range(len(indices))]
+
+    # Load data.
+    images, labels, _ = zip(*[training_set[i] for i in grid_indices])
+    return (gw, gh), np.stack(images), np.stack(labels)
+
+
+#----------------------------------------------------------------------------
+
+def training_loop(
+    run_dir                 = '.',      # Output directory.
+    training_set_kwargs     = {},       # Options for training set.
+    data_loader_kwargs      = {},       # Options for torch.utils.data.DataLoader.
+    G_kwargs                = {},       # Options for generator network.
+    D_kwargs                = {},       # Options for discriminator network.
+    G_opt_kwargs            = {},       # Options for generator optimizer.
+    D_opt_kwargs            = {},       # Options for discriminator optimizer.
+    augment_kwargs          = None,     # Options for augmentation pipeline. None = disable.
+    loss_kwargs             = {},       # Options for loss function.
+    metrics                 = [],       # Metrics to evaluate during training.
+    random_seed             = 0,        # Global random seed.
+    world_size              = 1,        # Number of GPUs participating in the training.
+    rank                    = 0,        # Rank of the current process.
+    gpu                     = 0,        # Index of GPU used in training
+    batch_gpu               = 4,        # Batch size for once GPU
+    batch_size              = 4,        # Total batch size for one training iteration. Can be larger than batch_gpu * world_size.
+    ema_kimg                = 10,       # Half-life of the exponential moving average (EMA) of generator weights.
+    ema_rampup              = None,     # EMA ramp-up coefficient.
+    G_reg_interval          = 4,        # How often to perform regularization for G? None = disable lazy regularization.
+    D_reg_interval          = 16,       # How often to perform regularization for D? None = disable lazy regularization.
+    augment_p               = 0,        # Initial value of augmentation probability.
+    ada_target              = None,     # ADA target value. None = fixed p.
+    ada_interval            = 4,        # How often to perform ADA adjustment?
+    ada_kimg                = 500,      # ADA adjustment speed, measured in how many kimg it takes for p to increase/decrease by one unit.
+    total_kimg              = 25000,    # Total length of the training, measured in thousands of real images.
+    kimg_per_tick           = 4,        # Progress snapshot interval.
+    image_snapshot_ticks    = 50,       # How often to save image snapshots? None = disable.
+    network_snapshot_ticks  = 50,       # How often to save network snapshots? None = disable.
+    resume_pkl              = None,     # Network pickle to resume training from.
+    resume_start            = 0,        # Resume from steps
+    cudnn_benchmark         = True,     # Enable torch.backends.cudnn.benchmark?
+    allow_tf32              = False,    # Enable torch.backends.cuda.matmul.allow_tf32 and torch.backends.cudnn.allow_tf32?
+    abort_fn                = None,     # Callback function for determining whether to abort training. Must return consistent results across ranks.
+    progress_fn             = None,     # Callback function for updating training progress. Called for all ranks.
+    update_cam_prior_ticks  = None,     # (optional) Non-parameteric updating camera poses of the dataset
+    generation_with_image   = False,    # (optional) For each random z, you also sample an image associated with it.
+    **unused,
+):
+    # Initialize.
+    start_time = time.time()
+    device = torch.device('cuda', gpu)
+    np.random.seed(random_seed * world_size + rank)
+    torch.manual_seed(random_seed * world_size + rank)
+    torch.backends.cudnn.benchmark = cudnn_benchmark    # Improves training speed.
+    torch.backends.cuda.matmul.allow_tf32 = allow_tf32  # Allow PyTorch to internally use tf32 for matmul
+    torch.backends.cudnn.allow_tf32 = allow_tf32        # Allow PyTorch to internally use tf32 for convolutions
+    conv2d_gradfix.enabled = True                       # Improves training speed.
+    grid_sample_gradfix.enabled = True                  # Avoids errors with the augmentation pipe.
+
+    img_dir = run_dir + '/images'
+    os.makedirs(img_dir, exist_ok=True)
+
+    assert batch_gpu <= (batch_size // world_size)
+
+    # Load training set.
+    if rank == 0:
+        print('Loading training set...')
+
+    if world_size == 1:
+        data_loader_kwargs.update({'num_workers': 1, 'prefetch_factor': 1})
+
+    training_set = dnnlib.util.construct_class_by_name(**training_set_kwargs)  # subclass of training.dataset.Dataset
+    
+    # Setup dataloader/sampler
+    # if getattr(G.synthesis, 'sampler', None) is not None:
+    #     raise NotImplementedError('ERROR NEED TO TAKE A LOOK')
+    #     # training_set_sampler = G.synthesis.sampler[0](
+    #     #     dataset=training_set, rank=rank, num_replicas=world_size, 
+    #     #     seed=random_seed, device=device, **G.synthesis.sampler[1])
+    # else:
+    training_set_sampler = misc.InfiniteSampler(
+            dataset=training_set, rank=rank, num_replicas=world_size, seed=random_seed)
+    training_set_iterator = iter(torch.utils.data.DataLoader(
+        dataset=training_set, sampler=training_set_sampler, batch_size=batch_size//world_size, **data_loader_kwargs))
+    if generation_with_image:
+        backup_data_iterator  = iter(torch.utils.data.DataLoader(
+            dataset=copy.deepcopy(training_set), sampler=training_set_sampler, batch_size=batch_size//world_size, **data_loader_kwargs))
+
+
+    if rank == 0:
+        print()
+        print('Num images: ', len(training_set))
+        print('Image shape:', training_set.image_shape)
+        print('Label shape:', training_set.label_shape)
+        print()
+
+    # Construct networks.
+    if rank == 0:
+        print('Constructing networks...')
+    common_kwargs = dict(c_dim=training_set.label_dim, img_resolution=training_set.resolution, img_channels=training_set.num_channels)
+    if G_kwargs.get('img_channels', None) is not None:
+        common_kwargs['img_channels'] = G_kwargs['img_channels']
+        del G_kwargs['img_channels']
+    G = dnnlib.util.construct_class_by_name(**G_kwargs, **common_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module
+    if 'camera_kwargs' in D_kwargs:
+        D_kwargs.camera_kwargs.w_dim, D_kwargs.camera_kwargs.num_ws = G.w_dim, G.num_ws
+    D = dnnlib.util.construct_class_by_name(**D_kwargs, **common_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module
+    G_ema = copy.deepcopy(G).eval()
+    
+    resize_real_img_early = D_kwargs.get('resize_real_early', False)
+    disc_enable_ema = D_kwargs.get('enable_ema', False)
+    if disc_enable_ema:
+        D_ema = copy.deepcopy(D).eval()
+    
+    # Resume from existing pickle.
+    if (resume_pkl is not None) and (rank == 0):
+        print(f'Resuming from "{resume_pkl}"')
+        with dnnlib.util.open_url(resume_pkl) as f:
+            resume_data = legacy.load_network_pkl(f)
+        modules =  [('G', G), ('D', D), ('G_ema', G_ema)]
+        if disc_enable_ema:
+            modules += [('D_ema', D_ema)]
+        for name, module in modules:
+            misc.copy_params_and_buffers(resume_data[name], module, require_all=False)
+
+    # Print network summary tables.
+    if rank == 0:
+        z = torch.empty([batch_gpu, G.z_dim], device=device)
+        c = torch.empty([batch_gpu, G.c_dim], device=device)
+        img = misc.print_module_summary(G, [z, c])
+        misc.print_module_summary(D, [img, c])
+
+    # Setup augmentation.
+    if rank == 0:
+        print('Setting up augmentation...')
+    augment_pipe = None
+    ada_stats = None
+    if (augment_kwargs is not None) and (augment_p > 0 or ada_target is not None):
+        augment_pipe = dnnlib.util.construct_class_by_name(**augment_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module
+        augment_pipe.p.copy_(torch.as_tensor(augment_p))
+        if ada_target is not None:
+            ada_stats = training_stats.Collector(regex='Loss/signs/real')
+
+    # Distribute across GPUs.
+    if rank == 0:
+        print(f'Distributing across {world_size} GPUs...')
+    ddp_modules = dict()
+    module_list = [('G_mapping', G.mapping), ('G_synthesis', G.synthesis), ('D', D), (None, G_ema), ('augment_pipe', augment_pipe)]
+    if G.encoder is not None:
+        module_list += [('G_encoder', G.encoder)]
+    if disc_enable_ema:
+        module_list += [('D_ema', D_ema)]
+    for name, module in module_list:
+        if (world_size > 1) and (module is not None) and len(list(module.parameters())) != 0:
+            module.requires_grad_(True)
+            module = torch.nn.parallel.DistributedDataParallel(
+                module, device_ids=[device], broadcast_buffers=False, find_unused_parameters=True)  # allows progressive
+            module.requires_grad_(False)
+        if name is not None:
+            ddp_modules[name] = module
+
+    # Setup training phases.
+    if rank == 0:
+        print('Setting up training phases...')
+    loss = dnnlib.util.construct_class_by_name(device=device, **ddp_modules, **loss_kwargs) # subclass of training.loss.Loss
+
+    phases = []
+    for name, module, opt_kwargs, reg_interval in [('G', G, G_opt_kwargs, G_reg_interval), ('D', D, D_opt_kwargs, D_reg_interval)]:
+        if reg_interval is None:
+            opt = dnnlib.util.construct_class_by_name(params=module.parameters(), **opt_kwargs) # subclass of torch.optim.Optimizer
+            phases += [dnnlib.EasyDict(name=name+'both', module=module, opt=opt, interval=1, scaler=None)]
+        else: # Lazy regularization.
+            mb_ratio = reg_interval / (reg_interval + 1)
+            opt_kwargs = dnnlib.EasyDict(opt_kwargs)
+            opt_kwargs.lr = opt_kwargs.lr * mb_ratio
+            opt_kwargs.betas = [beta ** mb_ratio for beta in opt_kwargs.betas]
+            opt = dnnlib.util.construct_class_by_name(module.parameters(), **opt_kwargs) # subclass of torch.optim.Optimizer
+            phases += [dnnlib.EasyDict(name=name+'main', module=module, opt=opt, interval=1, scaler=None)]
+            phases += [dnnlib.EasyDict(name=name+'reg', module=module, opt=opt, interval=reg_interval, scaler=None)]
+    
+    for phase in phases:
+        phase.start_event = None
+        phase.end_event = None
+        if rank == 0:
+            phase.start_event = torch.cuda.Event(enable_timing=True)
+            phase.end_event = torch.cuda.Event(enable_timing=True)
+
+    # Export sample images.
+    grid_size = None
+    grid_z = None
+    grid_c = None
+    grid_i = None
+    if rank == 0:
+        print(f'Exporting sample images... {batch_gpu}')
+        grid_size, images, labels = setup_snapshot_image_grid(training_set=training_set)
+        grid_z = torch.randn([labels.shape[0], G.z_dim], device=device).split(batch_gpu)
+        grid_c = torch.from_numpy(labels).to(device).split(batch_gpu)
+        grid_i = (torch.from_numpy(images).float() / 127.5 - 1).to(device).split(batch_gpu)
+
+        if not os.path.exists(os.path.join(img_dir, 'reals.png')):
+            save_image_grid(images, os.path.join(img_dir, 'reals.png'), drange=[0,255], grid_size=grid_size)
+        
+        if not os.path.exists( os.path.join(img_dir, 'fakes_init.png')):
+            with torch.no_grad():
+                images = torch.cat([G_ema.get_final_output(z=z, c=c, noise_mode='const', img=img).cpu() for z, c, img in zip(grid_z, grid_c, grid_i)]).numpy()
+            save_image_grid(images, os.path.join(img_dir, 'fakes_init.png'), drange=[-1,1], grid_size=grid_size)
+
+    # Initialize logs.
+    if rank == 0:
+        print('Initializing logs...')
+    stats_collector = training_stats.Collector(regex='.*')
+    stats_metrics = dict()
+    stats_jsonl = None
+    stats_tfevents = None
+    if rank == 0:
+        stats_jsonl = open(os.path.join(run_dir, 'stats.jsonl'), 'at')
+        try:
+            import torch.utils.tensorboard as tensorboard
+            stats_tfevents = tensorboard.SummaryWriter(run_dir)
+        except ImportError as err:
+            print('Skipping tfevents export:', err)
+
+    # Train.
+    if rank == 0:
+        print(f'Training for {total_kimg} kimg...')
+        print()
+
+    cur_nimg = resume_start
+    cur_tick = cur_nimg // (1000 * kimg_per_tick) 
+
+    tick_start_nimg = cur_nimg
+    tick_start_time = time.time()
+    maintenance_time = tick_start_time - start_time
+    batch_idx = 0
+    if progress_fn is not None:
+        progress_fn(0, total_kimg)
+
+    while True:
+        # set number of images
+        loss.set_alpha(cur_nimg)
+        curr_res = loss.resolution
+        
+        # Estimating Cameras for the training set (optional)
+        if hasattr(training_set_sampler, 'update_dataset_cameras') and \
+         (cur_nimg == resume_start and resume_start > 0 and cur_tick > update_cam_prior_ticks):
+            training_set_sampler.update_dataset_cameras(D.get_estimated_camera)
+        
+        # Fetch training data.
+        with torch.autograd.profiler.record_function('data_fetch'):
+            def load_data(iterator):
+                img, c, _ = next(iterator)
+                if resize_real_img_early: 
+                    img = resize_image(img, curr_res)
+                img = [{'img': img} for img in (img.to(device).to(torch.float32) / 127.5 - 1).split(batch_gpu)]
+                c = c.to(device).split(batch_gpu)
+                return img, c
+            
+            phase_real_img, phase_real_c = load_data(training_set_iterator)
+            all_gen_z   = torch.randn([len(phases) * batch_size, G.z_dim], device=device)
+            all_gen_z   = [phase_gen_z.split(batch_gpu) for phase_gen_z in all_gen_z.split(batch_size)]
+            all_gen_c   = [training_set.get_label(np.random.randint(len(training_set))) for _ in range(len(phases) * batch_size)]
+            all_gen_c   = torch.from_numpy(np.stack(all_gen_c)).pin_memory().to(device)
+            all_gen_c   = [phase_gen_c.split(batch_gpu) for phase_gen_c in all_gen_c.split(batch_size)]
+            all_gen_img = [[None for _ in range(len(phase_real_img))] for _ in range(len(phases))]
+            
+        # Execute training phases.
+        # with torch.autograd.profiler.profile(with_stack=True, profile_memory=True) as prof:
+        for phase, phase_gen_z, phase_gen_c, phase_gen_img in zip(phases, all_gen_z, all_gen_c, all_gen_img):
+            if batch_idx % phase.interval != 0:
+                continue
+            
+            if generation_with_image:
+                phase_gen_img, phase_gen_c = load_data(backup_data_iterator)
+                
+            # Initialize gradient accumulation.
+            if phase.start_event is not None:
+                phase.start_event.record(torch.cuda.current_stream(device))
+            phase.opt.zero_grad(set_to_none=True)
+            phase.module.requires_grad_(True)
+            
+            # Accumulate gradients over multiple rounds.
+            for round_idx, (real_img, real_c, gen_z, gen_c, fake_img) in enumerate(zip(phase_real_img, phase_real_c, phase_gen_z, phase_gen_c, phase_gen_img)):
+                sync = (round_idx == batch_size // (batch_gpu * world_size) - 1)
+                gain = phase.interval
+
+                losses = loss.accumulate_gradients(
+                    phase=phase.name, 
+                    real_img=real_img, 
+                    real_c=real_c, gen_z=gen_z, 
+                    gen_c=gen_c, fake_img=fake_img,
+                    sync=sync, gain=gain, scaler=phase.scaler)
+
+            # Update weights.
+            phase.module.requires_grad_(False)
+            with torch.autograd.profiler.record_function(phase.name + '_opt'):
+                if len(losses) > 0:
+                    if phase.scaler is not None:
+                        phase.scaler.unscale_(phase.opt)
+                    all_grads = [] 
+                    for param in phase.module.parameters():
+                        if param.grad is not None:
+                            misc.nan_to_num(param.grad, nan=0, posinf=1e5, neginf=-1e5, out=param.grad)
+                            all_grads += [torch.norm(param.grad.detach(), p=2)]
+                    grad_norm = torch.stack(all_grads).norm(p=2)
+                    if phase.scaler is not None:
+                        phase.scaler.step(phase.opt)
+                        phase.scaler.update()
+                        training_stats.report(f'Scaler/{phase.name}', phase.scaler.get_scale())
+                    else:
+                        phase.opt.step()
+                    training_stats.report(f'Gradient/{phase.name}', grad_norm)
+
+            if phase.end_event is not None:
+                phase.end_event.record(torch.cuda.current_stream(device))
+        
+        # Update G_ema.
+        with torch.autograd.profiler.record_function('Gema'):
+            ema_nimg = ema_kimg * 1000
+            if ema_rampup is not None:
+                ema_nimg = min(ema_nimg, cur_nimg * ema_rampup)
+            ema_beta = 0.5 ** (batch_size / max(ema_nimg, 1e-8))
+            for p_ema, p in zip(G_ema.parameters(), G.parameters()):
+                p_ema.copy_(p.lerp(p_ema, ema_beta))
+            for b_ema, b in zip(G_ema.buffers(), G.buffers()):
+                b_ema.copy_(b)
+            if disc_enable_ema:  # update EMA for discriminator
+                for p_ema, p in zip(D_ema.parameters(), D.parameters()):
+                    p_ema.copy_(p.lerp(p_ema, ema_beta))
+                for b_ema, b in zip(D_ema.buffers(), D.buffers()):
+                    b_ema.copy_(b)
+
+        # Update state.
+        cur_nimg += batch_size
+        batch_idx += 1
+
+        # Execute ADA heuristic.
+        if (ada_stats is not None) and (batch_idx % ada_interval == 0):
+            ada_stats.update()
+            adjust = np.sign(ada_stats['Loss/signs/real'] - ada_target) * (batch_size * ada_interval) / (ada_kimg * 1000)
+            augment_pipe.p.copy_((augment_pipe.p + adjust).max(misc.constant(0, device=device)))
+ 
+        # Perform maintenance tasks once per tick.
+        done = (cur_nimg >= total_kimg * 1000)
+        if (not done) and (cur_tick != 0) and (cur_nimg < tick_start_nimg + kimg_per_tick * 1000):
+            continue
+        
+        # Print status line, accumulating the same information in stats_collector.
+        tick_end_time = time.time()
+        fields = [f"[{run_dir}]:"]
+        fields += [f"tick {training_stats.report0('Progress/tick', cur_tick):<5d}"]
+        fields += [f"kimg {training_stats.report0('Progress/kimg', cur_nimg / 1e3):<8.1f}"]
+        if loss.alpha is not None:
+            fields += [f"alpha {training_stats.report0('Progress/alpha', loss.alpha):<8.5f}"]
+            fields += [f"res {training_stats.report0('Progress/res', loss.resolution):<5d}"]
+        fields += [f"time {dnnlib.util.format_time(training_stats.report0('Timing/total_sec', tick_end_time - start_time)):<12s}"]
+        fields += [f"sec/tick {training_stats.report0('Timing/sec_per_tick', tick_end_time - tick_start_time):<7.1f}"]
+        fields += [f"sec/kimg {training_stats.report0('Timing/sec_per_kimg', (tick_end_time - tick_start_time) / (cur_nimg - tick_start_nimg) * 1e3):<7.2f}"]
+        fields += [f"maintenance {training_stats.report0('Timing/maintain_sec', maintenance_time):<5.1f}"]
+        fields += [f"cpumem {training_stats.report0('Resources/cpu_mem_gb', psutil.Process(os.getpid()).memory_info().rss / 2**30):<6.2f}"]
+        fields += [f"gpumem {training_stats.report0('Resources/peak_gpu_mem_gb', torch.cuda.max_memory_allocated(device) / 2**30):<6.2f}"]
+        torch.cuda.reset_peak_memory_stats()
+        fields += [f"augment {training_stats.report0('Progress/augment', float(augment_pipe.p.cpu()) if augment_pipe is not None else 0):.3f}"]
+        training_stats.report0('Timing/total_hours', (tick_end_time - start_time) / (60 * 60))
+        training_stats.report0('Timing/total_days', (tick_end_time - start_time) / (24 * 60 * 60))
+        if rank == 0:
+            print(' '.join(fields))
+
+        # Check for abort.
+        if (not done) and (abort_fn is not None) and abort_fn():
+            done = True
+            if rank == 0:
+                print()
+                print('Aborting...')
+
+        # Update the dataset cameras?
+        if hasattr(training_set_sampler, 'update_dataset_cameras') and \
+          (cur_tick % update_cam_prior_ticks == 0 and cur_tick > 0):
+            training_set_sampler.update_dataset_cameras(D.get_estimated_camera)
+
+        # Save image snapshot.
+        if (rank == 0) and (image_snapshot_ticks is not None) and (done or cur_tick % image_snapshot_ticks == 0):
+            with torch.no_grad():
+                images = torch.cat([G_ema.get_final_output(z=z, c=c, noise_mode='const', img=None).cpu() for z, c, img in zip(grid_z, grid_c, grid_i)]).numpy()
+                save_image_grid(images, os.path.join(img_dir, f'fakes{cur_nimg//1000:06d}.png'), drange=[-1,1], grid_size=grid_size)
+
+                images = torch.cat([G_ema.get_final_output(z=z, c=c, noise_mode='const', img=img, camera_mode=[0.5,0.5,0.5]).cpu() for z, c, img in zip(grid_z, grid_c, grid_i)]).numpy()
+                save_image_grid(images, os.path.join(img_dir, f'fakes{cur_nimg//1000:06d}_000.png'), drange=[-1,1], grid_size=grid_size)
+
+        # Save network snapshot.
+        snapshot_pkl = None
+        snapshot_data = None
+        if (network_snapshot_ticks is not None) and (done or cur_tick % network_snapshot_ticks == 0):
+            snapshot_data = dict(training_set_kwargs=dict(training_set_kwargs))
+            modules = [('G', G), ('D', D), ('G_ema', G_ema), ('augment_pipe', augment_pipe)]
+            if disc_enable_ema:
+                modules += [('D_ema', D_ema)]
+            for name, module in modules:
+                if module is not None:
+                    module = copy.deepcopy(module).eval().requires_grad_(False).cpu()
+                snapshot_data[name] = module
+                del module # conserve memory
+            snapshot_pkl = os.path.join(run_dir, f'network-snapshot-{cur_nimg//1000:06d}.pkl')
+            if rank == 0:
+                with open(snapshot_pkl, 'wb') as f:
+                    pickle.dump(snapshot_data, f)
+                # save the latest checkpoint
+                shutil.copy(snapshot_pkl, os.path.join(run_dir, 'latest-network-snapshot.pkl'))
+        
+        # Evaluate metrics.
+        if (snapshot_data is not None) and (len(metrics) > 0) and (cur_tick > 1):
+            if rank == 0:
+                print('Evaluating metrics...')
+            for metric in metrics:
+                result_dict = metric_main.calc_metric(metric=metric, G=snapshot_data['G_ema'],
+                    dataset_kwargs=training_set_kwargs, num_gpus=world_size, rank=rank, device=device)
+                if rank == 0:
+                    metric_main.report_metric(result_dict, run_dir=run_dir, snapshot_pkl=snapshot_pkl)
+                stats_metrics.update(result_dict.results)
+        del snapshot_data # conserve memory
+
+        # Collect statistics.
+        for phase in phases:
+            value = []
+            if (phase.start_event is not None) and (phase.end_event is not None):
+                phase.end_event.synchronize()
+                value = phase.start_event.elapsed_time(phase.end_event)
+            training_stats.report0('Timing/' + phase.name, value)
+        stats_collector.update()
+        stats_dict = stats_collector.as_dict()
+
+        # Update logs.
+        timestamp = time.time()
+        if stats_jsonl is not None:
+            fields = dict(stats_dict, timestamp=timestamp)
+            stats_jsonl.write(json.dumps(fields) + '\n')
+            stats_jsonl.flush()
+            if rank == 0:
+                losses = [(key, fields[key]) for key in fields if 'Loss/' in key]
+                losses = ["{}: {:.4f}".format(key[5:], loss['mean']) for key, loss in losses]
+                
+                print('\t'.join(losses))
+
+        if stats_tfevents is not None:
+            global_step = int(cur_nimg / 1e3)
+            walltime = timestamp - start_time
+            for name, value in stats_dict.items():
+                stats_tfevents.add_scalar(name, value.mean, global_step=global_step, walltime=walltime)
+            for name, value in stats_metrics.items():
+                stats_tfevents.add_scalar(f'Metrics/{name}', value, global_step=global_step, walltime=walltime)
+            stats_tfevents.flush()
+        if progress_fn is not None:
+            progress_fn(cur_nimg // 1000, total_kimg)
+
+        # Update state.
+        cur_tick += 1
+        tick_start_nimg = cur_nimg
+        tick_start_time = time.time()
+        maintenance_time = tick_start_time - tick_end_time
+        
+        if done:
+            break
+
+    # Done.
+    if rank == 0:
+        print()
+        print('Exiting...')
+
+#----------------------------------------------------------------------------
diff --git a/visualizer.py b/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cdbb0e08dc26855c6e635eeee11ac50f8476b55
--- /dev/null
+++ b/visualizer.py
@@ -0,0 +1,344 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import click
+import os
+
+import multiprocessing
+import numpy as np
+import imgui
+import dnnlib
+from gui_utils import imgui_window
+from gui_utils import imgui_utils
+from gui_utils import gl_utils
+from gui_utils import text_utils
+from viz import renderer
+from viz import camera_widget
+from viz import pickle_widget
+from viz import latent_widget
+from viz import stylemix_widget
+from viz import trunc_noise_widget
+from viz import performance_widget
+from viz import capture_widget
+from viz import layer_widget
+from viz import equivariance_widget
+
+#----------------------------------------------------------------------------
+
+class Visualizer(imgui_window.ImguiWindow):
+    def __init__(self, capture_dir=None):
+        super().__init__(title='StyleNeRF Visualizer', window_width=3840, window_height=2160)
+
+        # Internals.
+        self._last_error_print  = None
+        self._async_renderer    = AsyncRenderer()
+        self._defer_rendering   = 0
+        self._tex_img           = None
+        self._tex_obj           = None
+
+        # Widget interface.
+        self.args               = dnnlib.EasyDict()
+        self.result             = dnnlib.EasyDict()
+        self.pane_w             = 0
+        self.label_w            = 0
+        self.button_w           = 0
+
+        # Widgets.
+        self.pickle_widget      = pickle_widget.PickleWidget(self)
+        self.latent_widget      = latent_widget.LatentWidget(self)
+        self.camera_widget      = camera_widget.CameraWidget(self)
+        self.stylemix_widget    = stylemix_widget.StyleMixingWidget(self)
+        self.trunc_noise_widget = trunc_noise_widget.TruncationNoiseWidget(self)
+        self.perf_widget        = performance_widget.PerformanceWidget(self)
+        self.capture_widget     = capture_widget.CaptureWidget(self)
+        self.layer_widget       = layer_widget.LayerWidget(self)
+        self.eq_widget          = equivariance_widget.EquivarianceWidget(self)
+
+        if capture_dir is not None:
+            self.capture_widget.path = capture_dir
+
+        # Initialize window.
+        self.set_position(0, 0)
+        self._adjust_font_size()
+        self.skip_frame() # Layout may change after first frame.
+
+    def close(self):
+        super().close()
+        if self._async_renderer is not None:
+            self._async_renderer.close()
+            self._async_renderer = None
+
+    def add_recent_pickle(self, pkl, ignore_errors=False):
+        self.pickle_widget.add_recent(pkl, ignore_errors=ignore_errors)
+
+    def load_pickle(self, pkl, ignore_errors=False):
+        self.pickle_widget.load(pkl, ignore_errors=ignore_errors)
+
+    def print_error(self, error):
+        error = str(error)
+        if error != self._last_error_print:
+            print('\n' + error + '\n')
+            self._last_error_print = error
+
+    def defer_rendering(self, num_frames=1):
+        self._defer_rendering = max(self._defer_rendering, num_frames)
+
+    def clear_result(self):
+        self._async_renderer.clear_result()
+
+    def set_async(self, is_async):
+        if is_async != self._async_renderer.is_async:
+            self._async_renderer.set_async(is_async)
+            self.clear_result()
+            if 'image' in self.result:
+                self.result.message = 'Switching rendering process...'
+                self.defer_rendering()
+
+    def _adjust_font_size(self):
+        old = self.font_size
+        self.set_font_size(min(self.content_width / 120, self.content_height / 60))
+        if self.font_size != old:
+            self.skip_frame() # Layout changed.
+
+    def draw_frame(self):
+        self.begin_frame()
+        self.args = dnnlib.EasyDict()
+        self.pane_w = self.font_size * 45
+        self.button_w = self.font_size * 5
+        self.label_w = round(self.font_size * 4.5)
+
+        # Detect mouse dragging in the result area.
+        dragging, dx, dy = imgui_utils.drag_hidden_window('##result_area', x=self.pane_w, y=0, width=self.content_width-self.pane_w, height=self.content_height)
+        if dragging:
+            if not self.camera_widget.camera_mode:
+                self.latent_widget.drag(dx, dy)  # change latents
+            else:
+                self.camera_widget.set_camera(dx, dy)  # change camera
+
+        # Begin control pane.
+        imgui.set_next_window_position(0, 0)
+        imgui.set_next_window_size(self.pane_w, self.content_height)
+        imgui.begin('##control_pane', closable=False, flags=(imgui.WINDOW_NO_TITLE_BAR | imgui.WINDOW_NO_RESIZE | imgui.WINDOW_NO_MOVE))
+
+        # Widgets.
+        expanded, _visible = imgui_utils.collapsing_header('Network & latent', default=True)
+        self.pickle_widget(expanded)
+        self.latent_widget(expanded)
+        self.stylemix_widget(expanded)
+        self.trunc_noise_widget(expanded)
+        self.camera_widget(expanded)
+        expanded, _visible = imgui_utils.collapsing_header('Performance & capture', default=True)
+        self.perf_widget(expanded)
+        self.capture_widget(expanded)
+        expanded, _visible = imgui_utils.collapsing_header('Layers & channels', default=True)
+        self.layer_widget(expanded)
+        with imgui_utils.grayed_out(not self.result.get('has_input_transform', False)):
+            expanded, _visible = imgui_utils.collapsing_header('Equivariance', default=True)
+            self.eq_widget(expanded)
+
+        # Render.
+        if self.is_skipping_frames():
+            pass
+        elif self._defer_rendering > 0:
+            self._defer_rendering -= 1
+        elif self.args.pkl is not None:
+            self._async_renderer.set_args(**self.args)
+            result = self._async_renderer.get_result()
+            if result is not None:
+                self.result = result
+
+        # Display.
+        max_w = self.content_width - self.pane_w
+        max_h = self.content_height
+        pos = np.array([self.pane_w + max_w / 2, max_h / 2])
+        if 'image' in self.result:
+            if self._tex_img is not self.result.image:
+                self._tex_img = self.result.image
+                if self._tex_obj is None or not self._tex_obj.is_compatible(image=self._tex_img):
+                    self._tex_obj = gl_utils.Texture(image=self._tex_img, bilinear=False, mipmap=False)
+                else:
+                    self._tex_obj.update(self._tex_img)
+            zoom = min(max_w / self._tex_obj.width, max_h / self._tex_obj.height)
+            zoom = np.floor(zoom) if zoom >= 1 else zoom
+            self._tex_obj.draw(pos=pos, zoom=zoom, align=0.5, rint=True)
+        if 'error' in self.result:
+            self.print_error(self.result.error)
+            if 'message' not in self.result:
+                self.result.message = str(self.result.error)
+        if 'message' in self.result:
+            tex = text_utils.get_texture(self.result.message, size=self.font_size, max_width=max_w, max_height=max_h, outline=2)
+            tex.draw(pos=pos, align=0.5, rint=True, color=1)
+
+        # End frame.
+        self._adjust_font_size()
+        imgui.end()
+        self.end_frame()
+
+#----------------------------------------------------------------------------
+
+class AsyncRenderer:
+    def __init__(self):
+        self._closed        = False
+        self._is_async      = False
+        self._cur_args      = None
+        self._cur_result    = None
+        self._cur_stamp     = 0
+        self._renderer_obj  = None
+        self._args_queue    = None
+        self._result_queue  = None
+        self._process       = None
+
+    def close(self):
+        self._closed = True
+        self._renderer_obj = None
+        if self._process is not None:
+            self._process.terminate()
+        self._process = None
+        self._args_queue = None
+        self._result_queue = None
+
+    @property
+    def is_async(self):
+        return self._is_async
+
+    def set_async(self, is_async):
+        self._is_async = is_async
+
+    def set_args(self, **args):
+        assert not self._closed
+        if args != self._cur_args:
+            if self._is_async:
+                self._set_args_async(**args)
+            else:
+                self._set_args_sync(**args)
+            self._cur_args = args
+
+    def _set_args_async(self, **args):
+        if self._process is None:
+            self._args_queue = multiprocessing.Queue()
+            self._result_queue = multiprocessing.Queue()
+            try:
+                multiprocessing.set_start_method('spawn')
+            except RuntimeError:
+                pass
+            self._process = multiprocessing.Process(target=self._process_fn, args=(self._args_queue, self._result_queue), daemon=True)
+            self._process.start()
+        self._args_queue.put([args, self._cur_stamp])
+
+    def _set_args_sync(self, **args):
+        if self._renderer_obj is None:
+            self._renderer_obj = renderer.Renderer()
+        self._cur_result = self._renderer_obj.render(**args)
+
+    def get_result(self):
+        assert not self._closed
+        if self._result_queue is not None:
+            while self._result_queue.qsize() > 0:
+                result, stamp = self._result_queue.get()
+                if stamp == self._cur_stamp:
+                    self._cur_result = result
+        return self._cur_result
+
+    def clear_result(self):
+        assert not self._closed
+        self._cur_args = None
+        self._cur_result = None
+        self._cur_stamp += 1
+
+    @staticmethod
+    def _process_fn(args_queue, result_queue):
+        renderer_obj = renderer.Renderer()
+        cur_args = None
+        cur_stamp = None
+        while True:
+            args, stamp = args_queue.get()
+            while args_queue.qsize() > 0:
+                args, stamp = args_queue.get()
+            if args != cur_args or stamp != cur_stamp:
+                result = renderer_obj.render(**args)
+                if 'error' in result:
+                    result.error = renderer.CapturedException(result.error)
+                result_queue.put([result, stamp])
+                cur_args = args
+                cur_stamp = stamp
+
+#----------------------------------------------------------------------------
+
+@click.command()
+@click.argument('pkls', metavar='PATH', nargs=-1)
+@click.option('--capture-dir', help='Where to save screenshot captures', metavar='PATH', default=None)
+@click.option('--browse-dir', help='Specify model path for the \'Browse...\' button', metavar='PATH')
+def main(
+    pkls,
+    capture_dir,
+    browse_dir
+):
+    """Interactive model visualizer.
+
+    Optional PATH argument can be used specify which .pkl file to load.
+    """
+    viz = Visualizer(capture_dir=capture_dir)
+
+    if browse_dir is not None:
+        viz.pickle_widget.search_dirs = [browse_dir]
+
+    # List pickles.
+    if len(pkls) > 0:
+        for pkl in pkls:
+            viz.add_recent_pickle(pkl)
+        viz.load_pickle(pkls[0])
+    else:
+        pretrained = [
+            'pretrained/debug/latest-network-snapshot.pkl',
+            'pretrained/ffhq_512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-afhqv2-512x512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhq-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-256x256.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-metfaces-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-metfacesu-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-afhqv2-512x512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-ffhq-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-ffhqu-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-ffhqu-256x256.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-metfaces-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-metfacesu-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-afhqcat-512x512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-afhqdog-512x512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-afhqv2-512x512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-afhqwild-512x512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-brecahad-512x512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-celebahq-256x256.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-cifar10-32x32.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-ffhq-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-ffhq-256x256.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-ffhq-512x512.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-ffhqu-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-ffhqu-256x256.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-lsundog-256x256.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-metfaces-1024x1024.pkl',
+            'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-metfacesu-1024x1024.pkl'
+        ]
+
+        # Populate recent pickles list with pretrained model URLs.
+        for url in pretrained:
+            viz.add_recent_pickle(url)
+
+    # Run.
+    while not viz.should_close():
+        viz.draw_frame()
+    viz.close()
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/viz/__init__.py b/viz/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..939e7c6c8f94c4ea1141885c3c3295fe083b06aa
--- /dev/null
+++ b/viz/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# empty
diff --git a/viz/camera_widget.py b/viz/camera_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0780da2b609f280742fe0d7192a22aeb99dce40
--- /dev/null
+++ b/viz/camera_widget.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import imgui
+import dnnlib
+from gui_utils import imgui_utils, imgui_window
+
+
+class CameraWidget:
+    def __init__(self, viz):
+        self.viz = viz
+        self.camera_kwargs = dnnlib.EasyDict(yaw=0, pitch=0, fov=12, anim=False, speed=0.25)
+        self.camera_mode = False
+        self.output_nerf = False
+    
+    def set_camera(self, dv, du):
+        viz = self.viz
+        du, dv = -du / viz.font_size * 5e-2, -dv / viz.font_size * 5e-2
+        if ((self.camera_kwargs.yaw + du) <= 1 and (self.camera_kwargs.yaw + du) >= -1 and
+            (self.camera_kwargs.pitch + dv) <= 1 and (self.camera_kwargs.pitch + dv) >=-1):
+            self.camera_kwargs.yaw   += du
+            self.camera_kwargs.pitch += dv
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            imgui.text('Camera')
+            imgui.same_line(viz.label_w)
+
+            _clicked, self.camera_mode = imgui.checkbox('Control viewpoint##enable', self.camera_mode)
+            imgui.same_line()
+            _clicked, self.output_nerf = imgui.checkbox('NeRF output##enable', self.output_nerf)
+
+        viz.args.camera = (self.camera_kwargs.yaw, self.camera_kwargs.pitch, self.camera_kwargs.fov)
+        viz.args.output_lowres = self.output_nerf
\ No newline at end of file
diff --git a/viz/capture_widget.py b/viz/capture_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc46c5a69fcce2c81b46e8b0c1f1659c468cec03
--- /dev/null
+++ b/viz/capture_widget.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import os
+import re
+import numpy as np
+import imgui
+import PIL.Image
+from gui_utils import imgui_utils
+from . import renderer
+
+#----------------------------------------------------------------------------
+
+class CaptureWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.path           = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '_screenshots'))
+        self.dump_image     = False
+        self.dump_gui       = False
+        self.defer_frames   = 0
+        self.disabled_time  = 0
+
+    def dump_png(self, image):
+        viz = self.viz
+        try:
+            _height, _width, channels = image.shape
+            assert channels in [1, 3]
+            assert image.dtype == np.uint8
+            os.makedirs(self.path, exist_ok=True)
+            file_id = 0
+            for entry in os.scandir(self.path):
+                if entry.is_file():
+                    match = re.fullmatch(r'(\d+).*', entry.name)
+                    if match:
+                        file_id = max(file_id, int(match.group(1)) + 1)
+            if channels == 1:
+                pil_image = PIL.Image.fromarray(image[:, :, 0], 'L')
+            else:
+                pil_image = PIL.Image.fromarray(image, 'RGB')
+            pil_image.save(os.path.join(self.path, f'{file_id:05d}.png'))
+        except:
+            viz.result.error = renderer.CapturedException()
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            with imgui_utils.grayed_out(self.disabled_time != 0):
+                imgui.text('Capture')
+                imgui.same_line(viz.label_w)
+                _changed, self.path = imgui_utils.input_text('##path', self.path, 1024,
+                    flags=(imgui.INPUT_TEXT_AUTO_SELECT_ALL | imgui.INPUT_TEXT_ENTER_RETURNS_TRUE),
+                    width=(-1 - viz.button_w * 2 - viz.spacing * 2),
+                    help_text='PATH')
+                if imgui.is_item_hovered() and not imgui.is_item_active() and self.path != '':
+                    imgui.set_tooltip(self.path)
+                imgui.same_line()
+                if imgui_utils.button('Save image', width=viz.button_w, enabled=(self.disabled_time == 0 and 'image' in viz.result)):
+                    self.dump_image = True
+                    self.defer_frames = 2
+                    self.disabled_time = 0.5
+                imgui.same_line()
+                if imgui_utils.button('Save GUI', width=-1, enabled=(self.disabled_time == 0)):
+                    self.dump_gui = True
+                    self.defer_frames = 2
+                    self.disabled_time = 0.5
+
+        self.disabled_time = max(self.disabled_time - viz.frame_delta, 0)
+        if self.defer_frames > 0:
+            self.defer_frames -= 1
+        elif self.dump_image:
+            if 'image' in viz.result:
+                self.dump_png(viz.result.image)
+            self.dump_image = False
+        elif self.dump_gui:
+            viz.capture_next_frame()
+            self.dump_gui = False
+        captured_frame = viz.pop_captured_frame()
+        if captured_frame is not None:
+            self.dump_png(captured_frame)
+
+#----------------------------------------------------------------------------
diff --git a/viz/equivariance_widget.py b/viz/equivariance_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..d961e82a581fb9ce2254e8163bade1ec34a8b139
--- /dev/null
+++ b/viz/equivariance_widget.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import imgui
+import dnnlib
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class EquivarianceWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.xlate          = dnnlib.EasyDict(x=0, y=0, anim=False, round=False, speed=1e-2)
+        self.xlate_def      = dnnlib.EasyDict(self.xlate)
+        self.rotate         = dnnlib.EasyDict(val=0, anim=False, speed=5e-3)
+        self.rotate_def     = dnnlib.EasyDict(self.rotate)
+        self.opts           = dnnlib.EasyDict(untransform=False)
+        self.opts_def       = dnnlib.EasyDict(self.opts)
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            imgui.text('Translate')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8):
+                _changed, (self.xlate.x, self.xlate.y) = imgui.input_float2('##xlate', self.xlate.x, self.xlate.y, format='%.4f')
+            imgui.same_line(viz.label_w + viz.font_size * 8 + viz.spacing)
+            _clicked, dragging, dx, dy = imgui_utils.drag_button('Drag fast##xlate', width=viz.button_w)
+            if dragging:
+                self.xlate.x += dx / viz.font_size * 2e-2
+                self.xlate.y += dy / viz.font_size * 2e-2
+            imgui.same_line()
+            _clicked, dragging, dx, dy = imgui_utils.drag_button('Drag slow##xlate', width=viz.button_w)
+            if dragging:
+                self.xlate.x += dx / viz.font_size * 4e-4
+                self.xlate.y += dy / viz.font_size * 4e-4
+            imgui.same_line()
+            _clicked, self.xlate.anim = imgui.checkbox('Anim##xlate', self.xlate.anim)
+            imgui.same_line()
+            _clicked, self.xlate.round = imgui.checkbox('Round##xlate', self.xlate.round)
+            imgui.same_line()
+            with imgui_utils.item_width(-1 - viz.button_w - viz.spacing), imgui_utils.grayed_out(not self.xlate.anim):
+                changed, speed = imgui.slider_float('##xlate_speed', self.xlate.speed, 0, 0.5, format='Speed %.5f', power=5)
+                if changed:
+                    self.xlate.speed = speed
+            imgui.same_line()
+            if imgui_utils.button('Reset##xlate', width=-1, enabled=(self.xlate != self.xlate_def)):
+                self.xlate = dnnlib.EasyDict(self.xlate_def)
+
+        if show:
+            imgui.text('Rotate')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8):
+                _changed, self.rotate.val = imgui.input_float('##rotate', self.rotate.val, format='%.4f')
+            imgui.same_line(viz.label_w + viz.font_size * 8 + viz.spacing)
+            _clicked, dragging, dx, _dy = imgui_utils.drag_button('Drag fast##rotate', width=viz.button_w)
+            if dragging:
+                self.rotate.val += dx / viz.font_size * 2e-2
+            imgui.same_line()
+            _clicked, dragging, dx, _dy = imgui_utils.drag_button('Drag slow##rotate', width=viz.button_w)
+            if dragging:
+                self.rotate.val += dx / viz.font_size * 4e-4
+            imgui.same_line()
+            _clicked, self.rotate.anim = imgui.checkbox('Anim##rotate', self.rotate.anim)
+            imgui.same_line()
+            with imgui_utils.item_width(-1 - viz.button_w - viz.spacing), imgui_utils.grayed_out(not self.rotate.anim):
+                changed, speed = imgui.slider_float('##rotate_speed', self.rotate.speed, -1, 1, format='Speed %.4f', power=3)
+                if changed:
+                    self.rotate.speed = speed
+            imgui.same_line()
+            if imgui_utils.button('Reset##rotate', width=-1, enabled=(self.rotate != self.rotate_def)):
+                self.rotate = dnnlib.EasyDict(self.rotate_def)
+
+        if show:
+            imgui.set_cursor_pos_x(imgui.get_content_region_max()[0] - 1 - viz.button_w*1 - viz.font_size*16)
+            _clicked, self.opts.untransform = imgui.checkbox('Untransform', self.opts.untransform)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w)
+            if imgui_utils.button('Reset##opts', width=-1, enabled=(self.opts != self.opts_def)):
+                self.opts = dnnlib.EasyDict(self.opts_def)
+
+        if self.xlate.anim:
+            c = np.array([self.xlate.x, self.xlate.y], dtype=np.float64)
+            t = c.copy()
+            if np.max(np.abs(t)) < 1e-4:
+                t += 1
+            t *= 0.1 / np.hypot(*t)
+            t += c[::-1] * [1, -1]
+            d = t - c
+            d *= (viz.frame_delta * self.xlate.speed) / np.hypot(*d)
+            self.xlate.x += d[0]
+            self.xlate.y += d[1]
+
+        if self.rotate.anim:
+            self.rotate.val += viz.frame_delta * self.rotate.speed
+
+        pos = np.array([self.xlate.x, self.xlate.y], dtype=np.float64)
+        if self.xlate.round and 'img_resolution' in viz.result:
+            pos = np.rint(pos * viz.result.img_resolution) / viz.result.img_resolution
+        angle = self.rotate.val * np.pi * 2
+
+        viz.args.input_transform = [
+            [np.cos(angle),  np.sin(angle), pos[0]],
+            [-np.sin(angle), np.cos(angle), pos[1]],
+            [0, 0, 1]]
+
+        viz.args.update(untransform=self.opts.untransform)
+
+#----------------------------------------------------------------------------
diff --git a/viz/latent_widget.py b/viz/latent_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..32c743bdbcac8a12425f8e5b32b9ea2d4612365d
--- /dev/null
+++ b/viz/latent_widget.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import imgui
+import dnnlib
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class LatentWidget:
+    def __init__(self, viz):
+        self.viz        = viz
+        self.latent     = dnnlib.EasyDict(x=0, y=0, anim=False, speed=0.25)
+        self.latent_def = dnnlib.EasyDict(self.latent)
+        self.step_y     = 100
+
+    def drag(self, dx, dy):
+        viz = self.viz
+        self.latent.x += dx / viz.font_size * 4e-2
+        self.latent.y += dy / viz.font_size * 4e-2
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        if show:
+            imgui.text('Latent')
+            imgui.same_line(viz.label_w)
+            seed = round(self.latent.x) + round(self.latent.y) * self.step_y
+            with imgui_utils.item_width(viz.font_size * 8):
+                changed, seed = imgui.input_int('##seed', seed)
+                if changed:
+                    self.latent.x = seed
+                    self.latent.y = 0
+            imgui.same_line(viz.label_w + viz.font_size * 8 + viz.spacing)
+            frac_x = self.latent.x - round(self.latent.x)
+            frac_y = self.latent.y - round(self.latent.y)
+            with imgui_utils.item_width(viz.font_size * 5):
+                changed, (new_frac_x, new_frac_y) = imgui.input_float2('##frac', frac_x, frac_y, format='%+.2f', flags=imgui.INPUT_TEXT_ENTER_RETURNS_TRUE)
+                if changed:
+                    self.latent.x += new_frac_x - frac_x
+                    self.latent.y += new_frac_y - frac_y
+            imgui.same_line(viz.label_w + viz.font_size * 13 + viz.spacing * 2)
+            _clicked, dragging, dx, dy = imgui_utils.drag_button('Drag', width=viz.button_w)
+            if dragging:
+                self.drag(dx, dy)
+            imgui.same_line(viz.label_w + viz.font_size * 13 + viz.button_w + viz.spacing * 3)
+            _clicked, self.latent.anim = imgui.checkbox('Anim', self.latent.anim)
+            imgui.same_line(round(viz.font_size * 27.7))
+            with imgui_utils.item_width(-1 - viz.button_w * 2 - viz.spacing * 2), imgui_utils.grayed_out(not self.latent.anim):
+                changed, speed = imgui.slider_float('##speed', self.latent.speed, -5, 5, format='Speed %.3f', power=3)
+                if changed:
+                    self.latent.speed = speed
+            imgui.same_line()
+            snapped = dnnlib.EasyDict(self.latent, x=round(self.latent.x), y=round(self.latent.y))
+            if imgui_utils.button('Snap', width=viz.button_w, enabled=(self.latent != snapped)):
+                self.latent = snapped
+            imgui.same_line()
+            if imgui_utils.button('Reset', width=-1, enabled=(self.latent != self.latent_def)):
+                self.latent = dnnlib.EasyDict(self.latent_def)
+
+        if self.latent.anim:
+            self.latent.x += viz.frame_delta * self.latent.speed
+        viz.args.w0_seeds = [] # [[seed, weight], ...]
+        for ofs_x, ofs_y in [[0, 0], [1, 0], [0, 1], [1, 1]]:
+            seed_x = np.floor(self.latent.x) + ofs_x
+            seed_y = np.floor(self.latent.y) + ofs_y
+            seed = (int(seed_x) + int(seed_y) * self.step_y) & ((1 << 32) - 1)
+            weight = (1 - abs(self.latent.x - seed_x)) * (1 - abs(self.latent.y - seed_y))
+            if weight > 0:
+                viz.args.w0_seeds.append([seed, weight])
+
+#----------------------------------------------------------------------------
diff --git a/viz/layer_widget.py b/viz/layer_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..365d6dbd9e1700b68b3ce121d987ef8c51356a01
--- /dev/null
+++ b/viz/layer_widget.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class LayerWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.prev_layers    = None
+        self.cur_layer      = None
+        self.sel_channels   = 3
+        self.base_channel   = 0
+        self.img_scale_db   = 0
+        self.img_normalize  = False
+        self.fft_show       = False
+        self.fft_all        = True
+        self.fft_range_db   = 50
+        self.fft_beta       = 8
+        self.refocus        = False
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        layers = viz.result.get('layers', [])
+        if self.prev_layers != layers:
+            self.prev_layers = layers
+            self.refocus = True
+        layer = ([layer for layer in layers if layer.name == self.cur_layer] + [None])[0]
+        if layer is None and len(layers) > 0:
+            layer = layers[-1]
+            self.cur_layer = layer.name
+        num_channels = layer.shape[1] if layer is not None else 0
+        base_channel_max = max(num_channels - self.sel_channels, 0)
+
+        if show:
+            bg_color = [0.16, 0.29, 0.48, 0.2]
+            dim_color = list(imgui.get_style().colors[imgui.COLOR_TEXT])
+            dim_color[-1] *= 0.5
+
+            # Begin list.
+            width = viz.font_size * 28
+            height = imgui.get_text_line_height_with_spacing() * 12 + viz.spacing
+            imgui.push_style_var(imgui.STYLE_FRAME_PADDING, [0, 0])
+            imgui.push_style_color(imgui.COLOR_CHILD_BACKGROUND, *bg_color)
+            imgui.push_style_color(imgui.COLOR_HEADER, 0, 0, 0, 0)
+            imgui.push_style_color(imgui.COLOR_HEADER_HOVERED, 0.16, 0.29, 0.48, 0.5)
+            imgui.push_style_color(imgui.COLOR_HEADER_ACTIVE, 0.16, 0.29, 0.48, 0.9)
+            imgui.begin_child('##list', width=width, height=height, border=True, flags=imgui.WINDOW_ALWAYS_VERTICAL_SCROLLBAR)
+
+            # List items.
+            for layer in layers:
+                selected = (self.cur_layer == layer.name)
+                _opened, selected = imgui.selectable(f'##{layer.name}_selectable', selected)
+                imgui.same_line(viz.spacing)
+                _clicked, selected = imgui.checkbox(f'{layer.name}##radio', selected)
+                if selected:
+                    self.cur_layer = layer.name
+                    if self.refocus:
+                        imgui.set_scroll_here()
+                        viz.skip_frame() # Focus will change on next frame.
+                        self.refocus = False
+                imgui.same_line(width - viz.font_size * 13)
+                imgui.text_colored('x'.join(str(x) for x in layer.shape[2:]), *dim_color)
+                imgui.same_line(width - viz.font_size * 8)
+                imgui.text_colored(str(layer.shape[1]), *dim_color)
+                imgui.same_line(width - viz.font_size * 5)
+                imgui.text_colored(layer.dtype, *dim_color)
+
+            # End list.
+            if len(layers) == 0:
+                imgui.text_colored('No layers found', *dim_color)
+            imgui.end_child()
+            imgui.pop_style_color(4)
+            imgui.pop_style_var(1)
+
+            # Begin options.
+            imgui.same_line()
+            imgui.begin_child('##options', width=-1, height=height, border=False)
+
+            # RGB & normalize.
+            rgb = (self.sel_channels == 3)
+            _clicked, rgb = imgui.checkbox('RGB', rgb)
+            self.sel_channels = 3 if rgb else 1
+            imgui.same_line(viz.font_size * 4)
+            _clicked, self.img_normalize = imgui.checkbox('Normalize', self.img_normalize)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w)
+            if imgui_utils.button('Reset##img_flags', width=-1, enabled=(self.sel_channels != 3 or self.img_normalize)):
+                self.sel_channels = 3
+                self.img_normalize = False
+
+            # Image scale.
+            with imgui_utils.item_width(-1 - viz.button_w - viz.spacing):
+                _changed, self.img_scale_db = imgui.slider_float('##scale', self.img_scale_db, min_value=-40, max_value=40, format='Scale %+.1f dB')
+            imgui.same_line()
+            if imgui_utils.button('Reset##scale', width=-1, enabled=(self.img_scale_db != 0)):
+                self.img_scale_db = 0
+
+            # Base channel.
+            self.base_channel = min(max(self.base_channel, 0), base_channel_max)
+            narrow_w = imgui.get_text_line_height_with_spacing()
+            with imgui_utils.grayed_out(base_channel_max == 0):
+                with imgui_utils.item_width(-1 - viz.button_w - narrow_w * 2 - viz.spacing * 3):
+                    _changed, self.base_channel = imgui.drag_int('##channel', self.base_channel, change_speed=0.05, min_value=0, max_value=base_channel_max, format=f'Channel %d/{num_channels}')
+                imgui.same_line()
+                if imgui_utils.button('-##channel', width=narrow_w):
+                    self.base_channel -= 1
+                imgui.same_line()
+                if imgui_utils.button('+##channel', width=narrow_w):
+                    self.base_channel += 1
+            imgui.same_line()
+            self.base_channel = min(max(self.base_channel, 0), base_channel_max)
+            if imgui_utils.button('Reset##channel', width=-1, enabled=(self.base_channel != 0 and base_channel_max > 0)):
+                self.base_channel = 0
+
+            # Stats.
+            stats = viz.result.get('stats', None)
+            stats = [f'{stats[idx]:g}' if stats is not None else 'N/A' for idx in range(6)]
+            rows = [
+                ['Statistic',   'All channels', 'Selected'],
+                ['Mean',        stats[0],       stats[1]],
+                ['Std',         stats[2],       stats[3]],
+                ['Max',         stats[4],       stats[5]],
+            ]
+            height = imgui.get_text_line_height_with_spacing() * len(rows) + viz.spacing
+            imgui.push_style_color(imgui.COLOR_CHILD_BACKGROUND, *bg_color)
+            imgui.begin_child('##stats', width=-1, height=height, border=True)
+            for y, cols in enumerate(rows):
+                for x, col in enumerate(cols):
+                    if x != 0:
+                        imgui.same_line(viz.font_size * (4 + (x - 1) * 6))
+                    if x == 0 or y == 0:
+                        imgui.text_colored(col, *dim_color)
+                    else:
+                        imgui.text(col)
+            imgui.end_child()
+            imgui.pop_style_color(1)
+
+            # FFT & all.
+            _clicked, self.fft_show = imgui.checkbox('FFT', self.fft_show)
+            imgui.same_line(viz.font_size * 4)
+            with imgui_utils.grayed_out(not self.fft_show or base_channel_max == 0):
+                _clicked, self.fft_all = imgui.checkbox('All channels', self.fft_all)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w)
+            with imgui_utils.grayed_out(not self.fft_show):
+                if imgui_utils.button('Reset##fft_flags', width=-1, enabled=(self.fft_show or not self.fft_all)):
+                    self.fft_show = False
+                    self.fft_all = True
+
+            # FFT range.
+            with imgui_utils.grayed_out(not self.fft_show):
+                with imgui_utils.item_width(-1 - viz.button_w - viz.spacing):
+                    _changed, self.fft_range_db = imgui.slider_float('##fft_range_db', self.fft_range_db, min_value=0.1, max_value=100, format='Range +-%.1f dB')
+                imgui.same_line()
+                if imgui_utils.button('Reset##fft_range_db', width=-1, enabled=(self.fft_range_db != 50)):
+                    self.fft_range_db = 50
+
+            # FFT beta.
+            with imgui_utils.grayed_out(not self.fft_show):
+                with imgui_utils.item_width(-1 - viz.button_w - viz.spacing):
+                    _changed, self.fft_beta = imgui.slider_float('##fft_beta', self.fft_beta, min_value=0, max_value=50, format='Kaiser beta %.2f', power=2.63)
+                imgui.same_line()
+                if imgui_utils.button('Reset##fft_beta', width=-1, enabled=(self.fft_beta != 8)):
+                    self.fft_beta = 8
+
+            # End options.
+            imgui.end_child()
+
+        self.base_channel = min(max(self.base_channel, 0), base_channel_max)
+        viz.args.layer_name = self.cur_layer if len(layers) > 0 and self.cur_layer != layers[-1].name else None
+        viz.args.update(sel_channels=self.sel_channels, base_channel=self.base_channel, img_scale_db=self.img_scale_db, img_normalize=self.img_normalize)
+        viz.args.fft_show = self.fft_show
+        if self.fft_show:
+            viz.args.update(fft_all=self.fft_all, fft_range_db=self.fft_range_db, fft_beta=self.fft_beta)
+
+#----------------------------------------------------------------------------
diff --git a/viz/performance_widget.py b/viz/performance_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..527a561bbd87cbad333b3971fc2dfcd2cc3694fd
--- /dev/null
+++ b/viz/performance_widget.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import array
+import numpy as np
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class PerformanceWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.gui_times      = [float('nan')] * 60
+        self.render_times   = [float('nan')] * 30
+        self.fps_limit      = 60
+        self.use_vsync      = False
+        self.is_async       = False
+        self.force_fp32     = False
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        self.gui_times = self.gui_times[1:] + [viz.frame_delta]
+        if 'render_time' in viz.result:
+            self.render_times = self.render_times[1:] + [viz.result.render_time]
+            del viz.result.render_time
+
+        if show:
+            imgui.text('GUI')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8):
+                imgui.plot_lines('##gui_times', array.array('f', self.gui_times), scale_min=0)
+            imgui.same_line(viz.label_w + viz.font_size * 9)
+            t = [x for x in self.gui_times if x > 0]
+            t = np.mean(t) if len(t) > 0 else 0
+            imgui.text(f'{t*1e3:.1f} ms' if t > 0 else 'N/A')
+            imgui.same_line(viz.label_w + viz.font_size * 14)
+            imgui.text(f'{1/t:.1f} FPS' if t > 0 else 'N/A')
+            imgui.same_line(viz.label_w + viz.font_size * 18 + viz.spacing * 3)
+            with imgui_utils.item_width(viz.font_size * 6):
+                _changed, self.fps_limit = imgui.input_int('FPS limit', self.fps_limit, flags=imgui.INPUT_TEXT_ENTER_RETURNS_TRUE)
+                self.fps_limit = min(max(self.fps_limit, 5), 1000)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w * 2 - viz.spacing)
+            _clicked, self.use_vsync = imgui.checkbox('Vertical sync', self.use_vsync)
+
+        if show:
+            imgui.text('Render')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8):
+                imgui.plot_lines('##render_times', array.array('f', self.render_times), scale_min=0)
+            imgui.same_line(viz.label_w + viz.font_size * 9)
+            t = [x for x in self.render_times if x > 0]
+            t = np.mean(t) if len(t) > 0 else 0
+            imgui.text(f'{t*1e3:.1f} ms' if t > 0 else 'N/A')
+            imgui.same_line(viz.label_w + viz.font_size * 14)
+            imgui.text(f'{1/t:.1f} FPS' if t > 0 else 'N/A')
+            imgui.same_line(viz.label_w + viz.font_size * 18 + viz.spacing * 3)
+            _clicked, self.is_async = imgui.checkbox('Separate process', self.is_async)
+            imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w * 2 - viz.spacing)
+            _clicked, self.force_fp32 = imgui.checkbox('Force FP32', self.force_fp32)
+
+        viz.set_fps_limit(self.fps_limit)
+        viz.set_vsync(self.use_vsync)
+        viz.set_async(self.is_async)
+        viz.args.force_fp32 = self.force_fp32
+
+#----------------------------------------------------------------------------
diff --git a/viz/pickle_widget.py b/viz/pickle_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb9d485ab3ca14c93a74c926bc47568fc95e0ac8
--- /dev/null
+++ b/viz/pickle_widget.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import glob
+import os
+import re
+
+import dnnlib
+import imgui
+import numpy as np
+from gui_utils import imgui_utils
+
+from . import renderer
+
+#----------------------------------------------------------------------------
+
+def _locate_results(pattern):
+    return pattern
+
+#----------------------------------------------------------------------------
+
+class PickleWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.search_dirs    = ['pretrained']
+        self.cur_pkl        = None
+        self.user_pkl       = ''
+        self.recent_pkls    = []
+        self.browse_cache   = dict() # {tuple(path, ...): [dnnlib.EasyDict(), ...], ...}
+        self.browse_refocus = False
+        self.load('', ignore_errors=True)
+
+    def add_recent(self, pkl, ignore_errors=False):
+        try:
+            resolved = self.resolve_pkl(pkl)
+            if resolved not in self.recent_pkls:
+                self.recent_pkls.append(resolved)
+        except:
+            if not ignore_errors:
+                raise
+
+    def load(self, pkl, ignore_errors=False):
+        viz = self.viz
+        viz.clear_result()
+        viz.skip_frame() # The input field will change on next frame.
+        try:
+            resolved = self.resolve_pkl(pkl)
+            name = resolved.replace('\\', '/').split('/')[-1]
+            self.cur_pkl = resolved
+            self.user_pkl = resolved
+            viz.result.message = f'Loading {name}...'
+            viz.defer_rendering()
+            if resolved in self.recent_pkls:
+                self.recent_pkls.remove(resolved)
+            self.recent_pkls.insert(0, resolved)
+        except:
+            self.cur_pkl = None
+            self.user_pkl = pkl
+            if pkl == '':
+                viz.result = dnnlib.EasyDict(message='No network pickle loaded')
+            else:
+                viz.result = dnnlib.EasyDict(error=renderer.CapturedException())
+            if not ignore_errors:
+                raise
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        recent_pkls = [pkl for pkl in self.recent_pkls if pkl != self.user_pkl]
+        if show:
+            imgui.text('Pickle')
+            imgui.same_line(viz.label_w)
+            changed, self.user_pkl = imgui_utils.input_text('##pkl', self.user_pkl, 1024,
+                flags=(imgui.INPUT_TEXT_AUTO_SELECT_ALL | imgui.INPUT_TEXT_ENTER_RETURNS_TRUE),
+                width=(-1 - viz.button_w * 2 - viz.spacing * 2),
+                help_text='<PATH> | <URL> | <RUN_DIR> | <RUN_ID> | <RUN_ID>/<KIMG>.pkl')
+            if changed:
+                self.load(self.user_pkl, ignore_errors=True)
+            if imgui.is_item_hovered() and not imgui.is_item_active() and self.user_pkl != '':
+                imgui.set_tooltip(self.user_pkl)
+            imgui.same_line()
+            if imgui_utils.button('Recent...', width=viz.button_w, enabled=(len(recent_pkls) != 0)):
+                imgui.open_popup('recent_pkls_popup')
+            imgui.same_line()
+            if imgui_utils.button('Browse...', enabled=len(self.search_dirs) > 0, width=-1):
+                imgui.open_popup('browse_pkls_popup')
+                self.browse_cache.clear()
+                self.browse_refocus = True
+
+        if imgui.begin_popup('recent_pkls_popup'):
+            for pkl in recent_pkls:
+                clicked, _state = imgui.menu_item(pkl)
+                if clicked:
+                    self.load(pkl, ignore_errors=True)
+            imgui.end_popup()
+
+        if imgui.begin_popup('browse_pkls_popup'):
+            def recurse(parents):
+                key = tuple(parents)
+                items = self.browse_cache.get(key, None)
+                if items is None:
+                    items = self.list_runs_and_pkls(parents)
+                    self.browse_cache[key] = items
+                for item in items:
+                    if item.type == 'run' and imgui.begin_menu(item.name):
+                        recurse([item.path])
+                        imgui.end_menu()
+                    if item.type == 'pkl':
+                        clicked, _state = imgui.menu_item(item.name)
+                        if clicked:
+                            self.load(item.path, ignore_errors=True)
+                if len(items) == 0:
+                    with imgui_utils.grayed_out():
+                        imgui.menu_item('No results found')
+            recurse(self.search_dirs)
+            if self.browse_refocus:
+                imgui.set_scroll_here()
+                viz.skip_frame() # Focus will change on next frame.
+                self.browse_refocus = False
+            imgui.end_popup()
+
+        paths = viz.pop_drag_and_drop_paths()
+        if paths is not None and len(paths) >= 1:
+            self.load(paths[0], ignore_errors=True)
+
+        viz.args.pkl = self.cur_pkl
+
+    def list_runs_and_pkls(self, parents):
+        items = []
+        run_regex = re.compile(r'\d+-.*')
+        pkl_regex = re.compile(r'network-snapshot-\d+\.pkl')
+        for parent in set(parents):
+            if os.path.isdir(parent):
+                for entry in os.scandir(parent):
+                    if entry.is_dir() and run_regex.fullmatch(entry.name):
+                        items.append(dnnlib.EasyDict(type='run', name=entry.name, path=os.path.join(parent, entry.name)))
+                    if entry.is_file() and pkl_regex.fullmatch(entry.name):
+                        items.append(dnnlib.EasyDict(type='pkl', name=entry.name, path=os.path.join(parent, entry.name)))
+
+        items = sorted(items, key=lambda item: (item.name.replace('_', ' '), item.path))
+        return items
+
+    def resolve_pkl(self, pattern):
+        assert isinstance(pattern, str)
+        assert pattern != ''
+
+        # URL => return as is.
+        if dnnlib.util.is_url(pattern):
+            return pattern
+
+        # Short-hand pattern => locate.
+        path = _locate_results(pattern)
+
+        # Run dir => pick the last saved snapshot.
+        if os.path.isdir(path):
+            pkl_files = sorted(glob.glob(os.path.join(path, 'network-snapshot-*.pkl')))
+            if len(pkl_files) == 0:
+                raise IOError(f'No network pickle found in "{path}"')
+            path = pkl_files[-1]
+
+        # Normalize.
+        path = os.path.abspath(path)
+        return path
+
+#----------------------------------------------------------------------------
diff --git a/viz/renderer.py b/viz/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f54b48e8e6c04459a99e699ea7b627cde0d140
--- /dev/null
+++ b/viz/renderer.py
@@ -0,0 +1,425 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import sys
+import copy
+import traceback
+import numpy as np
+import torch
+import torch.fft
+import torch.nn
+import matplotlib.cm
+import dnnlib
+import torch.nn.functional as F
+from torch_utils import misc
+from torch_utils.ops import upfirdn2d
+from training.networks import Generator
+import legacy # pylint: disable=import-error
+
+#----------------------------------------------------------------------------
+
+class CapturedException(Exception):
+    def __init__(self, msg=None):
+        if msg is None:
+            _type, value, _traceback = sys.exc_info()
+            assert value is not None
+            if isinstance(value, CapturedException):
+                msg = str(value)
+            else:
+                msg = traceback.format_exc()
+        assert isinstance(msg, str)
+        super().__init__(msg)
+
+#----------------------------------------------------------------------------
+
+class CaptureSuccess(Exception):
+    def __init__(self, out):
+        super().__init__()
+        self.out = out
+
+#----------------------------------------------------------------------------
+
+def _sinc(x):
+    y = (x * np.pi).abs()
+    z = torch.sin(y) / y.clamp(1e-30, float('inf'))
+    return torch.where(y < 1e-30, torch.ones_like(x), z)
+
+def _lanczos_window(x, a):
+    x = x.abs() / a
+    return torch.where(x < 1, _sinc(x), torch.zeros_like(x))
+
+#----------------------------------------------------------------------------
+
+def _construct_affine_bandlimit_filter(mat, a=3, amax=16, aflt=64, up=4, cutoff_in=1, cutoff_out=1):
+    assert a <= amax < aflt
+    mat = torch.as_tensor(mat).to(torch.float32)
+
+    # Construct 2D filter taps in input & output coordinate spaces.
+    taps = ((torch.arange(aflt * up * 2 - 1, device=mat.device) + 1) / up - aflt).roll(1 - aflt * up)
+    yi, xi = torch.meshgrid(taps, taps)
+    xo, yo = (torch.stack([xi, yi], dim=2) @ mat[:2, :2].t()).unbind(2)
+
+    # Convolution of two oriented 2D sinc filters.
+    fi = _sinc(xi * cutoff_in) * _sinc(yi * cutoff_in)
+    fo = _sinc(xo * cutoff_out) * _sinc(yo * cutoff_out)
+    f = torch.fft.ifftn(torch.fft.fftn(fi) * torch.fft.fftn(fo)).real
+
+    # Convolution of two oriented 2D Lanczos windows.
+    wi = _lanczos_window(xi, a) * _lanczos_window(yi, a)
+    wo = _lanczos_window(xo, a) * _lanczos_window(yo, a)
+    w = torch.fft.ifftn(torch.fft.fftn(wi) * torch.fft.fftn(wo)).real
+
+    # Construct windowed FIR filter.
+    f = f * w
+
+    # Finalize.
+    c = (aflt - amax) * up
+    f = f.roll([aflt * up - 1] * 2, dims=[0,1])[c:-c, c:-c]
+    f = torch.nn.functional.pad(f, [0, 1, 0, 1]).reshape(amax * 2, up, amax * 2, up)
+    f = f / f.sum([0,2], keepdim=True) / (up ** 2)
+    f = f.reshape(amax * 2 * up, amax * 2 * up)[:-1, :-1]
+    return f
+
+#----------------------------------------------------------------------------
+
+def _apply_affine_transformation(x, mat, up=4, **filter_kwargs):
+    _N, _C, H, W = x.shape
+    mat = torch.as_tensor(mat).to(dtype=torch.float32, device=x.device)
+
+    # Construct filter.
+    f = _construct_affine_bandlimit_filter(mat, up=up, **filter_kwargs)
+    assert f.ndim == 2 and f.shape[0] == f.shape[1] and f.shape[0] % 2 == 1
+    p = f.shape[0] // 2
+
+    # Construct sampling grid.
+    theta = mat.inverse()
+    theta[:2, 2] *= 2
+    theta[0, 2] += 1 / up / W
+    theta[1, 2] += 1 / up / H
+    theta[0, :] *= W / (W + p / up * 2)
+    theta[1, :] *= H / (H + p / up * 2)
+    theta = theta[:2, :3].unsqueeze(0).repeat([x.shape[0], 1, 1])
+    g = torch.nn.functional.affine_grid(theta, x.shape, align_corners=False)
+
+    # Resample image.
+    y = upfirdn2d.upsample2d(x=x, f=f, up=up, padding=p)
+    z = torch.nn.functional.grid_sample(y, g, mode='bilinear', padding_mode='zeros', align_corners=False)
+
+    # Form mask.
+    m = torch.zeros_like(y)
+    c = p * 2 + 1
+    m[:, :, c:-c, c:-c] = 1
+    m = torch.nn.functional.grid_sample(m, g, mode='nearest', padding_mode='zeros', align_corners=False)
+    return z, m
+
+#----------------------------------------------------------------------------
+def set_random_seed(seed):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+
+class Renderer:
+    def __init__(self):
+        self._device        = torch.device('cuda')
+        self._pkl_data      = dict()    # {pkl: dict | CapturedException, ...}
+        self._networks      = dict()    # {cache_key: torch.nn.Module, ...}
+        self._pinned_bufs   = dict()    # {(shape, dtype): torch.Tensor, ...}
+        self._cmaps         = dict()    # {name: torch.Tensor, ...}
+        self._is_timing     = False
+        self._start_event   = torch.cuda.Event(enable_timing=True)
+        self._end_event     = torch.cuda.Event(enable_timing=True)
+        self._net_layers    = dict()    # {cache_key: [dnnlib.EasyDict, ...], ...}
+
+    def render(self, **args):
+        self._is_timing = True
+        self._start_event.record(torch.cuda.current_stream(self._device))
+        res = dnnlib.EasyDict()
+        try:
+            self._render_impl(res, **args)
+        except:
+            res.error = CapturedException()
+        self._end_event.record(torch.cuda.current_stream(self._device))
+
+        if 'error' in res:
+            res.error = str(res.error)
+        if self._is_timing:
+            self._end_event.synchronize()
+            res.render_time = self._start_event.elapsed_time(self._end_event) * 1e-3
+            self._is_timing = False
+        return res
+
+    def get_network(self, pkl, key, **tweak_kwargs):
+        data = self._pkl_data.get(pkl, None)
+        if data is None:
+            print(f'Loading "{pkl}"... ', end='', flush=True)
+            try:
+                with dnnlib.util.open_url(pkl, verbose=False) as f:
+                    data = legacy.load_network_pkl(f)
+                print('Done.')
+            except:
+                data = CapturedException()
+                print('Failed!')
+            self._pkl_data[pkl] = data
+            self._ignore_timing()
+        if isinstance(data, CapturedException):
+            raise data
+
+        orig_net = data[key]
+        cache_key = (orig_net, self._device, tuple(sorted(tweak_kwargs.items())))
+        net = self._networks.get(cache_key, None)
+        if net is None:
+            try:
+                net = copy.deepcopy(orig_net)
+                net = self._tweak_network(net, **tweak_kwargs)
+                net.to(self._device)
+            except:
+                net = CapturedException()
+            self._networks[cache_key] = net
+            self._ignore_timing()
+        if isinstance(net, CapturedException):
+            raise net
+
+        return net
+
+    def get_camera_traj(self, gen, pitch, yaw, fov=12, batch_size=1, model_name='FFHQ512'):
+        range_u, range_v = gen.C.range_u, gen.C.range_v
+        if not (('car' in model_name) or ('Car' in model_name)):  # TODO: hack, better option?
+            yaw, pitch = 0.5 * yaw, 0.3  * pitch
+            pitch = pitch + np.pi/2
+            u = (yaw - range_u[0]) / (range_u[1] - range_u[0])
+            v = (pitch - range_v[0]) / (range_v[1] - range_v[0])
+        else:
+            u = (yaw + 1) / 2
+            v = (pitch + 1) / 2
+        cam = gen.get_camera(batch_size=batch_size, mode=[u, v, 0.5], device=self._device, fov=fov)
+        return cam
+
+    def _tweak_network(self, net):
+        # Print diagnostics.
+        #for name, value in misc.named_params_and_buffers(net):
+        #    if name.endswith('.magnitude_ema'):
+        #        value = value.rsqrt().numpy()
+        #        print(f'{name:<50s}{np.min(value):<16g}{np.max(value):g}')
+        #    if name.endswith('.weight') and value.ndim == 4:
+        #        value = value.square().mean([1,2,3]).sqrt().numpy()
+        #        print(f'{name:<50s}{np.min(value):<16g}{np.max(value):g}')
+        return net
+
+    def _get_pinned_buf(self, ref):
+        key = (tuple(ref.shape), ref.dtype)
+        buf = self._pinned_bufs.get(key, None)
+        if buf is None:
+            buf = torch.empty(ref.shape, dtype=ref.dtype).pin_memory()
+            self._pinned_bufs[key] = buf
+        return buf
+
+    def to_device(self, buf):
+        return self._get_pinned_buf(buf).copy_(buf).to(self._device)
+
+    def to_cpu(self, buf):
+        return self._get_pinned_buf(buf).copy_(buf).clone()
+
+    def _ignore_timing(self):
+        self._is_timing = False
+
+    def _apply_cmap(self, x, name='viridis'):
+        cmap = self._cmaps.get(name, None)
+        if cmap is None:
+            cmap = matplotlib.cm.get_cmap(name)
+            cmap = cmap(np.linspace(0, 1, num=1024), bytes=True)[:, :3]
+            cmap = self.to_device(torch.from_numpy(cmap))
+            self._cmaps[name] = cmap
+        hi = cmap.shape[0] - 1
+        x = (x * hi + 0.5).clamp(0, hi).to(torch.int64)
+        x = torch.nn.functional.embedding(x, cmap)
+        return x
+
+    @torch.no_grad()
+    def _render_impl(self, res,
+        pkl             = None,
+        w0_seeds        = [[0, 1]],
+        stylemix_idx    = [],
+        stylemix_seed   = 0,
+        trunc_psi       = 1,
+        trunc_cutoff    = 0,
+        random_seed     = 0,
+        noise_mode      = 'const',
+        force_fp32      = False,
+        layer_name      = None,
+        sel_channels    = 3,
+        base_channel    = 0,
+        img_scale_db    = 0,
+        img_normalize   = False,
+        fft_show        = False,
+        fft_all         = True,
+        fft_range_db    = 50,
+        fft_beta        = 8,
+        input_transform = None,
+        untransform     = False,
+        camera          = None,
+        output_lowres   = False,
+        **unused,
+    ):
+        # Dig up network details.
+        _G = self.get_network(pkl, 'G_ema')
+        try:
+            G = Generator(*_G.init_args, **_G.init_kwargs).to(self._device)
+            misc.copy_params_and_buffers(_G, G, require_all=False)
+        except Exception:
+            G = _G
+        
+        G.eval()
+
+        res.img_resolution = G.img_resolution
+        res.num_ws = G.num_ws
+        res.has_noise = any('noise_const' in name for name, _buf in G.synthesis.named_buffers())
+        res.has_input_transform = (hasattr(G.synthesis, 'input') and hasattr(G.synthesis.input, 'transform'))
+
+        # Set input transform.
+        if res.has_input_transform:
+            m = np.eye(3)
+            try:
+                if input_transform is not None:
+                    m = np.linalg.inv(np.asarray(input_transform))
+            except np.linalg.LinAlgError:
+                res.error = CapturedException()
+            G.synthesis.input.transform.copy_(torch.from_numpy(m))
+
+        # Generate random latents.
+        all_seeds = [seed for seed, _weight in w0_seeds] + [stylemix_seed]
+        all_seeds = list(set(all_seeds))
+        all_zs = np.zeros([len(all_seeds), G.z_dim], dtype=np.float32)
+        all_cs = np.zeros([len(all_seeds), G.c_dim], dtype=np.float32)
+        for idx, seed in enumerate(all_seeds):
+            rnd = np.random.RandomState(seed)
+            all_zs[idx] = rnd.randn(G.z_dim)
+            if G.c_dim > 0:
+                all_cs[idx, rnd.randint(G.c_dim)] = 1
+
+        # Run mapping network.
+        w_avg = G.mapping.w_avg
+        all_zs = self.to_device(torch.from_numpy(all_zs))
+        all_cs = self.to_device(torch.from_numpy(all_cs))
+        all_ws = G.mapping(z=all_zs, c=all_cs, truncation_psi=trunc_psi, truncation_cutoff=trunc_cutoff) - w_avg
+        all_ws = dict(zip(all_seeds, all_ws))
+
+        # Calculate final W.
+        w = torch.stack([all_ws[seed] * weight for seed, weight in w0_seeds]).sum(dim=0, keepdim=True)
+        stylemix_idx = [idx for idx in stylemix_idx if 0 <= idx < G.num_ws]
+        if len(stylemix_idx) > 0:
+            w[:, stylemix_idx] = all_ws[stylemix_seed][np.newaxis, stylemix_idx]
+        w += w_avg
+
+        # Run synthesis network.
+        synthesis_kwargs = dnnlib.EasyDict(noise_mode=noise_mode, force_fp32=force_fp32)
+        set_random_seed(random_seed)
+        if hasattr(G.synthesis, 'C'):
+            synthesis_kwargs.update({'camera_matrices': camera})
+        out, out_lowres, layers = self.run_synthesis_net(G.synthesis, w, capture_layer=layer_name, **synthesis_kwargs)
+
+        # Update layer list.
+        cache_key = (G.synthesis, tuple(sorted(synthesis_kwargs.items())))
+        if cache_key not in self._net_layers:
+            self._net_layers = dict()
+            if layer_name is not None:
+                torch.manual_seed(random_seed)
+                _out, _out2, layers = self.run_synthesis_net(G.synthesis, w, **synthesis_kwargs)
+            self._net_layers[cache_key] = layers
+        res.layers = self._net_layers[cache_key]
+        
+        # Untransform.
+        if untransform and res.has_input_transform:
+            out, _mask = _apply_affine_transformation(out.to(torch.float32), G.synthesis.input.transform, amax=6) # Override amax to hit the fast path in upfirdn2d.
+
+        # Select channels and compute statistics.
+        if output_lowres and out_lowres is not None:
+            out = torch.cat([out, F.interpolate(out_lowres, out.size(-1), mode='nearest')], -1)
+        out = out[0].to(torch.float32)
+        if sel_channels > out.shape[0]:
+            sel_channels = 1
+        base_channel = max(min(base_channel, out.shape[0] - sel_channels), 0)
+        sel = out[base_channel : base_channel + sel_channels]
+        res.stats = torch.stack([
+            out.mean(), sel.mean(),
+            out.std(), sel.std(),
+            out.norm(float('inf')), sel.norm(float('inf')),
+        ])
+        res.stats = self.to_cpu(res.stats).numpy()  # move to cpu
+
+        # Scale and convert to uint8.
+        img = sel
+        if img_normalize:
+            img = img / img.norm(float('inf'), dim=[1,2], keepdim=True).clip(1e-8, 1e8)
+        img = img * (10 ** (img_scale_db / 20))
+        img = (img * 127.5 + 128).clamp(0, 255).to(torch.uint8).permute(1, 2, 0)
+        res.image = img
+        
+        # FFT.
+        if fft_show:
+            sig = out if fft_all else sel
+            sig = sig.to(torch.float32)
+            sig = sig - sig.mean(dim=[1,2], keepdim=True)
+            sig = sig * torch.kaiser_window(sig.shape[1], periodic=False, beta=fft_beta, device=self._device)[None, :, None]
+            sig = sig * torch.kaiser_window(sig.shape[2], periodic=False, beta=fft_beta, device=self._device)[None, None, :]
+            fft = torch.fft.fftn(sig, dim=[1,2]).abs().square().sum(dim=0)
+            fft = fft.roll(shifts=[fft.shape[0] // 2, fft.shape[1] // 2], dims=[0,1])
+            fft = (fft / fft.mean()).log10() * 10 # dB
+            fft = self._apply_cmap((fft / fft_range_db + 1) / 2)
+            res.image = torch.cat([img.expand_as(fft), fft], dim=1)
+
+        res.image = self.to_cpu(res.image).numpy() # move to cpu
+
+    def run_synthesis_net(self, net, *args, capture_layer=None, **kwargs): # => out, layers
+        submodule_names = {mod: name for name, mod in net.named_modules()}
+        unique_names = set()
+        layers = []
+
+        def module_hook(module, _inputs, outputs):
+            outputs = list(outputs) if isinstance(outputs, (tuple, list)) else [outputs]
+            outputs = [out for out in outputs if isinstance(out, torch.Tensor) and out.ndim in [4, 5]]
+            for idx, out in enumerate(outputs):
+                if out.ndim == 5: # G-CNN => remove group dimension.
+                    out = out.mean(2)
+                name = submodule_names[module]
+                if name == '':
+                    name = 'output'
+                if len(outputs) > 1:
+                    name += f':{idx}'
+                if name in unique_names:
+                    suffix = 2
+                    while f'{name}_{suffix}' in unique_names:
+                        suffix += 1
+                    name += f'_{suffix}'
+                unique_names.add(name)
+                shape = [int(x) for x in out.shape]
+                dtype = str(out.dtype).split('.')[-1]
+                layers.append(dnnlib.EasyDict(name=name, shape=shape, dtype=dtype))
+                if name == capture_layer:
+                    raise CaptureSuccess(out)
+        hooks = []
+        hooks = [module.register_forward_hook(module_hook) for module in net.modules()]
+        try:
+            if 'camera_matrices' in kwargs:
+                kwargs['camera_matrices'] = self.get_camera_traj(net, *kwargs['camera_matrices'])
+            out = net(*args, **kwargs)
+            out_lowres = None
+            if isinstance(out, dict):
+                if 'img_nerf' in out:
+                    out_lowres = out['img_nerf']
+                out = out['img']
+                
+        except CaptureSuccess as e:
+            out = e.out
+            out_lowres = None
+        for hook in hooks:
+            hook.remove()
+        return out, out_lowres, layers
+
+#----------------------------------------------------------------------------
diff --git a/viz/stylemix_widget.py b/viz/stylemix_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d7bf3e6b4bed1f06774a9d4bd0797cf699f9142
--- /dev/null
+++ b/viz/stylemix_widget.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class StyleMixingWidget:
+    def __init__(self, viz):
+        self.viz        = viz
+        self.seed_def   = 1000
+        self.seed       = self.seed_def
+        self.animate    = False
+        self.enables    = []
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        num_ws = viz.result.get('num_ws', 0)
+        num_enables = viz.result.get('num_ws', 18)
+        self.enables += [False] * max(num_enables - len(self.enables), 0)
+
+        if show:
+            imgui.text('Stylemix')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 8), imgui_utils.grayed_out(num_ws == 0):
+                _changed, self.seed = imgui.input_int('##seed', self.seed)
+            imgui.same_line(viz.label_w + viz.font_size * 8 + viz.spacing)
+            with imgui_utils.grayed_out(num_ws == 0):
+                _clicked, self.animate = imgui.checkbox('Anim', self.animate)
+
+            pos2 = imgui.get_content_region_max()[0] - 1 - viz.button_w
+            pos1 = pos2 - imgui.get_text_line_height() - viz.spacing
+            pos0 = viz.label_w + viz.font_size * 12
+            imgui.push_style_var(imgui.STYLE_FRAME_PADDING, [0, 0])
+            for idx in range(num_enables):
+                imgui.same_line(round(pos0 + (pos1 - pos0) * (idx / (num_enables - 1))))
+                if idx == 0:
+                    imgui.set_cursor_pos_y(imgui.get_cursor_pos_y() + 3)
+                with imgui_utils.grayed_out(num_ws == 0):
+                    _clicked, self.enables[idx] = imgui.checkbox(f'##{idx}', self.enables[idx])
+                if imgui.is_item_hovered():
+                    imgui.set_tooltip(f'{idx}')
+            imgui.pop_style_var(1)
+
+            imgui.same_line(pos2)
+            imgui.set_cursor_pos_y(imgui.get_cursor_pos_y() - 3)
+            with imgui_utils.grayed_out(num_ws == 0):
+                if imgui_utils.button('Reset', width=-1, enabled=(self.seed != self.seed_def or self.animate or any(self.enables[:num_enables]))):
+                    self.seed = self.seed_def
+                    self.animate = False
+                    self.enables = [False] * num_enables
+
+        if any(self.enables[:num_ws]):
+            viz.args.stylemix_idx = [idx for idx, enable in enumerate(self.enables) if enable]
+            viz.args.stylemix_seed = self.seed & ((1 << 32) - 1)
+        if self.animate:
+            self.seed += 1
+
+#----------------------------------------------------------------------------
diff --git a/viz/trunc_noise_widget.py b/viz/trunc_noise_widget.py
new file mode 100644
index 0000000000000000000000000000000000000000..dda852b159bd8f2864fe6f6b87de9677e3e41625
--- /dev/null
+++ b/viz/trunc_noise_widget.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imgui
+from gui_utils import imgui_utils
+
+#----------------------------------------------------------------------------
+
+class TruncationNoiseWidget:
+    def __init__(self, viz):
+        self.viz            = viz
+        self.prev_num_ws    = 0
+        self.trunc_psi      = 1
+        self.trunc_cutoff   = 0
+        self.noise_enable   = True
+        self.noise_seed     = 0
+        self.noise_anim     = False
+
+    @imgui_utils.scoped_by_object_id
+    def __call__(self, show=True):
+        viz = self.viz
+        num_ws = viz.result.get('num_ws', 0)
+        has_noise = viz.result.get('has_noise', False)
+        if num_ws > 0 and num_ws != self.prev_num_ws:
+            if self.trunc_cutoff > num_ws or self.trunc_cutoff == self.prev_num_ws:
+                self.trunc_cutoff = num_ws
+            self.prev_num_ws = num_ws
+
+        if show:
+            imgui.text('Truncate')
+            imgui.same_line(viz.label_w)
+            with imgui_utils.item_width(viz.font_size * 10), imgui_utils.grayed_out(num_ws == 0):
+                _changed, self.trunc_psi = imgui.slider_float('##psi', self.trunc_psi, -1, 2, format='Psi %.2f')
+            imgui.same_line()
+            if num_ws == 0:
+                imgui_utils.button('Cutoff 0', width=(viz.font_size * 8 + viz.spacing), enabled=False)
+            else:
+                with imgui_utils.item_width(viz.font_size * 8 + viz.spacing):
+                    changed, new_cutoff = imgui.slider_int('##cutoff', self.trunc_cutoff, 0, num_ws, format='Cutoff %d')
+                    if changed:
+                        self.trunc_cutoff = min(max(new_cutoff, 0), num_ws)
+
+            with imgui_utils.grayed_out(not has_noise):
+                imgui.same_line()
+                _clicked, self.noise_enable = imgui.checkbox('Noise##enable', self.noise_enable)
+                imgui.same_line(round(viz.font_size * 27.7))
+                with imgui_utils.grayed_out(not self.noise_enable):
+                    with imgui_utils.item_width(-1 - viz.button_w - viz.spacing - viz.font_size * 4):
+                        _changed, self.noise_seed = imgui.input_int('##seed', self.noise_seed)
+                    imgui.same_line(spacing=0)
+                    _clicked, self.noise_anim = imgui.checkbox('Anim##noise', self.noise_anim)
+
+            is_def_trunc = (self.trunc_psi == 1 and self.trunc_cutoff == num_ws)
+            is_def_noise = (self.noise_enable and self.noise_seed == 0 and not self.noise_anim)
+            with imgui_utils.grayed_out(is_def_trunc and not has_noise):
+                imgui.same_line(imgui.get_content_region_max()[0] - 1 - viz.button_w)
+                if imgui_utils.button('Reset', width=-1, enabled=(not is_def_trunc or not is_def_noise)):
+                    self.prev_num_ws = num_ws
+                    self.trunc_psi = 1
+                    self.trunc_cutoff = num_ws
+                    self.noise_enable = True
+                    self.noise_seed = 0
+                    self.noise_anim = False
+
+        if self.noise_anim:
+            self.noise_seed += 1
+        viz.args.update(trunc_psi=self.trunc_psi, trunc_cutoff=self.trunc_cutoff, random_seed=self.noise_seed)
+        viz.args.noise_mode = ('none' if not self.noise_enable else 'const' if self.noise_seed == 0 else 'random')
+
+#----------------------------------------------------------------------------