Spaces:

kevinwang676
/

NeuCoSVC-2

Sleeping

App Files Files Community

kevinwang676 commited on May 10

Commit

9016314

•

1 Parent(s): b40bf00

Upload 93 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
Phoneme_Hallucinator_v2/.gitignore +6 -0
Phoneme_Hallucinator_v2/.vscode/launch.json +26 -0
Phoneme_Hallucinator_v2/Phoneme Hallucinator DEMO.ipynb +0 -0
Phoneme_Hallucinator_v2/README.md +36 -0
Phoneme_Hallucinator_v2/__pycache__/__init__.cpython-310.pyc +0 -0
Phoneme_Hallucinator_v2/datasets/__init__.py +27 -0
Phoneme_Hallucinator_v2/datasets/__pycache__/__init__.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/datasets/__pycache__/speech.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/datasets/speech.py +278 -0
Phoneme_Hallucinator_v2/evaluation/ASR-Eval.ipynb +0 -0
Phoneme_Hallucinator_v2/evaluation/ASR.ipynb +0 -0
Phoneme_Hallucinator_v2/evaluation/init +1 -0
Phoneme_Hallucinator_v2/exp/speech_XXL_cond/params.json +33 -0
Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/checkpoint +3 -0
Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/last.ckpt.data-00000-of-00001 +3 -0
Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/last.ckpt.index +0 -0
Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/last.ckpt.meta +3 -0
Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/params.ckpt.data-00000-of-00001 +3 -0
Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/params.ckpt.index +0 -0
Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/params.ckpt.meta +3 -0
Phoneme_Hallucinator_v2/models/__init__.py +9 -0
Phoneme_Hallucinator_v2/models/__pycache__/__init__.cpython-310.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/__init__.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/__init__.cpython-37.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/base.cpython-310.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/base.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/base.cpython-37.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/cVAE.cpython-310.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/cVAE.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/cVAE.cpython-37.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/networks.cpython-310.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/networks.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/networks.cpython-37.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/pc_acset.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/pc_acset_vae.cpython-310.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/pc_acset_vae.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/pc_acset_vae.cpython-37.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/pc_acset_vae_06.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/pc_encoder.cpython-310.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/pc_encoder.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/pc_encoder.cpython-37.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/runner.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/set_transformer.cpython-310.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/set_transformer.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/set_transformer.cpython-37.pyc +0 -0
Phoneme_Hallucinator_v2/models/__pycache__/utils.cpython-36.pyc +0 -0
Phoneme_Hallucinator_v2/models/base.py +103 -0
Phoneme_Hallucinator_v2/models/cVAE.py +45 -0
Phoneme_Hallucinator_v2/models/flow/__init__.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/last.ckpt.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/last.ckpt.meta filter=lfs diff=lfs merge=lfs -text
+Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/params.ckpt.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/params.ckpt.meta filter=lfs diff=lfs merge=lfs -text

Phoneme_Hallucinator_v2/.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.vscode
+exp/speech_XXL_cond
+*__pycache__
+*.pt
+*.npy
+*.wav

Phoneme_Hallucinator_v2/.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": [
+                "--cfg_file",
+                "exp/speech_XXL_cond/params.json",
+                "--num_samples",
+                "5000",
+                "--path",
+                "matching_set/target.pt",
+                "--out_path",
+                "matching_set/target_expanded_5k.npy"
+            ]
+        }
+    ]
+}

Phoneme_Hallucinator_v2/Phoneme Hallucinator DEMO.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Phoneme_Hallucinator_v2/README.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# Phoneme_Hallucinator
+This is the repository of the paper "Phoneme Hallucinator: One-shot Voice Conversion via Set Expansion" accepted by AAAI-2024. Some audio samples are provided [here](https://phonemehallucinator.github.io/).
+## Inference Tutorial
+1. If you only want to run our VC pipeline, please download `Phoneme Hallucinator DEMO.ipynb` in this repo and run it in google colab.
+## Training Tutorial
+1. Prepare environment. Require `Python 3.6.3` and the following packages
+   ```
+   pillow == 8.0.1
+   torch == 1.10.2
+   tensorflow == 1.15.5
+   tensorflow-probability == 0.7.0
+   tensorpack == 0.9.8
+   h5py == 2.10.0
+   numpy == 1.19.5
+   pathlib == 1.0.1
+   tqdm == 4.64.1
+   easydict == 1.10
+   matplotlib == 3.3.4
+   scikit-learn == 0.24.2
+   scipy == 1.5.4
+   seaborn == 0.11.2
+   ```
+3. To prepare the training set, we need to use WavLM to extract speech representations. Go to [kNN-VC repo](https://github.com/bshall/knn-vc) and follow its instructions to extract speech representations. Namely, after placing LibriSpeech dataset in a correct location, run the command:
+   `python prematch_dataset.py --librispeech_path /path/to/librispeech/root --out_path /path/where/you/want/outputs/to/go --topk 4 --matching_layer 6 --synthesis_layer 6`
+   Note that we don't use the "--prematch" option, becuase we only need to extract representations, not to extract and then perform kNN regression.
+4. After the above step, you can get a `--out_path` folder with three subfolders `train-clean-100`, `test-clean` and `dev-clean` where each folder contains the speech representation files (".pt").
+5. Go to our repo `./dataset/speech.py` and change the variables `path_to_wavlm_feat` and `tfrecord_path` accordingly. You need to change `path_to_wavlm_feat` to where the speech representations are stored in the previous step.
+6. Start Training by the following command:
+   `python scripts/run.py --cfg_file=./exp/speech_XXL_cond/params.json --mode=train`
+   If `tfrecord_path` doesn't exist, our codes will create tfrecords and save them to `tfrecord_path` before training starts. Note that if you encounter numerical issues ("NaN, INF") when the training starts, just try re-run the command multiple times. Training los will be saved to `./exp/speech_XXL_cond/`.

Phoneme_Hallucinator_v2/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (173 Bytes). View file

Phoneme_Hallucinator_v2/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+import pickle
+def get_dataset(args, split):
+    if args.dataset == 'speech':
+        from .speech import Dataset
+        dataset = Dataset(split, args.batch_size, args.set_size, args.mask_type)
+    else:
+        raise ValueError()
+    return dataset
+def cache(args, split, fname):
+    if os.path.isfile(fname):
+        with open(fname, 'rb') as f:
+            batches = pickle.load(f)
+    else:
+        batches = []
+        dataset = get_dataset(args, split)
+        dataset.initialize()
+        for _ in range(dataset.num_batches):
+            batch = dataset.next_batch()
+            batches.append(batch)
+        with open(fname, 'wb') as f:
+            pickle.dump(batches, f)
+    return batches

Phoneme_Hallucinator_v2/datasets/__pycache__/__init__.cpython-36.pyc ADDED Viewed

Binary file (863 Bytes). View file

Phoneme_Hallucinator_v2/datasets/__pycache__/speech.cpython-36.pyc ADDED Viewed

Binary file (7.36 kB). View file

Phoneme_Hallucinator_v2/datasets/speech.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import h5py
+import numpy as np
+from pathlib import Path
+import tensorflow as tf
+import torch
+from tqdm import tqdm
+import os
+import sys
+import glob
+import random
+import pdb
+np.random.seed(0)
+generate_tf_record = False
+tfrecord_path = "/path/to/save/your/tfrecord/"
+path_to_wavlm_feat = "/path/to/your/wavlm/feat"
+if not os.path.exists(tfrecord_path):
+    generate_tf_record = True
+os.makedirs(tfrecord_path, exist_ok=True)
+train_filename = tfrecord_path + 'train'
+valid_filename= tfrecord_path + 'valid'
+test_filename= tfrecord_path + 'test'
+train_path = Path(os.path.join(path_to_wavlm_feat, "train-clean-100"))
+valid_path = Path(os.path.join(path_to_wavlm_feat, "dev-clean"))
+test_path = Path(os.path.join(path_to_wavlm_feat, "test-clean"))
+train_size = 27269
+valid_size = 1940
+test_size = 1850
+def get_filenames(path):
+    all_files = []
+    all_files.extend(list(path.rglob("**/*.pt")))
+    return all_files
+def length_filter(paths):
+    filtered_paths = []
+    print("filter short files")
+    for each in tqdm(paths):
+        data = torch.load(each).numpy().astype(np.float32)
+        if data.shape[0] < 200:
+            continue
+        filtered_paths.append(each)
+    return filtered_paths
+def generate_mask(x, mask_type):
+    if mask_type == b'expand':
+        m = np.zeros_like(x)
+        N = np.random.randint(x.shape[0]//8, x.shape[0])
+        ind = np.random.choice(x.shape[0], N, replace=False)
+        m[ind] = 1.
+    elif mask_type == b'few_expand':
+        m = np.zeros_like(x)
+        N = np.random.randint(x.shape[0]//8)
+        ind = np.random.choice(x.shape[0], N, replace=False)
+        m[ind] = 1.
+    elif mask_type == b'arb_expand':
+        m = np.zeros_like(x)
+        N = np.random.randint(x.shape[0])
+        ind = np.random.choice(x.shape[0], N, replace=False)
+        m[ind] = 1.
+    elif mask_type == b'det_expand':
+        m = np.zeros_like(x)
+        ind = np.random.choice(x.shape[0], 100, replace=False)
+        m[ind] = 1.
+    elif mask_type == b'complete':
+        m = np.zeros_like(x)
+        while np.sum(m[:,0]) < x.shape[0] // 8:
+            p = np.random.uniform(-0.5, 0.5, size=4)
+            xa = np.concatenate([x, np.ones([x.shape[0],1])], axis=1)
+            m = (np.dot(xa, p) > 0).astype(np.float32)
+            m = np.repeat(np.expand_dims(m, axis=1), 3, axis=1)
+    else:
+        raise ValueError()
+    return m
+def wrap_int64(value):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+def wrap_bytes(value):
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+def print_progress(count, total):
+    # Percentage completion.
+    pct_complete = float(count) / total
+    # Status-message.
+    # Note the \r which means the line should overwrite itself.
+    msg = "\r- Progress: {0:.1%}".format(pct_complete)
+    # Print it.
+    sys.stdout.write(msg)
+    sys.stdout.flush()
+def convert(image_paths, out_path, max_files=1000):
+    # Args:
+    # image_paths   List of file-paths for the images.
+    # labels        Class-labels for the images.
+    # out_path      File-path for the TFRecords output file.
+    print("Converting: " + out_path)
+    # Number of images. Used when printing the progress.
+    num_images = len(image_paths)
+    splits = (num_images//max_files) + 1
+    if num_images%max_files == 0:
+        splits-=1
+    print(f"\nUsing {splits} shard(s) for {num_images} files, with up to {max_files} samples per shard")
+    file_count = 0
+    for i in tqdm(range(splits)):
+        # Open a TFRecordWriter for the output-file.
+        with tf.io.TFRecordWriter("{}_{}_{}.tfrecords".format(out_path, i+1, splits)) as writer:
+            # Iterate over all the image-paths and class-labels.
+            current_shard_count = 0
+            while current_shard_count < max_files:
+                index = i*max_files+current_shard_count
+                if index == len(image_paths):
+                    break
+                current_image = image_paths[index]
+                # Load the image-file using matplotlib's imread function.
+                img = torch.load(current_image).numpy().astype(np.float32)
+                # Convert the image to raw bytes.
+                img_bytes = img.tostring()
+                # Create a dict with the data we want to save in the
+                # TFRecords file. You can add more relevant data here.
+                data = \
+                    {
+                        'image': wrap_bytes(img_bytes),
+                        'length': wrap_int64(img.shape[0]),
+                        "filename": wrap_bytes(bytes(os.path.splitext(current_image.name)[0], 'utf-8'))
+                    }
+                # Wrap the data as TensorFlow Features.
+                feature = tf.train.Features(feature=data)
+                # Wrap again as a TensorFlow Example.
+                example = tf.train.Example(features=feature)
+                # Serialize the data.
+                serialized = example.SerializeToString()
+                # Write the serialized data to the TFRecords file.
+                writer.write(serialized)
+                current_shard_count+=1
+                file_count += 1
+    print(f"\nWrote {file_count} elements to TFRecord")
+if generate_tf_record:
+    train_image_paths = length_filter(get_filenames(train_path))
+    valid_image_paths = length_filter(get_filenames(valid_path))
+    test_image_paths = length_filter(get_filenames(test_path))
+    print(f"Number of training data after length filering: {len(train_image_paths)}")
+    print(f"Number of valid data after length filering: {len(valid_image_paths)}")
+    print(f"Number of testing data after length filering: {len(test_image_paths)}")
+    random.Random(4).shuffle(train_image_paths)
+    train_size = len(train_image_paths)
+    valid_size = len(valid_image_paths)
+    test_size = len(test_image_paths)
+    convert(image_paths=train_image_paths,
+            out_path=train_filename)
+    convert(image_paths=valid_image_paths,
+            out_path=valid_filename)
+    convert(image_paths=test_image_paths,
+            out_path=test_filename)
+def parse(serialized):
+    # Define a dict with the data-names and types we expect to
+    # find in the TFRecords file.
+    # It is a bit awkward that this needs to be specified again,
+    # because it could have been written in the header of the
+    # TFRecords file instead.
+    features = \
+        {
+            'image': tf.io.FixedLenFeature([], tf.string),
+            'length': tf.io.FixedLenFeature([], tf.int64),
+            'filename': tf.io.FixedLenFeature([], tf.string),
+        }
+    # Parse the serialized data so we get a dict with our data.
+    parsed_example = tf.io.parse_single_example(serialized=serialized,
+                                             features=features)
+    # Get the image as raw bytes.
+    image_raw = parsed_example['image']
+    # Decode the raw bytes so it becomes a tensor with type.
+    image = tf.io.decode_raw(image_raw, tf.float32)
+    # Get the label associated with the image.
+    length = parsed_example['length']
+    image = tf.reshape(image, [length, 1024])
+    filename = parsed_example['filename']
+    # The image and label are now correct TensorFlow types.
+    return image, filename
+def process(x, filename, set_size, mask_type):
+    x = x/10
+    ind = np.random.choice(x.shape[0], set_size, replace=False)
+    x = x[ind]
+    m = generate_mask(x, mask_type)
+    #N = np.random.randint(set_size)
+    #S = np.random.randint(x.shape[0] - set_size + 1)
+    #x = x[S:S+set_size]
+    #m = np.zeros_like(x)
+    #S = np.random.randint(set_size - N + 1)
+    #m[S:S+N] = 1.0
+    return x, m, filename
+def get_dst(split, set_size, mask_type):
+    if split == 'train':
+        files = glob.glob(train_filename+"*.tfrecords", recursive=False)
+        dst = tf.data.TFRecordDataset(files)
+        size = train_size
+        dst = dst.map(parse)
+        dst = dst.shuffle(256)
+        dst = dst.map(lambda x, y: tuple(tf.compat.v1.py_func(process, [x, y, set_size, mask_type], [tf.float32, tf.float32, tf.string])), num_parallel_calls=8)
+    elif split == 'valid':
+        files = glob.glob(valid_filename+"*.tfrecords", recursive=False)
+        dst = tf.data.TFRecordDataset(files)
+        size = valid_size
+        dst = dst.map(parse)
+        dst = dst.map(lambda x, y: tuple(tf.compat.v1.py_func(process, [x, y, set_size, mask_type], [tf.float32, tf.float32, tf.string])), num_parallel_calls=8)
+    else:
+        files = glob.glob(test_filename+"*.tfrecords", recursive=False)
+        dst = tf.data.TFRecordDataset(files)
+        size = test_size
+        dst = dst.map(parse)
+        dst = dst.map(lambda x, y: tuple(tf.compat.v1.py_func(process, [x, y, set_size, mask_type], [tf.float32, tf.float32, tf.string])), num_parallel_calls=8)
+    return dst, size
+class Dataset(object):
+    def __init__(self, split, batch_size, set_size, mask_type):
+        g = tf.Graph()
+        with g.as_default():
+            # open a session
+            config = tf.compat.v1.ConfigProto()
+            config.log_device_placement = True
+            config.allow_soft_placement = True
+            config.gpu_options.allow_growth = True
+            self.sess = tf.compat.v1.Session(config=config, graph=g)
+            # build dataset
+            dst, size = get_dst(split, set_size, mask_type)
+            self.size = size
+            self.num_batches = self.size // batch_size
+            dst = dst.batch(batch_size, drop_remainder=False)
+            dst = dst.prefetch(1)
+            dst_it = tf.compat.v1.data.make_initializable_iterator(dst)
+            x, b, filename  = dst_it.get_next()
+            self.x = x
+            self.b = b
+            self.filename = filename
+            #self.x = tf.reshape(x, [batch_size, set_size, 1024])
+            #self.b = tf.reshape(b, [batch_size, set_size, 1024])
+            self.dimension = 1024
+            self.initializer = dst_it.initializer
+    def initialize(self):
+        self.sess.run(self.initializer)
+    def next_batch(self):
+        x, b, filename = self.sess.run([self.x, self.b, self.filename])
+        m = np.ones_like(b)
+        return {'x':x, 'b':b, 'm':m, "f":filename}

Phoneme_Hallucinator_v2/evaluation/ASR-Eval.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Phoneme_Hallucinator_v2/evaluation/ASR.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Phoneme_Hallucinator_v2/evaluation/init ADDED Viewed

	@@ -0,0 +1 @@


1	+

Phoneme_Hallucinator_v2/exp/speech_XXL_cond/params.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "dataset": "speech",
+    "dimension": 1024,
+    "batch_size": 50,
+    "set_size": 200,
+    "mask_type": "arb_expand",
+    "model": "pc_acset_vae",
+    "latent_encoder_hidden": [256,256,256,256],
+    "latent_dim": 256,
+    "trans_params": {
+        "transform": ["L","LR","CP","R","L","LR","CP","R","L","LR","CP","R","L","LR","CP"],
+        "dimension": 256,
+        "coupling_hids": [256,256]
+    },
+    "vae_params": {
+    	"hid_dimensions": 256,
+    	"dimension": 1024,
+    	"enc_dense_hids": [512,512,512,512],
+    	"dec_dense_hids": [512,512,512,512]
+    },
+    "use_peq_embed": 1,
+    "set_xformer_hids": [256,256,256,256],
+    "epochs": 1000,
+    "optimizer": "adam",
+    "lr": 0.0001,
+    "decay_steps": 100000,
+    "decay_rate": 0.5,
+    "clip_gradient": 1,
+    "exp_dir": "Phoneme_Hallucinator_v2/exp/speech_XXL_cond",
+    "summ_freq": 100,
+    "eval_metrics": ["sam"]
+}

Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/checkpoint ADDED Viewed

	@@ -0,0 +1,3 @@

+model_checkpoint_path: "last.ckpt"
+all_model_checkpoint_paths: "params.ckpt"
+all_model_checkpoint_paths: "last.ckpt"

Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/last.ckpt.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f48a8699b9c871c3e4d8fa92c1f4e4c58c3c054d7f3620577286307b1cee9c22
+size 228403264

Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/last.ckpt.index ADDED Viewed

Binary file (33.9 kB). View file

Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/last.ckpt.meta ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc65091d5349de08462ecf43b05ff3ed9e07c4de97160bc81c858f99a2979411
+size 7340118

Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/params.ckpt.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a10bbba728a64b8c3697e3fe546a1fe1a1a666c609e021acb24a15ccf615c740
+size 228403264

Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/params.ckpt.index ADDED Viewed

Binary file (33.9 kB). View file

Phoneme_Hallucinator_v2/exp/speech_XXL_cond/weights/params.ckpt.meta ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22bf81dcaf6d4079a9151a9e797c1d4f6e7e209d493dba1f1f29b7d1ba5c2f59
+size 7340118

Phoneme_Hallucinator_v2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+def get_model(hps):
+    if hps.model == 'pc_acset_vae':
+        from .pc_acset_vae import ACSetVAE
+        model = ACSetVAE(hps)
+    else:
+        raise ValueError()
+    return model

Phoneme_Hallucinator_v2/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (392 Bytes). View file

Phoneme_Hallucinator_v2/models/__pycache__/__init__.cpython-36.pyc ADDED Viewed

Binary file (354 Bytes). View file

Phoneme_Hallucinator_v2/models/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (358 Bytes). View file

Phoneme_Hallucinator_v2/models/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (4.06 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/base.cpython-36.pyc ADDED Viewed

Binary file (3.89 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/base.cpython-37.pyc ADDED Viewed

Binary file (3.87 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/cVAE.cpython-310.pyc ADDED Viewed

Binary file (2.09 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/cVAE.cpython-36.pyc ADDED Viewed

Binary file (1.92 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/cVAE.cpython-37.pyc ADDED Viewed

Binary file (1.93 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/networks.cpython-310.pyc ADDED Viewed

Binary file (5 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/networks.cpython-36.pyc ADDED Viewed

Binary file (4.64 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/networks.cpython-37.pyc ADDED Viewed

Binary file (4.6 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/pc_acset.cpython-36.pyc ADDED Viewed

Binary file (2.77 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/pc_acset_vae.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/pc_acset_vae.cpython-36.pyc ADDED Viewed

Binary file (3.01 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/pc_acset_vae.cpython-37.pyc ADDED Viewed

Binary file (2.92 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/pc_acset_vae_06.cpython-36.pyc ADDED Viewed

Binary file (3.73 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/pc_encoder.cpython-310.pyc ADDED Viewed

Binary file (1.94 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/pc_encoder.cpython-36.pyc ADDED Viewed

Binary file (1.84 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/pc_encoder.cpython-37.pyc ADDED Viewed

Binary file (1.85 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/runner.cpython-36.pyc ADDED Viewed

Binary file (5.79 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/set_transformer.cpython-310.pyc ADDED Viewed

Binary file (2.49 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/set_transformer.cpython-36.pyc ADDED Viewed

Binary file (2.18 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/set_transformer.cpython-37.pyc ADDED Viewed

Binary file (2.16 kB). View file

Phoneme_Hallucinator_v2/models/__pycache__/utils.cpython-36.pyc ADDED Viewed

Binary file (4.92 kB). View file

Phoneme_Hallucinator_v2/models/base.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import logging
+from pprint import pformat
+import numpy as np
+import tensorflow as tf
+class BaseModel(object):
+    def __init__(self, hps):
+        super(BaseModel, self).__init__()
+        self.hps = hps
+        g = tf.Graph()
+        with g.as_default():
+            # open a session
+            config = tf.compat.v1.ConfigProto()
+            config.log_device_placement = True
+            config.allow_soft_placement = True
+            config.gpu_options.allow_growth = True
+            self.sess = tf.compat.v1.Session(config=config, graph=g)
+            # build model
+            self.build_net()
+            self.build_ops()
+            # initialize
+            self.sess.run(tf.compat.v1.global_variables_initializer())
+            self.saver = tf.compat.v1.train.Saver()
+            self.writer = tf.compat.v1.summary.FileWriter(self.hps.exp_dir + '/summary')
+            # logging
+            total_params = 0
+            trainable_variables = tf.compat.v1.trainable_variables()
+            logging.info('=' * 20)
+            logging.info("Variables:")
+            logging.info(pformat(trainable_variables))
+            for v in trainable_variables:
+                num_params = np.prod(v.get_shape().as_list())
+                total_params += num_params
+            logging.info("TOTAL TENSORS: %d TOTAL PARAMS: %f[M]" % (
+                len(trainable_variables), total_params / 1e6))
+            logging.info('=' * 20)
+    def save(self, filename='params'):
+        fname = f'{self.hps.exp_dir}/weights/{filename}.ckpt'
+        self.saver.save(self.sess, fname)
+    def load(self, filename='params'):
+        fname = f'{self.hps.exp_dir}/weights/{filename}.ckpt'
+        self.saver.restore(self.sess, fname)
+    def build_net(self):
+        raise NotImplementedError()
+    def build_ops(self):
+        # optimizer
+        self.global_step = tf.compat.v1.train.get_or_create_global_step()
+        learning_rate = tf.compat.v1.train.inverse_time_decay(
+            self.hps.lr, self.global_step,
+            self.hps.decay_steps, self.hps.decay_rate,
+            staircase=True)
+        warmup_lr = tf.compat.v1.train.inverse_time_decay(
+            0.001 * self.hps.lr, self.global_step,
+            self.hps.decay_steps, self.hps.decay_rate,
+            staircase=True)
+        learning_rate = tf.cond(pred=tf.less(self.global_step, 1000), true_fn=lambda: warmup_lr, false_fn=lambda: learning_rate)
+        tf.compat.v1.summary.scalar('lr', learning_rate)
+        if self.hps.optimizer == 'adam':
+            optimizer = tf.compat.v1.train.AdamOptimizer(
+                learning_rate=learning_rate,
+                beta1=0.9, beta2=0.999, epsilon=1e-08,
+                use_locking=False, name="Adam")
+        elif self.hps.optimizer == 'rmsprop':
+            optimizer = tf.compat.v1.train.RMSPropOptimizer(
+                learning_rate=learning_rate)
+        elif self.hps.optimizer == 'mom':
+            optimizer = tf.compat.v1.train.MomentumOptimizer(
+                learning_rate=learning_rate,
+                momentum=0.9)
+        else:
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+                learning_rate=learning_rate)
+        # regularization
+        l2_reg = sum(
+                [tf.reduce_sum(input_tensor=tf.square(v)) for v in tf.compat.v1.trainable_variables()
+                 if ("magnitude" in v.name) or ("rescaling_scale" in v.name)])
+        reg_loss = 0.00005 * l2_reg
+        # train
+        grads_and_vars = optimizer.compute_gradients(
+            self.loss+reg_loss, tf.compat.v1.trainable_variables())
+        grads, vars_ = zip(*grads_and_vars)
+        if self.hps.clip_gradient > 0:
+            grads, gradient_norm = tf.clip_by_global_norm(
+                grads, clip_norm=self.hps.clip_gradient)
+            gradient_norm = tf.debugging.check_numerics(
+                gradient_norm, "Gradient norm is NaN or Inf.")
+            tf.compat.v1.summary.scalar('gradient_norm', gradient_norm)
+        capped_grads_and_vars = zip(grads, vars_)
+        self.train_op = optimizer.apply_gradients(
+            capped_grads_and_vars, global_step=self.global_step)
+        # summary
+        self.summ_op = tf.compat.v1.summary.merge_all()
+    def execute(self, cmd, batch):
+        return self.sess.run(cmd, {self.x:batch['x'], self.b:batch['b'], self.m:batch['m']})

Phoneme_Hallucinator_v2/models/cVAE.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import numpy as np
+import tensorflow as tf
+import tensorflow_probability as tfp
+tfd = tfp.distributions
+from .networks import dense_nn, cond_dense_nn
+class CondVAE(object):
+    def __init__(self, hps, name="cvae"):
+        self.hps = hps
+        self.name = name
+    def enc(self, x, cond=None):
+        '''
+        x: [B, C]
+        cond: [B, C]
+        '''
+        B,C = tf.shape(input=x)[0], tf.shape(input=x)[1]
+        with tf.compat.v1.variable_scope(self.name, reuse=tf.compat.v1.AUTO_REUSE):
+            prior_dist = tfd.MultivariateNormalDiag(tf.zeros(self.hps['hid_dimensions']),tf.ones(self.hps['hid_dimensions']))
+            if cond is None:
+                x = dense_nn(x, self.hps['enc_dense_hids'], 2 * self.hps['hid_dimensions'], False, "enc")
+            else:
+                x = cond_dense_nn(x, cond, self.hps['enc_dense_hids'], 2 * self.hps['hid_dimensions'], False, "enc")
+            m, s = x[:, :self.hps['hid_dimensions']], tf.nn.softplus(x[:, self.hps['hid_dimensions']:])
+            posterior_dist = tfd.MultivariateNormalDiag(m,s)
+            #kl = 0.5 * tf.reduce_sum(s + m ** 2 - 1.0 - tf.log(s), axis=-1)
+            kl = - tfd.kl_divergence(posterior_dist, prior_dist)
+            eps = prior_dist.sample(B)
+            posterior_sample = m + eps * s
+        return kl, posterior_sample
+    def dec(self, x, cond=None):
+        '''
+        x: [B, C]
+        '''
+        B,C = tf.shape(input=x)[0], tf.shape(input=x)[1]
+        with tf.compat.v1.variable_scope(self.name, reuse=tf.compat.v1.AUTO_REUSE):
+            if cond is None:
+                x = dense_nn(x, self.hps['dec_dense_hids'], 2 * self.hps['dimension'], False, "dec")
+            else:
+                x = cond_dense_nn(x, cond, self.hps['dec_dense_hids'], 2 * self.hps['dimension'], False, "dec")
+            m, s = x[:, :self.hps['dimension']], tf.nn.softplus(x[:, self.hps['dimension']:])
+            sample_dist = tfd.MultivariateNormalDiag(loc=m, scale_diag=s)
+        return sample_dist

Phoneme_Hallucinator_v2/models/flow/__init__.py ADDED Viewed

File without changes