mrfakename commited on
Commit
ad63082
1 Parent(s): dd4e6c8

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (6) hide show
  1. README_REPO.md +7 -1
  2. app.py +1 -2
  3. inference-cli.py +35 -21
  4. model/utils.py +6 -7
  5. requirements.txt +2 -8
  6. requirements_eval.txt +5 -0
README_REPO.md CHANGED
@@ -62,7 +62,7 @@ An initial guidance on Finetuning [#57](https://github.com/SWivid/F5-TTS/discuss
62
 
63
  ## Inference
64
 
65
- To run inference with pretrained models, download the checkpoints from [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS), or automatically downloaded with `inference-cli` and `gradio_app`.
66
 
67
  Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
68
  - To avoid possible inference failures, make sure you have seen through the following instructions.
@@ -148,6 +148,12 @@ bash scripts/eval_infer_batch.sh
148
 
149
  ### Objective Evaluation
150
 
 
 
 
 
 
 
151
  **Some Notes**
152
 
153
  For faster-whisper with CUDA 11:
 
62
 
63
  ## Inference
64
 
65
+ The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [⭐ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or automatically downloaded with `inference-cli` and `gradio_app`.
66
 
67
  Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
68
  - To avoid possible inference failures, make sure you have seen through the following instructions.
 
148
 
149
  ### Objective Evaluation
150
 
151
+ Install packages for evaluation:
152
+
153
+ ```bash
154
+ pip install -r requirements_eval.txt
155
+ ```
156
+
157
  **Some Notes**
158
 
159
  For faster-whisper with CUDA 11:
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import os
2
  import re
3
  import torch
4
  import torchaudio
@@ -17,7 +16,6 @@ from model.utils import (
17
  save_spectrogram,
18
  )
19
  from transformers import pipeline
20
- import librosa
21
  import click
22
  import soundfile as sf
23
 
@@ -429,6 +427,7 @@ with gr.Blocks() as app_credits:
429
 
430
  * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
431
  * [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
 
432
  """)
433
  with gr.Blocks() as app_tts:
434
  gr.Markdown("# Batched TTS")
 
 
1
  import re
2
  import torch
3
  import torchaudio
 
16
  save_spectrogram,
17
  )
18
  from transformers import pipeline
 
19
  import click
20
  import soundfile as sf
21
 
 
427
 
428
  * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
429
  * [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
430
+ * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation
431
  """)
432
  with gr.Blocks() as app_tts:
433
  gr.Markdown("# Batched TTS")
inference-cli.py CHANGED
@@ -1,26 +1,24 @@
 
 
1
  import re
 
 
 
 
 
 
2
  import torch
3
  import torchaudio
4
- import numpy as np
5
- import tempfile
6
  from einops import rearrange
7
- from vocos import Vocos
8
  from pydub import AudioSegment, silence
9
- from model import CFM, UNetT, DiT, MMDiT
10
- from cached_path import cached_path
11
- from model.utils import (
12
- load_checkpoint,
13
- get_tokenizer,
14
- convert_char_to_pinyin,
15
- save_spectrogram,
16
- )
17
  from transformers import pipeline
18
- import soundfile as sf
19
- import tomli
20
- import argparse
21
- import tqdm
22
- from pathlib import Path
23
- import codecs
24
 
25
  parser = argparse.ArgumentParser(
26
  prog="python3 inference-cli.py",
@@ -73,6 +71,11 @@ parser.add_argument(
73
  "--remove_silence",
74
  help="Remove silence.",
75
  )
 
 
 
 
 
76
  args = parser.parse_args()
77
 
78
  config = tomli.load(open(args.config, "rb"))
@@ -88,6 +91,7 @@ model = args.model if args.model else config["model"]
88
  remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
89
  wave_path = Path(output_dir)/"out.wav"
90
  spectrogram_path = Path(output_dir)/"out.png"
 
91
 
92
  SPLIT_WORDS = [
93
  "but", "however", "nevertheless", "yet", "still",
@@ -105,7 +109,16 @@ device = (
105
  if torch.cuda.is_available()
106
  else "mps" if torch.backends.mps.is_available() else "cpu"
107
  )
108
- vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
 
 
 
 
 
 
 
 
 
109
 
110
  print(f"Using {device} device")
111
 
@@ -124,8 +137,9 @@ speed = 1.0
124
  fix_duration = None
125
 
126
  def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
127
- ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
128
- # ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
 
129
  vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
130
  model = CFM(
131
  transformer=model_cls(
@@ -385,4 +399,4 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, custom_spli
385
  return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
386
 
387
 
388
- infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))
 
1
+ import argparse
2
+ import codecs
3
  import re
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import soundfile as sf
9
+ import tomli
10
  import torch
11
  import torchaudio
12
+ import tqdm
13
+ from cached_path import cached_path
14
  from einops import rearrange
 
15
  from pydub import AudioSegment, silence
 
 
 
 
 
 
 
 
16
  from transformers import pipeline
17
+ from vocos import Vocos
18
+
19
+ from model import CFM, DiT, MMDiT, UNetT
20
+ from model.utils import (convert_char_to_pinyin, get_tokenizer,
21
+ load_checkpoint, save_spectrogram)
 
22
 
23
  parser = argparse.ArgumentParser(
24
  prog="python3 inference-cli.py",
 
71
  "--remove_silence",
72
  help="Remove silence.",
73
  )
74
+ parser.add_argument(
75
+ "--load_vocoder_from_local",
76
+ action="store_true",
77
+ help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
78
+ )
79
  args = parser.parse_args()
80
 
81
  config = tomli.load(open(args.config, "rb"))
 
91
  remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
92
  wave_path = Path(output_dir)/"out.wav"
93
  spectrogram_path = Path(output_dir)/"out.png"
94
+ vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
95
 
96
  SPLIT_WORDS = [
97
  "but", "however", "nevertheless", "yet", "still",
 
109
  if torch.cuda.is_available()
110
  else "mps" if torch.backends.mps.is_available() else "cpu"
111
  )
112
+
113
+ if args.load_vocoder_from_local:
114
+ print(f"Load vocos from local path {vocos_local_path}")
115
+ vocos = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
116
+ state_dict = torch.load(f"{vocos_local_path}/pytorch_model.bin", map_location=device)
117
+ vocos.load_state_dict(state_dict)
118
+ vocos.eval()
119
+ else:
120
+ print("Donwload Vocos from huggingface charactr/vocos-mel-24khz")
121
+ vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
122
 
123
  print(f"Using {device} device")
124
 
 
137
  fix_duration = None
138
 
139
  def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
140
+ ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
141
+ if not Path(ckpt_path).exists():
142
+ ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
143
  vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
144
  model = CFM(
145
  transformer=model_cls(
 
399
  return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
400
 
401
 
402
+ infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))
model/utils.py CHANGED
@@ -22,12 +22,6 @@ from einops import rearrange, reduce
22
 
23
  import jieba
24
  from pypinyin import lazy_pinyin, Style
25
- import zhconv
26
- from zhon.hanzi import punctuation
27
- from jiwer import compute_measures
28
-
29
- from funasr import AutoModel
30
- from faster_whisper import WhisperModel
31
 
32
  from model.ecapa_tdnn import ECAPA_TDNN_SMALL
33
  from model.modules import MelSpec
@@ -432,6 +426,7 @@ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path
432
 
433
  def load_asr_model(lang, ckpt_dir = ""):
434
  if lang == "zh":
 
435
  model = AutoModel(
436
  model = os.path.join(ckpt_dir, "paraformer-zh"),
437
  # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
@@ -440,6 +435,7 @@ def load_asr_model(lang, ckpt_dir = ""):
440
  disable_update=True,
441
  ) # following seed-tts setting
442
  elif lang == "en":
 
443
  model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
444
  model = WhisperModel(model_size, device="cuda", compute_type="float16")
445
  return model
@@ -451,6 +447,7 @@ def run_asr_wer(args):
451
  rank, lang, test_set, ckpt_dir = args
452
 
453
  if lang == "zh":
 
454
  torch.cuda.set_device(rank)
455
  elif lang == "en":
456
  os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
@@ -458,10 +455,12 @@ def run_asr_wer(args):
458
  raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
459
 
460
  asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
461
-
 
462
  punctuation_all = punctuation + string.punctuation
463
  wers = []
464
 
 
465
  for gen_wav, prompt_wav, truth in tqdm(test_set):
466
  if lang == "zh":
467
  res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
 
22
 
23
  import jieba
24
  from pypinyin import lazy_pinyin, Style
 
 
 
 
 
 
25
 
26
  from model.ecapa_tdnn import ECAPA_TDNN_SMALL
27
  from model.modules import MelSpec
 
426
 
427
  def load_asr_model(lang, ckpt_dir = ""):
428
  if lang == "zh":
429
+ from funasr import AutoModel
430
  model = AutoModel(
431
  model = os.path.join(ckpt_dir, "paraformer-zh"),
432
  # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
 
435
  disable_update=True,
436
  ) # following seed-tts setting
437
  elif lang == "en":
438
+ from faster_whisper import WhisperModel
439
  model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
440
  model = WhisperModel(model_size, device="cuda", compute_type="float16")
441
  return model
 
447
  rank, lang, test_set, ckpt_dir = args
448
 
449
  if lang == "zh":
450
+ import zhconv
451
  torch.cuda.set_device(rank)
452
  elif lang == "en":
453
  os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
 
455
  raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
456
 
457
  asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
458
+
459
+ from zhon.hanzi import punctuation
460
  punctuation_all = punctuation + string.punctuation
461
  wers = []
462
 
463
+ from jiwer import compute_measures
464
  for gen_wav, prompt_wav, truth in tqdm(test_set):
465
  if lang == "zh":
466
  res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
requirements.txt CHANGED
@@ -5,25 +5,19 @@ datasets
5
  einops>=0.8.0
6
  einx>=0.3.0
7
  ema_pytorch>=0.5.2
8
- faster_whisper
9
- funasr
10
  gradio
11
  jieba
12
- jiwer
13
  librosa
14
  matplotlib
15
- numpy==1.23.5
16
  pydub
17
  pypinyin
18
  safetensors
19
  soundfile
20
- # torch>=2.0
21
- # torchaudio>=2.3.0
22
  torchdiffeq
23
  tqdm>=4.65.0
24
  transformers
25
  vocos
26
  wandb
27
  x_transformers>=1.31.14
28
- zhconv
29
- zhon
 
5
  einops>=0.8.0
6
  einx>=0.3.0
7
  ema_pytorch>=0.5.2
 
 
8
  gradio
9
  jieba
 
10
  librosa
11
  matplotlib
12
+ numpy<=1.26.4
13
  pydub
14
  pypinyin
15
  safetensors
16
  soundfile
17
+ tomli
 
18
  torchdiffeq
19
  tqdm>=4.65.0
20
  transformers
21
  vocos
22
  wandb
23
  x_transformers>=1.31.14
 
 
requirements_eval.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ faster_whisper
2
+ funasr
3
+ jiwer
4
+ zhconv
5
+ zhon