import whisper import os import json import torchaudio import argparse import torch from tqdm import tqdm if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--whisper_size", default="large") args = parser.parse_args() #assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!" model = whisper.load_model(args.whisper_size, device="cpu") parent_dir = "./custom_character_voice/" speaker_names = list(os.walk(parent_dir))[0][1] speaker_annos = [] total_files = sum([len(files) for r, d, files in os.walk(parent_dir)]) # resample audios # 2023/4/21: Get the target sampling rate with open("./configs/amitaro_jp_base.json", 'r', encoding='utf-8') as f: hps = json.load(f) target_sr = hps['data']['sampling_rate'] processed_files = 0 for speaker in speaker_names: filelist = (list(os.walk(parent_dir + speaker))[0][2]) for i, wavfile in tqdm(enumerate(filelist), desc="Processing Audio:", total=len(filelist)): # try to load file as audio if wavfile.startswith("processed_"): continue #try: wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True, channels_first=True) wav = wav.mean(dim=0).unsqueeze(0) if sr != target_sr: wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav) if wav.shape[1] / sr > 20: print(f"{wavfile} too long, ignoring\n") save_path = parent_dir + speaker + "/" + f"processed_{i}.wav" torchaudio.save(save_path, wav, target_sr, channels_first=True) # transcribe text #lang, text = transcribe_one(save_path) audio = whisper.load_audio(save_path) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) options = whisper.DecodingOptions(beam_size=5, language="ja", fp16 = False) result = whisper.decode(model, mel, options) text = "[JA]"+ result.text + "[JA]\n" speaker_annos.append(save_path + "|" + speaker + "|" + text) processed_files += 1 #print(f"Processed: {processed_files}/{total_files}") #except: # print(f"Error occurred: {wavfile}") # continue # # clean annotation # import argparse # import text # from utils import load_filepaths_and_text # for i, line in enumerate(speaker_annos): # path, sid, txt = line.split("|") # cleaned_text = text._clean_text(txt, ["cjke_cleaners2"]) # cleaned_text += "\n" if not cleaned_text.endswith("\n") else "" # speaker_annos[i] = path + "|" + sid + "|" + cleaned_text # write into annotation if len(speaker_annos) == 0: print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.") print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.") with open("short_character_anno.txt", 'w', encoding='utf-8') as f: for line in speaker_annos: f.write(line) # import json # # generate new config # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f: # hps = json.load(f) # # modify n_speakers # hps['data']["n_speakers"] = 1000 + len(speaker2id) # # add speaker names # for speaker in speaker_names: # hps['speakers'][speaker] = speaker2id[speaker] # # save modified config # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f: # json.dump(hps, f, indent=2) # print("finished")