Linly / NeRF /data_utils /deepspeech_features /extract_ds_features.py
thepianist9's picture
Upload folder using huggingface_hub
79f9f38 verified
raw
history blame
4.36 kB
"""
Script for extracting DeepSpeech features from audio file.
"""
import os
import argparse
import numpy as np
import pandas as pd
from deepspeech_store import get_deepspeech_model_file
from deepspeech_features import conv_audios_to_deepspeech
def parse_args():
"""
Create python script parameters.
Returns
-------
ArgumentParser
Resulted args.
"""
parser = argparse.ArgumentParser(
description="Extract DeepSpeech features from audio file",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--input",
type=str,
required=True,
help="path to input audio file or directory")
parser.add_argument(
"--output",
type=str,
help="path to output file with DeepSpeech features")
parser.add_argument(
"--deepspeech",
type=str,
help="path to DeepSpeech 0.1.0 frozen model")
parser.add_argument(
"--metainfo",
type=str,
help="path to file with meta-information")
args = parser.parse_args()
return args
def extract_features(in_audios,
out_files,
deepspeech_pb_path,
metainfo_file_path=None):
"""
Real extract audio from video file.
Parameters
----------
in_audios : list of str
Paths to input audio files.
out_files : list of str
Paths to output files with DeepSpeech features.
deepspeech_pb_path : str
Path to DeepSpeech 0.1.0 frozen model.
metainfo_file_path : str, default None
Path to file with meta-information.
"""
#deepspeech_pb_path="/disk4/keyu/DeepSpeech/deepspeech-0.9.2-models.pbmm"
if metainfo_file_path is None:
num_frames_info = [None] * len(in_audios)
else:
train_df = pd.read_csv(
metainfo_file_path,
sep="\t",
index_col=False,
dtype={"Id": np.int, "File": np.unicode, "Count": np.int})
num_frames_info = train_df["Count"].values
assert (len(num_frames_info) == len(in_audios))
for i, in_audio in enumerate(in_audios):
if not out_files[i]:
file_stem, _ = os.path.splitext(in_audio)
out_files[i] = file_stem + ".npy"
#print(out_files[i])
conv_audios_to_deepspeech(
audios=in_audios,
out_files=out_files,
num_frames_info=num_frames_info,
deepspeech_pb_path=deepspeech_pb_path)
def main():
"""
Main body of script.
"""
args = parse_args()
in_audio = os.path.expanduser(args.input)
if not os.path.exists(in_audio):
raise Exception("Input file/directory doesn't exist: {}".format(in_audio))
deepspeech_pb_path = args.deepspeech
#add
deepspeech_pb_path = True
args.deepspeech = '~/.tensorflow/models/deepspeech-0_1_0-b90017e8.pb'
#deepspeech_pb_path="/disk4/keyu/DeepSpeech/deepspeech-0.9.2-models.pbmm"
if deepspeech_pb_path is None:
deepspeech_pb_path = ""
if deepspeech_pb_path:
deepspeech_pb_path = os.path.expanduser(args.deepspeech)
if not os.path.exists(deepspeech_pb_path):
deepspeech_pb_path = get_deepspeech_model_file()
if os.path.isfile(in_audio):
extract_features(
in_audios=[in_audio],
out_files=[args.output],
deepspeech_pb_path=deepspeech_pb_path,
metainfo_file_path=args.metainfo)
else:
audio_file_paths = []
for file_name in os.listdir(in_audio):
if not os.path.isfile(os.path.join(in_audio, file_name)):
continue
_, file_ext = os.path.splitext(file_name)
if file_ext.lower() == ".wav":
audio_file_path = os.path.join(in_audio, file_name)
audio_file_paths.append(audio_file_path)
audio_file_paths = sorted(audio_file_paths)
out_file_paths = [""] * len(audio_file_paths)
extract_features(
in_audios=audio_file_paths,
out_files=out_file_paths,
deepspeech_pb_path=deepspeech_pb_path,
metainfo_file_path=args.metainfo)
if __name__ == "__main__":
main()