Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import librosa | |
from huggingface_hub import hf_hub_download | |
from onnxruntime import InferenceSession | |
import numpy as np | |
# Load the onnx model | |
model = hf_hub_download( | |
repo_id="hon9kon9ize/yue-tonenet", filename="model.onnx") | |
session = InferenceSession(model) | |
def extract_feature(filepath, sampling_rate=22050): | |
y = librosa.load(filepath, sr=sampling_rate)[0] | |
margin = int(sampling_rate * 0.1) | |
y = y[margin:-margin] | |
y, _ = librosa.effects.trim( | |
y, top_db=40, frame_length=1024, hop_length=256) | |
duration = len(y) / sampling_rate | |
rate = duration * 2 # 0.5s | |
y = librosa.effects.time_stretch(y=y, rate=rate) | |
mel_feat = librosa.feature.melspectrogram( | |
y=y, sr=sampling_rate, n_mels=64, n_fft=2048, hop_length=16, fmin=50, fmax=350) | |
y = librosa.power_to_db(S=mel_feat, ref=np.max) | |
return y | |
def predict(filepath): | |
if filepath is None: | |
return "Input Error! Please enter one audio!" | |
x = extract_feature(filepath) | |
x = x.reshape(-1, 1, 690, 64) | |
input_name = session.get_inputs()[0].name | |
output_name = session.get_outputs()[0].name | |
pred = session.run([output_name], {input_name: x})[0] | |
pred = torch.softmax(torch.tensor(pred[0]), dim=0) | |
return {str(i+1): f"{pred[i].item():.2f}" for i in range(6)} | |
inputs = gr.Audio(sources="microphone", type="filepath", label="Recording") | |
# Render the app | |
app = gr.Interface(fn=predict, inputs=inputs, | |
outputs="label", title="Cantonese ToneNet", allow_flagging="never") | |
app.launch() | |