yue-tonenet / app.py
indiejoseph's picture
Update app.py
4a4cc1f verified
raw
history blame contribute delete
No virus
1.57 kB
import gradio as gr
import torch
import librosa
from huggingface_hub import hf_hub_download
from onnxruntime import InferenceSession
import numpy as np
# Load the onnx model
model = hf_hub_download(
repo_id="hon9kon9ize/yue-tonenet", filename="model.onnx")
session = InferenceSession(model)
def extract_feature(filepath, sampling_rate=22050):
y = librosa.load(filepath, sr=sampling_rate)[0]
margin = int(sampling_rate * 0.1)
y = y[margin:-margin]
y, _ = librosa.effects.trim(
y, top_db=40, frame_length=1024, hop_length=256)
duration = len(y) / sampling_rate
rate = duration * 2 # 0.5s
y = librosa.effects.time_stretch(y=y, rate=rate)
mel_feat = librosa.feature.melspectrogram(
y=y, sr=sampling_rate, n_mels=64, n_fft=2048, hop_length=16, fmin=50, fmax=350)
y = librosa.power_to_db(S=mel_feat, ref=np.max)
return y
def predict(filepath):
if filepath is None:
return "Input Error! Please enter one audio!"
x = extract_feature(filepath)
x = x.reshape(-1, 1, 690, 64)
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name
pred = session.run([output_name], {input_name: x})[0]
pred = torch.softmax(torch.tensor(pred[0]), dim=0)
return {str(i+1): f"{pred[i].item():.2f}" for i in range(6)}
inputs = gr.Audio(sources="microphone", type="filepath", label="Recording")
# Render the app
app = gr.Interface(fn=predict, inputs=inputs,
outputs="label", title="Cantonese ToneNet", allow_flagging="never")
app.launch()