talk_with_wind / app.py
aps's picture
Commit efficientat
4848335
raw
history blame
3.17 kB
import gradio as gr
import torch
import numpy as np
import librosa
from efficientat.models.MobileNetV3 import get_model as get_mobilenet, get_ensemble_model
from efficientat.models.preprocess import AugmentMelSTFT
from efficientat.helpers.utils import NAME_TO_WIDTH, labels
from torch import autocast
from contextlib import nullcontext
MODEL_NAME = "mn40_as"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = get_mobilenet(width_mult=NAME_TO_WIDTH(MODEL_NAME), pretrained_name=MODEL_NAME)
model.to(device)
model.eval()
def audio_tag(
audio_path,
sample_rate=32000,
window_size=800,
hop_size=320,
n_mels=128,
cuda=True,
):
(waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
mel = AugmentMelSTFT(n_mels=n_mels, sr=sample_rate, win_length=window_size, hopsize=hop_size)
mel.to(device)
mel.eval()
waveform = torch.from_numpy(waveform[None, :]).to(device)
# our models are trained in half precision mode (torch.float16)
# run on cuda with torch.float16 to get the best performance
# running on cpu with torch.float32 gives similar performance, using torch.bfloat16 is worse
with torch.no_grad(), autocast(device_type=device.type) if cuda and torch.cuda.is_available() else nullcontext():
spec = mel(waveform)
preds, features = model(spec.unsqueeze(0))
preds = torch.sigmoid(preds.float()).squeeze().cpu().numpy()
sorted_indexes = np.argsort(preds)[::-1]
output = {}
# Print audio tagging top probabilities
for k in range(10):
output[sorted_indexes[k]] = labels[sorted_indexes[k]]
return "\n".join(output.values())
def formatted_message(audio_length, audio_class, userText):
prefix = '''You are going to act as a magical tool that allows for humans to communicate with non-human entities like
rocks, crackling fire, trees, animals, and the wind. In order to do this, we're going to provide you a data string which
represents the audio input, the source of the audio, and the human's text input for the conversation.
The goal is for you to embody the source of the audio, and use the length and variance in the signal data to produce
plausible responses to the humans input. Remember to embody the the source data. When we start the conversation,
you should generate a "personality profile" for the source and utilize that personality profile in your responses.
Let's begin:'''
suffix = f'''Source: {audio_class}
Length of Audio in Seconds: {audio_length}
Human Input: {userText}
{audio_class} Response:'''
template = prefix + suffix
response = call_api(template)
return response
def call_api(message):
"""
response = requests.get(f'{api}?q={message}')
if response.status_code == 200:
return str(response.text).split('\n', 2)[2]
else:
return Sorry, I'm quite busy right now, but please try again later :)
"""
return message
demo = gr.Interface(
audio_tag,
gr.Audio(source="upload", type="filepath", label="Your audio"),
gr.Textbox(),
).launch(debug=True)