Spaces:

Hatman
/

Audio-Emotion-Recognition

Running on Zero

File size: 1,673 Bytes

d1fb9a5
 
 
675e1e5
a1288b8
d1fb9a5
a1288b8
 
 
864e9d8
a1288b8
 
90f8c97
fb7a30b
675e1e5
 
 
d1fb9a5
 
a1288b8
864e9d8
675e1e5
90f8c97
a1288b8
 
 
 
 
 
675e1e5
a1288b8
864e9d8
 
 
 
 
a1288b8
 
 
d1fb9a5
864e9d8

import gradio as gr
import spaces
import torch
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model_name = "Hemg/human-emotion-detection"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name).to(device) 

def preprocess_audio(audio):
    print('hallo')
    waveform, sampling_rate = torchaudio.load(audio)
    resampled_waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(waveform)
    return {'speech': resampled_waveform.numpy().flatten(), 'sampling_rate': 16000}

@spaces.GPU
def inference(audio):
    print('hello')
    
    example = preprocess_audio(audio)
    inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = inputs.to(device)  # Move inputs to GPU
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    return model.config.id2label[predicted_ids.item()], logits, predicted_ids   # Move tensors back to CPU for further processing
    

iface = gr.Interface(fn=inference,
                     inputs=gr.Audio(type="filepath"),
                     outputs=[gr.Label(label="Predicted Sentiment"),
                              gr.JSON(label="Logits"),
                              gr.JSON(label="Predicted ID")],
                     title="Audio Sentiment Analysis",
                     description="Upload an audio file or record one to analyze sentiment.")


iface.launch(share=True)