File size: 3,149 Bytes
f45be4e
 
c59d0a4
 
 
f45be4e
c59d0a4
 
 
f45be4e
 
 
c59d0a4
f45be4e
 
c59d0a4
 
 
 
 
 
 
 
 
 
 
 
 
f45be4e
c59d0a4
 
 
 
f45be4e
c59d0a4
f45be4e
 
c59d0a4
f45be4e
c59d0a4
6437cce
c59d0a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6437cce
c59d0a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import librosa
import matplotlib.pyplot as plt
import plotly.express as px
from radar_chart import radar_factory

from keras.models import load_model
import os
import numpy as np



model = load_model(os.path.join("model", "Emotion_Voice_Detection_Model_tuned_2.h5"))


def convert_class_to_emotion(pred):
        """
        Method to convert the predictions (int) into human readable strings.
        """
        
        # label_conversion = {0: 'neutral',
        #                     1: 'calm',
        #                     2: 'happy',
        #                     3: 'sad',
        #                     4: 'angry',
        #                     5: 'fearful',
        #                     6: 'disgust',
        #                     7: 'surprised'}

        label_conversion = {0: 'kata_sifat',
                            1: 'kata_benda',
                            2: 'kata_kerja',
                            3: 'kata_keterangan}

        return label_conversion[int(pred)]


def make_predictions(file, micro=None):
        """
        Method to process the files and create your features.
        """
        if file is not None and micro is None:
            input_audio = file
        elif file is None and micro is not None:
            input_audio = micro
        else:
            print("THERE IS A PROBLEM")
            input_audio = file

        data, sampling_rate = librosa.load(input_audio)
        print(data)
        print(f"THE SAMPLING RATE IS {sampling_rate}")
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=40).T, axis=0)
        x = np.expand_dims(mfccs, axis=1)
        x = np.expand_dims(x, axis=0)
        predictions = np.argmax(model.predict(x), axis=1)

        N = 8
        theta = radar_factory(N, frame='polygon')
        spoke_labels = np.array(['kata_benda',
                            'kata_kerja',
                            'kata_keterangan',
                            'kata_sifat'])
        fig_radar, axs = plt.subplots(figsize=(8, 8), nrows=1, ncols=1,
                            subplot_kw=dict(projection='radar'))
        vec = model.predict(x)[0]
        axs.plot(theta, vec, color="b")
        axs.fill(theta, vec, alpha=0.3)

        axs.set_varlabels(spoke_labels)

        fig = plt.figure()
        plt.plot(data, alpha=0.8)
        plt.xlabel("temps")
        plt.ylabel("amplitude")


        return convert_class_to_emotion(predictions), fig, fig_radar



# Set the starting state to an empty string
iface = gr.Interface(
    fn=make_predictions,
    title="identify emotion of a chunk of audio speech",
    description="a simple interface to perform emotion recognition from an audio file",
    article="Author: <a href=\"https://huggingface.co/poisso\">Poisso</a>.", 
    inputs=[gr.Audio(source="upload", type="filepath", label="File"),
        gr.Audio(source="microphone", type="filepath", streaming=False, label="Microphone")] 
    ,
    examples=[[os.path.join("examples", filename)] for filename in os.listdir("examples")],
    outputs=[gr.Textbox(label="Text output"), gr.Plot(), gr.Plot()]
    )
iface.launch(debug=True)