File size: 2,270 Bytes
4f92cf0
 
 
2417027
21f6ca3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2417027
4f92cf0
21f6ca3
4f92cf0
 
 
 
 
 
2417027
21f6ca3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2417027
4f92cf0
 
21f6ca3
 
 
4f92cf0
21f6ca3
4f92cf0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
from transformers import pipeline
from gradio_client import Client, file

language_classifier = Client("adrien-alloreview/speechbrain-lang-id-voxlingua107-ecapa")
transcriber = Client("tensorlake/audio-extractors")
emotion_detector = pipeline(
    "audio-classification",
    model="HowMannyMore/wav2vec2-lg-xlsr-ur-speech-emotion-recognition",
)
model_name_rus = "IlyaGusev/rubertconv_toxic_clf"
toxic_detector = pipeline(
    "text-classification",
    model=model_name_rus,
    tokenizer=model_name_rus,
    framework="pt",
    max_length=512,
    truncation=True,
    device=0,
)


def detect_language(file_path):
    result = language_classifier.predict(param_0=file(file_path), api_name="/predict")
    language_result = result["label"].split(": ")[1]
    if language_result.lower() in ["russian", "belarussian", "ukrainian"]:
        selected_language = "russian"
    else:
        selected_language = "kazakh"
    return selected_language


def request_gradio(file_path, language):
    try:
        result = transcriber.predict(
            audio_filepath=file(file_path),
            task="transcribe",
            batch_size=24,
            chunk_length_s=30,
            sampling_rate=16000,
            language=language,
            num_speakers=2,
            min_speakers=2,
            max_speakers=2,
            assisted=False,
            api_name="/transcribe",
        )
        return result
    except Exception as e:
        return None


def detect_emotion(audio):
    res = emotion_detector(audio)
    emotion_with_max_score = res[0]["label"]
    return emotion_with_max_score


def detect_toxic_local(text_whisper):
    res = toxic_detector([text_whisper])[0]["label"]
    if res == "toxic":
        return True
    if res == "neutral":
        return False
    else:
        return None


def assessment(file_path):
    language = detect_language(file_path)
    result_text = request_gradio(file_path, language)
    result_emotion = detect_emotion(result_text)
    result_toxic = detect_toxic_local(result_text)
    return {"emotion": result_emotion, "toxic": result_toxic}


gradio_app = gr.Interface(
    fn=assessment,
    inputs=gr.Audio(sources=["upload"], type="filepath"),
    outputs="json"
)
gradio_app.launch()