File size: 2,914 Bytes
f45be4e
e4f7433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42924fc
e4f7433
964a95a
e4f7433
 
 
964a95a
 
 
e4f7433
 
 
 
 
 
 
964a95a
 
 
 
e4f7433
 
 
 
 
42924fc
 
 
 
e4f7433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import librosa
import torch

from transformers import SpeechT5Processor, SpeechT5ForSpeechToText


checkpoint = "microsoft/speecht5_asr"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForSpeechToText.from_pretrained(checkpoint)


def process_audio(sampling_rate, waveform):
    # convert from int16 to floating point
    waveform = waveform / 32678.0

    # convert to mono if stereo
    if len(waveform.shape) > 1:
        waveform = librosa.to_mono(waveform.T)

    # resample to 16 kHz if necessary
    if sampling_rate != 16000:
        waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)

    # limit to 30 seconds
    waveform = waveform[:16000*30]

    # make PyTorch tensor
    waveform = torch.tensor(waveform)
    return waveform


def predict(audio, mic_audio=None):
    # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
    if mic_audio is not None:
        sampling_rate, waveform = mic_audio
    elif audio is not None:
        sampling_rate, waveform = audio
    else:
        return "(please provide audio)"

    waveform = process_audio(sampling_rate, waveform)
    inputs = processor(audio=waveform, sampling_rate=16000, return_tensors="pt")
    predicted_ids = model.generate(**inputs, max_length=400)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]


title = " 😍🥰Prolove  🧑‍🎤 👨‍🎤 "

description = """aplikasi prolove merupakan aplikasi untuk membantu ejaan kata yang diucapkan oleh user dalam bahasa inggris menjadi benar"""

article = """
<div style='margin:20px auto;'>
@article{Ao2021SpeechRecog,
  title   = {PROLOVE},
  author  = {M_ALVI_ADNAN},
  archivePrefix={arXiv},
  primaryClass={eess.AS},
  year={2021}
}
</pre>
<p>Example sound credits:<p>
<ul>
<li>"i wanna tell u smth <a href="https://freesound.org/people/InspectorJ/sounds/519189/">InspectorJ</a> (CC BY 4.0 license)
<li>"let me know <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
<li>"lets do it  <a href="https://freesound.org/people/JoyOhJoy/sounds/165348/">JoyOhJoy</a> (CC0 license)
<li>"listen to me  <a href="https://freesound.org/people/Sample_Me/sounds/610529/">Sample_Me</a> (CC0 license)
</ul>
</div>
"""

examples = [
    ["examples/I wanna tell you something_alvi.wav", None],
    ["examples/Let me know_fazrin.wav", None],
    ["examples/Lets do it_arka.wav", None],
    ["examples/Listen to me_shifa.wav", None],
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Audio(label="Upload Speech", source="upload", type="numpy"),
        gr.Audio(label="Record Speech", source="microphone", type="numpy"),
    ],
    outputs=[
        gr.Text(label="Transcription"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()