File size: 2,709 Bytes
d347764
 
 
 
8332aca
 
 
 
 
 
 
 
 
d347764
8717fa8
d347764
8717fa8
d347764
8332aca
 
d347764
8332aca
 
 
d347764
 
 
8332aca
 
d347764
 
 
 
8332aca
 
 
 
d347764
 
 
 
8332aca
 
 
d347764
 
 
 
f805e49
 
8717fa8
8332aca
f805e49
 
 
c737803
 
 
d347764
8332aca
d347764
f805e49
 
d347764
c737803
 
 
8332aca
c737803
 
 
 
 
 
 
8332aca
 
c737803
d347764
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
from deep_translator import GoogleTranslator
from transformers import (
    AutoTokenizer,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    SpeechT5Processor,
    VitsModel,
    pipeline,
)

# device = "cuda:0" if torch.cuda.is_available() else "cpu"

device = "cpu"
# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition",
                    model="openai/whisper-base", device=device)

# load text-to-speech mms-tts-id model (speaker embeddings included)
model = VitsModel.from_pretrained("facebook/mms-tts-ind")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ind")


def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256,
                       generate_kwargs={"task": "translate"})
    return outputs["text"]


def synthesise(text):
    inputs = tokenizer(text=text, return_tensors="pt")
    with torch.no_grad():
        speech = model(**inputs).waveform
    return speech.reshape(-1, 1).cpu()


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    google_translated = GoogleTranslator(
        source="en", target="id").translate(translated_text)
    synthesised_speech = synthesise(google_translated)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech


title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Indonesian. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech transcription, [Deep Translator](https://github.com/nidhaloff/deep-translator) for translation, and Meta's
[MMS TTS IND](https://huggingface.co/facebook/mms-tts-ind) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate],
                       ["Microphone", "Audio File"])

demo.launch()