File size: 2,709 Bytes
d347764 8332aca d347764 8717fa8 d347764 8717fa8 d347764 8332aca d347764 8332aca d347764 8332aca d347764 8332aca d347764 8332aca d347764 f805e49 8717fa8 8332aca f805e49 c737803 d347764 8332aca d347764 f805e49 d347764 c737803 8332aca c737803 8332aca c737803 d347764 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
from deep_translator import GoogleTranslator
from transformers import (
AutoTokenizer,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan,
SpeechT5Processor,
VitsModel,
pipeline,
)
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = "cpu"
# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition",
model="openai/whisper-base", device=device)
# load text-to-speech mms-tts-id model (speaker embeddings included)
model = VitsModel.from_pretrained("facebook/mms-tts-ind")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ind")
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256,
generate_kwargs={"task": "translate"})
return outputs["text"]
def synthesise(text):
inputs = tokenizer(text=text, return_tensors="pt")
with torch.no_grad():
speech = model(**inputs).waveform
return speech.reshape(-1, 1).cpu()
def speech_to_speech_translation(audio):
translated_text = translate(audio)
google_translated = GoogleTranslator(
source="en", target="id").translate(translated_text)
synthesised_speech = synthesise(google_translated)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Indonesian. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech transcription, [Deep Translator](https://github.com/nidhaloff/deep-translator) for translation, and Meta's
[MMS TTS IND](https://huggingface.co/facebook/mms-tts-ind) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate],
["Microphone", "Audio File"])
demo.launch()
|