zongxiao's picture
Update app.py
374bd91
raw
history blame
3.38 kB
import torch
import numpy as np
import soundfile as sf
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate
def translate(audio_file):
audio, sampling_rate = sf.read(audio_file)
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
label_outputs = {}
for pred in language_prediction:
label_outputs[pred["label"]] = pred["score"]
return outputs["text"],label_outputs
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
inputs = processor(text_prompt, voice_preset=voice_preset)
speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
translated_text, label_outputs= translate(audio)
synthesised_speech = synthesise(translated_text,voice_preset)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
examples = [
["./cs-CZ.mp3", None],
["./de-DE.mp3", None],
["./es-ES.mp3", None],
["./fr-FR.mp3", None],
["./it-IT.mp3", None],
["./ko-KR.mp3", None],
["./nl-NL.mp3", None],
["./pl-PL.mp3", None],
["./pt-PT.mp3", None],
["./ru-RU.mp3", None],
]
import gradio as gr
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
gr.Label(label="Language prediction"),
],
title=title,
description=description,
examples=examples,
)
mic_transcribe = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
gr.Label(label="Language prediction"),
],
title=title,
description=description,
)
with demo:
gr.TabbedInterface(
[file_transcribe, mic_transcribe],
["Transcribe Audio File", "Transcribe Microphone"],
)
demo.launch(share=True)