zongxiao's picture
Update app.py
739aa98
raw
history blame
No virus
3.4 kB
import torch
import numpy as np
import soundfile as sf
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate
def translate(audio_file):
audio, sampling_rate = sf.read(audio_file)
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
label_outputs = {}
for pred in language_prediction:
label_outputs[pred["label"]] = pred["score"]
return outputs["text"],label_outputs
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
inputs = processor(text_prompt, voice_preset=voice_preset)
speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
translated_text, label_outputs= translate(audio)
synthesised_speech = synthesise(translated_text,voice_preset)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
title = "外国话转中文话"
description = """
本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话,一个用于将中文转成语音输出。同时支持语音上传和麦克风输入转换速度比较慢因为租不起GPU的服务器(支出增加200倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
examples = [
["./cs.wav", None],
["./de.wav", None],
["./fr.wav", None],
["./it.wav", None],
["./nl.wav", None],
["./pl.wav", None],
["./ro.wav", None],
["./hr.wav", None],
["./fi.wav", None],
["./sl.wav", None],
]
import gradio as gr
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
gr.Label(label="Language prediction"),
],
title=title,
description=description,
examples=examples,
)
mic_transcribe = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
gr.Label(label="Language prediction"),
],
title=title,
description=description,
)
with demo:
gr.TabbedInterface(
[file_transcribe, mic_transcribe],
["Transcribe Audio File", "Transcribe Microphone"],
)
demo.launch(share=True)