speech-to-speech

Paused

File size: 3,401 Bytes

226475c
5307e6b
0041ae6
dbc99da
5307e6b
 
226475c
374bd91
5307e6b
dbc99da
 
 
0041ae6
dbc99da
da9d4b3
dbc99da
 
226475c
0041ae6
 
5307e6b
0041ae6
 
 
 
 
dbc99da
 
 
 
 
0041ae6
dbc99da
 
0041ae6
226475c
258dcc3
226475c
739aa98
226475c
 
0bc8a9a
303dd67
 
 
 
 
 
 
 
 
 
0bc8a9a
a2d9db4
226475c
b9359f0
 
0041ae6
b9359f0
 
 
 
0041ae6
b9359f0
9e2b006
 
b9359f0
9e2b006
b9359f0
0041ae6
b9359f0
 
 
 
0041ae6
b9359f0
9e2b006
 
 
226475c
b9359f0
 
 
 
226475c
8e194d1

import torch
import numpy as np
import soundfile as sf
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate

def translate(audio_file):
    audio, sampling_rate = sf.read(audio_file)
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
    label_outputs = {}
    for pred in language_prediction:
        label_outputs[pred["label"]] = pred["score"]
    return outputs["text"],label_outputs
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
    inputs = processor(text_prompt, voice_preset=voice_preset)
    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
    return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
    translated_text, label_outputs= translate(audio)
    synthesised_speech = synthesise(translated_text,voice_preset)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs

title = "外国话转中文话"
description = """
本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成语音输出。同时支持语音上传和麦克风输入转换速度比较慢因为租不起GPU的服务器（支出增加200倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
examples = [
    ["./cs.wav", None],
    ["./de.wav", None],
    ["./fr.wav", None],
    ["./it.wav", None],
    ["./nl.wav", None],
    ["./pl.wav", None],
    ["./ro.wav", None],
    ["./hr.wav", None],
    ["./fi.wav", None],
    ["./sl.wav", None],
]
import gradio as gr

demo = gr.Blocks()
file_transcribe = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
        gr.Label(label="Language prediction"),
    ],
    title=title,
    description=description,
    examples=examples,
)
mic_transcribe = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
        gr.Label(label="Language prediction"),
    ],
    title=title,
    description=description,
)
with demo:
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        ["Transcribe Audio File", "Transcribe Microphone"],
    )

demo.launch(share=True)