speech-to-speech

Paused

App Files Files Community

speech-to-speech / app.py

zongxiao

Update app.py

739aa98 12 months ago

raw

history blame

No virus

3.4 kB

	import torch
	import numpy as np
	import soundfile as sf
	from transformers import pipeline
	from transformers import BarkModel
	from transformers import AutoProcessor

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	pipe = pipeline(
	"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
	)
	label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
	processor = AutoProcessor.from_pretrained("suno/bark")
	model = BarkModel.from_pretrained("suno/bark")
	model = model.to(device)
	synthesised_rate = model.generation_config.sample_rate

	def translate(audio_file):
	audio, sampling_rate = sf.read(audio_file)
	outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
	language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
	label_outputs = {}
	for pred in language_prediction:
	label_outputs[pred["label"]] = pred["score"]
	return outputs["text"],label_outputs
	def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
	inputs = processor(text_prompt, voice_preset=voice_preset)
	speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
	return speech_output
	def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
	translated_text, label_outputs= translate(audio)
	synthesised_speech = synthesise(translated_text,voice_preset)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs

	title = "外国话转中文话"
	description = """
	本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成语音输出。同时支持语音上传和麦克风输入转换速度比较慢因为租不起GPU的服务器（支出增加200倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""
	examples = [
	["./cs.wav", None],
	["./de.wav", None],
	["./fr.wav", None],
	["./it.wav", None],
	["./nl.wav", None],
	["./pl.wav", None],
	["./ro.wav", None],
	["./hr.wav", None],
	["./fi.wav", None],
	["./sl.wav", None],
	]
	import gradio as gr

	demo = gr.Blocks()
	file_transcribe = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=[
	gr.Audio(label="Generated Speech", type="numpy"),
	gr.Text(label="Transcription"),
	gr.Label(label="Language prediction"),
	],
	title=title,
	description=description,
	examples=examples,
	)
	mic_transcribe = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=[
	gr.Audio(label="Generated Speech", type="numpy"),
	gr.Text(label="Transcription"),
	gr.Label(label="Language prediction"),
	],
	title=title,
	description=description,
	)
	with demo:
	gr.TabbedInterface(
	[file_transcribe, mic_transcribe],
	["Transcribe Audio File", "Transcribe Microphone"],
	)

	demo.launch(share=True)