import torch import numpy as np import soundfile as sf from transformers import pipeline from transformers import BarkModel from transformers import AutoProcessor device = "cuda:0" if torch.cuda.is_available() else "cpu" pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-large-v2", device=device ) label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device) processor = AutoProcessor.from_pretrained("suno/bark") model = BarkModel.from_pretrained("suno/bark") model = model.to(device) synthesised_rate = model.generation_config.sample_rate def translate(audio_file): audio, sampling_rate = sf.read(audio_file) outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"}) language_prediction = label({"array": audio, "sampling_rate": sampling_rate}) label_outputs = {} for pred in language_prediction: label_outputs[pred["label"]] = pred["score"] return outputs["text"],label_outputs def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"): inputs = processor(text_prompt, voice_preset=voice_preset) speech_output = model.generate(**inputs.to(device),pad_token_id=10000) return speech_output def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"): translated_text, label_outputs= translate(audio) synthesised_speech = synthesise(translated_text,voice_preset) synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16) return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs title = "外国话转普通话" description = """ 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话,一个用于将中文转成普通话语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation") """ examples = [ # ["./en.mp3", None], # ["./de.mp3", None], ["./fr.mp3", None], ["./it.mp3", None], ["./nl.mp3", None], ["./fi.mp3", None], # ["./cs.mp3", None], # ["./pl.mp3", None], ] import gradio as gr demo = gr.Blocks() file_transcribe = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(source="upload", type="filepath"), outputs=[ gr.Audio(label="Generated Speech", type="numpy"), gr.Text(label="Transcription"), gr.Label(label="Language prediction"), ], title=title, description=description, examples=examples, ) mic_transcribe = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(source="microphone", type="filepath"), outputs=[ gr.Audio(label="Generated Speech", type="numpy"), gr.Text(label="Transcription"), gr.Label(label="Language prediction"), ], title=title, description=description, ) with demo: gr.TabbedInterface( [file_transcribe, mic_transcribe], ["Transcribe Audio File", "Transcribe Microphone"], ) demo.launch() ########################################################################################################################### # import torch # import numpy as np # import soundfile as sf # from transformers import pipeline # from transformers import BarkModel # from transformers import AutoProcessor # device = "cuda:0" if torch.cuda.is_available() else "cpu" # pipe = pipeline( # "automatic-speech-recognition", model="openai/whisper-large-v2", device=device # ) # #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device) # processor = AutoProcessor.from_pretrained("suno/bark") # model = BarkModel.from_pretrained("suno/bark") # model = model.to(device) # synthesised_rate = model.generation_config.sample_rate # def translate(audio_file): # # audio, sampling_rate = sf.read(audio_file) # outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"}) # # language_prediction = label({"array": audio, "sampling_rate": sampling_rate}) # # label_outputs = {} # # for pred in language_prediction: # # label_outputs[pred["label"]] = pred["score"] # return outputs["text"]#,label_outputs # def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"): # inputs = processor(text_prompt, voice_preset=voice_preset) # speech_output = model.generate(**inputs.to(device),pad_token_id=10000) # return speech_output # def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"): # #translated_text, label_outputs= translate(audio) # translated_text = translate(audio) # synthesised_speech = synthesise(translated_text,voice_preset) # synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16) # return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs # title = "外国话转中文话" # description = """ # 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话(CPU演示太慢暂时先去掉了),一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。 # ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation") # """ # examples = [ # ["./en.mp3", None], # ["./de.mp3", None], # ["./fr.mp3", None], # ["./it.mp3", None], # ["./nl.mp3", None], # ["./fi.mp3", None], # ["./cs.mp3", None], # ["./pl.mp3", None], # ] # import gradio as gr # demo = gr.Blocks() # file_transcribe = gr.Interface( # fn=speech_to_speech_translation, # inputs=gr.Audio(source="upload", type="filepath"), # outputs=[ # gr.Audio(label="Generated Speech", type="numpy"), # gr.Text(label="Transcription"), # # gr.Label(label="Language prediction"), # ], # title=title, # description=description, # examples=examples, # ) # mic_transcribe = gr.Interface( # fn=speech_to_speech_translation, # inputs=gr.Audio(source="microphone", type="filepath"), # outputs=[ # gr.Audio(label="Generated Speech", type="numpy"), # gr.Text(label="Transcription"), # # gr.Label(label="Language prediction"), # ], # title=title, # description=description, # ) # with demo: # gr.TabbedInterface( # [file_transcribe, mic_transcribe], # ["Transcribe Audio File", "Transcribe Microphone"], # ) # demo.launch()