''' +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+ | Step 1: Set Up | | Step 2: Set Up Gradio | | Step 3: Speech-to-Text | | Step 4: Text-to-Speech | | Environment | | Interface | | & Language Model Processing | | Output | +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+ | | | | | | | | | - Import Python | | - Define interface | | - Transcribe audio | | - XTTS model generates | | libraries | | components | | to text using | | spoken response from | | - Initialize models: |--------> - Configure audio and |------->| Faster Whisper ASR |------->| LLM's text response | | Whisper, Mistral, | | text interaction | | - Transcribed text | | | | XTTS | | - Launch interface | | is added to | | | | | | | | chatbot's history | | | | | | | | - Mistral LLM | | | | | | | | processes chatbot | | | | | | | | history to generate | | | | | | | | response | | | +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+ ''' ###### Set Up Environment ###### import os # Set CUDA environment variable and install llama-cpp-python # llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++ os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc" os.system('python -m unidic download') os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose') # Third-party library imports from faster_whisper import WhisperModel import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama from TTS.api import TTS from TTS.utils.manage import ModelManager from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir #from TTS.utils.manage import ModelManager # Local imports from utils import get_sentence, wave_header_chunk, generate_speech_for_sentence # Load Whisper ASR model print("Loading Whisper ASR") whisper_model = WhisperModel("large-v3", device="cpu", compute_type="float32") # Load Mistral LLM print("Loading Mistral LLM") hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf") mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf" mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False) # Load XTTS Model print("Loading XTTS model") #model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # move in v2, since xtts_v1 is generated keyerror, I guess you can select it with old github's release. os.environ["COQUI_TOS_AGREED"] = "1" #m = ModelManager().download_model(model_name) ##print(m) #m = model_name #xtts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) device = "cpu" model_name = "tts_models/multilingual/multi-dataset/xtts_v2" print("⏳Downloading model") ModelManager().download_model(model_name) model_path = os.path.join( get_user_data_dir("tts"), model_name.replace("/", "--") ) config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) xtts_model = Xtts.init_from_config(config) xtts_model.load_checkpoint(config, checkpoint_dir=model_path, eval=True) xtts_model.to(device) #xtts_model = TTS(model_name, gpu=False) #xtts_model.to("cpu") # no GPU or Amd #tts.to("cuda") # cuda only #tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2" #ModelManager().download_model(tts_model_name) #tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--")) #config = XttsConfig() #config.load_json(os.path.join(tts_model_path, "config.json")) #xtts_model = Xtts.init_from_config(config) #xtts_model.to("cpu") #xtts_model.load_checkpoint( # config, # checkpoint_path=os.path.join(tts_model_path, "model.pth"), # vocab_path=os.path.join(tts_model_path, "vocab.json"), # eval=True, # use_deepspeed=True, #) #xtts_model.cuda() print("Loaded XTTS model") ###### Set up Gradio Interface ###### with gr.Blocks(title="Voice chat with LLM") as demo: DESCRIPTION = """# Voice chat with LLM""" gr.Markdown(DESCRIPTION) # Define chatbot component chatbot = gr.Chatbot( value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")], # Initial greeting from the chatbot elem_id="chatbot", avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"), bubble_full_width=False, ) # Define chatbot voice component VOICES = ["female", "male"] with gr.Row(): chatbot_voice = gr.Dropdown( label="Voice of the Chatbot", info="How should Chatbot talk like", choices=VOICES, max_choices=1, value=VOICES[0], ) # Define text and audio record input components with gr.Row(): txt_box = gr.Textbox( scale=3, show_label=False, placeholder="Enter text and press enter, or speak to your microphone", container=False, interactive=True, ) audio_record = gr.Audio(sources=["microphone"], type="filepath", scale=4) # Define generated audio playback component with gr.Row(): sentence = gr.Textbox(visible=False) audio_playback = gr.Audio( value=None, label="Generated audio response", streaming=True, autoplay=True,interactive=False, show_label=True, ) # Will be triggered on text submit (will send to generate_speech) def add_text(chatbot_history, text): chatbot_history = [] if chatbot_history is None else chatbot_history chatbot_history = chatbot_history + [(text, None)] return chatbot_history, gr.update(value="", interactive=False) # Will be triggered on voice submit (will transribe and send to generate_speech) def add_audio(chatbot_history, audio): chatbot_history = [] if chatbot_history is None else chatbot_history # get result from whisper and strip it to delete begin and end space response, _ = whisper_model.transcribe(audio) text = list(response)[0].text.strip() print("Transcribed text:", text) chatbot_history = chatbot_history + [(text, None)] return chatbot_history, gr.update(value="", interactive=False) def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False): # Start by yielding an initial empty audio to set up autoplay yield ("", chatbot_history, wave_header_chunk()) #yield ("", chatbot_history) # Helper function to handle the speech generation and yielding process def handle_speech_generation(sentence, chatbot_history, chatbot_voice): if sentence != "": print("Processing sentence") # generate speech by cloning a voice using default setting generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, None, return_as_byte=True) if generated_speech is not None: #_, audio_dict = generated_speech yield (sentence, chatbot_history, generated_speech) #yield (sentence, chatbot_history, audio_dict["value"]) if initial_greeting: # Process only the initial greeting if specified for _, sentence in chatbot_history: yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice) else: # Continuously get and process sentences from a generator function for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm): print("Inserting sentence to queue") yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice) txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback]) txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False) audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback]) audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False) FOOTNOTE = """ This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models. It relies on the following models : - Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text. - Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses. - Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot. Note: - Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor""" gr.Markdown(FOOTNOTE) demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback]) demo.queue().launch(debug=True,share=True)