import torch from llama_index.core.prompts import PromptTemplate from transformers import AutoTokenizer from llama_index.core import Settings import os import time from llama_index.llms.text_generation_inference import TextGenerationInference import whisper import gradio as gr from gtts import gTTS from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import soundfile as sf from datasets import load_dataset model = whisper.load_model("base") HF_API_TOKEN = os.getenv("HF_TOKEN") def translate_audio(audio): # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # decode the audio options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0) result = whisper.decode(model, mel, options) return result.text def audio_response(text, output_path="speech.wav"): # Load the processor, model, and vocoder processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Process the input text inputs = processor(text=text, return_tensors="pt") # Load xvector containing speaker's voice characteristics embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # Generate speech with torch.no_grad(): speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Save the audio to a file sf.write(output_path, speech.numpy(), samplerate=16000) # Ensure the sample rate matches your needs return output_path def messages_to_prompt(messages): # Default system message for a chatbot default_system_prompt = "You are an AI chatbot designed to assist with user queries in a friendly and conversational manner." prompt = default_system_prompt + "\n" for message in messages: if message.role == 'system': prompt += f"\n{message.content}\n" elif message.role == 'user': prompt += f"\n{message.content}\n" elif message.role == 'assistant': prompt += f"\n{message.content}\n" # Ensure we start with a system prompt, insert blank if needed if not prompt.startswith("\n"): prompt = "\n\n" + prompt # Add final assistant prompt prompt = prompt + "\n" return prompt def completion_to_prompt(completion): return f"<|system|>\n\n<|user|>\n{completion}\n<|assistant|>\n" Settings.llm = TextGenerationInference( model_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct", token=HF_API_TOKEN, messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt ) def text_response(t): time.sleep(1) # Adjust the delay as needed response = Settings.llm.complete(t) message = response.text return message def transcribe_(a): t1 = translate_audio(a) t2 = text_response(t1) t3 = audio_response(t2) return (t1, t2, t3) output_1 = gr.Textbox(label="Speech to Text") output_2 = gr.Textbox(label="LLM Output") output_3 = gr.Audio(label="LLM output to audio") gr.Interface( title='AI Voice Assistant', fn=transcribe_, inputs=[ gr.Audio(sources="microphone", type="filepath"), ], outputs=[ output_1, output_2, output_3 ] ).launch(share=True)