OpenGPT-4o

Sleeping

App Files Files Community

pratham0011 commited on Sep 6

Commit

245387f

•

1 Parent(s): 68f8169

Upload app__.py

Browse files

Files changed (1) hide show

app__.py +111 -0

app__.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+from llama_index.core.prompts import PromptTemplate
+from transformers import AutoTokenizer
+from llama_index.core import Settings
+import os
+import time
+from llama_index.llms.text_generation_inference import TextGenerationInference
+import whisper
+import gradio as gr
+from gtts import gTTS
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+import soundfile as sf
+from datasets import load_dataset
+model = whisper.load_model("base")
+HF_API_TOKEN = os.getenv("HF_TOKEN")
+def translate_audio(audio):
+    # load audio and pad/trim it to fit 30 seconds
+    audio = whisper.load_audio(audio)
+    audio = whisper.pad_or_trim(audio)
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    # decode the audio
+    options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
+    result = whisper.decode(model, mel, options)
+    return result.text
+def audio_response(text, output_path="speech.wav"):
+    # Load the processor, model, and vocoder
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    # Process the input text
+    inputs = processor(text=text, return_tensors="pt")
+    # Load xvector containing speaker's voice characteristics
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+    # Generate speech
+    with torch.no_grad():
+        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+    # Save the audio to a file
+    sf.write(output_path, speech.numpy(), samplerate=16000)  # Ensure the sample rate matches your needs
+    return output_path
+def messages_to_prompt(messages):
+    # Default system message for a chatbot
+    default_system_prompt = "You are an AI chatbot designed to assist with user queries in a friendly and conversational manner."
+    prompt = default_system_prompt + "\n"
+    for message in messages:
+        if message.role == 'system':
+            prompt += f"\n{message.content}</s>\n"
+        elif message.role == 'user':
+            prompt += f"\n{message.content}</s>\n"
+        elif message.role == 'assistant':
+            prompt += f"\n{message.content}</s>\n"
+    # Ensure we start with a system prompt, insert blank if needed
+    if not prompt.startswith("\n"):
+        prompt = "\n</s>\n" + prompt
+    # Add final assistant prompt
+    prompt = prompt + "\n"
+    return prompt
+def completion_to_prompt(completion):
+    return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"
+Settings.llm = TextGenerationInference(
+    model_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct",
+    token=HF_API_TOKEN,
+    messages_to_prompt=messages_to_prompt,
+    completion_to_prompt=completion_to_prompt
+)
+def text_response(t):
+    time.sleep(1)  # Adjust the delay as needed
+    response = Settings.llm.complete(t)
+    message = response.text
+    return  message
+def transcribe_(a):
+    t1 = translate_audio(a)
+    t2 = text_response(t1)
+    t3 = audio_response(t2)
+    return (t1, t2, t3)
+output_1 = gr.Textbox(label="Speech to Text")
+output_2 = gr.Textbox(label="LLM Output")
+output_3 = gr.Audio(label="LLM output to audio")
+gr.Interface(
+    title='AI Voice Assistant',
+    fn=transcribe_,
+    inputs=[
+        gr.Audio(sources="microphone", type="filepath"),
+    ],
+    outputs=[
+        output_1, output_2, output_3
+    ]
+).launch(share=True)