whisper-v3-zero

Sleeping

File size: 3,545 Bytes

9ba2a1c
7233e3e
9ba2a1c
a67942c
d4961b6
9ba2a1c
 
 
fbe6007
9ba2a1c
 
 
 
 
7233e3e
 
 
 
 
 
 
 
 
a67942c
d4961b6
7233e3e
9ba2a1c
 
a67942c
7233e3e
 
 
 
 
9ba2a1c
7233e3e
a67942c
7233e3e
9ba2a1c
7233e3e
 
 
 
 
9ba2a1c
 
7233e3e
9ba2a1c
 
 
 
 
 
bee3039
dd48e3a
7233e3e
 
 
 
 
 
9ba2a1c
7233e3e
9ba2a1c
7233e3e
9ba2a1c
7233e3e
 
9ba2a1c
 
 
 
 
 
 
bee3039
dd48e3a
7233e3e
 
 
 
 
 
9ba2a1c
7233e3e
9ba2a1c
7233e3e
9ba2a1c
7233e3e
 
9ba2a1c
 
 
 
 
7233e3e
9ba2a1c
7233e3e

import torch
import time

import gradio as gr
import spaces
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

DEFAULT_MODEL_NAME = "distil-whisper/distil-large-v3"

BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

def load_pipeline(model_name):
    return pipeline(
        task="automatic-speech-recognition",
        model=model_name,
        chunk_length_s=30,
        device=device,
    )

pipe = load_pipeline(DEFAULT_MODEL_NAME)

@spaces.GPU
def transcribe(inputs, task, model_name):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    global pipe
    if model_name != pipe.model.name_or_path:
        pipe = load_pipeline(model_name)

    start_time = time.time()  # Record the start time
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    end_time = time.time()  # Record the end time

    transcription_time = end_time - start_time  # Calculate the transcription time

    # Create the transcription time output with additional information
    transcription_time_output = (
        f"Transcription Time: {transcription_time:.2f} seconds\n"
        f"Model Used: {model_name}\n"
        f"Device Used: {'GPU' if torch.cuda.is_available() else 'CPU'}"
    )

    return text, transcription_time_output

demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
        gr.Textbox(
            label="Model Name",
            value=DEFAULT_MODEL_NAME,
            placeholder="Enter the model name",
            info="Some available models: distil-whisper/distil-large-v3 distil-whisper/distil-medium.en   Systran/faster-distil-whisper-large-v3 Systran/faster-whisper-large-v3  Systran/faster-whisper-medium    openai/whisper-tiny , openai/whisper-base, openai/whisper-medium, openai/whisper-large-v3"
        ),
    ],
    outputs=[gr.TextArea(label="Transcription"), gr.TextArea(label="Transcription Info")],
    theme="huggingface",
    title="Whisper Transcription",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the specified OpenAI Whisper"
        " checkpoint and 🤗 Transformers to transcribe audio files of arbitrary length."
    ),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath", label="Audio file"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
        gr.Textbox(
            label="Model Name",
            value=DEFAULT_MODEL_NAME,
            placeholder="Enter the model name",
            info="Some available models: openai/whisper-tiny, openai/whisper-base, openai/whisper-medium, openai/whisper-large-v2"
        ),
    ],
    outputs=[gr.TextArea(label="Transcription"), gr.TextArea(label="Transcription Info")],
    theme="huggingface",
    title="Whisper Transcription",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the specified OpenAI Whisper"
        " checkpoint and 🤗 Transformers to transcribe audio files of arbitrary length."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])

demo.launch(share=True)