Set temperature and prompt possible?
Hi,
is there a way to set the temperature and a prompt?
Something like this:
"temperature": "0.0",
"prompt": "Hello, welcome to my lecture. Today, we will discuss various topics. Let's begin."
I just want to get very precise transcriptions and also make the model always respond with punctuation.
Any suggestions?
Thanks.
Hey @jeffuli755 , you can achieve this with the following:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "distil-whisper/distil-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
torch_dtype=torch_dtype,
device=device,
)
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = dataset[0]["audio"]
# add a prompt
prompt = "Hello, welcome to my lecture. Today, we will discuss various topics. Let's begin."
prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(device)
result = pipe(sample.copy(), generate_kwargs={"prompt_ids": prompt_ids})
# change the temperature and enable sampling
result = pipe(sample.copy(), generate_kwargs={"prompt_ids": prompt_ids, "do_sample": True, "temperature": 1.0})
Hi @sanchit-gandhi ,
I am currently attempting to use a prompt to correct surnames, but it seems that this approach is not functioning as expected with the distil-large-v3 model. Interestingly, the same code works seamlessly with the openai/whisper-large-v3 model.
Are there any specific parameters or configurations that differ between the two models, which could explain this behaviour?
Thank you for your assistance.
My code:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import soundfile as sf
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# model_id = "openai/whisper-large-v3"
model_id = "distil-whisper/distil-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
torch_dtype=torch_dtype,
device=device,
)
# Load a local audio file (replace with your actual file path)
audio_path = "tests/audio.wav"
# Read the audio file
audio_input, sample_rate = sf.read(audio_path)
# Prepare the input format expected by the pipeline
sample = {"array": audio_input, "sampling_rate": sample_rate}
# add a prompt
prompt = "Replace any incorrect words in the transcription with the correct ones using the following vocabulary list: Surname1, Surname2"
prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(device)
# Run the pipeline with prompt
result = pipe(sample.copy(), generate_kwargs={"prompt_ids": prompt_ids})
print(result)
# Change the temperature and enable sampling
result = pipe(sample.copy(), generate_kwargs={"prompt_ids": prompt_ids, "do_sample": True, "temperature": 1.0})
# Print or process the result
print(result)