Single word transcription for a audio file with ~1.5m frames
#155
by
KevalRx
- opened
import torchaudio
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3")
sample_wav = download_asset(audio_file)
print(torchaudio.info(sample_wav))
AudioMetaData(sample_rate=8000, num_frames=1564224, num_channels=1, bits_per_sample=8, encoding=PCM_U)
# Resample to match Whisper sampling rate
target_sample_rate = processor.feature_extractor.sampling_rate
print(f'Whisper sampling rate: {target_sample_rate}')
if orig_sample_rate != target_sample_rate:
transform = torchaudio.transforms.Resample(orig_freq=orig_sample_rate, new_freq=target_sample_rate)
waveform = transform(waveform)
print(f"waveform original shape 2D: {waveform.shape}")
waveform1 = waveform.squeeze(0)
print(f"waveform modified shape 1D: {waveform1.shape}")
waveform original shape 2D: torch.Size([1, 3128448])
waveform modified shape 1D: torch.Size([3128448])
# Load and preprocess the audio data
inputs = processor(waveform1, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt").input_features.to(device)
# Generate transcription
predicted_ids = model.generate(inputs, language="en")
# Decode the transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
# Print the transcription
print(transcription)
[' Thank you.']
The audio is 1.5m frames. How come I only get transcription of a single-word? Does it not auto-regressively decode each input id?