How to get accuracy of transcription from the model?
How to get accuracy of transcription from the model?
I have the same issue. I'd like to have the probability of each chunk. I set the generate_kwargs={"language": language, "output_scores": True, "output_logits": True})
when calling pipe, but it does not return any probability. Please let me know if you found any solution.
Try a model like that https://github.com/thomasmol/cog-whisper-diarization
You can evaluate the model using the word-error rate (WER) metric using the following example. First install the Python dependencies:
pip install --upgrade pip
pip install --upgrade transformers datasets[audio] evaluate jiwer
Then, run the following code snippet to evaluate the model on the LibriSpeech ASR dataset:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from datasets import load_dataset
from evaluate import load
import torch
from tqdm import tqdm
# define our torch configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
# load the model + processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True)
model = model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
# load the dataset with streaming mode
dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)
# define the evaluation metric
wer_metric = load("wer")
def inference(batch):
# 1. Pre-process the audio data to log-mel spectrogram inputs
audio = [sample["array"] for sample in batch["audio"]]
input_features = processor(audio, sampling_rate=batch["audio"][0]["sampling_rate"], return_tensors="pt").input_features
input_features = input_features.to(device, dtype=torch_dtype)
# 2. Auto-regressively generate the predicted token ids
pred_ids = model.generate(input_features, max_new_tokens=128)
# 3. Decode the token ids to the final transcription
batch["transcription"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
batch["reference"] = batch["text"]
return batch
# batch size 16 inference
dataset = dataset.map(function=inference, batched=True, batch_size=16)
all_transcriptions = []
all_references = []
# iterate over the dataset and run inference
for result in tqdm(dataset, desc="Evaluating..."):
all_transcriptions.append(result["transcription"])
all_references.append(result["reference"])
# normalize predictions and references
all_transcriptions = [processor.normalize(transcription) for transcription in all_transcriptions]
all_references = [processor.normalize(reference) for reference in all_references]
# compute the WER metric
wer = 100 * wer_metric.compute(predictions=all_transcriptions, references=all_references)
print(wer)
You can evaluate the model using the word-error rate (WER) metric using the following example. First install the Python dependencies:
pip install --upgrade pip pip install --upgrade transformers datasets[audio] evaluate jiwer
Then, run the following code snippet to evaluate the model on the LibriSpeech ASR dataset:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor from datasets import load_dataset from evaluate import load import torch from tqdm import tqdm # define our torch configuration device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-large-v3" # load the model + processor model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True) model = model.to(device) processor = AutoProcessor.from_pretrained(model_id) # load the dataset with streaming mode dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True) # define the evaluation metric wer_metric = load("wer") def inference(batch): # 1. Pre-process the audio data to log-mel spectrogram inputs audio = [sample["array"] for sample in batch["audio"]] input_features = processor(audio, sampling_rate=batch["audio"][0]["sampling_rate"], return_tensors="pt").input_features input_features = input_features.to(device, dtype=torch_dtype) # 2. Auto-regressively generate the predicted token ids pred_ids = model.generate(input_features, max_new_tokens=128) # 3. Decode the token ids to the final transcription batch["transcription"] = processor.batch_decode(pred_ids, skip_special_tokens=True) batch["reference"] = batch["text"] return batch # batch size 16 inference dataset = dataset.map(function=inference, batched=True, batch_size=16) all_transcriptions = [] all_references = [] # iterate over the dataset and run inference for result in tqdm(dataset, desc="Evaluating..."): all_transcriptions.append(result["transcription"]) all_references.append(result["reference"]) # normalize predictions and references all_transcriptions = [processor.normalize(transcription) for transcription in all_transcriptions] all_references = [processor.normalize(reference) for reference in all_references] # compute the WER metric wer = 100 * wer_metric.compute(predictions=all_transcriptions, references=all_references) print(wer)
Thak you for your response. But I look to find how confident the model is about transcription. I probably didn't frame my question properly. I appreciate your help anyway.
Try a model like that https://github.com/thomasmol/cog-whisper-diarization
https://replicate.com/thomasmol/whisper-diarization
You have a confidence level for every word. So you can assume how confident the model is about transcription.
{
"end": "281.0",
"text": "We shared it with researchers. Right.",
"start": "279.76",
"words": [
{
"end": 279.78,
"word": "We",
"start": 279.76,
"probability": 0.92041015625
},
{
"end": 280.02,
"word": "shared",
"start": 279.78,
"probability": 0.99072265625
},
{
"end": 280.14,
"word": "it",
"start": 280.02,
"probability": 0.99951171875
},
{
"end": 280.24,
"word": "with",
"start": 280.14,
"probability": 1
},
{
"end": 280.54,
"word": "researchers.",
"start": 280.24,
"probability": 0.94384765625
},
{
"end": 281,
"word": "Right.",
"start": 280.78,
"probability": 0.73193359375
}
],
"speaker": "SPEAKER_00",
"avg_logprob": -0.21625000337759653
}