keess
/

whisper-model-internal

Automatic Speech Recognition

hf-asr-leaderboard

Inference Endpoints

Model card Files Files and versions Community

whisper-model-internal / handler.py

keess's picture

- add custom endpoint handler

e5983bf almost 2 years ago

history blame contribute delete

2.63 kB

	from typing import Dict, List, Any

	import torch as torch
	from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration

	import gradio as gr
	import subprocess
	import numpy as np
	import time

	import pandas as pd

	from datasets import Audio, Dataset



	class EndpointHandler():

	# model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
	# model='silero_vad', force_reload=False, onnx=True)

	# (get_speech_timestamps,
	# _, read_audio,
	# *_) = utils



	def __init__(self, path=""):
	device = 0 if torch.cuda.is_available() else "cpu"
	# self.pipe = pipeline(
	# task="automatic-speech-recognition",
	# model="openai/whisper-large",
	# # chunk_length_s=30,
	# device=device,
	# )
	self.processor = WhisperProcessor.from_pretrained("openai/whisper-large")
	self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
	self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="nl", task="transcribe")
	# self.pipe.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="nl", task="transcribe")

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	data args:
	inputs (:obj: `str`)
	date (:obj: `str`)
	Return:
	A :obj:`list` \| `dict`: will be serialized and returned
	"""
	#print request
	print("request")
	print(data)
	print(data["inputs"])
	# audio_data = read(io.BytesIO(data))
	# get inputs, inputs in request body is possible equal to wav or mp3 file
	inputs = data.pop("inputs", data)
	print("here comes text")
	print(inputs)
	data = [inputs]
	ds = pd.DataFrame(data, columns=['audio'])
	ds = Dataset.from_pandas(ds)
	# load dummy dataset and read soundfiles
	ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
	input_speech = next(iter(ds))["audio"]["array"]
	input_features = self.processor(input_speech, return_tensors="pt").input_features
	predicted_ids = self.model.generate(input_features, forced_decoder_ids=self.model.config.forced_decoder_ids)
	transcription = self.processor.batch_decode(predicted_ids)
	print("this is the description")
	print(transcription)
	# print(self.pipe(inputs))
	# text = self.pipe(inputs)["text"]
	# text = self.transcribe(inputs)
	# print(text)
	return transcription