File size: 2,633 Bytes
44c0e8a 1d5d732 2b4b5f3 1d5d732 2b4b5f3 44c0e8a e5983bf 1d5d732 e5983bf 1d5d732 44c0e8a e5983bf 1d5d732 44c0e8a 1d5d732 2b4b5f3 44c0e8a 343abdc 1d5d732 e5983bf 1d5d732 e5983bf 1d5d732 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
from typing import Dict, List, Any
import torch as torch
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import gradio as gr
import subprocess
import numpy as np
import time
import pandas as pd
from datasets import Audio, Dataset
class EndpointHandler():
# model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
# model='silero_vad', force_reload=False, onnx=True)
# (get_speech_timestamps,
# _, read_audio,
# *_) = utils
def __init__(self, path=""):
device = 0 if torch.cuda.is_available() else "cpu"
# self.pipe = pipeline(
# task="automatic-speech-recognition",
# model="openai/whisper-large",
# # chunk_length_s=30,
# device=device,
# )
self.processor = WhisperProcessor.from_pretrained("openai/whisper-large")
self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="nl", task="transcribe")
# self.pipe.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="nl", task="transcribe")
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
data args:
inputs (:obj: `str`)
date (:obj: `str`)
Return:
A :obj:`list` | `dict`: will be serialized and returned
"""
#print request
print("request")
print(data)
print(data["inputs"])
# audio_data = read(io.BytesIO(data))
# get inputs, inputs in request body is possible equal to wav or mp3 file
inputs = data.pop("inputs", data)
print("here comes text")
print(inputs)
data = [inputs]
ds = pd.DataFrame(data, columns=['audio'])
ds = Dataset.from_pandas(ds)
# load dummy dataset and read soundfiles
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
input_speech = next(iter(ds))["audio"]["array"]
input_features = self.processor(input_speech, return_tensors="pt").input_features
predicted_ids = self.model.generate(input_features, forced_decoder_ids=self.model.config.forced_decoder_ids)
transcription = self.processor.batch_decode(predicted_ids)
print("this is the description")
print(transcription)
# print(self.pipe(inputs))
# text = self.pipe(inputs)["text"]
# text = self.transcribe(inputs)
# print(text)
return transcription
|