kanyekuthi commited on
Commit
20960bc
β€’
1 Parent(s): bb3d9e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +336 -27
app.py CHANGED
@@ -1,37 +1,346 @@
1
- import os
2
- # os.system("pip install git+https://github.com/openai/whisper.git")
 
 
 
 
 
 
 
 
 
 
 
3
  # import gradio as gr
4
- # import whisper
5
- # from huggingface_hub import from_pretrained_keras
6
- # from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
- # from transformers import pipeline
8
- # from sklearn.preprocessing import StandardScaler
9
- # import logging
10
- # import librosa
11
- # import numpy as np
12
- # import pickle
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # gr.Interface.load("models/kanyekuthi/AfriSpeech-whisper-tiny").launch()
17
- # gr.Interface.load("models/kanyekuthi/AfriSpeech-whisper-tiny")
18
- # gr.launch()
 
19
 
20
- distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription", show_copy_button=True)
21
- transcription = gr.components.Textbox(label="Whisper Transcription", show_copy_button=True)
22
 
23
- demo = gr.Interface(
24
- # main_note,
25
- gr.Audio(sources=["microphone"]),
26
- # gr.Label(num_top_classes=4),
27
- # examples=[
28
- # [os.path.join(os.path.dirname(__file__),"audio/recording1.wav")],
29
- # [os.path.join(os.path.dirname(__file__),"audio/cantina.wav")],
30
- # ],
31
- outputs=[distil_transcription, transcription]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- demo.load("models/kanyekuthi/AfriSpeech-whisper-tiny")
35
 
36
  if __name__ == "__main__":
37
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # # os.system("pip install git+https://github.com/openai/whisper.git")
3
+ # # import gradio as gr
4
+ # # import whisper
5
+ # # from huggingface_hub import from_pretrained_keras
6
+ # # from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ # # from transformers import pipeline
8
+ # # from sklearn.preprocessing import StandardScaler
9
+ # # import logging
10
+ # # import librosa
11
+ # # import numpy as np
12
+ # # import pickle
13
+
14
  # import gradio as gr
 
 
 
 
 
 
 
 
 
15
 
16
+ # # gr.Interface.load("models/kanyekuthi/AfriSpeech-whisper-tiny").launch()
17
+ # # gr.Interface.load("models/kanyekuthi/AfriSpeech-whisper-tiny")
18
+ # # gr.launch()
19
+
20
+ # distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription", show_copy_button=True)
21
+ # transcription = gr.components.Textbox(label="Whisper Transcription", show_copy_button=True)
22
+
23
+ # demo = gr.Interface(
24
+ # # main_note,
25
+ # gr.Audio(sources=["microphone"]),
26
+ # # gr.Label(num_top_classes=4),
27
+ # # examples=[
28
+ # # [os.path.join(os.path.dirname(__file__),"audio/recording1.wav")],
29
+ # # [os.path.join(os.path.dirname(__file__),"audio/cantina.wav")],
30
+ # # ],
31
+ # outputs=[distil_transcription, transcription]
32
+ # )
33
+
34
+ # demo.load("models/kanyekuthi/AfriSpeech-whisper-tiny")
35
+
36
+ # if __name__ == "__main__":
37
+ # demo.launch()
38
+
39
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
40
+ from transformers.utils import is_flash_attn_2_available
41
+ from transformers.pipelines.audio_utils import ffmpeg_read
42
+ import torch
43
+ import gradio as gr
44
+ import time
45
+
46
+ BATCH_SIZE = 16
47
+ MAX_AUDIO_MINS = 30 # maximum audio input in minutes
48
+
49
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
50
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
51
+ use_flash_attention_2 = is_flash_attn_2_available()
52
+
53
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
54
+ "openai/whisper-small", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
55
+ )
56
+ distilled_model = AutoModelForSpeechSeq2Seq.from_pretrained(
57
+ "distil-whisper/distil-small", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
58
+ )
59
+
60
+ if not use_flash_attention_2:
61
+ # use flash attention from pytorch sdpa
62
+ model = model.to_bettertransformer()
63
+ distilled_model = distilled_model.to_bettertransformer()
64
+
65
+ processor = AutoProcessor.from_pretrained("openai/whisper-small")
66
+
67
+ model.to(device)
68
+ distilled_model.to(device)
69
+
70
+ pipe = pipeline(
71
+ "automatic-speech-recognition",
72
+ model=model,
73
+ tokenizer=processor.tokenizer,
74
+ feature_extractor=processor.feature_extractor,
75
+ max_new_tokens=128,
76
+ chunk_length_s=30,
77
+ torch_dtype=torch_dtype,
78
+ device=device,
79
+ generate_kwargs={"language": "en", "task": "transcribe"},
80
+ return_timestamps=True
81
+ )
82
+ pipe_forward = pipe._forward
83
+
84
+ distil_pipe = pipeline(
85
+ "automatic-speech-recognition",
86
+ model=distilled_model,
87
+ tokenizer=processor.tokenizer,
88
+ feature_extractor=processor.feature_extractor,
89
+ max_new_tokens=128,
90
+ chunk_length_s=15,
91
+ torch_dtype=torch_dtype,
92
+ device=device,
93
+ generate_kwargs={"language": "en", "task": "transcribe"},
94
+ )
95
+ distil_pipe_forward = distil_pipe._forward
96
+
97
+ def transcribe(inputs):
98
+ if inputs is None:
99
+ raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
100
+
101
+ with open(inputs, "rb") as f:
102
+ inputs = f.read()
103
+
104
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
105
+ audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60
106
+
107
+ if audio_length_mins > MAX_AUDIO_MINS:
108
+ raise gr.Error(
109
+ f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
110
+ f"Got an audio of length {round(audio_length_mins, 3)} minutes."
111
+ )
112
+
113
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
114
+
115
+ def _forward_distil_time(*args, **kwargs):
116
+ global distil_runtime
117
+ start_time = time.time()
118
+ result = distil_pipe_forward(*args, **kwargs)
119
+ distil_runtime = time.time() - start_time
120
+ distil_runtime = round(distil_runtime, 2)
121
+ return result
122
+
123
+ distil_pipe._forward = _forward_distil_time
124
+ distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
125
+ yield distil_text, distil_runtime, None, None, None
126
+
127
+ def _forward_time(*args, **kwargs):
128
+ global runtime
129
+ start_time = time.time()
130
+ result = pipe_forward(*args, **kwargs)
131
+ runtime = time.time() - start_time
132
+ runtime = round(runtime, 2)
133
+ return result
134
+
135
+ pipe._forward = _forward_time
136
+ text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
137
+
138
+ yield distil_text, distil_runtime, text, runtime
139
+
140
+ if __name__ == "__main__":
141
+ with gr.Blocks() as demo:
142
+ gr.HTML(
143
+ """
144
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
145
+ <div
146
+ style="
147
+ display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
148
+ "
149
+ >
150
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
151
+ Whisper vs Distil-Whisper: Speed Comparison
152
+ </h1>
153
+ </div>
154
+ </div>
155
+ """
156
+ )
157
+ gr.HTML(
158
+ f"""
159
+ <p><a href="https://huggingface.co/distil-whisper/distil-small"> Distil-Whisper</a> is a distilled variant
160
+ of the <a href="https://huggingface.co/openai/whisper-small"> Whisper</a> model by OpenAI. Compared to Whisper,
161
+ Distil-Whisper runs 6x faster with 50% fewer parameters, while performing to within 1% word error rate (WER) on
162
+ out-of-distribution evaluation data.</p>
163
+
164
+ <p>In this demo, we perform a speed comparison between Whisper and Distil-Whisper in order to test this claim.
165
+ Both models use the <a href="https://huggingface.co/distil-whisper/distil-small#long-form-transcription"> chunked long-form transcription algorithm</a>
166
+ in πŸ€— Transformers, as well as Flash Attention. To use Distil-Whisper yourself, check the code examples on the
167
+ <a href="https://github.com/huggingface/distil-whisper#1-usage"> Distil-Whisper repository</a>. To ensure fair
168
+ usage of the Space, we ask that audio file inputs are kept to < 30 mins.</p>
169
+ """
170
+ )
171
+ audio = gr.components.Audio(type="filepath", label="Audio input")
172
+ button = gr.Button("Transcribe")
173
+ with gr.Row():
174
+ distil_runtime = gr.components.Textbox(label="Distil-Whisper Transcription Time (s)")
175
+ runtime = gr.components.Textbox(label="Whisper Transcription Time (s)")
176
+ with gr.Row():
177
+ distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription", show_copy_button=True)
178
+ transcription = gr.components.Textbox(label="Whisper Transcription", show_copy_button=True)
179
+ button.click(
180
+ fn=transcribe,
181
+ inputs=audio,
182
+ outputs=[distil_transcription, distil_runtime, transcription, runtime],
183
+ )
184
+ gr.Markdown("## Examples")
185
+ gr.Examples(
186
+ [["./assets/example_1.wav"], ["./assets/example_2.wav"]],
187
+ audio,
188
+ outputs=[distil_transcription, distil_runtime, transcription, runtime],
189
+ fn=transcribe,
190
+ cache_examples=False,
191
+ )
192
+ demo.queue(max_size=10).launch()
193
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
194
+ from transformers.utils import is_flash_attn_2_available
195
+ from transformers.pipelines.audio_utils import ffmpeg_read
196
+ import torch
197
  import gradio as gr
198
+ import time
199
+
200
+ BATCH_SIZE = 16
201
+ MAX_AUDIO_MINS = 30 # maximum audio input in minutes
202
+
203
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
204
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
205
+ use_flash_attention_2 = is_flash_attn_2_available()
206
+
207
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
208
+ "openai/whisper-small", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
209
+ )
210
+ distilled_model = AutoModelForSpeechSeq2Seq.from_pretrained(
211
+ "distil-whisper/distil-small", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
212
+ )
213
 
214
+ if not use_flash_attention_2:
215
+ # use flash attention from pytorch sdpa
216
+ model = model.to_bettertransformer()
217
+ distilled_model = distilled_model.to_bettertransformer()
218
 
219
+ processor = AutoProcessor.from_pretrained("openai/whisper-small")
 
220
 
221
+ model.to(device)
222
+ distilled_model.to(device)
223
+
224
+ pipe = pipeline(
225
+ "automatic-speech-recognition",
226
+ model=model,
227
+ tokenizer=processor.tokenizer,
228
+ feature_extractor=processor.feature_extractor,
229
+ max_new_tokens=128,
230
+ chunk_length_s=30,
231
+ torch_dtype=torch_dtype,
232
+ device=device,
233
+ generate_kwargs={"language": "en", "task": "transcribe"},
234
+ return_timestamps=True
235
+ )
236
+ pipe_forward = pipe._forward
237
+
238
+ distil_pipe = pipeline(
239
+ "automatic-speech-recognition",
240
+ model=distilled_model,
241
+ tokenizer=processor.tokenizer,
242
+ feature_extractor=processor.feature_extractor,
243
+ max_new_tokens=128,
244
+ chunk_length_s=15,
245
+ torch_dtype=torch_dtype,
246
+ device=device,
247
+ generate_kwargs={"language": "en", "task": "transcribe"},
248
  )
249
+ distil_pipe_forward = distil_pipe._forward
250
+
251
+ def transcribe(inputs):
252
+ if inputs is None:
253
+ raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
254
+
255
+ with open(inputs, "rb") as f:
256
+ inputs = f.read()
257
+
258
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
259
+ audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60
260
+
261
+ if audio_length_mins > MAX_AUDIO_MINS:
262
+ raise gr.Error(
263
+ f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
264
+ f"Got an audio of length {round(audio_length_mins, 3)} minutes."
265
+ )
266
+
267
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
268
+
269
+ def _forward_distil_time(*args, **kwargs):
270
+ global distil_runtime
271
+ start_time = time.time()
272
+ result = distil_pipe_forward(*args, **kwargs)
273
+ distil_runtime = time.time() - start_time
274
+ distil_runtime = round(distil_runtime, 2)
275
+ return result
276
+
277
+ distil_pipe._forward = _forward_distil_time
278
+ distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
279
+ yield distil_text, distil_runtime, None, None, None
280
+
281
+ def _forward_time(*args, **kwargs):
282
+ global runtime
283
+ start_time = time.time()
284
+ result = pipe_forward(*args, **kwargs)
285
+ runtime = time.time() - start_time
286
+ runtime = round(runtime, 2)
287
+ return result
288
+
289
+ pipe._forward = _forward_time
290
+ text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
291
 
292
+ yield distil_text, distil_runtime, text, runtime
293
 
294
  if __name__ == "__main__":
295
+ with gr.Blocks() as demo:
296
+ gr.HTML(
297
+ """
298
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
299
+ <div
300
+ style="
301
+ display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
302
+ "
303
+ >
304
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
305
+ Whisper vs Distil-Whisper: Speed Comparison
306
+ </h1>
307
+ </div>
308
+ </div>
309
+ """
310
+ )
311
+ gr.HTML(
312
+ f"""
313
+ <p><a href="https://huggingface.co/distil-whisper/distil-small"> Distil-Whisper</a> is a distilled variant
314
+ of the <a href="https://huggingface.co/openai/whisper-small"> Whisper</a> model by OpenAI. Compared to Whisper,
315
+ Distil-Whisper runs 6x faster with 50% fewer parameters, while performing to within 1% word error rate (WER) on
316
+ out-of-distribution evaluation data.</p>
317
+
318
+ <p>In this demo, we perform a speed comparison between Whisper and Distil-Whisper in order to test this claim.
319
+ Both models use the <a href="https://huggingface.co/distil-whisper/distil-small#long-form-transcription"> chunked long-form transcription algorithm</a>
320
+ in πŸ€— Transformers, as well as Flash Attention. To use Distil-Whisper yourself, check the code examples on the
321
+ <a href="https://github.com/huggingface/distil-whisper#1-usage"> Distil-Whisper repository</a>. To ensure fair
322
+ usage of the Space, we ask that audio file inputs are kept to < 30 mins.</p>
323
+ """
324
+ )
325
+ audio = gr.components.Audio(type="filepath", label="Audio input")
326
+ button = gr.Button("Transcribe")
327
+ with gr.Row():
328
+ distil_runtime = gr.components.Textbox(label="Distil-Whisper Transcription Time (s)")
329
+ runtime = gr.components.Textbox(label="Whisper Transcription Time (s)")
330
+ with gr.Row():
331
+ distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription", show_copy_button=True)
332
+ transcription = gr.components.Textbox(label="Whisper Transcription", show_copy_button=True)
333
+ button.click(
334
+ fn=transcribe,
335
+ inputs=audio,
336
+ outputs=[distil_transcription, distil_runtime, transcription, runtime],
337
+ )
338
+ gr.Markdown("## Examples")
339
+ gr.Examples(
340
+ [["./assets/example_1.wav"], ["./assets/example_2.wav"]],
341
+ audio,
342
+ outputs=[distil_transcription, distil_runtime, transcription, runtime],
343
+ fn=transcribe,
344
+ cache_examples=False,
345
+ )
346
+ demo.queue(max_size=10).launch()