thealphamerc commited on
Commit
003983b
1 Parent(s): 2d6bfef

Added examples

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. .gitignore +3 -1
  3. app.py +53 -67
  4. input/example-1.wav +3 -0
  5. input/example-2.wav +3 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1 +1,3 @@
1
- output/
 
 
 
1
+ *.srt
2
+ *.mp4
3
+ *.raw
app.py CHANGED
@@ -1,13 +1,13 @@
1
  import os
2
- import logging
3
  os.system("pip install git+https://github.com/openai/whisper.git")
4
- import gradio as gr
5
- from subprocess import call
6
- import whisper
7
- from datetime import timedelta
8
- from pytube import YouTube
9
- import pandas as pd
10
  import pysrt
 
 
 
 
 
 
 
11
  # from transformers.pipelines.audio_utils import ffmpeg_read
12
 
13
 
@@ -21,11 +21,7 @@ ch.setFormatter(formatter)
21
  logger.addHandler(ch)
22
 
23
 
24
- BATCH_SIZE = 16
25
- CHUNK_LENGTH_S = 30
26
- NUM_PROC = 8
27
  FILE_LIMIT_MB = 1000
28
- YT_ATTEMPT_LIMIT = 3
29
 
30
 
31
  def run_cmd(command):
@@ -44,7 +40,6 @@ def inference(text):
44
 
45
 
46
  baseModel = whisper.load_model("base")
47
- smallModel = whisper.load_model("small")
48
 
49
 
50
  df_init = pd.DataFrame(columns=['start', 'end', 'text'])
@@ -52,35 +47,45 @@ transcription_df = gr.DataFrame(value=df_init, label="Transcription dataframe",
52
  0, "dynamic"), max_rows=30, wrap=True, overflow_row_behaviour='paginate')
53
 
54
 
55
- inputs = gr.components.Audio(type="filepath", label="Add audio file")
 
56
  outputs = [gr.components.Textbox(), transcription_df]
57
  title = "Transcribe multi-lingual audio clips"
58
- description = "An example of using TTS to generate speech from text."
59
  article = ""
60
- examples = [
61
- [""]
 
62
  ]
63
 
64
 
65
- def transcribe(inputs):
66
- print('Inputs: ', inputs)
67
- # print('Text: ', text)
68
- # progress(0, desc="Loading audio file...")
69
  if inputs is None:
70
  logger.warning("No audio file")
71
- return "No audio file submitted! Please upload an audio file before submitting your request."
72
  file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
 
 
73
  if file_size_mb > FILE_LIMIT_MB:
74
  logger.warning("Max file size exceeded")
75
- return f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."
 
 
 
 
 
 
 
76
 
77
- # with open(inputs, "rb") as f:
78
- # inputs = f.read()
 
 
79
 
80
- # load audio and pad/trim it to fit 30 seconds
81
- result = smallModel.transcribe(audio=inputs, language='english',
82
- verbose=False)
83
- # ---------------------------------------------------
84
  segments = result['segments']
85
  for segment in segments:
86
  startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
@@ -89,17 +94,11 @@ def transcribe(inputs):
89
  segmentId = segment['id']+1
90
  segment = f"{segmentId}\n{startTime} --> {endTime}\n{text[1:] if text[0] is ' ' else text}\n\n"
91
 
92
- srtFilename = os.path.join("output/SrtFiles", inputs.split(
93
- '/')[-1].split('.')[0]+'.srt')
94
  with open(srtFilename, 'a', encoding='utf-8') as srtFile:
95
  srtFile.write(segment)
96
 
97
- rawFilename = os.path.join("output/SrtFiles", inputs.split(
98
- '/')[-1].split('.')[0]+'.srt')
99
- with open(rawFilename, 'a', encoding='utf-8') as srtFile:
100
- srtFile.write(segment)
101
  try:
102
-
103
  srt_path = srtFilename
104
  df = pd.DataFrame(columns=['start', 'end', 'text'])
105
  subs = pysrt.open(srt_path)
@@ -129,7 +128,7 @@ def transcribe(inputs):
129
  df = pd.DataFrame(objects, columns=['start', 'end', 'text'])
130
  except Exception as e:
131
  print('Error: ', e)
132
- df = pd.DataFrame(columns=['start', 'end', 'text'])
133
 
134
  return [result["text"], df]
135
 
@@ -205,23 +204,24 @@ audio_chunked = gr.Interface(
205
  title=title,
206
  description=description,
207
  article=article,
 
208
  )
209
 
210
- microphone_chunked = gr.Interface(
211
- fn=transcribe,
212
- inputs=[
213
- gr.inputs.Audio(source="microphone",
214
- optional=True, type="filepath"),
215
- ],
216
- outputs=[
217
- gr.outputs.Textbox(label="Transcription").style(
218
- show_copy_button=True),
219
- ],
220
- allow_flagging="never",
221
- title=title,
222
- description=description,
223
- article=article,
224
- )
225
  youtube_chunked = gr.Interface(
226
  fn=youtube_transcript,
227
  inputs=[
@@ -248,21 +248,7 @@ youtube_chunked = gr.Interface(
248
 
249
  demo = gr.Blocks()
250
  with demo:
251
- gr.TabbedInterface([audio_chunked, youtube_chunked, microphone_chunked], [
252
- "Audio File", "Youtube", "Microphone"])
253
  demo.queue(concurrency_count=1, max_size=5)
254
  demo.launch(show_api=False)
255
-
256
-
257
- # gr.Interface(
258
- # inference,
259
- # inputs,
260
- # outputs,
261
- # verbose=True,
262
- # title=title,
263
- # description=description,
264
- # article=article,
265
- # examples=examples,
266
- # enable_queue=True,
267
-
268
- # ).launch(share=True, debug=True)
 
1
  import os
 
2
  os.system("pip install git+https://github.com/openai/whisper.git")
 
 
 
 
 
 
3
  import pysrt
4
+ import pandas as pd
5
+ from pytube import YouTube
6
+ from datetime import timedelta
7
+ import whisper
8
+ from subprocess import call
9
+ import gradio as gr
10
+ import logging
11
  # from transformers.pipelines.audio_utils import ffmpeg_read
12
 
13
 
 
21
  logger.addHandler(ch)
22
 
23
 
 
 
 
24
  FILE_LIMIT_MB = 1000
 
25
 
26
 
27
  def run_cmd(command):
 
40
 
41
 
42
  baseModel = whisper.load_model("base")
 
43
 
44
 
45
  df_init = pd.DataFrame(columns=['start', 'end', 'text'])
 
47
  0, "dynamic"), max_rows=30, wrap=True, overflow_row_behaviour='paginate')
48
 
49
 
50
+ inputs = [gr.components.Audio(type="filepath", label="Add audio file"), gr.inputs.Audio(source="microphone",
51
+ optional=True, type="filepath"),]
52
  outputs = [gr.components.Textbox(), transcription_df]
53
  title = "Transcribe multi-lingual audio clips"
54
+ description = "An example of using OpenAi whisper to generate transcriptions for audio clips."
55
  article = ""
56
+ audio_examples = [
57
+ ["input/example-1.wav"],
58
+ ["input/example-2.wav"],
59
  ]
60
 
61
 
62
+ def transcribe(inputs, microphone):
63
+ if (microphone is not None):
64
+ inputs = microphone
65
+
66
  if inputs is None:
67
  logger.warning("No audio file")
68
+ return [f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB.", df_init]
69
  file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
70
+
71
+ # --------------------------------------------------- Check the file size ---------------------------------------------------
72
  if file_size_mb > FILE_LIMIT_MB:
73
  logger.warning("Max file size exceeded")
74
+ df = pd.DataFrame(columns=['start', 'end', 'text'])
75
+ return [f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB.", df_init]
76
+
77
+ # --------------------------------------------------- Transcribe the audio ---------------------------------------------------
78
+ result = baseModel.transcribe(audio=inputs, language='english',
79
+ verbose=False)
80
+ srtFilename = os.path.join("output/SrtFiles", inputs.split(
81
+ '/')[-1].split('.')[0]+'.srt')
82
 
83
+ # --------------------------------------------------- Clear the file ---------------------------------------------------
84
+ with open(srtFilename, 'w', encoding='utf-8') as srtFile:
85
+ srtFile.seek(0)
86
+ srtFile.truncate()
87
 
88
+ # --------------------------------------------------- Write the file ---------------------------------------------------
 
 
 
89
  segments = result['segments']
90
  for segment in segments:
91
  startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
 
94
  segmentId = segment['id']+1
95
  segment = f"{segmentId}\n{startTime} --> {endTime}\n{text[1:] if text[0] is ' ' else text}\n\n"
96
 
 
 
97
  with open(srtFilename, 'a', encoding='utf-8') as srtFile:
98
  srtFile.write(segment)
99
 
100
+ # ------------------------------------------- Read the file and Prepare to display ---------------------------------------
 
 
 
101
  try:
 
102
  srt_path = srtFilename
103
  df = pd.DataFrame(columns=['start', 'end', 'text'])
104
  subs = pysrt.open(srt_path)
 
128
  df = pd.DataFrame(objects, columns=['start', 'end', 'text'])
129
  except Exception as e:
130
  print('Error: ', e)
131
+ df = df_init
132
 
133
  return [result["text"], df]
134
 
 
204
  title=title,
205
  description=description,
206
  article=article,
207
+ examples=audio_examples,
208
  )
209
 
210
+ # microphone_chunked = gr.Interface(
211
+ # fn=transcribe,
212
+ # inputs=[
213
+ # gr.inputs.Audio(source="microphone",
214
+ # optional=True, type="filepath"),
215
+ # ],
216
+ # outputs=[
217
+ # gr.outputs.Textbox(label="Transcription").style(
218
+ # show_copy_button=True),
219
+ # ],
220
+ # allow_flagging="never",
221
+ # title=title,
222
+ # description=description,
223
+ # article=article,
224
+ # )
225
  youtube_chunked = gr.Interface(
226
  fn=youtube_transcript,
227
  inputs=[
 
248
 
249
  demo = gr.Blocks()
250
  with demo:
251
+ gr.TabbedInterface([audio_chunked, youtube_chunked], [
252
+ "Audio File", "Youtube"])
253
  demo.queue(concurrency_count=1, max_size=5)
254
  demo.launch(show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
input/example-1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:971b4163670445c415c6b0fb6813c38093409ecac2f6b4d429ae3574d24ad470
3
+ size 3249924
input/example-2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c17c6659d9252782e9481764a6ce447bac29ff874cc5c67f9bbf703b7f13743
3
+ size 692524