radames commited on
Commit
083f815
1 Parent(s): 9c69cd6

update gradio

Browse files
Files changed (2) hide show
  1. app.py +26 -18
  2. requirements.txt +1 -1
app.py CHANGED
@@ -63,7 +63,7 @@ async def speech_to_text(video_file_path):
63
  Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
64
  """
65
  global total_inferences_since_reboot
66
- if(video_file_path == None):
67
  raise ValueError("Error no video input")
68
 
69
  video_path = Path(video_file_path)
@@ -84,6 +84,7 @@ async def speech_to_text(video_file_path):
84
  print(f'Transcribing from API attempt {tries}')
85
  try:
86
  inference_reponse = await query_api(audio_memory)
 
87
  transcription = inference_reponse["text"].lower()
88
  timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
89
  for chunk in inference_reponse['chunks']]
@@ -92,7 +93,8 @@ async def speech_to_text(video_file_path):
92
  print("\n\ntotal_inferences_since_reboot: ",
93
  total_inferences_since_reboot, "\n\n")
94
  return (transcription, transcription, timestamps)
95
- except:
 
96
  if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
97
  wait_time = inference_reponse['estimated_time']
98
  print("Waiting for model to load....", wait_time)
@@ -134,7 +136,7 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
134
 
135
  video_path = Path(video_in)
136
  video_file_name = video_path.stem
137
- if(video_in == None or text_in == None or transcription == None):
138
  raise ValueError("Inputs undefined")
139
 
140
  d = Differ()
@@ -150,7 +152,7 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
150
  # groupping character timestamps so there are less cuts
151
  idx = 0
152
  grouped = {}
153
- for(a, b) in zip(filtered, timestamps):
154
  if a[0] != '-':
155
  if idx in grouped:
156
  grouped[idx].append(b)
@@ -203,7 +205,15 @@ async def query_api(audio_bytes: bytes):
203
  }).encode("utf-8")
204
  async with aiohttp.ClientSession() as session:
205
  async with session.post(API_URL, headers=headers, data=payload) as response:
206
- return await response.json()
 
 
 
 
 
 
 
 
207
 
208
 
209
  def ping(name):
@@ -222,28 +232,26 @@ video_in = gr.Video(label="Video file")
222
  text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
223
  video_out = gr.Video(label="Video Out")
224
  diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
225
- examples = gr.components.Dataset(
226
- components=[video_in], samples=VIDEOS, type="index")
227
 
228
- demo = gr.Blocks(enable_queue=True, css='''
229
  #cut_btn, #reset_btn { align-self:stretch; }
230
  #\\31 3 { max-width: 540px; }
231
  .output-markdown {max-width: 65ch !important;}
232
- ''')
233
- demo.encrypt = False
234
- with demo:
235
  transcription_var = gr.Variable()
236
  timestamps_var = gr.Variable()
237
  with gr.Row():
238
  with gr.Column():
239
- gr.Markdown('''
240
  # Edit Video By Editing Text
241
  This project is a quick proof of concept of a simple video editor where the edits
242
  are made by editing the audio transcription.
243
  Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
244
  with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
245
  you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
246
- ''')
247
 
248
  with gr.Row():
249
 
@@ -269,9 +277,9 @@ with demo:
269
  text_in, transcription_var, timestamps_var])
270
 
271
  with gr.Row():
272
- gr.Markdown('''
273
  ### Now edit as text
274
- After running the video transcription, you can make cuts to the text below (only cuts, not additions!)''')
275
 
276
  with gr.Row():
277
  with gr.Column():
@@ -290,13 +298,13 @@ with demo:
290
  video_out.render()
291
  diff_out.render()
292
  with gr.Row():
293
- gr.Markdown('''
294
  #### Video Credits
295
 
296
  1. [Cooking](https://vimeo.com/573792389)
297
  1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
298
  1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
299
- ''')
300
-
301
  if __name__ == "__main__":
302
  demo.launch(debug=True)
 
63
  Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
64
  """
65
  global total_inferences_since_reboot
66
+ if (video_file_path == None):
67
  raise ValueError("Error no video input")
68
 
69
  video_path = Path(video_file_path)
 
84
  print(f'Transcribing from API attempt {tries}')
85
  try:
86
  inference_reponse = await query_api(audio_memory)
87
+ print(inference_reponse)
88
  transcription = inference_reponse["text"].lower()
89
  timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
90
  for chunk in inference_reponse['chunks']]
 
93
  print("\n\ntotal_inferences_since_reboot: ",
94
  total_inferences_since_reboot, "\n\n")
95
  return (transcription, transcription, timestamps)
96
+ except Exception as e:
97
+ print(e)
98
  if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
99
  wait_time = inference_reponse['estimated_time']
100
  print("Waiting for model to load....", wait_time)
 
136
 
137
  video_path = Path(video_in)
138
  video_file_name = video_path.stem
139
+ if (video_in == None or text_in == None or transcription == None):
140
  raise ValueError("Inputs undefined")
141
 
142
  d = Differ()
 
152
  # groupping character timestamps so there are less cuts
153
  idx = 0
154
  grouped = {}
155
+ for (a, b) in zip(filtered, timestamps):
156
  if a[0] != '-':
157
  if idx in grouped:
158
  grouped[idx].append(b)
 
205
  }).encode("utf-8")
206
  async with aiohttp.ClientSession() as session:
207
  async with session.post(API_URL, headers=headers, data=payload) as response:
208
+ print("API Response: ", response.status)
209
+ if response.headers['Content-Type'] == 'application/json':
210
+ return await response.json()
211
+ elif response.headers['Content-Type'] == 'application/octet-stream':
212
+ return await response.read()
213
+ elif response.headers['Content-Type'] == 'text/plain':
214
+ return await response.text()
215
+ else:
216
+ raise RuntimeError("Error Fetching API")
217
 
218
 
219
  def ping(name):
 
232
  text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
233
  video_out = gr.Video(label="Video Out")
234
  diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
235
+ examples = gr.Dataset(components=[video_in], samples=VIDEOS, type="index")
 
236
 
237
+ css = """
238
  #cut_btn, #reset_btn { align-self:stretch; }
239
  #\\31 3 { max-width: 540px; }
240
  .output-markdown {max-width: 65ch !important;}
241
+ """
242
+ with gr.Blocks(css=css) as demo:
 
243
  transcription_var = gr.Variable()
244
  timestamps_var = gr.Variable()
245
  with gr.Row():
246
  with gr.Column():
247
+ gr.Markdown("""
248
  # Edit Video By Editing Text
249
  This project is a quick proof of concept of a simple video editor where the edits
250
  are made by editing the audio transcription.
251
  Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
252
  with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
253
  you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
254
+ """)
255
 
256
  with gr.Row():
257
 
 
277
  text_in, transcription_var, timestamps_var])
278
 
279
  with gr.Row():
280
+ gr.Markdown("""
281
  ### Now edit as text
282
+ After running the video transcription, you can make cuts to the text below (only cuts, not additions!)""")
283
 
284
  with gr.Row():
285
  with gr.Column():
 
298
  video_out.render()
299
  diff_out.render()
300
  with gr.Row():
301
+ gr.Markdown("""
302
  #### Video Credits
303
 
304
  1. [Cooking](https://vimeo.com/573792389)
305
  1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
306
  1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
307
+ """)
308
+ demo.queue()
309
  if __name__ == "__main__":
310
  demo.launch(debug=True)
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  torch
2
  transformers
3
- gradio==3.0.9
4
  datasets
5
  librosa
6
  ffmpeg-python
 
1
  torch
2
  transformers
3
+ gradio==3.35.2
4
  datasets
5
  librosa
6
  ffmpeg-python