Spaces:

raphael-gl
/

ai-days-subtitles-demo

Paused

App Files Files Community

Raphael commited on Jun 20, 2023

Commit

720b03b

•

1 Parent(s): 1538088

App v1

Browse files

Signed-off-by: Raphael <[email protected]>

Files changed (4) hide show

.gitignore +1 -0
app.py +252 -0
packages.txt +1 -0
requirements.txt +15 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import logging
+import math
+import os
+import shutil
+import time
+from datasets import load_dataset
+import gradio as gr
+import moviepy.editor as mp
+import numpy as np
+import pysrt
+import torch
+from transformers import pipeline
+import yt_dlp
+os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', force=True)
+LOG = logging.getLogger(__name__)
+CLIP_SECONDS = 20
+SLICES = 4
+SLICE_DURATION = CLIP_SECONDS / SLICES
+# At most 6 mins
+MAX_CHUNKS = 45
+BASEDIR = '/tmp/processed'
+os.makedirs(BASEDIR, exist_ok=True)
+asr_kwargs = {
+    "task": "automatic-speech-recognition",
+    "model": "openai/whisper-medium.en"
+}
+translator_kwargs = {
+    "task": "translation_en_to_fr",
+    "model": "Helsinki-NLP/opus-mt-en-fr"
+}
+summarizer_kwargs = {
+    "task": "summarization",
+    "model": "facebook/bart-large-cnn"
+}
+if torch.cuda.is_available():
+    LOG.info("GPU available")
+    asr_kwargs['device'] = 'cuda:0'
+    translator_kwargs['device'] = 'cuda:0'
+    summarizer_kwargs['device'] = 'cuda:0'
+# All three models should fit together on a single T4 GPU
+LOG.info("Fetching ASR model from the Hub if not already there")
+asr = pipeline(**asr_kwargs)
+LOG.info("Fetching translation model from the Hub if not already there")
+translator = pipeline(**translator_kwargs)
+LOG.info("Fetching summarization model from the Hub if not already there")
+summarizer = pipeline(**summarizer_kwargs)
+def demo(url: str, translate: bool):
+    basedir = BASEDIR
+    video_path, video = download(url, os.path.join(basedir, 'video.mp4'))
+    audio_clips(video, basedir)
+    srt_file, summary = process_video(basedir, video.duration, translate)
+    return summary, srt_file, [video_path, srt_file]
+def download(url, dst):
+    LOG.info("Downloading provided url %s", url)
+    opts = {
+        'skip_download': False,
+        'overwrites': True,
+        'format': 'mp4',
+        'outtmpl': {'default': dst}
+    }
+    with yt_dlp.YoutubeDL(opts) as dl:
+        dl.download([url])
+    return dst, mp.VideoFileClip(dst)
+def audiodir(basedir):
+    return os.path.join(basedir, 'audio')
+def audio_clips(video: mp.VideoFileClip, basedir: str):
+    LOG.info("Building audio clips")
+    clips_dir = audiodir(basedir)
+    shutil.rmtree(clips_dir, ignore_errors=True)
+    os.makedirs(clips_dir, exist_ok=True)
+    audio = video.audio
+    end = audio.duration
+    digits = int(math.log(end / CLIP_SECONDS, 10)) + 1
+    for idx, i in enumerate(range(0, int(end), CLIP_SECONDS)):
+        sub_end = min(i+CLIP_SECONDS, end)
+        # print(sub_end)
+        sub_clip = audio.subclip(t_start=i, t_end=sub_end)
+        audio_file = os.path.join(clips_dir, f"audio_{idx:0{digits}d}" + ".ogg")
+        # audio_file = os.path.join(AUDIO_CLIPS, "audio_" + str(idx))
+        sub_clip.write_audiofile(audio_file, fps=16000)
+def process_video(basedir: str, duration, translate: bool):
+    audio_dir = audiodir(basedir)
+    transcriptions = transcription(audio_dir, duration)
+    subs = translation(transcriptions, translate)
+    srt_file = build_srt_clips(subs, basedir)
+    summary = summarize(transcriptions, translate)
+    return srt_file, summary
+def transcription(audio_dir: str, duration):
+    LOG.info("Audio transcription")
+    # Not exact, nvm, doesn't need to be
+    chunks = int(duration / CLIP_SECONDS + 1)
+    chunks = min(chunks, MAX_CHUNKS)
+    LOG.debug("Loading audio clips dataset")
+    dataset = load_dataset("audiofolder", data_dir=audio_dir)
+    dataset = dataset['train']
+    dataset = dataset['audio'][0:chunks]
+    start = time.time()
+    transcriptions = []
+    for i, d in enumerate(np.array_split(dataset, 5)):
+        d = list(d)
+        LOG.info("ASR batch %d / 5, samples %d", i, len(d))
+        t = asr(d, max_new_tokens=10000)
+        transcriptions.extend(t)
+    transcriptions = [t['text'] for t in transcriptions]
+    elapsed = time.time() - start
+    LOG.info("Transcription done, elapsed %.2f seconds", elapsed)
+    return transcriptions
+def translation(transcriptions, translate):
+    if translate:
+        LOG.info("Performing translation")
+        start = time.time()
+        translations = translator(transcriptions)
+        translations = [t['translation_text'] for t in translations]
+        elapsed = time.time() - start
+        LOG.info("Translation done, elapsed %.2f seconds", elapsed)
+    else:
+        translations = transcriptions
+    return translations
+def summarize(transcriptions, translate):
+    LOG.info("Generating video summary")
+    whole_text = ' '.join(transcriptions).strip()
+    word_count = len(whole_text.split())
+    summary = summarizer(whole_text)
+    # min_length=word_count // 4 + 1,
+    # max_length=word_count // 2 + 1)
+    summary = translation([summary[0]['summary_text']], translate)[0]
+    return summary
+def subs_to_timed_segments(subtitles: list[str]):
+    LOG.info("Building srt segments")
+    all_chunks = []
+    for sub in subtitles:
+        chunks = np.array_split(sub.split(' '), SLICES)
+        all_chunks.extend(chunks)
+    subs = []
+    for c in all_chunks:
+        c = ' '.join(c)
+        subs.append(c)
+    segments = []
+    for i, c in enumerate(subs):
+        segments.append({
+            'text': c.strip(),
+            'start': i * SLICE_DURATION,
+            'end': (i + 1) * SLICE_DURATION
+        })
+    return segments
+def build_srt_clips(subs, basedir):
+    LOG.info("Generating subtitles")
+    segments = subs_to_timed_segments(subs)
+    LOG.info("Building srt clips")
+    max_text_len = 30
+    subtitles = pysrt.SubRipFile()
+    first = True
+    for segment in segments:
+        start = segment['start'] * 1000
+        if first:
+            start += 3000
+            first = False
+        end = segment['end'] * 1000
+        text = segment['text']
+        text = text.strip()
+        if len(text) < max_text_len:
+            o = pysrt.SubRipItem()
+            o.start = pysrt.SubRipTime(0, 0, 0, start)
+            o.end = pysrt.SubRipTime(0, 0, 0, end)
+            o.text = text
+            subtitles.append(o)
+        else:
+            # Just split in two, should be ok in most cases
+            words = text.split()
+            o = pysrt.SubRipItem()
+            o.text = ' '.join(words[0:len(words)//2])
+            o.start = pysrt.SubRipTime(0, 0, 0, start)
+            chkpt = (start + end) / 2
+            o.end = pysrt.SubRipTime(0, 0, 0, chkpt)
+            subtitles.append(o)
+            o = pysrt.SubRipItem()
+            o.text = ' '.join(words[len(words)//2:])
+            o.start = pysrt.SubRipTime(0, 0, 0, chkpt)
+            o.end = pysrt.SubRipTime(0, 0, 0, end)
+            subtitles.append(o)
+    srt_path = os.path.join(basedir, 'video.srt')
+    subtitles.save(srt_path, encoding='utf-8')
+    LOG.info("Subtitles saved in srt file %s", srt_path)
+    return srt_path
+iface = gr.Interface(
+    fn=demo,
+    inputs=[
+        gr.Text(value="https://youtu.be/tiZFewofSLM", label="English video url"),
+        gr.Checkbox(value=True, label='Translate to French')],
+    outputs=[
+        gr.Text(label="Video summary"),
+        gr.File(label="SRT file"),
+        gr.Video(label="Video with subtitles"),
+    ])
+iface.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ imagemagick

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+jupyter
+notebook
+numpy
+torch
+transformers
+hf_transfer
+moviepy
+yt-dlp
+datasets
+soundfile
+librosa
+sentencepiece
+pysrt
+gradio
+sacremoses