tts-hallo-talking-portrait

Running on A10G

App Files Files Community

fffiloni commited on Jun 26

Commit

446a654

•

1 Parent(s): e2b4c82

Update app.py

Browse files

Files changed (1) hide show

app.py +277 -51

app.py CHANGED Viewed

@@ -2,6 +2,10 @@ import os
 import shutil
 from huggingface_hub import snapshot_download
 import gradio as gr
 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 from scripts.inference import inference_process
 import argparse
@@ -12,7 +16,118 @@ is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] els
 if(not is_shared_ui):
     hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
-def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
     if is_shared_ui:
         raise gr.Error("This Space only works in duplicated instances")
@@ -33,8 +148,61 @@ def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=T
     inference_process(args)
     return f'output-{unique_id}.mp4'
 css = '''
 div#warning-ready {
     background-color: #ecfdf5;
     padding: 0 16px 16px;
@@ -72,54 +240,112 @@ div#warning-duplicate .actions a {
 '''
 with gr.Blocks(css=css) as demo:
-    if is_shared_ui:
-        top_description = gr.HTML(f'''
-            <div class="gr-prose">
-                <h2 class="custom-color"><svg xmlns="http://www.w3.org/2000/svg" width="18px" height="18px" style="margin-right: 0px;display: inline-block;"fill="none"><path fill="#fff" d="M7 13.2a6.3 6.3 0 0 0 4.4-10.7A6.3 6.3 0 0 0 .6 6.9 6.3 6.3 0 0 0 7 13.2Z"/><path fill="#fff" fill-rule="evenodd" d="M7 0a6.9 6.9 0 0 1 4.8 11.8A6.9 6.9 0 0 1 0 7 6.9 6.9 0 0 1 7 0Zm0 0v.7V0ZM0 7h.6H0Zm7 6.8v-.6.6ZM13.7 7h-.6.6ZM9.1 1.7c-.7-.3-1.4-.4-2.2-.4a5.6 5.6 0 0 0-4 1.6 5.6 5.6 0 0 0-1.6 4 5.6 5.6 0 0 0 1.6 4 5.6 5.6 0 0 0 4 1.7 5.6 5.6 0 0 0 4-1.7 5.6 5.6 0 0 0 1.7-4 5.6 5.6 0 0 0-1.7-4c-.5-.5-1.1-.9-1.8-1.2Z" clip-rule="evenodd"/><path fill="#000" fill-rule="evenodd" d="M7 2.9a.8.8 0 1 1 0 1.5A.8.8 0 0 1 7 3ZM5.8 5.7c0-.4.3-.6.6-.6h.7c.3 0 .6.2.6.6v3.7h.5a.6.6 0 0 1 0 1.3H6a.6.6 0 0 1 0-1.3h.4v-3a.6.6 0 0 1-.6-.7Z" clip-rule="evenodd"/></svg>
-                Attention: this Space need to be duplicated to work</h2>
-                <p class="main-message custom-color">
-                    To make it work, <strong>duplicate the Space</strong> and run it on your own profile using a <strong>private</strong> GPU.<br />
-                    An L4 costs <strong>US$0.80/h</strong>
-                </p>
-                <p class="actions custom-color">
-                    <a href="https://huggingface.co/spaces/{os.environ['SPACE_ID']}?duplicate=true">
-                        <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg-dark.svg" alt="Duplicate this Space" />
-                    </a>
-                    to start generate your talking head
-                </p>
-            </div>
-        ''', elem_id="warning-duplicate")
-    gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
-    gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
-    gr.Markdown("""
-Hallo has a few simple requirements for input data:
-For the source image:
-1. It should be cropped into squares.
-2. The face should be the main focus, making up 50%-70% of the image.
-3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
-For the driving audio:
-1. It must be in WAV format.
-2. It must be in English since our training datasets are only in this language.
-3. Ensure the vocals are clear; background music is acceptable.
-We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference.
-                """)
-    with gr.Row():
-        with gr.Column():
-            avatar_face = gr.Image(type="filepath", label="Face")
-            driving_audio = gr.Audio(type="filepath", label="Driving audio")
-            generate = gr.Button("Generate")
-        with gr.Column():
-            output_video = gr.Video(label="Your talking head")
-    generate.click(
-        fn=run_inference,
-        inputs=[avatar_face, driving_audio],
-        outputs=output_video
     )
-demo.launch(show_error=True)

 import shutil
 from huggingface_hub import snapshot_download
 import gradio as gr
+from gradio_client import Client, handle_file
+from mutagen.mp3 import MP3
+from pydub import AudioSegment
+from PIL import Image
 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 from scripts.inference import inference_process
 import argparse
 if(not is_shared_ui):
     hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
+def is_mp3(file_path):
+    try:
+        audio = MP3(file_path)
+        return True
+    except Exception as e:
+        return False
+def convert_mp3_to_wav(mp3_file_path, wav_file_path):
+    # Load the MP3 file
+    audio = AudioSegment.from_mp3(mp3_file_path)
+    # Export as WAV file
+    audio.export(wav_file_path, format="wav")
+    return wav_file_path
+def trim_audio(file_path, output_path, max_duration=4000):
+    # Load the audio file
+    audio = AudioSegment.from_wav(file_path)
+    # Check the length of the audio in milliseconds
+    audio_length = len(audio)
+    # If the audio is longer than the maximum duration, trim it
+    if audio_length > max_duration:
+        trimmed_audio = audio[:max_duration]
+    else:
+        trimmed_audio = audio
+    # Export the trimmed audio to a new file
+    trimmed_audio.export(output_path, format="wav")
+    return output_path
+def add_silence_to_wav(wav_file_path, duration_s=1):
+    # Load the WAV file
+    audio = AudioSegment.from_wav(wav_file_path)
+    # Create 1 second of silence
+    silence = AudioSegment.silent(duration=duration_s * 1000)  # duration is in milliseconds
+    # Add silence to the end of the audio file
+    audio_with_silence = audio + silence
+    # Export the modified audio
+    audio_with_silence.export(wav_file_path, format="wav")
+    return wav_file_path
+def check_mp3(file_path):
+    if is_mp3(file_path):
+        wav_file_path = os.path.splitext(file_path)[0] + '.wav'
+        converted_audio = convert_mp3_to_wav(file_path, wav_file_path)
+        print(f"File converted to {wav_file_path}")
+        return converted_audio
+    else:
+        print("The file is not an MP3 file.")
+        return file_path
+def convert_webp_to_png(webp_file):
+    # Open the WebP image
+    webp_image = Image.open(webp_file)
+    # Convert and save as PNG
+    webp_image.save("png_converted_image.png", "PNG")
+    return "png_converted_image.png"
+def generate_portrait(prompt_image):
+    if prompt_image is None or prompt_image == "":
+        raise gr.Error("Can't generate a portrait without a prompt !")
+    client = Client("AP123/SDXL-Lightning")
+    result = client.predict(
+            prompt_image,
+            "4-Step",
+            api_name="/generate_image"
+    )
+    print(result)
+    return result
+def generate_voice(prompt_audio, voice_description):
+    if prompt_audio is None or prompt_audio == "" :
+        raise gr.Error("Can't generate a voice without text to synthetize !")
+    if voice_description is None or voice_description == "":
+        gr.Info(
+            "For better control, You may want to provide a voice character description next time.",
+            duration = 10,
+            visible = True
+        )
+    client = Client("parler-tts/parler_tts_mini")
+    result = client.predict(
+        text=prompt_audio,
+        description=voice_description,
+        api_name="/gen_tts"
+    )
+    print(result)
+    return result
+def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
+    client = Client("collabora/WhisperSpeech")
+    result = client.predict(
+        multilingual_text=prompt_audio_whisperspeech,
+        speaker_audio=handle_file(audio_to_clone),
+        speaker_url="",
+        cps=14,
+        api_name="/whisper_speech_demo"
+    )
+    print(result)
+    return result
+def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
     if is_shared_ui:
         raise gr.Error("This Space only works in duplicated instances")
     inference_process(args)
     return f'output-{unique_id}.mp4'
+def generate_talking_portrait(portrait, voice):
+    if portrait is None:
+        raise gr.Error("Please provide a portrait to animate.")
+    if voice is None:
+        raise gr.Error("Please provide audio (4 seconds max).")
+    # trim audio
+    input_file = voice
+    trimmed_output_file = "trimmed_audio.wav"
+    trimmed_output_file = trim_audio(input_file, trimmed_output_file)
+    voice = trimmed_output_file
+    ready_audio = add_silence_to_wav(voice)
+    print(f"1 second of silence added to {voice}")
+    # call hallo
+    talking_portrait_vid = run_hallo(portrait, ready_audio)
+    return talking_portrait_vid
 css = '''
+#col-container {
+    margin: 0 auto;
+}
+#main-group {
+    background-color: none;
+}
+.tabs {
+    background-color: unset;
+}
+#image-block {
+    flex: 1;
+}
+#video-block {
+    flex: 9;
+}
+#audio-block, #audio-clone-elm {
+    flex: 1;
+}
+#text-synth, #voice-desc, #text-synth-wsp{
+    height: 180px;
+}
+#audio-column, #result-column {
+    display: flex;
+}
+#gen-voice-btn {
+    flex: 1;
+}
+#parler-tab, #whisperspeech-tab {
+    padding: 0;
+}
+#main-submit{
+    flex: 1;
+}
 div#warning-ready {
     background-color: #ecfdf5;
     padding: 0 16px 16px;
 '''
 with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("""
+        # Parler X Hallo
+        Generate talking portraits
+        """)
+        with gr.Group(elem_id="main-group"):
+            with gr.Row():
+                with gr.Column():
+                    portrait = gr.Image(
+                        sources=["upload"],
+                        type="filepath",
+                        format="png",
+                        elem_id="image-block"
+                    )
+                    prompt_image = gr.Textbox(
+                        label="Generate image",
+                        lines=3
+                    )
+                    gen_image_btn = gr.Button("Generate portrait (optional)")
+                with gr.Column(elem_id="audio-column"):
+                    voice = gr.Audio(
+                        type="filepath",
+                        max_length=4000,
+                        elem_id="audio-block"
+                    )
+                    with gr.Tab("Parler TTS", elem_id="parler-tab"):
+                        prompt_audio = gr.Textbox(
+                            label="Text to synthetize",
+                            lines=4,
+                            max_lines=4,
+                            elem_id="text-synth"
+                        )
+                        voice_description = gr.Textbox(
+                            label="Voice description",
+                            lines=4,
+                            max_lines=4,
+                            elem_id="voice-desc"
+                        )
+                        gen_voice_btn = gr.Button("Generate voice (optional)")
+                    with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
+                        prompt_audio_whisperspeech = gr.Textbox(
+                            label="Text to synthetize",
+                            lines=4,
+                            max_lines=4,
+                            elem_id="text-synth-wsp"
+                        )
+                        audio_to_clone = gr.Audio(
+                            label="Voice to clone",
+                            type="filepath",
+                            elem_id="audio-clone-elm"
+                        )
+                        gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
+                with gr.Column(elem_id="result-column"):
+                    result = gr.Video(
+                        elem_id="video-block"
+                    )
+                    submit_btn = gr.Button("Submit", elem_id="main-submit")
+    voice.upload(
+        fn = check_mp3,
+        inputs = [voice],
+        outputs = [voice],
+        queue = False,
+        show_api = False
     )
+    gen_image_btn.click(
+        fn = generate_portrait,
+        inputs = [prompt_image],
+        outputs = [portrait],
+        queue=False,
+        show_api = False
+    )
+    gen_voice_btn.click(
+        fn = generate_voice,
+        inputs = [prompt_audio, voice_description],
+        outputs = [voice],
+        queue=False,
+        show_api = False
+    )
+    gen_wsp_voice_btn.click(
+        fn = get_whisperspeech,
+        inputs = [prompt_audio_whisperspeech, audio_to_clone],
+        outputs = [voice],
+        queue=False,
+        show_api = False
+    )
+    submit_btn.click(
+        fn = generate_talking_portrait,
+        inputs = [portrait, voice],
+        outputs = [result],
+        show_api = False
+    )
+demo.queue(max_size=2).launch(show_error=True, show_api=False)