Spaces:

VideoSys
/

CogVideoX

Running on Zero

App Files Files Community

zxl commited on Aug 28

Commit

07c6a04

•

1 Parent(s): bd6e6ad

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

CONTRIBUTING.md +37 -0
LICENSE +0 -0
README.md +16 -8
app.py +508 -0
docs/dsp.md +25 -0
docs/pab.md +121 -0
eval/pab/commom_metrics/README.md +6 -0
eval/pab/commom_metrics/__init__.py +0 -0
eval/pab/commom_metrics/calculate_lpips.py +97 -0
eval/pab/commom_metrics/calculate_psnr.py +90 -0
eval/pab/commom_metrics/calculate_ssim.py +116 -0
eval/pab/commom_metrics/eval.py +160 -0
eval/pab/experiments/__init__.py +0 -0
eval/pab/experiments/attention_ablation.py +60 -0
eval/pab/experiments/components_ablation.py +46 -0
eval/pab/experiments/latte.py +57 -0
eval/pab/experiments/opensora.py +44 -0
eval/pab/experiments/opensora_plan.py +57 -0
eval/pab/experiments/utils.py +22 -0
eval/pab/vbench/VBench_full_info.json +0 -0
eval/pab/vbench/cal_vbench.py +154 -0
eval/pab/vbench/run_vbench.py +52 -0
examples/cogvideo/sample.py +14 -0
examples/latte/sample.py +24 -0
examples/open_sora/sample.py +24 -0
examples/open_sora_plan/sample.py +24 -0
requirements.txt +25 -0
setup.py +55 -0
tests/__init__.py +0 -0
videosys/__init__.py +19 -0
videosys/core/__init__.py +0 -0
videosys/core/comm.py +420 -0
videosys/core/engine.py +132 -0
videosys/core/mp_utils.py +270 -0
videosys/core/pab_mgr.py +364 -0
videosys/core/parallel_mgr.py +119 -0
videosys/core/pipeline.py +34 -0
videosys/core/shardformer/__init__.py +0 -0
videosys/core/shardformer/t5/__init__.py +0 -0
videosys/core/shardformer/t5/modeling.py +39 -0
videosys/core/shardformer/t5/policy.py +68 -0
videosys/datasets/dataloader.py +94 -0
videosys/datasets/image_transform.py +42 -0
videosys/datasets/video_transform.py +441 -0
videosys/diffusion/__init__.py +41 -0
videosys/diffusion/diffusion_utils.py +79 -0
videosys/diffusion/gaussian_diffusion.py +829 -0
videosys/diffusion/respace.py +119 -0
videosys/diffusion/timestep_sampler.py +143 -0
videosys/models/__init__.py +0 -0

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,37 @@

+## Coding Standards
+### Unit Tests
+We use [PyTest](https://docs.pytest.org/en/latest/) to execute tests. You can install pytest by `pip install pytest`. As some of the tests require initialization of the distributed backend, GPUs are needed to execute these tests.
+To set up the environment for unit testing, first change your current directory to the root directory of your local ColossalAI repository, then run
+```bash
+pip install -r requirements/requirements-test.txt
+```
+If you encounter an error telling "Could not find a version that satisfies the requirement fbgemm-gpu==0.2.0", please downgrade your python version to 3.8 or 3.9 and try again.
+If you only want to run CPU tests, you can run
+```bash
+pytest -m cpu tests/
+```
+If you have 8 GPUs on your machine, you can run the full test
+```bash
+pytest tests/
+```
+If you do not have 8 GPUs on your machine, do not worry. Unit testing will be automatically conducted when you put up a pull request to the main branch.
+### Code Style
+We have some static checks when you commit your code change, please make sure you can pass all the tests and make sure the coding style meets our requirements. We use pre-commit hook to make sure the code is aligned with the writing standard. To set up the code style checking, you need to follow the steps below.
+```shell
+# these commands are executed under the Colossal-AI directory
+pip install pre-commit
+pre-commit install
+```
+Code format checking will be automatically executed when you commit your changes.

LICENSE ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,12 +1,20 @@
 ---
-title: Demo
-emoji: 🔥
-colorFrom: indigo
-colorTo: yellow
 sdk: gradio
 sdk_version: 4.42.0
 app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: VideoSys-CogVideoX
+emoji: 🎥
+colorFrom: yellow
+colorTo: green
 sdk: gradio
 sdk_version: 4.42.0
+suggested_hardware: a10g-large
+suggested_storage: large
+app_port: 7860
 app_file: app.py
+models:
+  - THUDM/CogVideoX-2b
+tags:
+  - cogvideox
+  - video-generation
+  - thudm
+short_description: Text-to-Video
+disable_embedding: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,508 @@

+# # import gradio as gr
+# # from videosys import CogVideoConfig, VideoSysEngine
+# # import tempfile
+# # import os
+# # import logging
+# # import uuid
+# # logging.basicConfig(level=logging.INFO)
+# # logger = logging.getLogger(__name__)
+# # config = CogVideoConfig(world_size=1)
+# # engine = VideoSysEngine(config)
+# # def generate_video(prompt):
+# #     try:
+# #         video = engine.generate(prompt).video[0]
+# #         # 使用临时文件和唯一标识符
+# #         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
+# #             temp_filename = temp_file.name
+# #             unique_filename = f"{uuid.uuid4().hex}.mp4"
+# #             output_path = os.path.join(tempfile.gettempdir(), unique_filename)
+# #             engine.save_video(video, output_path)
+# #         return output_path
+# #     except Exception as e:
+# #         logger.error(f"An error occurred: {str(e)}")
+# #         return None  # 返回 None 而不是错误消息
+# # iface = gr.Interface(
+# #     fn=generate_video,
+# #     inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
+# #     outputs=gr.Video(label="Generated Video"),
+# #     title="CogVideoX-2b: Text-to-Video Generation",
+# #     description="Enter a text prompt to generate a video using CogVideoX-2b."
+# # )
+# # iface.launch()
+# from videosys import CogVideoConfig, VideoSysEngine
+# from videosys.models.cogvideo.pipeline import CogVideoPABConfig
+# import os
+# import gradio as gr
+# import numpy as np
+# import torch
+# from openai import OpenAI
+# from time import time
+# import tempfile
+# import uuid
+# import logging
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+# dtype = torch.bfloat16
+# sys_prompt = """You are part of a team of bots that creates videos. You work with an assistant bot that will draw anything you say in square brackets.
+# For example , outputting " a beautiful morning in the woods with the sun peaking through the trees " will trigger your partner bot to output an video of a forest morning , as described. You will be prompted by people looking to create detailed , amazing videos. The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive.
+# There are a few rules to follow:
+# You will only ever output a single video description per user request.
+# When modifications are requested , you should not simply make the description longer . You should refactor the entire description to integrate the suggestions.
+# Other times the user will not want modifications , but instead want a new image . In this case , you should ignore your previous conversation with the user.
+# Video descriptions must have the same num of words as examples below. Extra words will be ignored.
+# """
+# def convert_prompt(prompt: str, retry_times: int = 3) -> str:
+#     if not os.environ.get("OPENAI_API_KEY"):
+#         return prompt
+#     client = OpenAI()
+#     text = prompt.strip()
+#     for i in range(retry_times):
+#         response = client.chat.completions.create(
+#             messages=[
+#                 {"role": "system", "content": sys_prompt},
+#                 {
+#                     "role": "user",
+#                     "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "a girl is on the beach"',
+#                 },
+#                 {
+#                     "role": "assistant",
+#                     "content": "A radiant woman stands on a deserted beach, arms outstretched, wearing a beige trench coat, white blouse, light blue jeans, and chic boots, against a backdrop of soft sky and sea. Moments later, she is seen mid-twirl, arms exuberant, with the lighting suggesting dawn or dusk. Then, she runs along the beach, her attire complemented by an off-white scarf and black ankle boots, the tranquil sea behind her. Finally, she holds a paper airplane, her pose reflecting joy and freedom, with the ocean's gentle waves and the sky's soft pastel hues enhancing the serene ambiance.",
+#                 },
+#                 {
+#                     "role": "user",
+#                     "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "A man jogging on a football field"',
+#                 },
+#                 {
+#                     "role": "assistant",
+#                     "content": "A determined man in athletic attire, including a blue long-sleeve shirt, black shorts, and blue socks, jogs around a snow-covered soccer field, showcasing his solitary exercise in a quiet, overcast setting. His long dreadlocks, focused expression, and the serene winter backdrop highlight his dedication to fitness. As he moves, his attire, consisting of a blue sports sweatshirt, black athletic pants, gloves, and sneakers, grips the snowy ground. He is seen running past a chain-link fence enclosing the playground area, with a basketball hoop and children's slide, suggesting a moment of solitary exercise amidst the empty field.",
+#                 },
+#                 {
+#                     "role": "user",
+#                     "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : " A woman is dancing, HD footage, close-up"',
+#                 },
+#                 {
+#                     "role": "assistant",
+#                     "content": "A young woman with her hair in an updo and wearing a teal hoodie stands against a light backdrop, initially looking over her shoulder with a contemplative expression. She then confidently makes a subtle dance move, suggesting rhythm and movement. Next, she appears poised and focused, looking directly at the camera. Her expression shifts to one of introspection as she gazes downward slightly. Finally, she dances with confidence, her left hand over her heart, symbolizing a poignant moment, all while dressed in the same teal hoodie against a plain, light-colored background.",
+#                 },
+#                 {
+#                     "role": "user",
+#                     "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
+#                 },
+#             ],
+#             model="glm-4-0520",
+#             temperature=0.01,
+#             top_p=0.7,
+#             stream=False,
+#             max_tokens=250,
+#         )
+#         if response.choices:
+#             return response.choices[0].message.content
+#     return prompt
+# def load_model(enable_video_sys=False, pab_threshold=[100, 850], pab_gap=2):
+#     pab_config = CogVideoPABConfig(full_threshold=pab_threshold, full_gap=pab_gap)
+#     config = CogVideoConfig(world_size=1, enable_pab=enable_video_sys, pab_config=pab_config)
+#     engine = VideoSysEngine(config)
+#     return engine
+# def generate(engine, prompt, num_inference_steps=50, guidance_scale=6.0):
+#     try:
+#         video = engine.generate(prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).video[0]
+#         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
+#             temp_file.name
+#             unique_filename = f"{uuid.uuid4().hex}.mp4"
+#             output_path = os.path.join(tempfile.gettempdir(), unique_filename)
+#             engine.save_video(video, output_path)
+#         return output_path
+#     except Exception as e:
+#         logger.error(f"An error occurred: {str(e)}")
+#         return None
+# with gr.Blocks() as demo:
+#     gr.Markdown("""
+#            <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
+#                VideoSys Huggingface Space🤗
+#            </div>
+#            <div style="text-align: center;">
+#                <a href="https://github.com/NUS-HPC-AI-Lab/VideoSys">🌐 Github</a>
+#            </div>
+#            <div style="text-align: center; font-size: 15px; font-weight: bold; color: red; margin-bottom: 20px;">
+#             ⚠️ This demo is for academic research and experiential use only.
+#             Users should strictly adhere to local laws and ethics.
+#             </div>
+#             <div style="text-align: center; font-size: 15px; font-weight: bold; color: magenta; margin-bottom: 20px;">
+#             💡 This demo only demonstrates single-device inference. To experience the full power of VideoSys, please deploy it with multiple devices.
+#             </div>
+#            """)
+#     with gr.Row():
+#         with gr.Column():
+#             prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="a bear hunting for prey", lines=5)
+#             with gr.Row():
+#                 gr.Markdown(
+#                     "✨Upon pressing the enhanced prompt button, we will use [GLM-4 Model](https://github.com/THUDM/GLM-4) to polish the prompt and overwrite the original one."
+#                 )
+#                 enhance_button = gr.Button("✨ Enhance Prompt(Optional)")
+#             with gr.Column():
+#                 gr.Markdown(
+#                     "**Optional Parameters** (default values are recommended)<br>"
+#                     "Turn Inference Steps larger if you want more detailed video, but it will be slower.<br>"
+#                     "50 steps are recommended for most cases. will cause 120 seconds for inference.<br>"
+#                 )
+#                 with gr.Row():
+#                     num_inference_steps = gr.Number(label="Inference Steps", value=50)
+#                     guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
+#                     pab_gap = gr.Number(label="PAB Gap", value=2, precision=0)
+#                     pab_threshold = gr.Textbox(label="PAB Threshold", value="100,850", lines=1)
+#                 with gr.Row():
+#                     generate_button = gr.Button("🎬 Generate Video")
+#                     generate_button_vs = gr.Button("⚡️ Generate Video with VideoSys (Faster)")
+#         with gr.Column():
+#             with gr.Row():
+#                 video_output = gr.Video(label="CogVideoX", width=720, height=480)
+#             with gr.Row():
+#                 download_video_button = gr.File(label="📥 Download Video", visible=False)
+#                 elapsed_time = gr.Textbox(label="Elapsed Time", value="0s", visible=False)
+#             with gr.Row():
+#                 video_output_vs = gr.Video(label="CogVideoX with VideoSys", width=720, height=480)
+#             with gr.Row():
+#                 download_video_button_vs = gr.File(label="📥 Download Video", visible=False)
+#                 elapsed_time_vs = gr.Textbox(label="Elapsed Time", value="0s", visible=False)
+#     def generate_vanilla(prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
+#         # tensor = infer(prompt, num_inference_steps, guidance_scale, progress=progress)
+#         engine = load_model()
+#         t = time()
+#         video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
+#         elapsed_time = time() - t
+#         video_update = gr.update(visible=True, value=video_path)
+#         elapsed_time = gr.update(visible=True, value=f"{elapsed_time:.2f}s")
+#         return video_path, video_update, elapsed_time
+#     def generate_vs(prompt, num_inference_steps, guidance_scale, threshold, gap, progress=gr.Progress(track_tqdm=True)):
+#         # tensor = infer(prompt, num_inference_steps, guidance_scale, progress=progress)
+#         threshold = [int(i) for i in threshold.split(",")]
+#         gap = int(gap)
+#         engine = load_model(enable_video_sys=True, pab_threshold=threshold, pab_gap=gap)
+#         t = time()
+#         video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
+#         elapsed_time = time() - t
+#         video_update = gr.update(visible=True, value=video_path)
+#         elapsed_time = gr.update(visible=True, value=f"{elapsed_time:.2f}s")
+#         return video_path, video_update, elapsed_time
+#     def enhance_prompt_func(prompt):
+#         return convert_prompt(prompt, retry_times=1)
+#     generate_button.click(
+#         generate_vanilla,
+#         inputs=[prompt, num_inference_steps, guidance_scale],
+#         outputs=[video_output, download_video_button, elapsed_time],
+#     )
+#     generate_button_vs.click(
+#         generate_vs,
+#         inputs=[prompt, num_inference_steps, guidance_scale, pab_threshold, pab_gap],
+#         outputs=[video_output_vs, download_video_button_vs, elapsed_time_vs],
+#     )
+#     enhance_button.click(enhance_prompt_func, inputs=[prompt], outputs=[prompt])
+# if __name__ == "__main__":
+#     demo.launch()
+import gradio as gr
+from videosys import CogVideoConfig, VideoSysEngine
+from videosys.models.cogvideo.pipeline import CogVideoPABConfig
+import os
+import numpy as np
+import torch
+from openai import OpenAI
+from time import time
+import tempfile
+import uuid
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+dtype = torch.bfloat16
+sys_prompt = """You are part of a team of bots that creates videos. You work with an assistant bot that will draw anything you say in square brackets.
+For example , outputting " a beautiful morning in the woods with the sun peaking through the trees " will trigger your partner bot to output an video of a forest morning , as described. You will be prompted by people looking to create detailed , amazing videos. The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive.
+There are a few rules to follow:
+You will only ever output a single video description per user request.
+When modifications are requested , you should not simply make the description longer . You should refactor the entire description to integrate the suggestions.
+Other times the user will not want modifications , but instead want a new image . In this case , you should ignore your previous conversation with the user.
+Video descriptions must have the same num of words as examples below. Extra words will be ignored.
+"""
+def convert_prompt(prompt: str, retry_times: int = 3) -> str:
+    if not os.environ.get("OPENAI_API_KEY"):
+        return prompt
+    client = OpenAI()
+    text = prompt.strip()
+    for i in range(retry_times):
+        response = client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": sys_prompt},
+                {
+                    "role": "user",
+                    "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "a girl is on the beach"',
+                },
+                {
+                    "role": "assistant",
+                    "content": "A radiant woman stands on a deserted beach, arms outstretched, wearing a beige trench coat, white blouse, light blue jeans, and chic boots, against a backdrop of soft sky and sea. Moments later, she is seen mid-twirl, arms exuberant, with the lighting suggesting dawn or dusk. Then, she runs along the beach, her attire complemented by an off-white scarf and black ankle boots, the tranquil sea behind her. Finally, she holds a paper airplane, her pose reflecting joy and freedom, with the ocean's gentle waves and the sky's soft pastel hues enhancing the serene ambiance.",
+                },
+                {
+                    "role": "user",
+                    "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "A man jogging on a football field"',
+                },
+                {
+                    "role": "assistant",
+                    "content": "A determined man in athletic attire, including a blue long-sleeve shirt, black shorts, and blue socks, jogs around a snow-covered soccer field, showcasing his solitary exercise in a quiet, overcast setting. His long dreadlocks, focused expression, and the serene winter backdrop highlight his dedication to fitness. As he moves, his attire, consisting of a blue sports sweatshirt, black athletic pants, gloves, and sneakers, grips the snowy ground. He is seen running past a chain-link fence enclosing the playground area, with a basketball hoop and children's slide, suggesting a moment of solitary exercise amidst the empty field.",
+                },
+                {
+                    "role": "user",
+                    "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : " A woman is dancing, HD footage, close-up"',
+                },
+                {
+                    "role": "assistant",
+                    "content": "A young woman with her hair in an updo and wearing a teal hoodie stands against a light backdrop, initially looking over her shoulder with a contemplative expression. She then confidently makes a subtle dance move, suggesting rhythm and movement. Next, she appears poised and focused, looking directly at the camera. Her expression shifts to one of introspection as she gazes downward slightly. Finally, she dances with confidence, her left hand over her heart, symbolizing a poignant moment, all while dressed in the same teal hoodie against a plain, light-colored background.",
+                },
+                {
+                    "role": "user",
+                    "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: "{text}"',
+                },
+            ],
+            model="glm-4-0520",
+            temperature=0.01,
+            top_p=0.7,
+            stream=False,
+            max_tokens=250,
+        )
+        if response.choices:
+            return response.choices[0].message.content
+    return prompt
+def load_model(enable_video_sys=False, pab_threshold=[100, 850], pab_gap=2):
+    pab_config = CogVideoPABConfig(full_threshold=pab_threshold, full_gap=pab_gap)
+    config = CogVideoConfig(world_size=1, enable_pab=enable_video_sys, pab_config=pab_config)
+    engine = VideoSysEngine(config)
+    return engine
+def generate(engine, prompt, num_inference_steps=50, guidance_scale=6.0):
+    try:
+        video = engine.generate(prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).video[0]
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
+            temp_file.name
+            unique_filename = f"{uuid.uuid4().hex}.mp4"
+            output_path = os.path.join(tempfile.gettempdir(), unique_filename)
+            engine.save_video(video, output_path)
+        return output_path
+    except Exception as e:
+        logger.error(f"An error occurred: {str(e)}")
+        return None
+css = """
+body {
+    font-family: Arial, sans-serif;
+    line-height: 1.6;
+    color: #333;
+    max-width: 1200px;
+    margin: 0 auto;
+    padding: 20px;
+}
+.container {
+    display: flex;
+    flex-direction: column;
+    gap: 20px;
+}
+.row {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 20px;
+}
+.column {
+    flex: 1;
+    min-width: 0;
+}
+.textbox, .number-input, button {
+    width: 100%;
+    padding: 10px;
+    margin-bottom: 10px;
+    border: 1px solid #ddd;
+    border-radius: 4px;
+}
+button {
+    background-color: #4CAF50;
+    color: white;
+    border: none;
+    cursor: pointer;
+    transition: background-color 0.3s;
+}
+button:hover {
+    background-color: #45a049;
+}
+.video-output {
+    width: 100%;
+    max-width: 720px;
+    height: auto;
+    margin: 0 auto;
+}
+@media (max-width: 768px) {
+    .row {
+        flex-direction: column;
+    }
+    .column {
+        width: 100%;
+    }
+    .video-output {
+        width: 100%;
+        height: auto;
+    }
+}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.HTML("""
+    <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
+        VideoSys Huggingface Space🤗
+    </div>
+    <div style="text-align: center;">
+        <a href="https://github.com/NUS-HPC-AI-Lab/VideoSys">🌐 Github</a>
+    </div>
+    <div style="text-align: center; font-size: 15px; font-weight: bold; color: red; margin-bottom: 20px;">
+        ⚠️ This demo is for academic research and experiential use only.
+        Users should strictly adhere to local laws and ethics.
+    </div>
+    <div style="text-align: center; font-size: 15px; font-weight: bold; color: magenta; margin-bottom: 20px;">
+        💡 This demo only demonstrates single-device inference. To experience the full power of VideoSys, please deploy it with multiple devices.
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="a bear hunting for prey", lines=5)
+            with gr.Row():
+                gr.Markdown(
+                    "✨Upon pressing the enhanced prompt button, we will use [GLM-4 Model](https://github.com/THUDM/GLM-4) to polish the prompt and overwrite the original one."
+                )
+                enhance_button = gr.Button("✨ Enhance Prompt(Optional)")
+            with gr.Column():
+                gr.Markdown(
+                    "**Optional Parameters** (default values are recommended)<br>"
+                    "Turn Inference Steps larger if you want more detailed video, but it will be slower.<br>"
+                    "50 steps are recommended for most cases. will cause 120 seconds for inference.<br>"
+                )
+                with gr.Row():
+                    num_inference_steps = gr.Number(label="Inference Steps", value=50)
+                    guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
+                    pab_gap = gr.Number(label="PAB Gap", value=2, precision=0)
+                    pab_threshold = gr.Textbox(label="PAB Threshold", value="100,850", lines=1)
+                with gr.Row():
+                    generate_button = gr.Button("🎬 Generate Video")
+                    generate_button_vs = gr.Button("⚡️ Generate Video with VideoSys (Faster)")
+        with gr.Column():
+            with gr.Row():
+                video_output = gr.Video(label="CogVideoX", width=720, height=480)
+            with gr.Row():
+                download_video_button = gr.File(label="📥 Download Video", visible=False)
+                elapsed_time = gr.Textbox(label="Elapsed Time", value="0s", visible=False)
+            with gr.Row():
+                video_output_vs = gr.Video(label="CogVideoX with VideoSys", width=720, height=480)
+            with gr.Row():
+                download_video_button_vs = gr.File(label="📥 Download Video", visible=False)
+                elapsed_time_vs = gr.Textbox(label="Elapsed Time", value="0s", visible=False)
+    def generate_vanilla(prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
+        engine = load_model()
+        t = time()
+        video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
+        elapsed_time = time() - t
+        video_update = gr.update(visible=True, value=video_path)
+        elapsed_time = gr.update(visible=True, value=f"{elapsed_time:.2f}s")
+        return video_path, video_update, elapsed_time
+    def generate_vs(prompt, num_inference_steps, guidance_scale, threshold, gap, progress=gr.Progress(track_tqdm=True)):
+        threshold = [int(i) for i in threshold.split(",")]
+        gap = int(gap)
+        engine = load_model(enable_video_sys=True, pab_threshold=threshold, pab_gap=gap)
+        t = time()
+        video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
+        elapsed_time = time() - t
+        video_update = gr.update(visible=True, value=video_path)
+        elapsed_time = gr.update(visible=True, value=f"{elapsed_time:.2f}s")
+        return video_path, video_update, elapsed_time
+    def enhance_prompt_func(prompt):
+        return convert_prompt(prompt, retry_times=1)
+    generate_button.click(
+        generate_vanilla,
+        inputs=[prompt, num_inference_steps, guidance_scale],
+        outputs=[video_output, download_video_button, elapsed_time],
+    )
+    generate_button_vs.click(
+        generate_vs,
+        inputs=[prompt, num_inference_steps, guidance_scale, pab_threshold, pab_gap],
+        outputs=[video_output_vs, download_video_button_vs, elapsed_time_vs],
+    )
+    enhance_button.click(enhance_prompt_func, inputs=[prompt], outputs=[prompt])
+if __name__ == "__main__":
+    demo.launch()

docs/dsp.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# DSP
+paper: https://arxiv.org/abs/2403.10266
+![dsp_overview](../assets/figures/dsp_overview.png)
+DSP (Dynamic Sequence Parallelism) is a novel, elegant and super efficient sequence parallelism for [OpenSora](https://github.com/hpcaitech/Open-Sora), [Latte](https://github.com/Vchitect/Latte) and other multi-dimensional transformer architecture.
+The key idea is to dynamically switch the parallelism dimension according to the current computation stage, leveraging the potential characteristics of multi-dimensional transformers. Compared with splitting head and sequence dimension as previous methods, it can reduce at least 75% of communication cost.
+It achieves **3x** speed for training and **2x** speed for inference in OpenSora compared with sota sequence parallelism ([DeepSpeed Ulysses](https://arxiv.org/abs/2309.14509)). For a 10s (80 frames) of 512x512 video, the inference latency of OpenSora is:
+| Method | 1xH800 | 8xH800 (DS Ulysses) | 8xH800 (DSP) |
+| ------ | ------ | ------ | ------ |
+| Latency(s) | 106 | 45 | 22 |
+The following is DSP's end-to-end throughput for training of OpenSora:
+![dsp_overview](../assets/figures/dsp_exp.png)
+### Usage
+DSP is currently supported for: OpenSora, OpenSoraPlan and Latte. To enable DSP, you just need to launch with multiple GPUs.

docs/pab.md ADDED Viewed

	@@ -0,0 +1,121 @@

+# Pyramid Attention Broadcast(PAB)
+[[paper](https://arxiv.org/abs/2408.12588)][[blog](https://arxiv.org/abs/2403.10266)]
+Pyramid Attention Broadcast(PAB)(#pyramid-attention-broadcastpab)
+- [Pyramid Attention Broadcast(PAB)](#pyramid-attention-broadcastpab)
+  - [Insights](#insights)
+  - [Pyramid Attention Broadcast (PAB) Mechanism](#pyramid-attention-broadcast-pab-mechanism)
+  - [Experimental Results](#experimental-results)
+  - [Usage](#usage)
+    - [Supported Models](#supported-models)
+    - [Configuration for PAB](#configuration-for-pab)
+      - [Parameters](#parameters)
+      - [Example Configuration](#example-configuration)
+We introduce Pyramid Attention Broadcast (PAB), the first approach that achieves real-time DiT-based video generation. By mitigating redundant attention computation, PAB achieves up to 21.6 FPS with 10.6x acceleration, without sacrificing quality across popular DiT-based video generation models including Open-Sora, Open-Sora-Plan, and Latte. Notably, as a training-free approach, PAB can enpower any future DiT-based video generation models with real-time capabilities.
+## Insights
+![method](../assets/figures/pab_motivation.png)
+Our study reveals two key insights of three **attention mechanisms** within video diffusion transformers:
+- First, attention differences across time steps exhibit a U-shaped pattern, with significant variations occurring during the first and last 15% of steps, while the middle 70% of steps show very stable, minor differences.
+- Second, within the stable middle segment, the variability differs among attention types:
+    - **Spatial attention** varies the most, involving high-frequency elements like edges and textures;
+    - **Temporal attention** exhibits mid-frequency variations related to movements and dynamics in videos;
+    - **Cross-modal attention** is the most stable, linking text with video content, analogous to low-frequency signals reflecting textual semantics.
+## Pyramid Attention Broadcast (PAB) Mechanism
+![method](../assets/figures/pab_method.png)
+Building on these insights, we propose a **pyramid attention broadcast(PAB)** mechanism to minimize unnecessary computations and optimize the utility of each attention module, as shown in Figure[xx figure] below.
+In the middle segment, we broadcast one step's attention outputs to its subsequent several steps, thereby significantly reducing the computational cost on attention modules.
+For more efficient broadcast and minimum influence to effect, we set varied broadcast ranges for different attentions based on their stability and differences.
+**The smaller the variation in attention, the broader the potential broadcast range.**
+## Experimental Results
+Here are the results of our experiments, more results are shown in https://oahzxl.github.io/PAB:
+![pab_vis](../assets/figures/pab_vis.png)
+## Usage
+### Supported Models
+PAB currently supports Open-Sora, Open-Sora-Plan, and Latte.
+### Configuration for PAB
+To efficiently use the Pyramid Attention Broadcast (PAB) mechanism, configure the following parameters to control the broadcasting for different attention types. This helps reduce computational costs by skipping certain steps based on attention stability.
+#### Parameters
+- **spatial_broadcast**: Enable or disable broadcasting for spatial attention.
+  - Type: `True` or `False`
+- **spatial_threshold**: Set the range of diffusion steps within which spatial attention is applied.
+  - Format: `[min_value, max_value]`
+- **spatial_gap**: Number of blocks in model to skip during broadcasting for spatial attention.
+  - Type: Integer
+- **temporal_broadcast**: Enable or disable broadcasting for temporal attention.
+  - Type: `True` or `False`
+- **temporal_threshold**: Set the range of diffusion steps within which temporal attention is applied.
+  - Format: `[min_value, max_value]`
+- **temporal_gap**: Number of steps to skip during broadcasting for temporal attention.
+  - Type: Integer
+- **cross_broadcast**: Enable or disable broadcasting for cross-modal attention.
+  - Type: `True` or `False`
+- **cross_threshold**: Set the range of diffusion steps within which cross-modal attention is applied.
+  - Format: `[min_value, max_value]`
+- **cross_gap**: Number of steps to skip during broadcasting for cross-modal attention.
+  - Type: Integer
+#### Example Configuration
+```yaml
+spatial_broadcast: True
+spatial_threshold: [100, 800]
+spatial_gap: 2
+temporal_broadcast: True
+temporal_threshold: [100, 800]
+temporal_gap: 3
+cross_broadcast: True
+cross_threshold: [100, 900]
+cross_gap: 5
+```
+Explanation:
+- **Spatial Attention**:
+  - Broadcasting enabled (`spatial_broadcast: True`)
+  - Applied within the threshold range of 100 to 800
+  - Skips every 2 steps (`spatial_gap: 2`)
+  - Active within the first 28 steps (`spatial_block: [0, 28]`)
+- **Temporal Attention**:
+  - Broadcasting enabled (`temporal_broadcast: True`)
+  - Applied within the threshold range of 100 to 800
+  - Skips every 3 steps (`temporal_gap: 3`)
+- **Cross-Modal Attention**:
+  - Broadcasting enabled (`cross_broadcast: True`)
+  - Applied within the threshold range of 100 to 900
+  - Skips every 5 steps (`cross_gap: 5`)
+Adjust these settings based on your specific needs to optimize the performance of each attention mechanism.

eval/pab/commom_metrics/README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+Common metrics
+Include LPIPS, PSNR and SSIM.
+The code is adapted from [common_metrics_on_video_quality
+](https://github.com/JunyaoHu/common_metrics_on_video_quality).

eval/pab/commom_metrics/__init__.py ADDED Viewed

File without changes

eval/pab/commom_metrics/calculate_lpips.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import lpips
+import numpy as np
+import torch
+spatial = True  # Return a spatial map of perceptual distance.
+# Linearly calibrated models (LPIPS)
+loss_fn = lpips.LPIPS(net="alex", spatial=spatial)  # Can also set net = 'squeeze' or 'vgg'
+# loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg'
+def trans(x):
+    # if greyscale images add channel
+    if x.shape[-3] == 1:
+        x = x.repeat(1, 1, 3, 1, 1)
+    # value range [0, 1] -> [-1, 1]
+    x = x * 2 - 1
+    return x
+def calculate_lpips(videos1, videos2, device):
+    # image should be RGB, IMPORTANT: normalized to [-1,1]
+    assert videos1.shape == videos2.shape
+    # videos [batch_size, timestamps, channel, h, w]
+    # support grayscale input, if grayscale -> channel*3
+    # value range [0, 1] -> [-1, 1]
+    videos1 = trans(videos1)
+    videos2 = trans(videos2)
+    lpips_results = []
+    for video_num in range(videos1.shape[0]):
+        # get a video
+        # video [timestamps, channel, h, w]
+        video1 = videos1[video_num]
+        video2 = videos2[video_num]
+        lpips_results_of_a_video = []
+        for clip_timestamp in range(len(video1)):
+            # get a img
+            # img [timestamps[x], channel, h, w]
+            # img [channel, h, w] tensor
+            img1 = video1[clip_timestamp].unsqueeze(0).to(device)
+            img2 = video2[clip_timestamp].unsqueeze(0).to(device)
+            loss_fn.to(device)
+            # calculate lpips of a video
+            lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist())
+        lpips_results.append(lpips_results_of_a_video)
+    lpips_results = np.array(lpips_results)
+    lpips = {}
+    lpips_std = {}
+    for clip_timestamp in range(len(video1)):
+        lpips[clip_timestamp] = np.mean(lpips_results[:, clip_timestamp])
+        lpips_std[clip_timestamp] = np.std(lpips_results[:, clip_timestamp])
+    result = {
+        "value": lpips,
+        "value_std": lpips_std,
+        "video_setting": video1.shape,
+        "video_setting_name": "time, channel, heigth, width",
+    }
+    return result
+# test code / using example
+def main():
+    NUMBER_OF_VIDEOS = 8
+    VIDEO_LENGTH = 50
+    CHANNEL = 3
+    SIZE = 64
+    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    device = torch.device("cuda")
+    # device = torch.device("cpu")
+    import json
+    result = calculate_lpips(videos1, videos2, device)
+    print(json.dumps(result, indent=4))
+if __name__ == "__main__":
+    main()

eval/pab/commom_metrics/calculate_psnr.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import math
+import numpy as np
+import torch
+def img_psnr(img1, img2):
+    # [0,1]
+    # compute mse
+    # mse = np.mean((img1-img2)**2)
+    mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2)
+    # compute psnr
+    if mse < 1e-10:
+        return 100
+    psnr = 20 * math.log10(1 / math.sqrt(mse))
+    return psnr
+def trans(x):
+    return x
+def calculate_psnr(videos1, videos2):
+    # videos [batch_size, timestamps, channel, h, w]
+    assert videos1.shape == videos2.shape
+    videos1 = trans(videos1)
+    videos2 = trans(videos2)
+    psnr_results = []
+    for video_num in range(videos1.shape[0]):
+        # get a video
+        # video [timestamps, channel, h, w]
+        video1 = videos1[video_num]
+        video2 = videos2[video_num]
+        psnr_results_of_a_video = []
+        for clip_timestamp in range(len(video1)):
+            # get a img
+            # img [timestamps[x], channel, h, w]
+            # img [channel, h, w] numpy
+            img1 = video1[clip_timestamp].numpy()
+            img2 = video2[clip_timestamp].numpy()
+            # calculate psnr of a video
+            psnr_results_of_a_video.append(img_psnr(img1, img2))
+        psnr_results.append(psnr_results_of_a_video)
+    psnr_results = np.array(psnr_results)
+    psnr = {}
+    psnr_std = {}
+    for clip_timestamp in range(len(video1)):
+        psnr[clip_timestamp] = np.mean(psnr_results[:, clip_timestamp])
+        psnr_std[clip_timestamp] = np.std(psnr_results[:, clip_timestamp])
+    result = {
+        "value": psnr,
+        "value_std": psnr_std,
+        "video_setting": video1.shape,
+        "video_setting_name": "time, channel, heigth, width",
+    }
+    return result
+# test code / using example
+def main():
+    NUMBER_OF_VIDEOS = 8
+    VIDEO_LENGTH = 50
+    CHANNEL = 3
+    SIZE = 64
+    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    import json
+    result = calculate_psnr(videos1, videos2)
+    print(json.dumps(result, indent=4))
+if __name__ == "__main__":
+    main()

eval/pab/commom_metrics/calculate_ssim.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import cv2
+import numpy as np
+import torch
+def ssim(img1, img2):
+    C1 = 0.01**2
+    C2 = 0.03**2
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
+    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    return ssim_map.mean()
+def calculate_ssim_function(img1, img2):
+    # [0,1]
+    # ssim is the only metric extremely sensitive to gray being compared to b/w
+    if not img1.shape == img2.shape:
+        raise ValueError("Input images must have the same dimensions.")
+    if img1.ndim == 2:
+        return ssim(img1, img2)
+    elif img1.ndim == 3:
+        if img1.shape[0] == 3:
+            ssims = []
+            for i in range(3):
+                ssims.append(ssim(img1[i], img2[i]))
+            return np.array(ssims).mean()
+        elif img1.shape[0] == 1:
+            return ssim(np.squeeze(img1), np.squeeze(img2))
+    else:
+        raise ValueError("Wrong input image dimensions.")
+def trans(x):
+    return x
+def calculate_ssim(videos1, videos2):
+    # videos [batch_size, timestamps, channel, h, w]
+    assert videos1.shape == videos2.shape
+    videos1 = trans(videos1)
+    videos2 = trans(videos2)
+    ssim_results = []
+    for video_num in range(videos1.shape[0]):
+        # get a video
+        # video [timestamps, channel, h, w]
+        video1 = videos1[video_num]
+        video2 = videos2[video_num]
+        ssim_results_of_a_video = []
+        for clip_timestamp in range(len(video1)):
+            # get a img
+            # img [timestamps[x], channel, h, w]
+            # img [channel, h, w] numpy
+            img1 = video1[clip_timestamp].numpy()
+            img2 = video2[clip_timestamp].numpy()
+            # calculate ssim of a video
+            ssim_results_of_a_video.append(calculate_ssim_function(img1, img2))
+        ssim_results.append(ssim_results_of_a_video)
+    ssim_results = np.array(ssim_results)
+    ssim = {}
+    ssim_std = {}
+    for clip_timestamp in range(len(video1)):
+        ssim[clip_timestamp] = np.mean(ssim_results[:, clip_timestamp])
+        ssim_std[clip_timestamp] = np.std(ssim_results[:, clip_timestamp])
+    result = {
+        "value": ssim,
+        "value_std": ssim_std,
+        "video_setting": video1.shape,
+        "video_setting_name": "time, channel, heigth, width",
+    }
+    return result
+# test code / using example
+def main():
+    NUMBER_OF_VIDEOS = 8
+    VIDEO_LENGTH = 50
+    CHANNEL = 3
+    SIZE = 64
+    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    torch.device("cuda")
+    import json
+    result = calculate_ssim(videos1, videos2)
+    print(json.dumps(result, indent=4))
+if __name__ == "__main__":
+    main()

eval/pab/commom_metrics/eval.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import argparse
+import os
+import imageio
+import torch
+import torchvision.transforms.functional as F
+import tqdm
+from calculate_lpips import calculate_lpips
+from calculate_psnr import calculate_psnr
+from calculate_ssim import calculate_ssim
+def load_videos(directory, video_ids, file_extension):
+    videos = []
+    for video_id in video_ids:
+        video_path = os.path.join(directory, f"{video_id}.{file_extension}")
+        if os.path.exists(video_path):
+            video = load_video(video_path)  # Define load_video based on how videos are stored
+            videos.append(video)
+        else:
+            raise ValueError(f"Video {video_id}.{file_extension} not found in {directory}")
+    return videos
+def load_video(video_path):
+    """
+    Load a video from the given path and convert it to a PyTorch tensor.
+    """
+    # Read the video using imageio
+    reader = imageio.get_reader(video_path, "ffmpeg")
+    # Extract frames and convert to a list of tensors
+    frames = []
+    for frame in reader:
+        # Convert the frame to a tensor and permute the dimensions to match (C, H, W)
+        frame_tensor = torch.tensor(frame).cuda().permute(2, 0, 1)
+        frames.append(frame_tensor)
+    # Stack the list of tensors into a single tensor with shape (T, C, H, W)
+    video_tensor = torch.stack(frames)
+    return video_tensor
+def resize_video(video, target_height, target_width):
+    resized_frames = []
+    for frame in video:
+        resized_frame = F.resize(frame, [target_height, target_width])
+        resized_frames.append(resized_frame)
+    return torch.stack(resized_frames)
+def preprocess_eval_video(eval_video, generated_video_shape):
+    T_gen, _, H_gen, W_gen = generated_video_shape
+    T_eval, _, H_eval, W_eval = eval_video.shape
+    if T_eval < T_gen:
+        raise ValueError(f"Eval video time steps ({T_eval}) are less than generated video time steps ({T_gen}).")
+    if H_eval < H_gen or W_eval < W_gen:
+        # Resize the video maintaining the aspect ratio
+        resize_height = max(H_gen, int(H_gen * (H_eval / W_eval)))
+        resize_width = max(W_gen, int(W_gen * (W_eval / H_eval)))
+        eval_video = resize_video(eval_video, resize_height, resize_width)
+        # Recalculate the dimensions
+        T_eval, _, H_eval, W_eval = eval_video.shape
+    # Center crop
+    start_h = (H_eval - H_gen) // 2
+    start_w = (W_eval - W_gen) // 2
+    cropped_video = eval_video[:T_gen, :, start_h : start_h + H_gen, start_w : start_w + W_gen]
+    return cropped_video
+def main(args):
+    device = "cuda"
+    gt_video_dir = args.gt_video_dir
+    generated_video_dir = args.generated_video_dir
+    video_ids = []
+    file_extension = "mp4"
+    for f in os.listdir(generated_video_dir):
+        if f.endswith(f".{file_extension}"):
+            video_ids.append(f.replace(f".{file_extension}", ""))
+    if not video_ids:
+        raise ValueError("No videos found in the generated video dataset. Exiting.")
+    print(f"Find {len(video_ids)} videos")
+    prompt_interval = 1
+    batch_size = 16
+    calculate_lpips_flag, calculate_psnr_flag, calculate_ssim_flag = True, True, True
+    lpips_results = []
+    psnr_results = []
+    ssim_results = []
+    total_len = len(video_ids) // batch_size + (1 if len(video_ids) % batch_size != 0 else 0)
+    for idx, video_id in enumerate(tqdm.tqdm(range(total_len))):
+        gt_videos_tensor = []
+        generated_videos_tensor = []
+        for i in range(batch_size):
+            video_idx = idx * batch_size + i
+            if video_idx >= len(video_ids):
+                break
+            video_id = video_ids[video_idx]
+            generated_video = load_video(os.path.join(generated_video_dir, f"{video_id}.{file_extension}"))
+            generated_videos_tensor.append(generated_video)
+            eval_video = load_video(os.path.join(gt_video_dir, f"{video_id}.{file_extension}"))
+            gt_videos_tensor.append(eval_video)
+        gt_videos_tensor = (torch.stack(gt_videos_tensor) / 255.0).cpu()
+        generated_videos_tensor = (torch.stack(generated_videos_tensor) / 255.0).cpu()
+        if calculate_lpips_flag:
+            result = calculate_lpips(gt_videos_tensor, generated_videos_tensor, device=device)
+            result = result["value"].values()
+            result = sum(result) / len(result)
+            lpips_results.append(result)
+        if calculate_psnr_flag:
+            result = calculate_psnr(gt_videos_tensor, generated_videos_tensor)
+            result = result["value"].values()
+            result = sum(result) / len(result)
+            psnr_results.append(result)
+        if calculate_ssim_flag:
+            result = calculate_ssim(gt_videos_tensor, generated_videos_tensor)
+            result = result["value"].values()
+            result = sum(result) / len(result)
+            ssim_results.append(result)
+        if (idx + 1) % prompt_interval == 0:
+            out_str = ""
+            for results, name in zip([lpips_results, psnr_results, ssim_results], ["lpips", "psnr", "ssim"]):
+                result = sum(results) / len(results)
+                out_str += f"{name}: {result:.4f}, "
+            print(f"Processed {idx + 1} videos. {out_str[:-2]}")
+    out_str = ""
+    for results, name in zip([lpips_results, psnr_results, ssim_results], ["lpips", "psnr", "ssim"]):
+        result = sum(results) / len(results)
+        out_str += f"{name}: {result:.4f}, "
+    out_str = out_str[:-2]
+    # save
+    with open(f"./{os.path.basename(generated_video_dir)}.txt", "w+") as f:
+        f.write(out_str)
+    print(f"Processed all videos. {out_str}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gt_video_dir", type=str)
+    parser.add_argument("--generated_video_dir", type=str)
+    args = parser.parse_args()
+    main(args)

eval/pab/experiments/__init__.py ADDED Viewed

File without changes

eval/pab/experiments/attention_ablation.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from utils import generate_func, read_prompt_list
+import videosys
+from videosys import OpenSoraConfig, OpenSoraPipeline
+from videosys.models.open_sora import OpenSoraPABConfig
+def attention_ablation_func(pab_kwargs, prompt_list, output_dir):
+    pab_config = OpenSoraPABConfig(**pab_kwargs)
+    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, output_dir)
+def main(prompt_list):
+    # spatial
+    gap_list = [2, 3, 4, 5]
+    for gap in gap_list:
+        pab_kwargs = {
+            "spatial_broadcast": True,
+            "spatial_gap": gap,
+            "temporal_broadcast": False,
+            "cross_broadcast": False,
+            "mlp_skip": False,
+        }
+        output_dir = f"./samples/attention_ablation/spatial_g{gap}"
+        attention_ablation_func(pab_kwargs, prompt_list, output_dir)
+    # temporal
+    gap_list = [3, 4, 5, 6]
+    for gap in gap_list:
+        pab_kwargs = {
+            "spatial_broadcast": False,
+            "temporal_broadcast": True,
+            "temporal_gap": gap,
+            "cross_broadcast": False,
+            "mlp_skip": False,
+        }
+        output_dir = f"./samples/attention_ablation/temporal_g{gap}"
+        attention_ablation_func(pab_kwargs, prompt_list, output_dir)
+    # cross
+    gap_list = [5, 6, 7, 8]
+    for gap in gap_list:
+        pab_kwargs = {
+            "spatial_broadcast": False,
+            "temporal_broadcast": False,
+            "cross_broadcast": True,
+            "cross_gap": gap,
+            "mlp_skip": False,
+        }
+        output_dir = f"./samples/attention_ablation/cross_g{gap}"
+        attention_ablation_func(pab_kwargs, prompt_list, output_dir)
+if __name__ == "__main__":
+    videosys.initialize(42)
+    prompt_list = read_prompt_list("vbench/VBench_full_info.json")
+    main(prompt_list)

eval/pab/experiments/components_ablation.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from utils import generate_func, read_prompt_list
+import videosys
+from videosys import OpenSoraConfig, OpenSoraPipeline
+from videosys.models.open_sora import OpenSoraPABConfig
+def wo_spatial(prompt_list):
+    pab_config = OpenSoraPABConfig(spatial_broadcast=False)
+    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/components_ablation/wo_spatial")
+def wo_temporal(prompt_list):
+    pab_config = OpenSoraPABConfig(temporal_broadcast=False)
+    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/components_ablation/wo_temporal")
+def wo_cross(prompt_list):
+    pab_config = OpenSoraPABConfig(cross_broadcast=False)
+    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/components_ablation/wo_cross")
+def wo_mlp(prompt_list):
+    pab_config = OpenSoraPABConfig(mlp_skip=False)
+    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/components_ablation/wo_mlp")
+if __name__ == "__main__":
+    videosys.initialize(42)
+    prompt_list = read_prompt_list("./vbench/VBench_full_info.json")
+    wo_spatial(prompt_list)
+    wo_temporal(prompt_list)
+    wo_cross(prompt_list)
+    wo_mlp(prompt_list)

eval/pab/experiments/latte.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from utils import generate_func, read_prompt_list
+import videosys
+from videosys import LatteConfig, LattePipeline
+from videosys.models.latte import LattePABConfig
+def eval_base(prompt_list):
+    config = LatteConfig()
+    pipeline = LattePipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/latte_base", loop=5)
+def eval_pab1(prompt_list):
+    pab_config = LattePABConfig(
+        spatial_gap=2,
+        temporal_gap=3,
+        cross_gap=6,
+    )
+    config = LatteConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = LattePipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/latte_pab1", loop=5)
+def eval_pab2(prompt_list):
+    pab_config = LattePABConfig(
+        spatial_gap=3,
+        temporal_gap=4,
+        cross_gap=7,
+    )
+    config = LatteConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = LattePipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/latte_pab2", loop=5)
+def eval_pab3(prompt_list):
+    pab_config = LattePABConfig(
+        spatial_gap=4,
+        temporal_gap=6,
+        cross_gap=9,
+    )
+    config = LatteConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = LattePipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/latte_pab3", loop=5)
+if __name__ == "__main__":
+    videosys.initialize(42)
+    prompt_list = read_prompt_list("vbench/VBench_full_info.json")
+    eval_base(prompt_list)
+    eval_pab1(prompt_list)
+    eval_pab2(prompt_list)
+    eval_pab3(prompt_list)

eval/pab/experiments/opensora.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from utils import generate_func, read_prompt_list
+import videosys
+from videosys import OpenSoraConfig, OpenSoraPipeline
+from videosys.models.open_sora import OpenSoraPABConfig
+def eval_base(prompt_list):
+    config = OpenSoraConfig()
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/opensora_base", loop=5)
+def eval_pab1(prompt_list):
+    config = OpenSoraConfig(enable_pab=True)
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/opensora_pab1", loop=5)
+def eval_pab2(prompt_list):
+    pab_config = OpenSoraPABConfig(spatial_gap=3, temporal_gap=5, cross_gap=7)
+    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/opensora_pab2", loop=5)
+def eval_pab3(prompt_list):
+    pab_config = OpenSoraPABConfig(spatial_gap=5, temporal_gap=7, cross_gap=9)
+    config = OpenSoraConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/opensora_pab3", loop=5)
+if __name__ == "__main__":
+    videosys.initialize(42)
+    prompt_list = read_prompt_list("vbench/VBench_full_info.json")
+    eval_base(prompt_list)
+    eval_pab1(prompt_list)
+    eval_pab2(prompt_list)
+    eval_pab3(prompt_list)

eval/pab/experiments/opensora_plan.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from utils import generate_func, read_prompt_list
+import videosys
+from videosys import OpenSoraPlanConfig, OpenSoraPlanPipeline
+from videosys.models.open_sora_plan import OpenSoraPlanPABConfig
+def eval_base(prompt_list):
+    config = OpenSoraPlanConfig()
+    pipeline = OpenSoraPlanPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/opensoraplan_base", loop=5)
+def eval_pab1(prompt_list):
+    pab_config = OpenSoraPlanPABConfig(
+        spatial_gap=2,
+        temporal_gap=4,
+        cross_gap=6,
+    )
+    config = OpenSoraPlanConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPlanPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/opensoraplan_pab1", loop=5)
+def eval_pab2(prompt_list):
+    pab_config = OpenSoraPlanPABConfig(
+        spatial_gap=3,
+        temporal_gap=5,
+        cross_gap=7,
+    )
+    config = OpenSoraPlanConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPlanPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/opensoraplan_pab2", loop=5)
+def eval_pab3(prompt_list):
+    pab_config = OpenSoraPlanPABConfig(
+        spatial_gap=5,
+        temporal_gap=7,
+        cross_gap=9,
+    )
+    config = OpenSoraPlanConfig(enable_pab=True, pab_config=pab_config)
+    pipeline = OpenSoraPlanPipeline(config)
+    generate_func(pipeline, prompt_list, "./samples/opensoraplan_pab3", loop=5)
+if __name__ == "__main__":
+    videosys.initialize(42)
+    prompt_list = read_prompt_list("vbench/VBench_full_info.json")
+    eval_base(prompt_list)
+    eval_pab1(prompt_list)
+    eval_pab2(prompt_list)
+    eval_pab3(prompt_list)

eval/pab/experiments/utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+import os
+import tqdm
+from videosys.utils.utils import set_seed
+def generate_func(pipeline, prompt_list, output_dir, loop: int = 5, kwargs: dict = {}):
+    kwargs["verbose"] = False
+    for prompt in tqdm.tqdm(prompt_list):
+        for l in range(loop):
+            set_seed(l)
+            video = pipeline.generate(prompt, **kwargs).video[0]
+            pipeline.save_video(video, os.path.join(output_dir, f"{prompt}-{l}.mp4"))
+def read_prompt_list(prompt_list_path):
+    with open(prompt_list_path, "r") as f:
+        prompt_list = json.load(f)
+    prompt_list = [prompt["prompt_en"] for prompt in prompt_list]
+    return prompt_list

eval/pab/vbench/VBench_full_info.json ADDED Viewed

The diff for this file is too large to render. See raw diff

eval/pab/vbench/cal_vbench.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import argparse
+import json
+import os
+SEMANTIC_WEIGHT = 1
+QUALITY_WEIGHT = 4
+QUALITY_LIST = [
+    "subject consistency",
+    "background consistency",
+    "temporal flickering",
+    "motion smoothness",
+    "aesthetic quality",
+    "imaging quality",
+    "dynamic degree",
+]
+SEMANTIC_LIST = [
+    "object class",
+    "multiple objects",
+    "human action",
+    "color",
+    "spatial relationship",
+    "scene",
+    "appearance style",
+    "temporal style",
+    "overall consistency",
+]
+NORMALIZE_DIC = {
+    "subject consistency": {"Min": 0.1462, "Max": 1.0},
+    "background consistency": {"Min": 0.2615, "Max": 1.0},
+    "temporal flickering": {"Min": 0.6293, "Max": 1.0},
+    "motion smoothness": {"Min": 0.706, "Max": 0.9975},
+    "dynamic degree": {"Min": 0.0, "Max": 1.0},
+    "aesthetic quality": {"Min": 0.0, "Max": 1.0},
+    "imaging quality": {"Min": 0.0, "Max": 1.0},
+    "object class": {"Min": 0.0, "Max": 1.0},
+    "multiple objects": {"Min": 0.0, "Max": 1.0},
+    "human action": {"Min": 0.0, "Max": 1.0},
+    "color": {"Min": 0.0, "Max": 1.0},
+    "spatial relationship": {"Min": 0.0, "Max": 1.0},
+    "scene": {"Min": 0.0, "Max": 0.8222},
+    "appearance style": {"Min": 0.0009, "Max": 0.2855},
+    "temporal style": {"Min": 0.0, "Max": 0.364},
+    "overall consistency": {"Min": 0.0, "Max": 0.364},
+}
+DIM_WEIGHT = {
+    "subject consistency": 1,
+    "background consistency": 1,
+    "temporal flickering": 1,
+    "motion smoothness": 1,
+    "aesthetic quality": 1,
+    "imaging quality": 1,
+    "dynamic degree": 0.5,
+    "object class": 1,
+    "multiple objects": 1,
+    "human action": 1,
+    "color": 1,
+    "spatial relationship": 1,
+    "scene": 1,
+    "appearance style": 1,
+    "temporal style": 1,
+    "overall consistency": 1,
+}
+ordered_scaled_res = [
+    "total score",
+    "quality score",
+    "semantic score",
+    "subject consistency",
+    "background consistency",
+    "temporal flickering",
+    "motion smoothness",
+    "dynamic degree",
+    "aesthetic quality",
+    "imaging quality",
+    "object class",
+    "multiple objects",
+    "human action",
+    "color",
+    "spatial relationship",
+    "scene",
+    "appearance style",
+    "temporal style",
+    "overall consistency",
+]
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--score_dir", required=True, type=str)
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    res_postfix = "_eval_results.json"
+    info_postfix = "_full_info.json"
+    files = os.listdir(args.score_dir)
+    res_files = [x for x in files if res_postfix in x]
+    info_files = [x for x in files if info_postfix in x]
+    assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"
+    full_results = {}
+    for res_file in res_files:
+        # first check if results is normal
+        info_file = res_file.split(res_postfix)[0] + info_postfix
+        with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f:
+            info = json.load(f)
+            assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list"
+        # read results
+        with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
+            data = json.load(f)
+            for key, val in data.items():
+                full_results[key] = format(val[0], ".4f")
+    scaled_results = {}
+    dims = set()
+    for key, val in full_results.items():
+        dim = key.replace("_", " ") if "_" in key else key
+        scaled_score = (float(val) - NORMALIZE_DIC[dim]["Min"]) / (
+            NORMALIZE_DIC[dim]["Max"] - NORMALIZE_DIC[dim]["Min"]
+        )
+        scaled_score *= DIM_WEIGHT[dim]
+        scaled_results[dim] = scaled_score
+        dims.add(dim)
+    assert len(dims) == len(NORMALIZE_DIC), f"{set(NORMALIZE_DIC.keys())-dims} not calculated yet"
+    quality_score = sum([scaled_results[i] for i in QUALITY_LIST]) / sum([DIM_WEIGHT[i] for i in QUALITY_LIST])
+    semantic_score = sum([scaled_results[i] for i in SEMANTIC_LIST]) / sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST])
+    scaled_results["quality score"] = quality_score
+    scaled_results["semantic score"] = semantic_score
+    scaled_results["total score"] = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (
+        QUALITY_WEIGHT + SEMANTIC_WEIGHT
+    )
+    formated_scaled_results = {"items": []}
+    for key in ordered_scaled_res:
+        formated_score = format(scaled_results[key] * 100, ".2f") + "%"
+        formated_scaled_results["items"].append({key: formated_score})
+    output_file_path = os.path.join(args.score_dir, "all_results.json")
+    with open(output_file_path, "w") as outfile:
+        json.dump(full_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {output_file_path}")
+    scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
+    with open(scaled_file_path, "w") as outfile:
+        json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {scaled_file_path}")

eval/pab/vbench/run_vbench.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import argparse
+import torch
+from vbench import VBench
+full_info_path = "./vbench/VBench_full_info.json"
+dimensions = [
+    "subject_consistency",
+    "imaging_quality",
+    "background_consistency",
+    "motion_smoothness",
+    "overall_consistency",
+    "human_action",
+    "multiple_objects",
+    "spatial_relationship",
+    "object_class",
+    "color",
+    "aesthetic_quality",
+    "appearance_style",
+    "temporal_flickering",
+    "scene",
+    "temporal_style",
+    "dynamic_degree",
+]
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video_path", required=True, type=str)
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    save_path = args.video_path.replace("/samples/", "/vbench_out/")
+    kwargs = {}
+    kwargs["imaging_quality_preprocessing_mode"] = "longer"  # use VBench/evaluate.py default
+    for dimension in dimensions:
+        my_VBench = VBench(torch.device("cuda"), full_info_path, save_path)
+        my_VBench.evaluate(
+            videos_path=args.video_path,
+            name=dimension,
+            local=False,
+            read_frame=False,
+            dimension_list=[dimension],
+            mode="vbench_standard",
+            **kwargs,
+        )

examples/cogvideo/sample.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from videosys import CogVideoConfig, VideoSysEngine
+def run_base():
+    config = CogVideoConfig(world_size=1)
+    engine = VideoSysEngine(config)
+    prompt = "Sunset over the sea."
+    video = engine.generate(prompt).video[0]
+    engine.save_video(video, f"./outputs/{prompt}.mp4")
+if __name__ == "__main__":
+    run_base()

examples/latte/sample.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from videosys import LatteConfig, VideoSysEngine
+def run_base():
+    config = LatteConfig(world_size=1)
+    engine = VideoSysEngine(config)
+    prompt = "Sunset over the sea."
+    video = engine.generate(prompt).video[0]
+    engine.save_video(video, f"./outputs/{prompt}.mp4")
+def run_pab():
+    config = LatteConfig(world_size=1)
+    engine = VideoSysEngine(config)
+    prompt = "Sunset over the sea."
+    video = engine.generate(prompt).video[0]
+    engine.save_video(video, f"./outputs/{prompt}.mp4")
+if __name__ == "__main__":
+    run_base()
+    # run_pab()

examples/open_sora/sample.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from videosys import OpenSoraConfig, VideoSysEngine
+def run_base():
+    config = OpenSoraConfig(world_size=1)
+    engine = VideoSysEngine(config)
+    prompt = "Sunset over the sea."
+    video = engine.generate(prompt).video[0]
+    engine.save_video(video, f"./outputs/{prompt}.mp4")
+def run_pab():
+    config = OpenSoraConfig(world_size=1, enable_pab=True)
+    engine = VideoSysEngine(config)
+    prompt = "Sunset over the sea."
+    video = engine.generate(prompt).video[0]
+    engine.save_video(video, f"./outputs/{prompt}.mp4")
+if __name__ == "__main__":
+    run_base()
+    run_pab()

examples/open_sora_plan/sample.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from videosys import OpenSoraPlanConfig, VideoSysEngine
+def run_base():
+    config = OpenSoraPlanConfig(world_size=1)
+    engine = VideoSysEngine(config)
+    prompt = "Sunset over the sea."
+    video = engine.generate(prompt).video[0]
+    engine.save_video(video, f"./outputs/{prompt}.mp4")
+def run_pab():
+    config = OpenSoraPlanConfig(world_size=1)
+    engine = VideoSysEngine(config)
+    prompt = "Sunset over the sea."
+    video = engine.generate(prompt).video[0]
+    engine.save_video(video, f"./outputs/{prompt}.mp4")
+if __name__ == "__main__":
+    run_base()
+    # run_pab()

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+gradio
+click
+colossalai
+contexttimer
+diffusers==0.30.0
+einops
+fabric
+ftfy
+imageio
+imageio-ffmpeg
+matplotlib
+ninja
+numpy
+omegaconf
+packaging
+psutil
+pydantic
+ray
+rich
+safetensors
+timm
+torch>=1.13
+tqdm
+transformers
+openai

setup.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from typing import List
+from setuptools import find_packages, setup
+def fetch_requirements(path) -> List[str]:
+    """
+    This function reads the requirements file.
+    Args:
+        path (str): the path to the requirements file.
+    Returns:
+        The lines in the requirements file.
+    """
+    with open(path, "r") as fd:
+        return [r.strip() for r in fd.readlines()]
+def fetch_readme() -> str:
+    """
+    This function reads the README.md file in the current directory.
+    Returns:
+        The lines in the README file.
+    """
+    with open("README.md", encoding="utf-8") as f:
+        return f.read()
+setup(
+    name="videosys",
+    version="2.0.0",
+    packages=find_packages(
+        exclude=(
+            "videos",
+            "tests",
+            "figure",
+            "*.egg-info",
+        )
+    ),
+    description="VideoSys",
+    long_description=fetch_readme(),
+    long_description_content_type="text/markdown",
+    license="Apache Software License 2.0",
+    install_requires=fetch_requirements("requirements.txt"),
+    python_requires=">=3.6",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Environment :: GPU :: NVIDIA CUDA",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: System :: Distributed Computing",
+    ],
+)

tests/__init__.py ADDED Viewed

File without changes

videosys/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .core.engine import VideoSysEngine
+from .core.parallel_mgr import initialize
+from .models.cogvideo.pipeline import CogVideoConfig, CogVideoPipeline
+from .models.latte.pipeline import LatteConfig, LattePipeline
+from .models.open_sora.pipeline import OpenSoraConfig, OpenSoraPipeline
+from .models.open_sora_plan.pipeline import OpenSoraPlanConfig, OpenSoraPlanPipeline
+__all__ = [
+    "initialize",
+    "VideoSysEngine",
+    "LattePipeline",
+    "LatteConfig",
+    "OpenSoraPlanPipeline",
+    "OpenSoraPlanConfig",
+    "OpenSoraPipeline",
+    "OpenSoraConfig",
+    "CogVideoConfig",
+    "CogVideoPipeline",
+]

videosys/core/__init__.py ADDED Viewed

File without changes

videosys/core/comm.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from typing import Any, Optional, Tuple
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from einops import rearrange
+from torch import Tensor
+from torch.distributed import ProcessGroup
+from videosys.core.parallel_mgr import get_sequence_parallel_size
+# ======================================================
+# Model
+# ======================================================
+def model_sharding(model: torch.nn.Module):
+    global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    for _, param in model.named_parameters():
+        padding_size = (world_size - param.numel() % world_size) % world_size
+        if padding_size > 0:
+            padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size])
+        else:
+            padding_param = param.data.view(-1)
+        splited_params = padding_param.split(padding_param.numel() // world_size)
+        splited_params = splited_params[global_rank]
+        param.data = splited_params
+# ======================================================
+# AllGather & ReduceScatter
+# ======================================================
+class AsyncAllGatherForTwo(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        inputs: Tensor,
+        weight: Tensor,
+        bias: Tensor,
+        sp_rank: int,
+        sp_size: int,
+        group: Optional[ProcessGroup] = None,
+    ) -> Tuple[Tensor, Any]:
+        """
+        Returns:
+            outputs: Tensor
+            handle: Optional[Work], if overlap is True
+        """
+        from torch.distributed._functional_collectives import all_gather_tensor
+        ctx.group = group
+        ctx.sp_rank = sp_rank
+        ctx.sp_size = sp_size
+        # all gather inputs
+        all_inputs = all_gather_tensor(inputs.unsqueeze(0), 0, group)
+        # compute local qkv
+        local_qkv = F.linear(inputs, weight, bias).unsqueeze(0)
+        # remote compute
+        remote_inputs = all_inputs[1 - sp_rank].view(list(local_qkv.shape[:-1]) + [-1])
+        # compute remote qkv
+        remote_qkv = F.linear(remote_inputs, weight, bias)
+        # concat local and remote qkv
+        if sp_rank == 0:
+            qkv = torch.cat([local_qkv, remote_qkv], dim=0)
+        else:
+            qkv = torch.cat([remote_qkv, local_qkv], dim=0)
+        qkv = rearrange(qkv, "sp b n c -> b (sp n) c")
+        ctx.save_for_backward(inputs, weight, remote_inputs)
+        return qkv
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]:
+        from torch.distributed._functional_collectives import reduce_scatter_tensor
+        group = ctx.group
+        sp_rank = ctx.sp_rank
+        sp_size = ctx.sp_size
+        inputs, weight, remote_inputs = ctx.saved_tensors
+        # split qkv_grad
+        qkv_grad = grad_outputs[0]
+        qkv_grad = rearrange(qkv_grad, "b (sp n) c -> sp b n c", sp=sp_size)
+        qkv_grad = torch.chunk(qkv_grad, 2, dim=0)
+        if sp_rank == 0:
+            local_qkv_grad, remote_qkv_grad = qkv_grad
+        else:
+            remote_qkv_grad, local_qkv_grad = qkv_grad
+        # compute remote grad
+        remote_inputs_grad = torch.matmul(remote_qkv_grad, weight).squeeze(0)
+        weight_grad = torch.matmul(remote_qkv_grad.transpose(-1, -2), remote_inputs).squeeze(0).sum(0)
+        bias_grad = remote_qkv_grad.squeeze(0).sum(0).sum(0)
+        # launch async reduce scatter
+        remote_inputs_grad_zero = torch.zeros_like(remote_inputs_grad)
+        if sp_rank == 0:
+            remote_inputs_grad = torch.cat([remote_inputs_grad_zero, remote_inputs_grad], dim=0)
+        else:
+            remote_inputs_grad = torch.cat([remote_inputs_grad, remote_inputs_grad_zero], dim=0)
+        remote_inputs_grad = reduce_scatter_tensor(remote_inputs_grad, "sum", 0, group)
+        # compute local grad and wait for reduce scatter
+        local_input_grad = torch.matmul(local_qkv_grad, weight).squeeze(0)
+        weight_grad += torch.matmul(local_qkv_grad.transpose(-1, -2), inputs).squeeze(0).sum(0)
+        bias_grad += local_qkv_grad.squeeze(0).sum(0).sum(0)
+        # sum remote and local grad
+        inputs_grad = remote_inputs_grad + local_input_grad
+        return inputs_grad, weight_grad, bias_grad, None, None, None
+class AllGather(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        inputs: Tensor,
+        group: Optional[ProcessGroup] = None,
+        overlap: bool = False,
+    ) -> Tuple[Tensor, Any]:
+        """
+        Returns:
+            outputs: Tensor
+            handle: Optional[Work], if overlap is True
+        """
+        assert ctx is not None or not overlap
+        if ctx is not None:
+            ctx.comm_grp = group
+        comm_size = dist.get_world_size(group)
+        if comm_size == 1:
+            return inputs.unsqueeze(0), None
+        buffer_shape = (comm_size,) + inputs.shape
+        outputs = torch.empty(buffer_shape, dtype=inputs.dtype, device=inputs.device)
+        buffer_list = list(torch.chunk(outputs, comm_size, dim=0))
+        if not overlap:
+            dist.all_gather(buffer_list, inputs, group=group)
+            return outputs, None
+        else:
+            handle = dist.all_gather(buffer_list, inputs, group=group, async_op=True)
+            return outputs, handle
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]:
+        return (
+            ReduceScatter.forward(None, grad_outputs[0], ctx.comm_grp, False)[0],
+            None,
+            None,
+        )
+class ReduceScatter(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        inputs: Tensor,
+        group: ProcessGroup,
+        overlap: bool = False,
+    ) -> Tuple[Tensor, Any]:
+        """
+        Returns:
+            outputs: Tensor
+            handle: Optional[Work], if overlap is True
+        """
+        assert ctx is not None or not overlap
+        if ctx is not None:
+            ctx.comm_grp = group
+        comm_size = dist.get_world_size(group)
+        if comm_size == 1:
+            return inputs.squeeze(0), None
+        if not inputs.is_contiguous():
+            inputs = inputs.contiguous()
+        output_shape = inputs.shape[1:]
+        outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device)
+        buffer_list = list(torch.chunk(inputs, comm_size, dim=0))
+        if not overlap:
+            dist.reduce_scatter(outputs, buffer_list, group=group)
+            return outputs, None
+        else:
+            handle = dist.reduce_scatter(outputs, buffer_list, group=group, async_op=True)
+            return outputs, handle
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs) -> Tuple[Tensor, None, None]:
+        # TODO: support async backward
+        return (
+            AllGather.forward(None, grad_outputs[0], ctx.comm_grp, False)[0],
+            None,
+            None,
+        )
+# ======================================================
+# AlltoAll
+# ======================================================
+def _all_to_all_func(input_, world_size, group, scatter_dim, gather_dim):
+    input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
+    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+    dist.all_to_all(output_list, input_list, group=group)
+    return torch.cat(output_list, dim=gather_dim).contiguous()
+class _AllToAll(torch.autograd.Function):
+    """All-to-all communication.
+    Args:
+        input_: input matrix
+        process_group: communication group
+        scatter_dim: scatter dimension
+        gather_dim: gather dimension
+    """
+    @staticmethod
+    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
+        ctx.process_group = process_group
+        ctx.scatter_dim = scatter_dim
+        ctx.gather_dim = gather_dim
+        world_size = dist.get_world_size(process_group)
+        return _all_to_all_func(input_, world_size, process_group, scatter_dim, gather_dim)
+    @staticmethod
+    def backward(ctx, *grad_output):
+        process_group = ctx.process_group
+        scatter_dim = ctx.gather_dim
+        gather_dim = ctx.scatter_dim
+        return_grad = _AllToAll.apply(*grad_output, process_group, scatter_dim, gather_dim)
+        return (return_grad, None, None, None)
+def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1):
+    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
+# ======================================================
+# Sequence Gather & Split
+# ======================================================
+def _split_sequence_func(input_, pg: dist.ProcessGroup, dim: int, pad: int):
+    # skip if only one rank involved
+    world_size = dist.get_world_size(pg)
+    rank = dist.get_rank(pg)
+    if world_size == 1:
+        return input_
+    if pad > 0:
+        pad_size = list(input_.shape)
+        pad_size[dim] = pad
+        input_ = torch.cat([input_, torch.zeros(pad_size, dtype=input_.dtype, device=input_.device)], dim=dim)
+    dim_size = input_.size(dim)
+    assert dim_size % world_size == 0, f"dim_size ({dim_size}) is not divisible by world_size ({world_size})"
+    tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
+    output = tensor_list[rank].contiguous()
+    return output
+def _gather_sequence_func(input_, pg: dist.ProcessGroup, dim: int, pad: int):
+    # skip if only one rank involved
+    input_ = input_.contiguous()
+    world_size = dist.get_world_size(pg)
+    dist.get_rank(pg)
+    if world_size == 1:
+        return input_
+    # all gather
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    assert input_.device.type == "cuda"
+    torch.distributed.all_gather(tensor_list, input_, group=pg)
+    # concat
+    output = torch.cat(tensor_list, dim=dim)
+    if pad > 0:
+        output = output.narrow(dim, 0, output.size(dim) - pad)
+    return output
+class _GatherForwardSplitBackward(torch.autograd.Function):
+    """
+    Gather the input sequence.
+    Args:
+        input_: input matrix.
+        process_group: process group.
+        dim: dimension
+    """
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather_sequence_func(input_)
+    @staticmethod
+    def forward(ctx, input_, process_group, dim, grad_scale, pad):
+        ctx.process_group = process_group
+        ctx.dim = dim
+        ctx.grad_scale = grad_scale
+        ctx.pad = pad
+        return _gather_sequence_func(input_, process_group, dim, pad)
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.grad_scale == "up":
+            grad_output = grad_output * dist.get_world_size(ctx.process_group)
+        elif ctx.grad_scale == "down":
+            grad_output = grad_output / dist.get_world_size(ctx.process_group)
+        return _split_sequence_func(grad_output, ctx.process_group, ctx.dim, ctx.pad), None, None, None, None
+class _SplitForwardGatherBackward(torch.autograd.Function):
+    """
+    Split sequence.
+    Args:
+        input_: input matrix.
+        process_group: parallel mode.
+        dim: dimension
+    """
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_sequence_func(input_)
+    @staticmethod
+    def forward(ctx, input_, process_group, dim, grad_scale, pad):
+        ctx.process_group = process_group
+        ctx.dim = dim
+        ctx.grad_scale = grad_scale
+        ctx.pad = pad
+        return _split_sequence_func(input_, process_group, dim, pad)
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.grad_scale == "up":
+            grad_output = grad_output * dist.get_world_size(ctx.process_group)
+        elif ctx.grad_scale == "down":
+            grad_output = grad_output / dist.get_world_size(ctx.process_group)
+        return _gather_sequence_func(grad_output, ctx.process_group, ctx.pad), None, None, None, None
+def split_sequence(input_, process_group, dim, grad_scale=1.0, pad=0):
+    return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale, pad)
+def gather_sequence(input_, process_group, dim, grad_scale=1.0, pad=0):
+    return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale, pad)
+# ==============================
+# Pad
+# ==============================
+SPTIAL_PAD = 0
+TEMPORAL_PAD = 0
+def set_spatial_pad(dim_size: int):
+    sp_size = get_sequence_parallel_size()
+    pad = (sp_size - (dim_size % sp_size)) % sp_size
+    global SPTIAL_PAD
+    SPTIAL_PAD = pad
+def get_spatial_pad() -> int:
+    return SPTIAL_PAD
+def set_temporal_pad(dim_size: int):
+    sp_size = get_sequence_parallel_size()
+    pad = (sp_size - (dim_size % sp_size)) % sp_size
+    global TEMPORAL_PAD
+    TEMPORAL_PAD = pad
+def get_temporal_pad() -> int:
+    return TEMPORAL_PAD
+def all_to_all_with_pad(
+    input_: torch.Tensor,
+    process_group: dist.ProcessGroup,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+    scatter_pad: int = 0,
+    gather_pad: int = 0,
+):
+    if scatter_pad > 0:
+        pad_shape = list(input_.shape)
+        pad_shape[scatter_dim] = scatter_pad
+        pad_tensor = torch.zeros(pad_shape, device=input_.device, dtype=input_.dtype)
+        input_ = torch.cat([input_, pad_tensor], dim=scatter_dim)
+    assert (
+        input_.shape[scatter_dim] % dist.get_world_size(process_group) == 0
+    ), f"Dimension to scatter ({input_.shape[scatter_dim]}) is not divisible by world size ({dist.get_world_size(process_group)})"
+    input_ = _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
+    if gather_pad > 0:
+        input_ = input_.narrow(gather_dim, 0, input_.size(gather_dim) - gather_pad)
+    return input_

videosys/core/engine.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+from functools import partial
+from typing import Any, Optional
+import imageio
+import torch
+import videosys
+from .mp_utils import ProcessWorkerWrapper, ResultHandler, WorkerMonitor, get_distributed_init_method, get_open_port
+class VideoSysEngine:
+    """
+    this is partly inspired by vllm
+    """
+    def __init__(self, config):
+        self.config = config
+        self.parallel_worker_tasks = None
+        self._init_worker(config.pipeline_cls)
+    def _init_worker(self, pipeline_cls):
+        world_size = self.config.world_size
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(world_size))
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
+        # contention amongst the shards
+        if "OMP_NUM_THREADS" not in os.environ:
+            os.environ["OMP_NUM_THREADS"] = "1"
+        # NOTE: The two following lines need adaption for multi-node
+        assert world_size <= torch.cuda.device_count()
+        # change addr for multi-node
+        distributed_init_method = get_distributed_init_method("127.0.0.1", get_open_port())
+        if world_size == 1:
+            self.workers = []
+            self.worker_monitor = None
+        else:
+            result_handler = ResultHandler()
+            self.workers = [
+                ProcessWorkerWrapper(
+                    result_handler,
+                    partial(
+                        self._create_pipeline,
+                        pipeline_cls=pipeline_cls,
+                        rank=rank,
+                        local_rank=rank,
+                        distributed_init_method=distributed_init_method,
+                    ),
+                )
+                for rank in range(1, world_size)
+            ]
+            self.worker_monitor = WorkerMonitor(self.workers, result_handler)
+            result_handler.start()
+            self.worker_monitor.start()
+        self.driver_worker = self._create_pipeline(
+            pipeline_cls=pipeline_cls, distributed_init_method=distributed_init_method
+        )
+    # TODO: add more options here for pipeline, or wrap all options into config
+    def _create_pipeline(self, pipeline_cls, rank=0, local_rank=0, distributed_init_method=None):
+        videosys.initialize(rank=rank, world_size=self.config.world_size, init_method=distributed_init_method, seed=42)
+        pipeline = pipeline_cls(self.config)
+        return pipeline
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+        # Start the workers first.
+        worker_outputs = [worker.execute_method(method, *args, **kwargs) for worker in self.workers]
+        if async_run_tensor_parallel_workers_only:
+            # Just return futures
+            return worker_outputs
+        driver_worker_method = getattr(self.driver_worker, method)
+        driver_worker_output = driver_worker_method(*args, **kwargs)
+        # Get the results of the workers.
+        return [driver_worker_output] + [output.get() for output in worker_outputs]
+    def _driver_execute_model(self, *args, **kwargs):
+        return self.driver_worker.generate(*args, **kwargs)
+    def generate(self, *args, **kwargs):
+        return self._run_workers("generate", *args, **kwargs)[0]
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        for result in parallel_worker_tasks:
+            result.get()
+    def save_video(self, video, output_path):
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        imageio.mimwrite(output_path, video, fps=24)
+    def shutdown(self):
+        if (worker_monitor := getattr(self, "worker_monitor", None)) is not None:
+            worker_monitor.close()
+        torch.distributed.destroy_process_group()
+    def __del__(self):
+        self.shutdown()

videosys/core/mp_utils.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# adapted from vllm
+# https://github.com/vllm-project/vllm/blob/main/vllm/executor/multiproc_worker_utils.py
+import asyncio
+import multiprocessing
+import os
+import socket
+import sys
+import threading
+import traceback
+import uuid
+from dataclasses import dataclass
+from multiprocessing import Queue
+from multiprocessing.connection import wait
+from typing import Any, Callable, Dict, Generic, List, Optional, TextIO, TypeVar, Union
+from videosys.utils.logging import create_logger
+T = TypeVar("T")
+_TERMINATE = "TERMINATE"  # sentinel
+# ANSI color codes
+CYAN = "\033[1;36m"
+RESET = "\033[0;0m"
+JOIN_TIMEOUT_S = 2
+mp_method = "spawn"  # fork cann't work
+mp = multiprocessing.get_context(mp_method)
+logger = create_logger()
+def get_distributed_init_method(ip: str, port: int) -> str:
+    # Brackets are not permitted in ipv4 addresses,
+    # see https://github.com/python/cpython/issues/103848
+    return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
+def get_open_port() -> int:
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+@dataclass
+class Result(Generic[T]):
+    """Result of task dispatched to worker"""
+    task_id: uuid.UUID
+    value: Optional[T] = None
+    exception: Optional[BaseException] = None
+class ResultFuture(threading.Event, Generic[T]):
+    """Synchronous future for non-async case"""
+    def __init__(self):
+        super().__init__()
+        self.result: Optional[Result[T]] = None
+    def set_result(self, result: Result[T]):
+        self.result = result
+        self.set()
+    def get(self) -> T:
+        self.wait()
+        assert self.result is not None
+        if self.result.exception is not None:
+            raise self.result.exception
+        return self.result.value  # type: ignore[return-value]
+def _set_future_result(future: Union[ResultFuture, asyncio.Future], result: Result):
+    if isinstance(future, ResultFuture):
+        future.set_result(result)
+        return
+    loop = future.get_loop()
+    if not loop.is_closed():
+        if result.exception is not None:
+            loop.call_soon_threadsafe(future.set_exception, result.exception)
+        else:
+            loop.call_soon_threadsafe(future.set_result, result.value)
+class ResultHandler(threading.Thread):
+    """Handle results from all workers (in background thread)"""
+    def __init__(self) -> None:
+        super().__init__(daemon=True)
+        self.result_queue = mp.Queue()
+        self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}
+    def run(self):
+        for result in iter(self.result_queue.get, _TERMINATE):
+            future = self.tasks.pop(result.task_id)
+            _set_future_result(future, result)
+        # Ensure that all waiters will receive an exception
+        for task_id, future in self.tasks.items():
+            _set_future_result(future, Result(task_id=task_id, exception=ChildProcessError("worker died")))
+    def close(self):
+        self.result_queue.put(_TERMINATE)
+class WorkerMonitor(threading.Thread):
+    """Monitor worker status (in background thread)"""
+    def __init__(self, workers: List["ProcessWorkerWrapper"], result_handler: ResultHandler):
+        super().__init__(daemon=True)
+        self.workers = workers
+        self.result_handler = result_handler
+        self._close = False
+    def run(self) -> None:
+        # Blocks until any worker exits
+        dead_sentinels = wait([w.process.sentinel for w in self.workers])
+        if not self._close:
+            self._close = True
+            # Kill / cleanup all workers
+            for worker in self.workers:
+                process = worker.process
+                if process.sentinel in dead_sentinels:
+                    process.join(JOIN_TIMEOUT_S)
+                if process.exitcode is not None and process.exitcode != 0:
+                    logger.error("Worker %s pid %s died, exit code: %s", process.name, process.pid, process.exitcode)
+            # Cleanup any remaining workers
+            logger.info("Killing local worker processes")
+            for worker in self.workers:
+                worker.kill_worker()
+            # Must be done after worker task queues are all closed
+            self.result_handler.close()
+        for worker in self.workers:
+            worker.process.join(JOIN_TIMEOUT_S)
+    def close(self):
+        if self._close:
+            return
+        self._close = True
+        logger.info("Terminating local worker processes")
+        for worker in self.workers:
+            worker.terminate_worker()
+        # Must be done after worker task queues are all closed
+        self.result_handler.close()
+def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
+    """Prepend each output line with process-specific prefix"""
+    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    file_write = file.write
+    def write_with_prefix(s: str):
+        if not s:
+            return
+        if file.start_new_line:  # type: ignore[attr-defined]
+            file_write(prefix)
+        idx = 0
+        while (next_idx := s.find("\n", idx)) != -1:
+            next_idx += 1
+            file_write(s[idx:next_idx])
+            if next_idx == len(s):
+                file.start_new_line = True  # type: ignore[attr-defined]
+                return
+            file_write(prefix)
+            idx = next_idx
+        file_write(s[idx:])
+        file.start_new_line = False  # type: ignore[attr-defined]
+    file.start_new_line = True  # type: ignore[attr-defined]
+    file.write = write_with_prefix  # type: ignore[method-assign]
+def _run_worker_process(
+    worker_factory: Callable[[], Any],
+    task_queue: Queue,
+    result_queue: Queue,
+) -> None:
+    """Worker process event loop"""
+    # Add process-specific prefix to stdout and stderr
+    process_name = mp.current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+    # Initialize worker
+    worker = worker_factory()
+    del worker_factory
+    # Accept tasks from the engine in task_queue
+    # and return task output in result_queue
+    logger.info("Worker ready; awaiting tasks")
+    try:
+        for items in iter(task_queue.get, _TERMINATE):
+            output = None
+            exception = None
+            task_id, method, args, kwargs = items
+            try:
+                executor = getattr(worker, method)
+                output = executor(*args, **kwargs)
+            except BaseException as e:
+                tb = traceback.format_exc()
+                logger.error("Exception in worker %s while processing method %s: %s, %s", process_name, method, e, tb)
+                exception = e
+            result_queue.put(Result(task_id=task_id, value=output, exception=exception))
+    except KeyboardInterrupt:
+        pass
+    except Exception:
+        logger.exception("Worker failed")
+    logger.info("Worker exiting")
+class ProcessWorkerWrapper:
+    """Local process wrapper for handling single-node multi-GPU."""
+    def __init__(self, result_handler: ResultHandler, worker_factory: Callable[[], Any]) -> None:
+        self._task_queue = mp.Queue()
+        self.result_queue = result_handler.result_queue
+        self.tasks = result_handler.tasks
+        self.process = mp.Process(  # type: ignore[attr-defined]
+            target=_run_worker_process,
+            name="VideoSysWorkerProcess",
+            kwargs=dict(
+                worker_factory=worker_factory,
+                task_queue=self._task_queue,
+                result_queue=self.result_queue,
+            ),
+            daemon=True,
+        )
+        self.process.start()
+    def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future], method: str, args, kwargs):
+        task_id = uuid.uuid4()
+        self.tasks[task_id] = future
+        try:
+            self._task_queue.put((task_id, method, args, kwargs))
+        except BaseException as e:
+            del self.tasks[task_id]
+            raise ChildProcessError("worker died") from e
+    def execute_method(self, method: str, *args, **kwargs):
+        future: ResultFuture = ResultFuture()
+        self._enqueue_task(future, method, args, kwargs)
+        return future
+    async def execute_method_async(self, method: str, *args, **kwargs):
+        future = asyncio.get_running_loop().create_future()
+        self._enqueue_task(future, method, args, kwargs)
+        return await future
+    def terminate_worker(self):
+        try:
+            self._task_queue.put(_TERMINATE)
+        except ValueError:
+            self.process.kill()
+        self._task_queue.close()
+    def kill_worker(self):
+        self._task_queue.close()
+        self.process.kill()

videosys/core/pab_mgr.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import random
+import numpy as np
+import torch
+from videosys.utils.logging import logger
+PAB_MANAGER = None
+class PABConfig:
+    def __init__(
+        self,
+        steps: int,
+        cross_broadcast: bool,
+        cross_threshold: list,
+        cross_gap: int,
+        spatial_broadcast: bool,
+        spatial_threshold: list,
+        spatial_gap: int,
+        temporal_broadcast: bool,
+        temporal_threshold: list,
+        temporal_gap: int,
+        diffusion_skip: bool,
+        diffusion_timestep_respacing: list,
+        diffusion_skip_timestep: list,
+        mlp_skip: bool,
+        mlp_spatial_skip_config: dict,
+        mlp_temporal_skip_config: dict,
+        full_broadcast: bool = False,
+        full_threshold: list = None,
+        full_gap: int = 1,
+    ):
+        self.steps = steps
+        self.cross_broadcast = cross_broadcast
+        self.cross_threshold = cross_threshold
+        self.cross_gap = cross_gap
+        self.spatial_broadcast = spatial_broadcast
+        self.spatial_threshold = spatial_threshold
+        self.spatial_gap = spatial_gap
+        self.temporal_broadcast = temporal_broadcast
+        self.temporal_threshold = temporal_threshold
+        self.temporal_gap = temporal_gap
+        self.diffusion_skip = diffusion_skip
+        self.diffusion_timestep_respacing = diffusion_timestep_respacing
+        self.diffusion_skip_timestep = diffusion_skip_timestep
+        self.mlp_skip = mlp_skip
+        self.mlp_spatial_skip_config = mlp_spatial_skip_config
+        self.mlp_temporal_skip_config = mlp_temporal_skip_config
+        self.temporal_mlp_outputs = {}
+        self.spatial_mlp_outputs = {}
+        self.full_broadcast = full_broadcast
+        self.full_threshold = full_threshold
+        self.full_gap = full_gap
+class PABManager:
+    def __init__(self, config: PABConfig):
+        self.config: PABConfig = config
+        init_prompt = f"Init PABManager. steps: {config.steps}."
+        init_prompt += f" spatial_broadcast: {config.spatial_broadcast}, spatial_threshold: {config.spatial_threshold}, spatial_gap: {config.spatial_gap}."
+        init_prompt += f" temporal_broadcast: {config.temporal_broadcast}, temporal_threshold: {config.temporal_threshold}, temporal_gap: {config.temporal_gap}."
+        init_prompt += f" cross_broadcast: {config.cross_broadcast}, cross_threshold: {config.cross_threshold}, cross_gap: {config.cross_gap}."
+        init_prompt += f" full_broadcast: {config.full_broadcast}, full_threshold: {config.full_threshold}, full_gap: {config.full_gap}."
+        logger.info(init_prompt)
+    def if_broadcast_cross(self, timestep: int, count: int):
+        if (
+            self.config.cross_broadcast
+            and (timestep is not None)
+            and (count % self.config.cross_gap != 0)
+            and (self.config.cross_threshold[0] < timestep < self.config.cross_threshold[1])
+        ):
+            flag = True
+        else:
+            flag = False
+        count = (count + 1) % self.config.steps
+        return flag, count
+    def if_broadcast_temporal(self, timestep: int, count: int):
+        if (
+            self.config.temporal_broadcast
+            and (timestep is not None)
+            and (count % self.config.temporal_gap != 0)
+            and (self.config.temporal_threshold[0] < timestep < self.config.temporal_threshold[1])
+        ):
+            flag = True
+        else:
+            flag = False
+        count = (count + 1) % self.config.steps
+        return flag, count
+    def if_broadcast_spatial(self, timestep: int, count: int, block_idx: int):
+        if (
+            self.config.spatial_broadcast
+            and (timestep is not None)
+            and (count % self.config.spatial_gap != 0)
+            and (self.config.spatial_threshold[0] < timestep < self.config.spatial_threshold[1])
+        ):
+            flag = True
+        else:
+            flag = False
+        count = (count + 1) % self.config.steps
+        return flag, count
+    def if_broadcast_full(self, timestep: int, count: int, block_idx: int):
+        if (
+            self.config.full_broadcast
+            and (timestep is not None)
+            and (count % self.config.full_gap != 0)
+            and (self.config.full_threshold[0] < timestep < self.config.full_threshold[1])
+        ):
+            flag = True
+        else:
+            flag = False
+        count = (count + 1) % self.config.steps
+        return flag, count
+    @staticmethod
+    def _is_t_in_skip_config(all_timesteps, timestep, config):
+        is_t_in_skip_config = False
+        for key in config:
+            if key not in all_timesteps:
+                continue
+            index = all_timesteps.index(key)
+            skip_range = all_timesteps[index : index + 1 + int(config[key]["skip_count"])]
+            if timestep in skip_range:
+                is_t_in_skip_config = True
+                skip_range = [all_timesteps[index], all_timesteps[index + int(config[key]["skip_count"])]]
+                break
+        return is_t_in_skip_config, skip_range
+    def if_skip_mlp(self, timestep: int, count: int, block_idx: int, all_timesteps, is_temporal=False):
+        if not self.config.mlp_skip:
+            return False, None, False, None
+        if is_temporal:
+            cur_config = self.config.mlp_temporal_skip_config
+        else:
+            cur_config = self.config.mlp_spatial_skip_config
+        is_t_in_skip_config, skip_range = self._is_t_in_skip_config(all_timesteps, timestep, cur_config)
+        next_flag = False
+        if (
+            self.config.mlp_skip
+            and (timestep is not None)
+            and (timestep in cur_config)
+            and (block_idx in cur_config[timestep]["block"])
+        ):
+            flag = False
+            next_flag = True
+            count = count + 1
+        elif (
+            self.config.mlp_skip
+            and (timestep is not None)
+            and (is_t_in_skip_config)
+            and (block_idx in cur_config[skip_range[0]]["block"])
+        ):
+            flag = True
+            count = 0
+        else:
+            flag = False
+        return flag, count, next_flag, skip_range
+    def save_skip_output(self, timestep, block_idx, ff_output, is_temporal=False):
+        if is_temporal:
+            self.config.temporal_mlp_outputs[(timestep, block_idx)] = ff_output
+        else:
+            self.config.spatial_mlp_outputs[(timestep, block_idx)] = ff_output
+    def get_mlp_output(self, skip_range, timestep, block_idx, is_temporal=False):
+        skip_start_t = skip_range[0]
+        if is_temporal:
+            skip_output = (
+                self.config.temporal_mlp_outputs.get((skip_start_t, block_idx), None)
+                if self.config.temporal_mlp_outputs is not None
+                else None
+            )
+        else:
+            skip_output = (
+                self.config.spatial_mlp_outputs.get((skip_start_t, block_idx), None)
+                if self.config.spatial_mlp_outputs is not None
+                else None
+            )
+        if skip_output is not None:
+            if timestep == skip_range[-1]:
+                # TODO: save memory
+                if is_temporal:
+                    del self.config.temporal_mlp_outputs[(skip_start_t, block_idx)]
+                else:
+                    del self.config.spatial_mlp_outputs[(skip_start_t, block_idx)]
+        else:
+            raise ValueError(
+                f"No stored MLP output found | t {timestep} |[{skip_range[0]}, {skip_range[-1]}] | block {block_idx}"
+            )
+        return skip_output
+    def get_spatial_mlp_outputs(self):
+        return self.config.spatial_mlp_outputs
+    def get_temporal_mlp_outputs(self):
+        return self.config.temporal_mlp_outputs
+def set_pab_manager(config: PABConfig):
+    global PAB_MANAGER
+    PAB_MANAGER = PABManager(config)
+def enable_pab():
+    if PAB_MANAGER is None:
+        return False
+    return (
+        PAB_MANAGER.config.cross_broadcast
+        or PAB_MANAGER.config.spatial_broadcast
+        or PAB_MANAGER.config.temporal_broadcast
+    )
+def update_steps(steps: int):
+    if PAB_MANAGER is not None:
+        PAB_MANAGER.config.steps = steps
+def if_broadcast_cross(timestep: int, count: int):
+    if not enable_pab():
+        return False, count
+    return PAB_MANAGER.if_broadcast_cross(timestep, count)
+def if_broadcast_temporal(timestep: int, count: int):
+    if not enable_pab():
+        return False, count
+    return PAB_MANAGER.if_broadcast_temporal(timestep, count)
+def if_broadcast_spatial(timestep: int, count: int, block_idx: int):
+    if not enable_pab():
+        return False, count
+    return PAB_MANAGER.if_broadcast_spatial(timestep, count, block_idx)
+def if_broadcast_full(timestep: int, count: int, block_idx: int):
+    if not enable_pab():
+        return False, count
+    return PAB_MANAGER.if_broadcast_full(timestep, count, block_idx)
+def if_broadcast_mlp(timestep: int, count: int, block_idx: int, all_timesteps, is_temporal=False):
+    if not enable_pab():
+        return False, count
+    return PAB_MANAGER.if_skip_mlp(timestep, count, block_idx, all_timesteps, is_temporal)
+def save_mlp_output(timestep: int, block_idx: int, ff_output, is_temporal=False):
+    return PAB_MANAGER.save_skip_output(timestep, block_idx, ff_output, is_temporal)
+def get_mlp_output(skip_range, timestep, block_idx: int, is_temporal=False):
+    return PAB_MANAGER.get_mlp_output(skip_range, timestep, block_idx, is_temporal)
+def get_diffusion_skip():
+    return enable_pab() and PAB_MANAGER.config.diffusion_skip
+def get_diffusion_timestep_respacing():
+    return PAB_MANAGER.config.diffusion_timestep_respacing
+def get_diffusion_skip_timestep():
+    return enable_pab() and PAB_MANAGER.config.diffusion_skip_timestep
+def space_timesteps(time_steps, time_bins):
+    num_bins = len(time_bins)
+    bin_size = time_steps // num_bins
+    result = []
+    for i, bin_count in enumerate(time_bins):
+        start = i * bin_size
+        end = start + bin_size
+        bin_steps = np.linspace(start, end, bin_count, endpoint=False, dtype=int).tolist()
+        result.extend(bin_steps)
+    result_tensor = torch.tensor(result, dtype=torch.int32)
+    sorted_tensor = torch.sort(result_tensor, descending=True).values
+    return sorted_tensor
+def skip_diffusion_timestep(timesteps, diffusion_skip_timestep):
+    if isinstance(timesteps, list):
+        # If timesteps is a list, we assume each element is a tensor
+        timesteps_np = [t.cpu().numpy() for t in timesteps]
+        device = timesteps[0].device
+    else:
+        # If timesteps is a tensor
+        timesteps_np = timesteps.cpu().numpy()
+        device = timesteps.device
+    num_bins = len(diffusion_skip_timestep)
+    if isinstance(timesteps_np, list):
+        bin_size = len(timesteps_np) // num_bins
+        new_timesteps = []
+        for i in range(num_bins):
+            bin_start = i * bin_size
+            bin_end = (i + 1) * bin_size if i != num_bins - 1 else len(timesteps_np)
+            bin_timesteps = timesteps_np[bin_start:bin_end]
+            if diffusion_skip_timestep[i] == 0:
+                # If the bin is marked with 0, keep all timesteps
+                new_timesteps.extend(bin_timesteps)
+            elif diffusion_skip_timestep[i] == 1:
+                # If the bin is marked with 1, omit the last timestep in the bin
+                new_timesteps.extend(bin_timesteps[1:])
+        new_timesteps_tensor = [torch.tensor(t, device=device) for t in new_timesteps]
+    else:
+        bin_size = len(timesteps_np) // num_bins
+        new_timesteps = []
+        for i in range(num_bins):
+            bin_start = i * bin_size
+            bin_end = (i + 1) * bin_size if i != num_bins - 1 else len(timesteps_np)
+            bin_timesteps = timesteps_np[bin_start:bin_end]
+            if diffusion_skip_timestep[i] == 0:
+                # If the bin is marked with 0, keep all timesteps
+                new_timesteps.extend(bin_timesteps)
+            elif diffusion_skip_timestep[i] == 1:
+                # If the bin is marked with 1, omit the last timestep in the bin
+                new_timesteps.extend(bin_timesteps[1:])
+            elif diffusion_skip_timestep[i] != 0:
+                # If the bin is marked with a non-zero value, randomly omit n timesteps
+                if len(bin_timesteps) > diffusion_skip_timestep[i]:
+                    indices_to_remove = set(random.sample(range(len(bin_timesteps)), diffusion_skip_timestep[i]))
+                    timesteps_to_keep = [
+                        timestep for idx, timestep in enumerate(bin_timesteps) if idx not in indices_to_remove
+                    ]
+                else:
+                    timesteps_to_keep = bin_timesteps  # 如果bin_timesteps的长度小于等于n，则不删除任何元素
+                new_timesteps.extend(timesteps_to_keep)
+        new_timesteps_tensor = torch.tensor(new_timesteps, device=device)
+    if isinstance(timesteps, list):
+        return new_timesteps_tensor
+    else:
+        return new_timesteps_tensor

videosys/core/parallel_mgr.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from typing import Optional
+import torch
+import torch.distributed as dist
+from colossalai.cluster.process_group_mesh import ProcessGroupMesh
+from torch.distributed import ProcessGroup
+from videosys.utils.logging import init_dist_logger, logger
+from videosys.utils.utils import set_seed
+PARALLEL_MANAGER = None
+class ParallelManager(ProcessGroupMesh):
+    def __init__(self, dp_size, cp_size, sp_size):
+        super().__init__(dp_size, cp_size, sp_size)
+        dp_axis, cp_axis, sp_axis = 0, 1, 2
+        self.dp_size = dp_size
+        self.dp_group: ProcessGroup = self.get_group_along_axis(dp_axis)
+        self.dp_rank = dist.get_rank(self.dp_group)
+        self.cp_size = cp_size
+        self.cp_group: ProcessGroup = self.get_group_along_axis(cp_axis)
+        self.cp_rank = dist.get_rank(self.cp_group)
+        self.sp_size = sp_size
+        self.sp_group: ProcessGroup = self.get_group_along_axis(sp_axis)
+        self.sp_rank = dist.get_rank(self.sp_group)
+        self.enable_sp = sp_size > 1
+        logger.info(f"Init parallel manager with dp_size: {dp_size}, cp_size: {cp_size}, sp_size: {sp_size}")
+def set_parallel_manager(dp_size, cp_size, sp_size):
+    global PARALLEL_MANAGER
+    PARALLEL_MANAGER = ParallelManager(dp_size, cp_size, sp_size)
+def get_data_parallel_group():
+    return PARALLEL_MANAGER.dp_group
+def get_data_parallel_size():
+    return PARALLEL_MANAGER.dp_size
+def get_data_parallel_rank():
+    return PARALLEL_MANAGER.dp_rank
+def get_sequence_parallel_group():
+    return PARALLEL_MANAGER.sp_group
+def get_sequence_parallel_size():
+    return PARALLEL_MANAGER.sp_size
+def get_sequence_parallel_rank():
+    return PARALLEL_MANAGER.sp_rank
+def get_cfg_parallel_group():
+    return PARALLEL_MANAGER.cp_group
+def get_cfg_parallel_size():
+    return PARALLEL_MANAGER.cp_size
+def enable_sequence_parallel():
+    if PARALLEL_MANAGER is None:
+        return False
+    return PARALLEL_MANAGER.enable_sp
+def get_parallel_manager():
+    return PARALLEL_MANAGER
+def initialize(
+    rank=0,
+    world_size=1,
+    init_method=None,
+    seed: Optional[int] = None,
+    sp_size: Optional[int] = None,
+    enable_cp: bool = True,
+):
+    if not dist.is_initialized():
+        try:
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        dist.init_process_group(backend="nccl", init_method=init_method, world_size=world_size, rank=rank)
+        torch.cuda.set_device(rank)
+        init_dist_logger()
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    # init sequence parallel
+    if sp_size is None:
+        sp_size = dist.get_world_size()
+        dp_size = 1
+    else:
+        assert dist.get_world_size() % sp_size == 0, f"world_size {dist.get_world_size()} must be divisible by sp_size"
+        dp_size = dist.get_world_size() // sp_size
+    # update cfg parallel
+    if enable_cp and sp_size % 2 == 0:
+        sp_size = sp_size // 2
+        cp_size = 2
+    else:
+        cp_size = 1
+    set_parallel_manager(dp_size, cp_size, sp_size)
+    if seed is not None:
+        set_seed(seed + get_data_parallel_rank())

videosys/core/pipeline.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from abc import abstractmethod
+from dataclasses import dataclass
+import torch
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+class VideoSysPipeline(DiffusionPipeline):
+    def __init__(self):
+        super().__init__()
+    @staticmethod
+    def set_eval_and_device(device: torch.device, *modules):
+        for module in modules:
+            module.eval()
+            module.to(device)
+    @abstractmethod
+    def generate(self, *args, **kwargs):
+        pass
+    def __call__(self, *args, **kwargs):
+        """
+        In diffusers, it is a convention to call the pipeline object.
+        But in VideoSys, we will use the generate method for better prompt.
+        This is a wrapper for the generate method to support the diffusers usage.
+        """
+        return self.generate(*args, **kwargs)
+@dataclass
+class VideoSysPipelineOutput(BaseOutput):
+    video: torch.Tensor

videosys/core/shardformer/__init__.py ADDED Viewed

File without changes

videosys/core/shardformer/t5/__init__.py ADDED Viewed

File without changes

videosys/core/shardformer/t5/modeling.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn as nn
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+    @staticmethod
+    def from_native_module(module, *args, **kwargs):
+        assert module.__class__.__name__ == "FusedRMSNorm", (
+            "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm."
+            "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48"
+        )
+        layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps)
+        layer_norm.weight.data.copy_(module.weight.data)
+        layer_norm = layer_norm.to(module.weight.device)
+        return layer_norm

videosys/core/shardformer/t5/policy.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from colossalai.shardformer.modeling.jit import get_jit_fused_dropout_add_func
+from colossalai.shardformer.modeling.t5 import get_jit_fused_T5_layer_ff_forward, get_T5_layer_self_attention_forward
+from colossalai.shardformer.policies.base_policy import Policy, SubModuleReplacementDescription
+class T5EncoderPolicy(Policy):
+    def config_sanity_check(self):
+        assert not self.shard_config.enable_tensor_parallelism
+        assert not self.shard_config.enable_flash_attention
+    def preprocess(self):
+        return self.model
+    def module_policy(self):
+        from transformers.models.t5.modeling_t5 import T5LayerFF, T5LayerSelfAttention, T5Stack
+        policy = {}
+        # check whether apex is installed
+        try:
+            from apex.normalization import FusedRMSNorm  # noqa
+            from videosys.core.shardformer.t5.modeling import T5LayerNorm
+            # recover hf from fused rms norm to T5 norm which is faster
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="layer_norm",
+                    target_module=T5LayerNorm,
+                ),
+                policy=policy,
+                target_key=T5LayerFF,
+            )
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(suffix="layer_norm", target_module=T5LayerNorm),
+                policy=policy,
+                target_key=T5LayerSelfAttention,
+            )
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(suffix="final_layer_norm", target_module=T5LayerNorm),
+                policy=policy,
+                target_key=T5Stack,
+            )
+        except (ImportError, ModuleNotFoundError):
+            pass
+        # use jit operator
+        if self.shard_config.enable_jit_fused:
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_jit_fused_T5_layer_ff_forward(),
+                    "dropout_add": get_jit_fused_dropout_add_func(),
+                },
+                policy=policy,
+                target_key=T5LayerFF,
+            )
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_T5_layer_self_attention_forward(),
+                    "dropout_add": get_jit_fused_dropout_add_func(),
+                },
+                policy=policy,
+                target_key=T5LayerSelfAttention,
+            )
+        return policy
+    def postprocess(self):
+        return self.model

videosys/datasets/dataloader.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import random
+from typing import Iterator, Optional
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from torch.utils.data.distributed import DistributedSampler
+from videosys.core.parallel_mgr import ParallelManager
+class StatefulDistributedSampler(DistributedSampler):
+    def __init__(
+        self,
+        dataset: Dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+    ) -> None:
+        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
+        self.start_index: int = 0
+    def __iter__(self) -> Iterator:
+        iterator = super().__iter__()
+        indices = list(iterator)
+        indices = indices[self.start_index :]
+        return iter(indices)
+    def __len__(self) -> int:
+        return self.num_samples - self.start_index
+    def set_start_index(self, start_index: int) -> None:
+        self.start_index = start_index
+def prepare_dataloader(
+    dataset,
+    batch_size,
+    shuffle=False,
+    seed=1024,
+    drop_last=False,
+    pin_memory=False,
+    num_workers=0,
+    pg_manager: Optional[ParallelManager] = None,
+    **kwargs,
+):
+    r"""
+    Prepare a dataloader for distributed training. The dataloader will be wrapped by
+    `torch.utils.data.DataLoader` and `StatefulDistributedSampler`.
+    Args:
+        dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
+        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+        seed (int, optional): Random worker seed for sampling, defaults to 1024.
+        add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
+        pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+        num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+        kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+    Returns:
+        :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+    """
+    _kwargs = kwargs.copy()
+    sampler = StatefulDistributedSampler(
+        dataset,
+        num_replicas=pg_manager.size(pg_manager.dp_axis),
+        rank=pg_manager.coordinate(pg_manager.dp_axis),
+        shuffle=shuffle,
+    )
+    # Deterministic dataloader
+    def seed_worker(worker_id):
+        worker_seed = seed
+        np.random.seed(worker_seed)
+        torch.manual_seed(worker_seed)
+        random.seed(worker_seed)
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        worker_init_fn=seed_worker,
+        drop_last=drop_last,
+        pin_memory=pin_memory,
+        num_workers=num_workers,
+        **_kwargs,
+    )

videosys/datasets/image_transform.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Adapted from DiT
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DiT:      https://github.com/facebookresearch/DiT
+# --------------------------------------------------------
+import numpy as np
+import torchvision.transforms as transforms
+from PIL import Image
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])
+def get_transforms_image(image_size=256):
+    transform = transforms.Compose(
+        [
+            transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size)),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ]
+    )
+    return transform

videosys/datasets/video_transform.py ADDED Viewed

	@@ -0,0 +1,441 @@

+# Adapted from OpenSora and Latte
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# OpenSora: https://github.com/hpcaitech/Open-Sora
+# Latte:    https://github.com/Vchitect/Latte
+# --------------------------------------------------------
+import numbers
+import random
+import numpy as np
+import torch
+from PIL import Image
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tensor. Got %s" % type(clip))
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+    return True
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+    """
+    if len(clip.size()) != 4:
+        raise ValueError("clip should be a 4D tensor")
+    return clip[..., i : i + h, j : j + w]
+def resize(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)
+def resize_scale(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    H, W = clip.size(-2), clip.size(-1)
+    scale_ = target_size[0] / min(H, W)
+    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+def center_crop(clip, crop_size):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    if h < th or w < tw:
+        raise ValueError("height and width must be no smaller than crop_size")
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw)
+def center_crop_using_short_edge(clip):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h < w:
+        th, tw = h, h
+        i = 0
+        j = int(round((w - tw) / 2.0))
+    else:
+        th, tw = w, w
+        i = int(round((h - th) / 2.0))
+        j = 0
+    return crop(clip, i, j, th, tw)
+def random_shift_crop(clip):
+    """
+    Slide along the long edge, with the short edge as crop size
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h <= w:
+        short_edge = h
+    else:
+        short_edge = w
+    th, tw = short_edge, short_edge
+    i = torch.randint(0, h - th + 1, size=(1,)).item()
+    j = torch.randint(0, w - tw + 1, size=(1,)).item()
+    return crop(clip, i, j, th, tw)
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+    Return:
+        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    # return clip.float().permute(3, 0, 1, 2) / 255.0
+    return clip.float() / 255.0
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    # print(mean)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    return clip
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    return clip.flip(-1)
+class RandomCropVideo:
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: randomly cropped video clip.
+                size is (T, C, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip)
+        return crop(clip, i, j, h, w)
+    def get_params(self, clip):
+        h, w = clip.shape[-2:]
+        th, tw = self.size
+        if h < th or w < tw:
+            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = torch.randint(0, h - th + 1, size=(1,)).item()
+        j = torch.randint(0, w - tw + 1, size=(1,)).item()
+        return i, j, th, tw
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+class CenterCropResizeVideo:
+    """
+    First use the short side for cropping length,
+    center crop video, then resize to the specified size
+    """
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_center_crop = center_crop_using_short_edge(clip)
+        clip_center_crop_resize = resize(
+            clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode
+        )
+        return clip_center_crop_resize
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class UCFCenterCropVideo:
+    """
+    First scale to the specified size in equal proportion to the short edge,
+    then center cropping
+    """
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        clip_center_crop = center_crop(clip_resize, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class KineticsRandomCropResizeVideo:
+    """
+    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
+    """
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        clip_random_crop = random_shift_crop(clip)
+        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
+        return clip_resize
+class CenterCropVideo:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_center_crop = center_crop(clip, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class NormalizeVideo:
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
+        """
+        return normalize(clip, self.mean, self.std, self.inplace)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
+class ToTensorVideo:
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    """
+    def __init__(self):
+        pass
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+        """
+        return to_tensor(clip)
+    def __repr__(self) -> str:
+        return self.__class__.__name__
+class RandomHorizontalFlipVideo:
+    """
+    Flip the video clip along the horizontal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor): Size is (T, C, H, W)
+        """
+        if random.random() < self.p:
+            clip = hflip(clip)
+        return clip
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+#  ------------------------------------------------------------
+#  ---------------------  Sampling  ---------------------------
+#  ------------------------------------------------------------
+class TemporalRandomCrop(object):
+    """Temporally crop the given frame indices at a random location.
+    Args:
+            size (int): Desired length of frames will be seen in the model.
+    """
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, total_frames):
+        rand_end = max(0, total_frames - self.size - 1)
+        begin_index = random.randint(0, rand_end)
+        end_index = min(begin_index + self.size, total_frames)
+        return begin_index, end_index

videosys/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Modified from OpenAI's diffusion repos and Meta DiT
+#     DiT:   https://github.com/facebookresearch/DiT/tree/main
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000,
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
+        model_var_type=(
+            (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type
+        # rescale_timesteps=rescale_timesteps,
+    )

videosys/diffusion/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)]
+    return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2))
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a continuous Gaussian distribution.
+    :param x: the targets
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
+    return log_probs
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

videosys/diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,829 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import enum
+import math
+import numpy as np
+import torch as th
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = enum.auto()  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64)
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(self, *, betas, model_mean_type, model_var_type, loss_type):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = (
+            np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:]))
+            if len(self.posterior_variance) > 1
+            else np.array([])
+        )
+        self.posterior_mean_coef1 = betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output))
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)
+        out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
+        kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"])
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

videosys/diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(f"cannot divide section of {size} steps into {section_count}")
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(self, model, *args, **kwargs):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(self, model, *args, **kwargs):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(model, self.timestep_map, self.original_num_steps)
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

videosys/diffusion/timestep_sampler.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size())]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

videosys/models/__init__.py ADDED Viewed

File without changes