hyoungwoncho
/

sd_perturbed_attention_guidance

@@ -38,10 +38,8 @@ EXAMPLE_DOC_STRING = """
         ```py
         >>> import torch
         >>> from diffusers import StableDiffusionPipeline
         >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
         >>> pipe = pipe.to("cuda")
         >>> prompt = "a photo of an astronaut riding a horse on mars"
         >>> image = pipe(prompt).images[0]
         ```
@@ -64,8 +62,12 @@ class PAGIdentitySelfAttnProcessor:
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
     ) -> torch.FloatTensor:
         residual = hidden_states
         if attn.spatial_norm is not None:
@@ -91,11 +93,9 @@ class PAGIdentitySelfAttnProcessor:
         if attn.group_norm is not None:
             hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
-        args = () if USE_PEFT_BACKEND else (scale,)
-        query = attn.to_q(hidden_states_org, *args)
-        key = attn.to_k(hidden_states_org, *args)
-        value = attn.to_v(hidden_states_org, *args)
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
@@ -115,7 +115,7 @@ class PAGIdentitySelfAttnProcessor:
         hidden_states_org = hidden_states_org.to(query.dtype)
         # linear proj
-        hidden_states_org = attn.to_out[0](hidden_states_org, *args)
         # dropout
         hidden_states_org = attn.to_out[1](hidden_states_org)
@@ -134,9 +134,7 @@ class PAGIdentitySelfAttnProcessor:
         if attn.group_norm is not None:
             hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
-        args = () if USE_PEFT_BACKEND else (scale,)
-        value = attn.to_v(hidden_states_ptb, *args)
         hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
         #hidden_states_ptb = value
@@ -144,7 +142,7 @@ class PAGIdentitySelfAttnProcessor:
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
         # linear proj
-        hidden_states_ptb = attn.to_out[0](hidden_states_ptb, *args)
         # dropout
         hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
@@ -178,8 +176,12 @@ class PAGCFGIdentitySelfAttnProcessor:
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
     ) -> torch.FloatTensor:
         residual = hidden_states
         if attn.spatial_norm is not None:
@@ -205,12 +207,10 @@ class PAGCFGIdentitySelfAttnProcessor:
         if attn.group_norm is not None:
             hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
-        args = () if USE_PEFT_BACKEND else (scale,)
-        query = attn.to_q(hidden_states_org, *args)
-        key = attn.to_k(hidden_states_org, *args)
-        value = attn.to_v(hidden_states_org, *args)
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
@@ -230,7 +230,7 @@ class PAGCFGIdentitySelfAttnProcessor:
         hidden_states_org = hidden_states_org.to(query.dtype)
         # linear proj
-        hidden_states_org = attn.to_out[0](hidden_states_org, *args)
         # dropout
         hidden_states_org = attn.to_out[1](hidden_states_org)
@@ -249,14 +249,12 @@ class PAGCFGIdentitySelfAttnProcessor:
         if attn.group_norm is not None:
             hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
-        args = () if USE_PEFT_BACKEND else (scale,)
-        value = attn.to_v(hidden_states_ptb, *args)
         hidden_states_ptb = value
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
         # linear proj
-        hidden_states_ptb = attn.to_out[0](hidden_states_ptb, *args)
         # dropout
         hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
@@ -298,7 +296,6 @@ def retrieve_timesteps(
     """
     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
     Args:
         scheduler (`SchedulerMixin`):
             The scheduler to get timesteps from.
@@ -311,7 +308,6 @@ def retrieve_timesteps(
                 Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
                 timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
                 must be `None`.
     Returns:
         `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
         second element is the number of inference steps.
@@ -332,22 +328,19 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
-class StableDiffusionPAGPipeline(
     DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -540,7 +533,6 @@ class StableDiffusionPAGPipeline(
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
@@ -885,12 +877,9 @@ class StableDiffusionPAGPipeline(
     def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
         r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
         The suffixes after the scaling factors represent the stages where they are being applied.
         Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
         that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
         Args:
             s1 (`float`):
                 Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
@@ -914,13 +903,9 @@ class StableDiffusionPAGPipeline(
         """
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
         key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
         <Tip warning={true}>
         This API is 🧪 experimental.
         </Tip>
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
             vae (`bool`, defaults to `True`): To apply fusion on the VAE.
@@ -944,17 +929,12 @@ class StableDiffusionPAGPipeline(
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
     def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         """Disable QKV projection fusion if enabled.
         <Tip warning={true}>
         This API is 🧪 experimental.
         </Tip>
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
             vae (`bool`, defaults to `True`): To apply fusion on the VAE.
         """
         if unet:
             if not self.fusing_unet:
@@ -974,7 +954,6 @@ class StableDiffusionPAGPipeline(
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
         See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
         Args:
             timesteps (`torch.Tensor`):
                 generate embedding vectors at these timesteps
@@ -982,7 +961,6 @@ class StableDiffusionPAGPipeline(
                 dimension of the embeddings to generate
             dtype:
                 data type of the generated embeddings
         Returns:
             `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
         """
@@ -1128,7 +1106,6 @@ class StableDiffusionPAGPipeline(
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
@@ -1195,9 +1172,7 @@ class StableDiffusionPAGPipeline(
                 The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                 `._callback_tensor_inputs` attribute of your pipeline class.
         Examples:
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,

         ```py
         >>> import torch
         >>> from diffusers import StableDiffusionPipeline
         >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
         >>> pipe = pipe.to("cuda")
         >>> prompt = "a photo of an astronaut riding a horse on mars"
         >>> image = pipe(prompt).images[0]
         ```
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         temb: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
     ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
         residual = hidden_states
         if attn.spatial_norm is not None:
         if attn.group_norm is not None:
             hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states_org)
+        key = attn.to_k(hidden_states_org)
+        value = attn.to_v(hidden_states_org)
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
         hidden_states_org = hidden_states_org.to(query.dtype)
         # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
         # dropout
         hidden_states_org = attn.to_out[1](hidden_states_org)
         if attn.group_norm is not None:
             hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
+        value = attn.to_v(hidden_states_ptb)
         hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
         #hidden_states_ptb = value
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
         # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
         # dropout
         hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         temb: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
     ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
         residual = hidden_states
         if attn.spatial_norm is not None:
         if attn.group_norm is not None:
             hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states_org)
+        key = attn.to_k(hidden_states_org)
+        value = attn.to_v(hidden_states_org)
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
         hidden_states_org = hidden_states_org.to(query.dtype)
         # linear proj
+        hidden_states_org = attn.to_out[0](hidden_states_org)
         # dropout
         hidden_states_org = attn.to_out[1](hidden_states_org)
         if attn.group_norm is not None:
             hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
+        value = attn.to_v(hidden_states_ptb)
         hidden_states_ptb = value
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
         # linear proj
+        hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
         # dropout
         hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
     """
     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
     Args:
         scheduler (`SchedulerMixin`):
             The scheduler to get timesteps from.
                 Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
                 timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
                 must be `None`.
     Returns:
         `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
         second element is the number of inference steps.
     return timesteps, num_inference_steps
+class StableDiffusionPipeline(
     DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
     def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
         r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
         The suffixes after the scaling factors represent the stages where they are being applied.
         Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
         that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
         Args:
             s1 (`float`):
                 Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
         """
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
         key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
         <Tip warning={true}>
         This API is 🧪 experimental.
         </Tip>
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
             vae (`bool`, defaults to `True`): To apply fusion on the VAE.
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
     def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         """Disable QKV projection fusion if enabled.
         <Tip warning={true}>
         This API is 🧪 experimental.
         </Tip>
         Args:
             unet (`bool`, defaults to `True`): To apply fusion on the UNet.
             vae (`bool`, defaults to `True`): To apply fusion on the VAE.
         """
         if unet:
             if not self.fusing_unet:
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
         See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
         Args:
             timesteps (`torch.Tensor`):
                 generate embedding vectors at these timesteps
                 dimension of the embeddings to generate
             dtype:
                 data type of the generated embeddings
         Returns:
             `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
         """
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
                 The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                 `._callback_tensor_inputs` attribute of your pipeline class.
         Examples:
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,