hyoungwoncho
/

sd_perturbed_attention_guidance

@@ -12,8 +12,11 @@ from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from diffusers.models.attention_processor import FusedAttnProcessor2_0
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
     USE_PEFT_BACKEND,
@@ -24,11 +27,6 @@ from diffusers.utils import (
     unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from diffusers.models.attention_processor import Attention, AttnProcessor2_0
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -68,7 +66,7 @@ class PAGIdentitySelfAttnProcessor:
         if len(args) > 0 or kwargs.get("scale", None) is not None:
             deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
             deprecate("scale", "1.0.0", deprecation_message)
         residual = hidden_states
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
@@ -77,10 +75,10 @@ class PAGIdentitySelfAttnProcessor:
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
         # chunk
         hidden_states_org, hidden_states_ptb = hidden_states.chunk(2)
         # original path
         batch_size, sequence_length, _ = hidden_states_org.shape
@@ -113,7 +111,7 @@ class PAGIdentitySelfAttnProcessor:
         hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states_org = hidden_states_org.to(query.dtype)
         # linear proj
         hidden_states_org = attn.to_out[0](hidden_states_org)
         # dropout
@@ -135,12 +133,12 @@ class PAGIdentitySelfAttnProcessor:
             hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
         value = attn.to_v(hidden_states_ptb)
         hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
-        #hidden_states_ptb = value
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
         # linear proj
         hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
         # dropout
@@ -182,7 +180,7 @@ class PAGCFGIdentitySelfAttnProcessor:
         if len(args) > 0 or kwargs.get("scale", None) is not None:
             deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
             deprecate("scale", "1.0.0", deprecation_message)
         residual = hidden_states
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
@@ -191,11 +189,11 @@ class PAGCFGIdentitySelfAttnProcessor:
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
         # chunk
         hidden_states_uncond, hidden_states_org, hidden_states_ptb = hidden_states.chunk(3)
         hidden_states_org = torch.cat([hidden_states_uncond, hidden_states_org])
         # original path
         batch_size, sequence_length, _ = hidden_states_org.shape
@@ -207,7 +205,7 @@ class PAGCFGIdentitySelfAttnProcessor:
         if attn.group_norm is not None:
             hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
         query = attn.to_q(hidden_states_org)
         key = attn.to_k(hidden_states_org)
         value = attn.to_v(hidden_states_org)
@@ -228,7 +226,7 @@ class PAGCFGIdentitySelfAttnProcessor:
         hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states_org = hidden_states_org.to(query.dtype)
         # linear proj
         hidden_states_org = attn.to_out[0](hidden_states_org)
         # dropout
@@ -252,7 +250,7 @@ class PAGCFGIdentitySelfAttnProcessor:
         value = attn.to_v(hidden_states_ptb)
         hidden_states_ptb = value
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
         # linear proj
         hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
         # dropout
@@ -328,7 +326,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
-class StableDiffusionPipeline(
     DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
     r"""
@@ -976,7 +974,7 @@ class StableDiffusionPipeline(
             emb = torch.nn.functional.pad(emb, (0, 1))
         assert emb.shape == (w.shape[0], embedding_dim)
         return emb
     def pred_z0(self, sample, model_output, timestep):
         alpha_prod_t = self.scheduler.alphas_cumprod[timestep].to(sample.device)
@@ -996,19 +994,14 @@ class StableDiffusionPipeline(
             )
         return pred_original_sample
     def pred_x0(self, latents, noise_pred, t, generator, device, prompt_embeds, output_type):
         pred_z0 = self.pred_z0(latents, noise_pred, t)
-        pred_x0 = self.vae.decode(
-            pred_z0 / self.vae.config.scaling_factor,
-            return_dict=False,
-            generator=generator
-        )[0]
         pred_x0, ____ = self.run_safety_checker(pred_x0, device, prompt_embeds.dtype)
         do_denormalize = [True] * pred_x0.shape[0]
         pred_x0 = self.image_processor.postprocess(pred_x0, output_type=output_type, do_denormalize=do_denormalize)
         return pred_x0
     @property
@@ -1041,36 +1034,27 @@ class StableDiffusionPipeline(
     @property
     def interrupt(self):
         return self._interrupt
     @property
     def pag_scale(self):
         return self._pag_scale
     @property
-    def do_adversarial_guidance(self):
         return self._pag_scale > 0
     @property
     def pag_adaptive_scaling(self):
         return self._pag_adaptive_scaling
     @property
     def do_pag_adaptive_scaling(self):
         return self._pag_adaptive_scaling > 0
-    @property
-    def pag_drop_rate(self):
-        return self._pag_drop_rate
-    @property
-    def pag_applied_layers(self):
-        return self._pag_applied_layers
     @property
     def pag_applied_layers_index(self):
         return self._pag_applied_layers_index
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -1083,9 +1067,7 @@ class StableDiffusionPipeline(
         guidance_scale: float = 7.5,
         pag_scale: float = 0.0,
         pag_adaptive_scaling: float = 0.0,
-        pag_drop_rate: float = 0.5,
-        pag_applied_layers: List[str] = ['down'], #['down', 'mid', 'up']
-        pag_applied_layers_index: List[str] = ['d4'], #['d4', 'd5', 'm0']
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
@@ -1221,11 +1203,9 @@ class StableDiffusionPipeline(
         self._clip_skip = clip_skip
         self._cross_attention_kwargs = cross_attention_kwargs
         self._interrupt = False
         self._pag_scale = pag_scale
         self._pag_adaptive_scaling = pag_adaptive_scaling
-        self._pag_drop_rate = pag_drop_rate
-        self._pag_applied_layers = pag_applied_layers
         self._pag_applied_layers_index = pag_applied_layers_index
         # 2. Define call parameters
@@ -1258,15 +1238,15 @@ class StableDiffusionPipeline(
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
-        #cfg
-        if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-        #pag
-        elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
             prompt_embeds = torch.cat([prompt_embeds, prompt_embeds])
-        #both
-        elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, prompt_embeds])
         if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
@@ -1309,21 +1289,44 @@ class StableDiffusionPipeline(
             ).to(device=device, dtype=latents.dtype)
         # 7. Denoising loop
-        if self.do_adversarial_guidance:
             down_layers = []
             mid_layers = []
             up_layers = []
             for name, module in self.unet.named_modules():
-                if 'attn1' in name and 'to' not in name:
-                    layer_type = name.split('.')[0].split('_')[0]
-                    if layer_type == 'down':
                         down_layers.append(module)
-                    elif layer_type == 'mid':
                         mid_layers.append(module)
-                    elif layer_type == 'up':
                         up_layers.append(module)
                     else:
                         raise ValueError(f"Invalid layer type: {layer_type}")
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
@@ -1331,46 +1334,22 @@ class StableDiffusionPipeline(
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
-                #cfg
-                if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
                     latent_model_input = torch.cat([latents] * 2)
-                #pag
-                elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
                     latent_model_input = torch.cat([latents] * 2)
-                #both
-                elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
                     latent_model_input = torch.cat([latents] * 3)
-                #no
                 else:
                     latent_model_input = latents
-                # change attention layer in UNet if use PAG
-                if self.do_adversarial_guidance:
-                    if self.do_classifier_free_guidance:
-                        replace_processor = PAGCFGIdentitySelfAttnProcessor()
-                    else:
-                        replace_processor = PAGIdentitySelfAttnProcessor()
-                    drop_layers = self.pag_applied_layers_index
-                    for drop_layer in drop_layers:
-                        try:
-                            if drop_layer[0] == 'd':
-                                down_layers[int(drop_layer[1])].processor = replace_processor
-                            elif drop_layer[0] == 'm':
-                                mid_layers[int(drop_layer[1])].processor = replace_processor
-                            elif drop_layer[0] == 'u':
-                                up_layers[int(drop_layer[1])].processor = replace_processor
-                            else:
-                                raise ValueError(f"Invalid layer type: {drop_layer[0]}")
-                        except IndexError:
-                            raise ValueError(
-                                f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
-                            )
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
@@ -1381,43 +1360,44 @@ class StableDiffusionPipeline(
                     added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
                 # perform guidance
                 # cfg
-                if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     delta = noise_pred_text - noise_pred_uncond
                     noise_pred = noise_pred_uncond + self.guidance_scale * delta
                 # pag
-                elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
                     noise_pred_original, noise_pred_perturb = noise_pred.chunk(2)
                     signal_scale = self.pag_scale
                     if self.do_pag_adaptive_scaling:
-                        signal_scale = self.pag_scale - self.pag_adaptive_scaling * (1000-t)
-                        if signal_scale<0:
                             signal_scale = 0
                     noise_pred = noise_pred_original + signal_scale * (noise_pred_original - noise_pred_perturb)
                 # both
-                elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
                     noise_pred_uncond, noise_pred_text, noise_pred_text_perturb = noise_pred.chunk(3)
                     signal_scale = self.pag_scale
                     if self.do_pag_adaptive_scaling:
-                        signal_scale = self.pag_scale - self.pag_adaptive_scaling * (1000-t)
-                        if signal_scale<0:
                             signal_scale = 0
-                    noise_pred = noise_pred_text + (self.guidance_scale-1.0) * (noise_pred_text - noise_pred_uncond) + signal_scale * (noise_pred_text - noise_pred_text_perturb)
                 if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
                     # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
@@ -1460,20 +1440,17 @@ class StableDiffusionPipeline(
         # Offload all models
         self.maybe_free_model_hooks()
-        if not return_dict:
-            return (image, has_nsfw_concept)
         # change attention layer in UNet if use PAG
-        if self.do_adversarial_guidance:
             drop_layers = self.pag_applied_layers_index
             for drop_layer in drop_layers:
                 try:
-                    if drop_layer[0] == 'd':
                         down_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
-                    elif drop_layer[0] == 'm':
                         mid_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
-                    elif drop_layer[0] == 'u':
                         up_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
                     else:
                         raise ValueError(f"Invalid layer type: {drop_layer[0]}")
@@ -1482,4 +1459,7 @@ class StableDiffusionPipeline(
                         f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
                     )
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import Attention, AttnProcessor2_0, FusedAttnProcessor2_0
 from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
     USE_PEFT_BACKEND,
     unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
         if len(args) > 0 or kwargs.get("scale", None) is not None:
             deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
             deprecate("scale", "1.0.0", deprecation_message)
         residual = hidden_states
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
         # chunk
         hidden_states_org, hidden_states_ptb = hidden_states.chunk(2)
         # original path
         batch_size, sequence_length, _ = hidden_states_org.shape
         hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states_org = hidden_states_org.to(query.dtype)
         # linear proj
         hidden_states_org = attn.to_out[0](hidden_states_org)
         # dropout
             hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)
         value = attn.to_v(hidden_states_ptb)
         hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
+        # hidden_states_ptb = value
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
         # linear proj
         hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
         # dropout
         if len(args) > 0 or kwargs.get("scale", None) is not None:
             deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
             deprecate("scale", "1.0.0", deprecation_message)
         residual = hidden_states
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
         # chunk
         hidden_states_uncond, hidden_states_org, hidden_states_ptb = hidden_states.chunk(3)
         hidden_states_org = torch.cat([hidden_states_uncond, hidden_states_org])
         # original path
         batch_size, sequence_length, _ = hidden_states_org.shape
         if attn.group_norm is not None:
             hidden_states_org = attn.group_norm(hidden_states_org.transpose(1, 2)).transpose(1, 2)
         query = attn.to_q(hidden_states_org)
         key = attn.to_k(hidden_states_org)
         value = attn.to_v(hidden_states_org)
         hidden_states_org = hidden_states_org.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states_org = hidden_states_org.to(query.dtype)
         # linear proj
         hidden_states_org = attn.to_out[0](hidden_states_org)
         # dropout
         value = attn.to_v(hidden_states_ptb)
         hidden_states_ptb = value
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
         # linear proj
         hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
         # dropout
     return timesteps, num_inference_steps
+class StableDiffusionPAGPipeline(
     DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
     r"""
             emb = torch.nn.functional.pad(emb, (0, 1))
         assert emb.shape == (w.shape[0], embedding_dim)
         return emb
     def pred_z0(self, sample, model_output, timestep):
         alpha_prod_t = self.scheduler.alphas_cumprod[timestep].to(sample.device)
             )
         return pred_original_sample
     def pred_x0(self, latents, noise_pred, t, generator, device, prompt_embeds, output_type):
         pred_z0 = self.pred_z0(latents, noise_pred, t)
+        pred_x0 = self.vae.decode(pred_z0 / self.vae.config.scaling_factor, return_dict=False, generator=generator)[0]
         pred_x0, ____ = self.run_safety_checker(pred_x0, device, prompt_embeds.dtype)
         do_denormalize = [True] * pred_x0.shape[0]
         pred_x0 = self.image_processor.postprocess(pred_x0, output_type=output_type, do_denormalize=do_denormalize)
         return pred_x0
     @property
     @property
     def interrupt(self):
         return self._interrupt
     @property
     def pag_scale(self):
         return self._pag_scale
     @property
+    def do_perturbed_attention_guidance(self):
         return self._pag_scale > 0
     @property
     def pag_adaptive_scaling(self):
         return self._pag_adaptive_scaling
     @property
     def do_pag_adaptive_scaling(self):
         return self._pag_adaptive_scaling > 0
     @property
     def pag_applied_layers_index(self):
         return self._pag_applied_layers_index
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         guidance_scale: float = 7.5,
         pag_scale: float = 0.0,
         pag_adaptive_scaling: float = 0.0,
+        pag_applied_layers_index: List[str] = ["d4"],  # ['d4', 'd5', 'm0']
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
         self._clip_skip = clip_skip
         self._cross_attention_kwargs = cross_attention_kwargs
         self._interrupt = False
         self._pag_scale = pag_scale
         self._pag_adaptive_scaling = pag_adaptive_scaling
         self._pag_applied_layers_index = pag_applied_layers_index
         # 2. Define call parameters
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
+        # cfg
+        if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        # pag
+        elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
             prompt_embeds = torch.cat([prompt_embeds, prompt_embeds])
+        # both
+        elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, prompt_embeds])
         if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
             ).to(device=device, dtype=latents.dtype)
         # 7. Denoising loop
+        if self.do_perturbed_attention_guidance:
             down_layers = []
             mid_layers = []
             up_layers = []
             for name, module in self.unet.named_modules():
+                if "attn1" in name and "to" not in name:
+                    layer_type = name.split(".")[0].split("_")[0]
+                    if layer_type == "down":
                         down_layers.append(module)
+                    elif layer_type == "mid":
                         mid_layers.append(module)
+                    elif layer_type == "up":
                         up_layers.append(module)
                     else:
                         raise ValueError(f"Invalid layer type: {layer_type}")
+        # change attention layer in UNet if use PAG
+        if self.do_perturbed_attention_guidance:
+            if self.do_classifier_free_guidance:
+                replace_processor = PAGCFGIdentitySelfAttnProcessor()
+            else:
+                replace_processor = PAGIdentitySelfAttnProcessor()
+            drop_layers = self.pag_applied_layers_index
+            for drop_layer in drop_layers:
+                try:
+                    if drop_layer[0] == "d":
+                        down_layers[int(drop_layer[1])].processor = replace_processor
+                    elif drop_layer[0] == "m":
+                        mid_layers[int(drop_layer[1])].processor = replace_processor
+                    elif drop_layer[0] == "u":
+                        up_layers[int(drop_layer[1])].processor = replace_processor
+                    else:
+                        raise ValueError(f"Invalid layer type: {drop_layer[0]}")
+                except IndexError:
+                    raise ValueError(
+                        f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
+                    )
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
+                # cfg
+                if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
                     latent_model_input = torch.cat([latents] * 2)
+                # pag
+                elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
                     latent_model_input = torch.cat([latents] * 2)
+                # both
+                elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
                     latent_model_input = torch.cat([latents] * 3)
+                # no
                 else:
                     latent_model_input = latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
                 # perform guidance
                 # cfg
+                if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     delta = noise_pred_text - noise_pred_uncond
                     noise_pred = noise_pred_uncond + self.guidance_scale * delta
                 # pag
+                elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
                     noise_pred_original, noise_pred_perturb = noise_pred.chunk(2)
                     signal_scale = self.pag_scale
                     if self.do_pag_adaptive_scaling:
+                        signal_scale = self.pag_scale - self.pag_adaptive_scaling * (1000 - t)
+                        if signal_scale < 0:
                             signal_scale = 0
                     noise_pred = noise_pred_original + signal_scale * (noise_pred_original - noise_pred_perturb)
                 # both
+                elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
                     noise_pred_uncond, noise_pred_text, noise_pred_text_perturb = noise_pred.chunk(3)
                     signal_scale = self.pag_scale
                     if self.do_pag_adaptive_scaling:
+                        signal_scale = self.pag_scale - self.pag_adaptive_scaling * (1000 - t)
+                        if signal_scale < 0:
                             signal_scale = 0
+                    noise_pred = (
+                        noise_pred_text
+                        + (self.guidance_scale - 1.0) * (noise_pred_text - noise_pred_uncond)
+                        + signal_scale * (noise_pred_text - noise_pred_text_perturb)
+                    )
                 if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
                     # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
         # Offload all models
         self.maybe_free_model_hooks()
         # change attention layer in UNet if use PAG
+        if self.do_perturbed_attention_guidance:
             drop_layers = self.pag_applied_layers_index
             for drop_layer in drop_layers:
                 try:
+                    if drop_layer[0] == "d":
                         down_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
+                    elif drop_layer[0] == "m":
                         mid_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
+                    elif drop_layer[0] == "u":
                         up_layers[int(drop_layer[1])].processor = AttnProcessor2_0()
                     else:
                         raise ValueError(f"Invalid layer type: {drop_layer[0]}")
                         f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
                     )
+        if not return_dict:
+            return (image, has_nsfw_concept)
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)