Spaces:

editing-images
/

ledits

Running on A10G

App Files Files Community

KatharinaK commited on Sep 25, 2023

Commit

2a869f2

•

1 Parent(s): d223295

Added attention masking and intersect masking; fix truncation of prompts

Browse files

Files changed (1) hide show

modified_pipeline_semantic_stable_diffusion.py +351 -33

modified_pipeline_semantic_stable_diffusion.py CHANGED Viewed

@@ -9,16 +9,180 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import logging, randn_tensor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 # from . import SemanticStableDiffusionPipelineOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class SemanticStableDiffusionPipeline(DiffusionPipeline):
     r"""
@@ -207,6 +371,29 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
         latents = latents * self.scheduler.init_noise_sigma
         return latents
     @torch.no_grad()
     def __call__(
         self,
@@ -235,7 +422,13 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
         edit_mom_beta: Optional[float] = 0.4,
         edit_weights: Optional[List[float]] = None,
         sem_guidance: Optional[List[torch.Tensor]] = None,
         # DDPM additions
         use_ddpm: bool = False,
         wts: Optional[List[torch.Tensor]] = None,
@@ -334,6 +527,12 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
             second element is a list of `bool`s denoting whether the corresponding generated image likely represents
             "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -348,12 +547,12 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
             enable_edit_guidance = True
             if isinstance(editing_prompt, str):
                 editing_prompt = [editing_prompt]
-            enabled_editing_prompts = len(editing_prompt)
         elif editing_prompt_embeddings is not None:
             enable_edit_guidance = True
-            enabled_editing_prompts = editing_prompt_embeddings.shape[0]
         else:
-            enabled_editing_prompts = 0
             enable_edit_guidance = False
         # get prompt text embeddings
@@ -361,17 +560,23 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
         # duplicate text embeddings for each generation per prompt, using mps friendly method
@@ -382,24 +587,37 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
         if enable_edit_guidance:
             # get safety text embeddings
             if editing_prompt_embeddings is None:
                 edit_concepts_input = self.tokenizer(
                     [x for item in editing_prompt for x in repeat(item, batch_size)],
                     padding="max_length",
                     max_length=self.tokenizer.model_max_length,
                     return_tensors="pt",
                 )
                 edit_concepts_input_ids = edit_concepts_input.input_ids
-                if edit_concepts_input_ids.shape[-1] > self.tokenizer.model_max_length:
                     removed_text = self.tokenizer.batch_decode(
-                        edit_concepts_input_ids[:, self.tokenizer.model_max_length :]
                     )
                     logger.warning(
                         "The following part of your input was truncated because CLIP can only handle sequences up to"
                         f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                     )
-                    edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
                 edit_concepts = self.text_encoder(edit_concepts_input_ids.to(self.device))[0]
             else:
                 edit_concepts = editing_prompt_embeddings.to(self.device).repeat(batch_size, 1, 1)
@@ -453,8 +671,11 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             if enable_edit_guidance:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings, edit_concepts])
             else:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
         # get the initial random noise unless the user supplied it
@@ -466,6 +687,9 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
           t_to_idx = {int(v):k for k,v in enumerate(timesteps[-zs.shape[0]:])}
           timesteps = timesteps[-zs.shape[0]:]
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
@@ -493,7 +717,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = (
-                torch.cat([latents] * (2 + enabled_editing_prompts)) if do_classifier_free_guidance else latents
             )
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
@@ -502,7 +726,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
             # perform guidance
             if do_classifier_free_guidance:
-                noise_pred_out = noise_pred.chunk(2 + enabled_editing_prompts)  # [b,4, 64, 64]
                 noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
                 noise_pred_edit_concepts = noise_pred_out[2:]
@@ -589,27 +813,115 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
                         noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
-                        # torch.quantile function expects float32
-                        if noise_guidance_edit_tmp.dtype == torch.float32:
-                            tmp = torch.quantile(
-                                torch.abs(noise_guidance_edit_tmp).flatten(start_dim=2),
-                                edit_threshold_c,
-                                dim=2,
-                                keepdim=False,
                             )
-                        else:
-                            tmp = torch.quantile(
-                                torch.abs(noise_guidance_edit_tmp).flatten(start_dim=2).to(torch.float32),
-                                edit_threshold_c,
-                                dim=2,
-                                keepdim=False,
-                            ).to(noise_guidance_edit_tmp.dtype)
-                        noise_guidance_edit_tmp = torch.where(
-                            torch.abs(noise_guidance_edit_tmp) >= tmp[:, :, None, None],
-                            noise_guidance_edit_tmp,
-                            torch.zeros_like(noise_guidance_edit_tmp),
-                        )
                         noise_guidance_edit[c, :, :, :, :] = noise_guidance_edit_tmp
                         # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
@@ -712,6 +1024,12 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
             else: #if not use_ddpm:
               latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)

 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor, Attention
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 # from . import SemanticStableDiffusionPipelineOutput
+import torch.nn.functional as F
+import math
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class AttentionStore():
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [],  "mid_self": [],  "up_self": []}
+    def __call__(self, attn, is_cross: bool, place_in_unet: str, editing_prompts, PnP):
+        # attn.shape = batch_size * head_size, seq_len query, seq_len_key
+        bs = 2 + int(PnP) + editing_prompts
+        source_batch_size = int(attn.shape[0] // bs)
+        skip = 2 if PnP else 1 # skip PnP & unconditional
+        self.forward(
+                attn[skip*source_batch_size:],
+                is_cross,
+                place_in_unet)
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= 32 ** 2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+    def between_steps(self, store_step=True):
+        if store_step:
+            if self.average:
+                if len(self.attention_store) == 0:
+                    self.attention_store = self.step_store
+                else:
+                    for key in self.attention_store:
+                        for i in range(len(self.attention_store[key])):
+                            self.attention_store[key][i] += self.step_store[key][i]
+            else:
+                if len(self.attention_store) == 0:
+                    self.attention_store = [self.step_store]
+                else:
+                    self.attention_store.append(self.step_store)
+            self.cur_step += 1
+        self.step_store = self.get_empty_store()
+    def get_attention(self, step: int):
+        if self.average:
+            attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store}
+        else:
+            assert(step is not None)
+            attention = self.attention_store[step]
+        return attention
+    def aggregate_attention(self, attention_maps, prompts, res: int,
+        from_where: List[str], is_cross: bool, select: int
+    ):
+        out = []
+        num_pixels = res ** 2
+        for location in from_where:
+            for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
+                if item.shape[1] == num_pixels:
+                    cross_maps = item.reshape(len(prompts), -1, res, res, item.shape[-1])[select]
+                    out.append(cross_maps)
+        out = torch.cat(out, dim=0)
+        # average over heads
+        out = out.sum(0) / out.shape[0]
+        return out
+    def __init__(self, average: bool):
+        self.step_store = self.get_empty_store()
+        self.attention_store = []
+        self.cur_step = 0
+        self.average = average
+class CrossAttnProcessor:
+    def __init__(self, attention_store, place_in_unet, PnP, editing_prompts):
+        self.attnstore = attention_store
+        self.place_in_unet = place_in_unet
+        self.editing_prompts = editing_prompts
+        self.PnP = PnP
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        assert(not attn.residual_connection)
+        assert(attn.spatial_norm is None)
+        assert(attn.group_norm is None)
+        assert(hidden_states.ndim != 4)
+        assert(encoder_hidden_states is not None) # is cross
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        self.attnstore(attention_probs,
+                        is_cross=True,
+                        place_in_unet=self.place_in_unet,
+                        editing_prompts=self.editing_prompts,
+                        PnP=self.PnP)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+# Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionAttendAndExcitePipeline.GaussianSmoothing
+class GaussianSmoothing():
+    def __init__(self, device):
+        kernel_size = [3, 3]
+        sigma = [0.5, 0.5]
+        # The gaussian kernel is the product of the gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(1, *[1] * (kernel.dim() - 1))
+        self.weight = kernel.to(device)
+    def __call__(self, input):
+        """
+        Arguments:
+        Apply gaussian filter to input.
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return F.conv2d(input, weight=self.weight.to(input.dtype))
 class SemanticStableDiffusionPipeline(DiffusionPipeline):
     r"""
         latents = latents * self.scheduler.init_noise_sigma
         return latents
+    def prepare_unet(self, attention_store, PnP: bool):
+        attn_procs = {}
+        for name in self.unet.attn_processors.keys():
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                place_in_unet = "down"
+            else:
+                continue
+            if "attn2" in name:
+                attn_procs[name] = CrossAttnProcessor(
+                    attention_store=attention_store,
+                    place_in_unet=place_in_unet,
+                    PnP=PnP,
+                    editing_prompts=self.enabled_editing_prompts)
+            else:
+                attn_procs[name] = AttnProcessor()
+        self.unet.set_attn_processor(attn_procs)
     @torch.no_grad()
     def __call__(
         self,
         edit_mom_beta: Optional[float] = 0.4,
         edit_weights: Optional[List[float]] = None,
         sem_guidance: Optional[List[torch.Tensor]] = None,
+        # masking
+        use_cross_attn_mask: bool = False,
+        use_intersect_mask: bool = True,
+        edit_tokens_for_attn_map: List[str] = None,
+        # Attention store (just for visualization purposes)
+        attn_store_steps: Optional[List[int]] = [],
+        store_averaged_over_steps: bool = True,
         # DDPM additions
         use_ddpm: bool = False,
         wts: Optional[List[torch.Tensor]] = None,
             second element is a list of `bool`s denoting whether the corresponding generated image likely represents
             "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
         """
+        if use_intersect_mask:
+            use_cross_attn_mask = True
+        if use_cross_attn_mask:
+            self.smoothing = GaussianSmoothing(self.device)
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
             enable_edit_guidance = True
             if isinstance(editing_prompt, str):
                 editing_prompt = [editing_prompt]
+            self.enabled_editing_prompts = len(editing_prompt)
         elif editing_prompt_embeddings is not None:
             enable_edit_guidance = True
+            self.enabled_editing_prompts = editing_prompt_embeddings.shape[0]
         else:
+            self.enabled_editing_prompts = 0
             enable_edit_guidance = False
         # get prompt text embeddings
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+            text_input_ids, untruncated_ids
+        ):
+            removed_text = self.tokenizer.batch_decode(
+                untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+            )
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
         text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         if enable_edit_guidance:
             # get safety text embeddings
             if editing_prompt_embeddings is None:
+                if edit_tokens_for_attn_map is not None:
+                    edit_tokens = [[word.replace("</w>", "") for word in self.tokenizer.tokenize(item)] for item in editing_prompt]
+                    #print(f"edit_tokens: {edit_tokens}")
                 edit_concepts_input = self.tokenizer(
                     [x for item in editing_prompt for x in repeat(item, batch_size)],
                     padding="max_length",
                     max_length=self.tokenizer.model_max_length,
+                    truncation=True,
                     return_tensors="pt",
+                    return_length=True
                 )
+                num_edit_tokens = edit_concepts_input.length -2 # not counting startoftext and endoftext
                 edit_concepts_input_ids = edit_concepts_input.input_ids
+                untruncated_ids = self.tokenizer(
+                    [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    padding="longest",
+                    return_tensors="pt").input_ids
+                if untruncated_ids.shape[-1] >= edit_concepts_input_ids.shape[-1] and not torch.equal(
+                    edit_concepts_input_ids, untruncated_ids
+                ):
                     removed_text = self.tokenizer.batch_decode(
+                        untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
                     )
                     logger.warning(
                         "The following part of your input was truncated because CLIP can only handle sequences up to"
                         f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                     )
                 edit_concepts = self.text_encoder(edit_concepts_input_ids.to(self.device))[0]
             else:
                 edit_concepts = editing_prompt_embeddings.to(self.device).repeat(batch_size, 1, 1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
+            self.text_cross_attention_maps = [prompt] if isinstance(prompt, str) else prompt
             if enable_edit_guidance:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings, edit_concepts])
+                self.text_cross_attention_maps += \
+                    ([editing_prompt] if isinstance(editing_prompt, str) else editing_prompt)
             else:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
         # get the initial random noise unless the user supplied it
           t_to_idx = {int(v):k for k,v in enumerate(timesteps[-zs.shape[0]:])}
           timesteps = timesteps[-zs.shape[0]:]
+        self.attention_store = AttentionStore(average=store_averaged_over_steps)
+        self.prepare_unet(self.attention_store, False)
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = (
+                torch.cat([latents] * (2 + self.enabled_editing_prompts)) if do_classifier_free_guidance else latents
             )
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
             # perform guidance
             if do_classifier_free_guidance:
+                noise_pred_out = noise_pred.chunk(2 + self.enabled_editing_prompts)  # [b,4, 64, 64]
                 noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
                 noise_pred_edit_concepts = noise_pred_out[2:]
                         noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
+                        if use_cross_attn_mask:
+                            out = self.attention_store.aggregate_attention(
+                                attention_maps=self.attention_store.step_store,
+                                prompts=self.text_cross_attention_maps,
+                                res=16,
+                                from_where=["up","down"],
+                                is_cross=True,
+                                select=self.text_cross_attention_maps.index(editing_prompt[c]),
                             )
+                            attn_map = out[:, :, 1:] # 0 -> startoftext
+                            attn_map *= 100
+                            attn_map = torch.nn.functional.softmax(attn_map, dim=-1)
+                            attn_map = attn_map[:,:,:num_edit_tokens[c]] # -1 -> endoftext
+                            assert(attn_map.shape[2]==num_edit_tokens[c])
+                            if edit_tokens_for_attn_map is not None:
+                                # select attn_map for specified tokens
+                                token_idx = [edit_tokens[c].index(item) for item in edit_tokens_for_attn_map[c]]
+                                attn_map = attn_map[:,:,token_idx]
+                                assert(attn_map.shape[2] == len(edit_tokens_for_attn_map[c]))
+                            # average over tokens
+                            attn_map = torch.sum(attn_map, dim=2)
+                            # gaussian_smoothing
+                            attn_map = F.pad(attn_map.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect")
+                            attn_map = self.smoothing(attn_map).squeeze(0).squeeze(0)
+                            # torch.quantile function expects float32
+                            if attn_map.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    attn_map.flatten(),
+                                    edit_threshold_c
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    attn_map.flatten().to(torch.float32),
+                                    edit_threshold_c
+                                ).to(attn_map.dtype)
+                            attn_mask = torch.where(attn_map >= tmp, 1.0, 0.0)
+                            # resolution must match latent space dimension
+                            attn_mask = F.interpolate(
+                                attn_mask.unsqueeze(0).unsqueeze(0),
+                                noise_guidance_edit_tmp.shape[-2:] # 64,64
+                            )[0,0,:,:]
+                            if not use_intersect_mask:
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * attn_mask
+                        if use_intersect_mask:
+                            noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                            noise_guidance_edit_tmp_quantile = torch.sum(noise_guidance_edit_tmp_quantile, dim=1, keepdim=True)
+                            noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(1,4,1,1)
+                            if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                ).to(noise_guidance_edit_tmp_quantile.dtype)
+                            sega_mask = torch.where(
+                                noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                torch.ones_like(noise_guidance_edit_tmp),
+                                torch.zeros_like(noise_guidance_edit_tmp),
+                            )
+                            intersect_mask = sega_mask * attn_mask
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * intersect_mask
+                        elif not use_cross_attn_mask:
+                            # calculate quantile
+                            noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                            noise_guidance_edit_tmp_quantile = torch.sum(noise_guidance_edit_tmp_quantile, dim=1, keepdim=True)
+                            noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(1,4,1,1)
+                            # torch.quantile function expects float32
+                            if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                ).to(noise_guidance_edit_tmp_quantile.dtype)
+                            noise_guidance_edit_tmp = torch.where(
+                                noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                noise_guidance_edit_tmp,
+                                torch.zeros_like(noise_guidance_edit_tmp),
+                            )
                         noise_guidance_edit[c, :, :, :, :] = noise_guidance_edit_tmp
                         # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
             else: #if not use_ddpm:
               latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            # step callback
+            store_step = i in attn_store_steps
+            if store_step:
+                print("storing attention")
+            self.attention_store.between_steps(store_step)
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)