Sanster commited on
Commit
7153d36
1 Parent(s): 0b78eba

Upload 4 files

Browse files
demo.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+ from PIL import Image, ImageFilter, ImageOps
5
+ from pipeline_PowerPaint import StableDiffusionInpaintPipeline as Pipeline
6
+ from power_paint_tokenizer import PowerPaintTokenizer
7
+ from diffusers.utils import load_image
8
+
9
+
10
+ def add_task_to_prompt(prompt, negative_prompt, task):
11
+ if task == "object-removal":
12
+ promptA = prompt + " P_ctxt"
13
+ promptB = prompt + " P_ctxt"
14
+ negative_promptA = negative_prompt + " P_obj"
15
+ negative_promptB = negative_prompt + " P_obj"
16
+ elif task == "shape-guided":
17
+ promptA = prompt + " P_shape"
18
+ promptB = prompt + " P_ctxt"
19
+ negative_promptA = negative_prompt
20
+ negative_promptB = negative_prompt
21
+ elif task == "image-outpainting":
22
+ promptA = prompt + " P_ctxt"
23
+ promptB = prompt + " P_ctxt"
24
+ negative_promptA = negative_prompt + " P_obj"
25
+ negative_promptB = negative_prompt + " P_obj"
26
+ else:
27
+ promptA = prompt + " P_obj"
28
+ promptB = prompt + " P_obj"
29
+ negative_promptA = negative_prompt
30
+ negative_promptB = negative_prompt
31
+
32
+ return promptA, promptB, negative_promptA, negative_promptB
33
+
34
+
35
+ @torch.inference_mode()
36
+ def predict(
37
+ pipe,
38
+ input_image,
39
+ prompt,
40
+ fitting_degree,
41
+ ddim_steps,
42
+ scale,
43
+ negative_prompt,
44
+ task,
45
+ ):
46
+ width, height = input_image["image"].convert("RGB").size
47
+
48
+ if width < height:
49
+ input_image["image"] = (
50
+ input_image["image"].convert("RGB").resize((640, int(height / width * 640)))
51
+ )
52
+ else:
53
+ input_image["image"] = (
54
+ input_image["image"].convert("RGB").resize((int(width / height * 640), 640))
55
+ )
56
+
57
+ promptA, promptB, negative_promptA, negative_promptB = add_task_to_prompt(
58
+ prompt, negative_prompt, task
59
+ )
60
+ print(promptA, promptB, negative_promptA, negative_promptB)
61
+ img = np.array(input_image["image"].convert("RGB"))
62
+
63
+ W = int(np.shape(img)[0] - np.shape(img)[0] % 8)
64
+ H = int(np.shape(img)[1] - np.shape(img)[1] % 8)
65
+ input_image["image"] = input_image["image"].resize((H, W))
66
+ input_image["mask"] = input_image["mask"].resize((H, W))
67
+ result = pipe(
68
+ promptA=promptA,
69
+ promptB=promptB,
70
+ tradoff=fitting_degree,
71
+ tradoff_nag=fitting_degree,
72
+ negative_promptA=negative_promptA,
73
+ negative_promptB=negative_promptB,
74
+ image=input_image["image"].convert("RGB"),
75
+ mask_image=input_image["mask"].convert("RGB"),
76
+ width=H,
77
+ height=W,
78
+ guidance_scale=scale,
79
+ num_inference_steps=ddim_steps,
80
+ ).images[0]
81
+ mask_np = np.array(input_image["mask"].convert("RGB"))
82
+ red = np.array(result).astype("float") * 1
83
+ red[:, :, 0] = 180.0
84
+ red[:, :, 2] = 0
85
+ red[:, :, 1] = 0
86
+ result_m = np.array(result)
87
+ result_m = Image.fromarray(
88
+ (
89
+ result_m.astype("float") * (1 - mask_np.astype("float") / 512.0)
90
+ + mask_np.astype("float") / 512.0 * red
91
+ ).astype("uint8")
92
+ )
93
+ m_img = (
94
+ input_image["mask"].convert("RGB").filter(ImageFilter.GaussianBlur(radius=3))
95
+ )
96
+ m_img = np.asarray(m_img) / 255.0
97
+ img_np = np.asarray(input_image["image"].convert("RGB")) / 255.0
98
+ ours_np = np.asarray(result) / 255.0
99
+ ours_np = ours_np * m_img + (1 - m_img) * img_np
100
+ result_paste = Image.fromarray(np.uint8(ours_np * 255))
101
+
102
+ dict_res = [input_image["mask"].convert("RGB"), result_m]
103
+
104
+ dict_out = [input_image["image"].convert("RGB"), result_paste]
105
+
106
+ return dict_out, dict_res
107
+
108
+
109
+ pipe = Pipeline.from_pretrained(
110
+ "Sanster/PowerPaint-V1-stable-diffusion-inpainting",
111
+ torch_dtype=torch.float16,
112
+ safety_checker=None,
113
+ variant="fp16",
114
+ )
115
+ pipe.tokenizer = PowerPaintTokenizer(pipe.tokenizer)
116
+ pipe = pipe.to("mps")
117
+
118
+
119
+ img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
120
+ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
121
+ image = load_image(img_url).convert("RGB")
122
+ mask = load_image(mask_url).convert("RGB")
123
+
124
+ input_image = {"image": image, "mask": mask}
125
+ prompt = "Face of a fox sitting on a bench"
126
+ negative_prompt = "out of frame, lowres, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, disfigured, gross proportions, malformed limbs, watermark, signature"
127
+ fitting_degree = 1
128
+ ddim_steps = 30
129
+ tasks = [
130
+ {
131
+ "task": "object-removal",
132
+ "guidance_scale": 12,
133
+ "prompt": "",
134
+ "negative_prompt": "",
135
+ },
136
+ {
137
+ "task": "shape-guided",
138
+ "guidance_scale": 7.5,
139
+ "prompt": prompt,
140
+ "negative_prompt": negative_prompt,
141
+ },
142
+ {
143
+ "task": "inpaint",
144
+ "guidance_scale": 7.5,
145
+ "prompt": prompt,
146
+ "negative_prompt": negative_prompt,
147
+ },
148
+ {
149
+ "task": "image-outpainting",
150
+ "guidance_scale": 7.5,
151
+ "prompt": "A dog seitting on a bench",
152
+ "negative_prompt": negative_prompt,
153
+ },
154
+ ]
155
+
156
+ for task in tasks:
157
+ if task["task"] == "image-outpainting":
158
+ margin = 128
159
+ input_image["image"] = ImageOps.expand(
160
+ input_image["image"],
161
+ border=(margin, margin, margin, margin),
162
+ fill=(127, 127, 127),
163
+ )
164
+ outpaint_mask = np.zeros_like(np.asarray(input_image["mask"]))
165
+ input_image["mask"] = Image.fromarray(
166
+ cv2.copyMakeBorder(
167
+ outpaint_mask,
168
+ margin,
169
+ margin,
170
+ margin,
171
+ margin,
172
+ cv2.BORDER_CONSTANT,
173
+ value=(255, 255, 255),
174
+ )
175
+ )
176
+
177
+ dict_out, dict_res = predict(
178
+ pipe,
179
+ input_image,
180
+ task["prompt"],
181
+ fitting_degree,
182
+ ddim_steps,
183
+ task["guidance_scale"],
184
+ task["negative_prompt"],
185
+ task,
186
+ )
187
+
188
+ result_image = dict_out[1]
189
+ result_image.save(f"{task['task']}_result.png")
pipeline_PowerPaint.py ADDED
@@ -0,0 +1,1243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import PIL
20
+ import torch
21
+ from packaging import version
22
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
+ from diffusers.configuration_utils import FrozenDict
24
+ from diffusers.image_processor import VaeImageProcessor
25
+ from diffusers.loaders import (
26
+ FromSingleFileMixin,
27
+ LoraLoaderMixin,
28
+ TextualInversionLoaderMixin,
29
+ )
30
+ from diffusers.models import (
31
+ AsymmetricAutoencoderKL,
32
+ AutoencoderKL,
33
+ UNet2DConditionModel,
34
+ )
35
+ from diffusers.schedulers import KarrasDiffusionSchedulers
36
+ from diffusers.utils import (
37
+ deprecate,
38
+ is_accelerate_available,
39
+ is_accelerate_version,
40
+ logging,
41
+ )
42
+ from diffusers.utils.torch_utils import randn_tensor
43
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
44
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
45
+ from diffusers.pipelines.stable_diffusion.safety_checker import (
46
+ StableDiffusionSafetyChecker,
47
+ )
48
+
49
+
50
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
51
+
52
+
53
+ def prepare_mask_and_masked_image(
54
+ image, mask, height, width, return_image: bool = False
55
+ ):
56
+ """
57
+ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
58
+ converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
59
+ ``image`` and ``1`` for the ``mask``.
60
+
61
+ The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
62
+ binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
63
+
64
+ Args:
65
+ image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
66
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
67
+ ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
68
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
69
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
70
+ ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
71
+
72
+
73
+ Raises:
74
+ ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
75
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
76
+ TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
77
+ (ot the other way around).
78
+
79
+ Returns:
80
+ tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
81
+ dimensions: ``batch x channels x height x width``.
82
+ """
83
+
84
+ if image is None:
85
+ raise ValueError("`image` input cannot be undefined.")
86
+
87
+ if mask is None:
88
+ raise ValueError("`mask_image` input cannot be undefined.")
89
+
90
+ if isinstance(image, torch.Tensor):
91
+ if not isinstance(mask, torch.Tensor):
92
+ raise TypeError(
93
+ f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not"
94
+ )
95
+
96
+ # Batch single image
97
+ if image.ndim == 3:
98
+ assert (
99
+ image.shape[0] == 3
100
+ ), "Image outside a batch should be of shape (3, H, W)"
101
+ image = image.unsqueeze(0)
102
+
103
+ # Batch and add channel dim for single mask
104
+ if mask.ndim == 2:
105
+ mask = mask.unsqueeze(0).unsqueeze(0)
106
+
107
+ # Batch single mask or add channel dim
108
+ if mask.ndim == 3:
109
+ # Single batched mask, no channel dim or single mask not batched but channel dim
110
+ if mask.shape[0] == 1:
111
+ mask = mask.unsqueeze(0)
112
+
113
+ # Batched masks no channel dim
114
+ else:
115
+ mask = mask.unsqueeze(1)
116
+
117
+ assert (
118
+ image.ndim == 4 and mask.ndim == 4
119
+ ), "Image and Mask must have 4 dimensions"
120
+ assert (
121
+ image.shape[-2:] == mask.shape[-2:]
122
+ ), "Image and Mask must have the same spatial dimensions"
123
+ assert (
124
+ image.shape[0] == mask.shape[0]
125
+ ), "Image and Mask must have the same batch size"
126
+
127
+ # Check image is in [-1, 1]
128
+ if image.min() < -1 or image.max() > 1:
129
+ raise ValueError("Image should be in [-1, 1] range")
130
+
131
+ # Check mask is in [0, 1]
132
+ if mask.min() < 0 or mask.max() > 1:
133
+ raise ValueError("Mask should be in [0, 1] range")
134
+
135
+ # Binarize mask
136
+ mask[mask < 0.5] = 0
137
+ mask[mask >= 0.5] = 1
138
+
139
+ # Image as float32
140
+ image = image.to(dtype=torch.float32)
141
+ elif isinstance(mask, torch.Tensor):
142
+ raise TypeError(
143
+ f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not"
144
+ )
145
+ else:
146
+ # preprocess image
147
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
148
+ image = [image]
149
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
150
+ # resize all images w.r.t passed height an width
151
+ image = [
152
+ i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image
153
+ ]
154
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
155
+ image = np.concatenate(image, axis=0)
156
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
157
+ image = np.concatenate([i[None, :] for i in image], axis=0)
158
+
159
+ image = image.transpose(0, 3, 1, 2)
160
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
161
+
162
+ # preprocess mask
163
+ if isinstance(mask, (PIL.Image.Image, np.ndarray)):
164
+ mask = [mask]
165
+
166
+ if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
167
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
168
+ mask = np.concatenate(
169
+ [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
170
+ )
171
+ mask = mask.astype(np.float32) / 255.0
172
+ elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
173
+ mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
174
+
175
+ mask[mask < 0.5] = 0
176
+ mask[mask >= 0.5] = 1
177
+ mask = torch.from_numpy(mask)
178
+
179
+ masked_image = image * (mask < 0.5)
180
+
181
+ # n.b. ensure backwards compatibility as old function does not return image
182
+ if return_image:
183
+ return mask, masked_image, image
184
+
185
+ return mask, masked_image
186
+
187
+
188
+ class StableDiffusionInpaintPipeline(
189
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
190
+ ):
191
+ r"""
192
+ Pipeline for text-guided image inpainting using Stable Diffusion.
193
+
194
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
195
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
196
+
197
+ The pipeline also inherits the following loading methods:
198
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
199
+ - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
200
+ - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
201
+
202
+ Args:
203
+ vae ([`AutoencoderKL`, `AsymmetricAutoencoderKL`]):
204
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
205
+ text_encoder ([`CLIPTextModel`]):
206
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
207
+ tokenizer ([`~transformers.CLIPTokenizer`]):
208
+ A `CLIPTokenizer` to tokenize text.
209
+ unet ([`UNet2DConditionModel`]):
210
+ A `UNet2DConditionModel` to denoise the encoded image latents.
211
+ scheduler ([`SchedulerMixin`]):
212
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
213
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
214
+ safety_checker ([`StableDiffusionSafetyChecker`]):
215
+ Classification module that estimates whether generated images could be considered offensive or harmful.
216
+ Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
217
+ about a model's potential harms.
218
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
219
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
220
+ """
221
+ _optional_components = ["safety_checker", "feature_extractor"]
222
+
223
+ def __init__(
224
+ self,
225
+ vae: Union[AutoencoderKL, AsymmetricAutoencoderKL],
226
+ text_encoder: CLIPTextModel,
227
+ tokenizer: CLIPTokenizer,
228
+ unet: UNet2DConditionModel,
229
+ scheduler: KarrasDiffusionSchedulers,
230
+ safety_checker: StableDiffusionSafetyChecker,
231
+ feature_extractor: CLIPImageProcessor,
232
+ requires_safety_checker: bool = True,
233
+ ):
234
+ super().__init__()
235
+
236
+ if (
237
+ hasattr(scheduler.config, "steps_offset")
238
+ and scheduler.config.steps_offset != 1
239
+ ):
240
+ deprecation_message = (
241
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
242
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
243
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
244
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
245
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
246
+ " file"
247
+ )
248
+ deprecate(
249
+ "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
250
+ )
251
+ new_config = dict(scheduler.config)
252
+ new_config["steps_offset"] = 1
253
+ scheduler._internal_dict = FrozenDict(new_config)
254
+
255
+ if (
256
+ hasattr(scheduler.config, "skip_prk_steps")
257
+ and scheduler.config.skip_prk_steps is False
258
+ ):
259
+ deprecation_message = (
260
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration"
261
+ " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
262
+ " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
263
+ " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
264
+ " Hub, it would be very nice if you could open a Pull request for the"
265
+ " `scheduler/scheduler_config.json` file"
266
+ )
267
+ deprecate(
268
+ "skip_prk_steps not set",
269
+ "1.0.0",
270
+ deprecation_message,
271
+ standard_warn=False,
272
+ )
273
+ new_config = dict(scheduler.config)
274
+ new_config["skip_prk_steps"] = True
275
+ scheduler._internal_dict = FrozenDict(new_config)
276
+
277
+ if safety_checker is None and requires_safety_checker:
278
+ logger.warning(
279
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
280
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
281
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
282
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
283
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
284
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
285
+ )
286
+
287
+ if safety_checker is not None and feature_extractor is None:
288
+ raise ValueError(
289
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
290
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
291
+ )
292
+
293
+ is_unet_version_less_0_9_0 = hasattr(
294
+ unet.config, "_diffusers_version"
295
+ ) and version.parse(
296
+ version.parse(unet.config._diffusers_version).base_version
297
+ ) < version.parse(
298
+ "0.9.0.dev0"
299
+ )
300
+ is_unet_sample_size_less_64 = (
301
+ hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
302
+ )
303
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
304
+ deprecation_message = (
305
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
306
+ " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
307
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
308
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
309
+ " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
310
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
311
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
312
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
313
+ " the `unet/config.json` file"
314
+ )
315
+ deprecate(
316
+ "sample_size<64", "1.0.0", deprecation_message, standard_warn=False
317
+ )
318
+ new_config = dict(unet.config)
319
+ new_config["sample_size"] = 64
320
+ unet._internal_dict = FrozenDict(new_config)
321
+
322
+ # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4
323
+ if unet.config.in_channels != 9:
324
+ logger.info(
325
+ f"You have loaded a UNet with {unet.config.in_channels} input channels which."
326
+ )
327
+
328
+ self.register_modules(
329
+ vae=vae,
330
+ text_encoder=text_encoder,
331
+ tokenizer=tokenizer,
332
+ unet=unet,
333
+ scheduler=scheduler,
334
+ safety_checker=safety_checker,
335
+ feature_extractor=feature_extractor,
336
+ )
337
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
338
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
339
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
340
+
341
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
342
+ def enable_model_cpu_offload(self, gpu_id=0):
343
+ r"""
344
+ Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
345
+ time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
346
+ Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
347
+ iterative execution of the `unet`.
348
+ """
349
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
350
+ from accelerate import cpu_offload_with_hook
351
+ else:
352
+ raise ImportError(
353
+ "`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher."
354
+ )
355
+
356
+ device = torch.device(f"cuda:{gpu_id}")
357
+
358
+ if self.device.type != "cpu":
359
+ self.to("cpu", silence_dtype_warnings=True)
360
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
361
+
362
+ hook = None
363
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
364
+ _, hook = cpu_offload_with_hook(
365
+ cpu_offloaded_model, device, prev_module_hook=hook
366
+ )
367
+
368
+ if self.safety_checker is not None:
369
+ _, hook = cpu_offload_with_hook(
370
+ self.safety_checker, device, prev_module_hook=hook
371
+ )
372
+
373
+ # We'll offload the last model manually.
374
+ self.final_offload_hook = hook
375
+
376
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
377
+ def _encode_prompt(
378
+ self,
379
+ promptA,
380
+ promptB,
381
+ t,
382
+ device,
383
+ num_images_per_prompt,
384
+ do_classifier_free_guidance,
385
+ negative_promptA=None,
386
+ negative_promptB=None,
387
+ t_nag=None,
388
+ prompt_embeds: Optional[torch.FloatTensor] = None,
389
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
390
+ lora_scale: Optional[float] = None,
391
+ ):
392
+ r"""
393
+ Encodes the prompt into text encoder hidden states.
394
+
395
+ Args:
396
+ prompt (`str` or `List[str]`, *optional*):
397
+ prompt to be encoded
398
+ device: (`torch.device`):
399
+ torch device
400
+ num_images_per_prompt (`int`):
401
+ number of images that should be generated per prompt
402
+ do_classifier_free_guidance (`bool`):
403
+ whether to use classifier free guidance or not
404
+ negative_prompt (`str` or `List[str]`, *optional*):
405
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
406
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
407
+ less than `1`).
408
+ prompt_embeds (`torch.FloatTensor`, *optional*):
409
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
410
+ provided, text embeddings will be generated from `prompt` input argument.
411
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
412
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
413
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
414
+ argument.
415
+ lora_scale (`float`, *optional*):
416
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
417
+ """
418
+ # set lora scale so that monkey patched LoRA
419
+ # function of text encoder can correctly access it
420
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
421
+ self._lora_scale = lora_scale
422
+
423
+ prompt = promptA
424
+ negative_prompt = negative_promptA
425
+
426
+ if promptA is not None and isinstance(promptA, str):
427
+ batch_size = 1
428
+ elif promptA is not None and isinstance(promptA, list):
429
+ batch_size = len(promptA)
430
+ else:
431
+ batch_size = prompt_embeds.shape[0]
432
+
433
+ if prompt_embeds is None:
434
+ # textual inversion: procecss multi-vector tokens if necessary
435
+ if isinstance(self, TextualInversionLoaderMixin):
436
+ promptA = self.maybe_convert_prompt(promptA, self.tokenizer)
437
+
438
+ text_inputsA = self.tokenizer(
439
+ promptA,
440
+ padding="max_length",
441
+ max_length=self.tokenizer.model_max_length,
442
+ truncation=True,
443
+ return_tensors="pt",
444
+ )
445
+ text_inputsB = self.tokenizer(
446
+ promptB,
447
+ padding="max_length",
448
+ max_length=self.tokenizer.model_max_length,
449
+ truncation=True,
450
+ return_tensors="pt",
451
+ )
452
+ text_input_idsA = text_inputsA.input_ids
453
+ text_input_idsB = text_inputsB.input_ids
454
+ untruncated_ids = self.tokenizer(
455
+ promptA, padding="longest", return_tensors="pt"
456
+ ).input_ids
457
+
458
+ if untruncated_ids.shape[-1] >= text_input_idsA.shape[
459
+ -1
460
+ ] and not torch.equal(text_input_idsA, untruncated_ids):
461
+ removed_text = self.tokenizer.batch_decode(
462
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
463
+ )
464
+ logger.warning(
465
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
466
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
467
+ )
468
+
469
+ if (
470
+ hasattr(self.text_encoder.config, "use_attention_mask")
471
+ and self.text_encoder.config.use_attention_mask
472
+ ):
473
+ attention_mask = text_inputsA.attention_mask.to(device)
474
+ else:
475
+ attention_mask = None
476
+
477
+ # print("text_input_idsA: ",text_input_idsA)
478
+ # print("text_input_idsB: ",text_input_idsB)
479
+ # print('t: ',t)
480
+
481
+ prompt_embedsA = self.text_encoder(
482
+ text_input_idsA.to(device),
483
+ attention_mask=attention_mask,
484
+ )
485
+ prompt_embedsA = prompt_embedsA[0]
486
+
487
+ prompt_embedsB = self.text_encoder(
488
+ text_input_idsB.to(device),
489
+ attention_mask=attention_mask,
490
+ )
491
+ prompt_embedsB = prompt_embedsB[0]
492
+ prompt_embeds = prompt_embedsA * (t) + (1 - t) * prompt_embedsB
493
+ # print("prompt_embeds: ",prompt_embeds)
494
+
495
+ if self.text_encoder is not None:
496
+ prompt_embeds_dtype = self.text_encoder.dtype
497
+ elif self.unet is not None:
498
+ prompt_embeds_dtype = self.unet.dtype
499
+ else:
500
+ prompt_embeds_dtype = prompt_embeds.dtype
501
+
502
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
503
+
504
+ bs_embed, seq_len, _ = prompt_embeds.shape
505
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
506
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
507
+ prompt_embeds = prompt_embeds.view(
508
+ bs_embed * num_images_per_prompt, seq_len, -1
509
+ )
510
+
511
+ # get unconditional embeddings for classifier free guidance
512
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
513
+ uncond_tokensA: List[str]
514
+ uncond_tokensB: List[str]
515
+ if negative_prompt is None:
516
+ uncond_tokensA = [""] * batch_size
517
+ uncond_tokensB = [""] * batch_size
518
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
519
+ raise TypeError(
520
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
521
+ f" {type(prompt)}."
522
+ )
523
+ elif isinstance(negative_prompt, str):
524
+ uncond_tokensA = [negative_promptA]
525
+ uncond_tokensB = [negative_promptB]
526
+ elif batch_size != len(negative_prompt):
527
+ raise ValueError(
528
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
529
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
530
+ " the batch size of `prompt`."
531
+ )
532
+ else:
533
+ uncond_tokensA = negative_promptA
534
+ uncond_tokensB = negative_promptB
535
+
536
+ # textual inversion: procecss multi-vector tokens if necessary
537
+ if isinstance(self, TextualInversionLoaderMixin):
538
+ uncond_tokensA = self.maybe_convert_prompt(
539
+ uncond_tokensA, self.tokenizer
540
+ )
541
+ uncond_tokensB = self.maybe_convert_prompt(
542
+ uncond_tokensB, self.tokenizer
543
+ )
544
+
545
+ max_length = prompt_embeds.shape[1]
546
+ uncond_inputA = self.tokenizer(
547
+ uncond_tokensA,
548
+ padding="max_length",
549
+ max_length=max_length,
550
+ truncation=True,
551
+ return_tensors="pt",
552
+ )
553
+ uncond_inputB = self.tokenizer(
554
+ uncond_tokensB,
555
+ padding="max_length",
556
+ max_length=max_length,
557
+ truncation=True,
558
+ return_tensors="pt",
559
+ )
560
+
561
+ if (
562
+ hasattr(self.text_encoder.config, "use_attention_mask")
563
+ and self.text_encoder.config.use_attention_mask
564
+ ):
565
+ attention_mask = uncond_inputA.attention_mask.to(device)
566
+ else:
567
+ attention_mask = None
568
+
569
+ negative_prompt_embedsA = self.text_encoder(
570
+ uncond_inputA.input_ids.to(device),
571
+ attention_mask=attention_mask,
572
+ )
573
+ negative_prompt_embedsB = self.text_encoder(
574
+ uncond_inputB.input_ids.to(device),
575
+ attention_mask=attention_mask,
576
+ )
577
+ negative_prompt_embeds = (
578
+ negative_prompt_embedsA[0] * (t_nag)
579
+ + (1 - t_nag) * negative_prompt_embedsB[0]
580
+ )
581
+
582
+ # negative_prompt_embeds = negative_prompt_embeds[0]
583
+
584
+ if do_classifier_free_guidance:
585
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
586
+ seq_len = negative_prompt_embeds.shape[1]
587
+
588
+ negative_prompt_embeds = negative_prompt_embeds.to(
589
+ dtype=prompt_embeds_dtype, device=device
590
+ )
591
+
592
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
593
+ 1, num_images_per_prompt, 1
594
+ )
595
+ negative_prompt_embeds = negative_prompt_embeds.view(
596
+ batch_size * num_images_per_prompt, seq_len, -1
597
+ )
598
+
599
+ # For classifier free guidance, we need to do two forward passes.
600
+ # Here we concatenate the unconditional and text embeddings into a single batch
601
+ # to avoid doing two forward passes
602
+ # print("prompt_embeds: ",prompt_embeds)
603
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
604
+
605
+ return prompt_embeds
606
+
607
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
608
+ def run_safety_checker(self, image, device, dtype):
609
+ if self.safety_checker is None:
610
+ has_nsfw_concept = None
611
+ else:
612
+ if torch.is_tensor(image):
613
+ feature_extractor_input = self.image_processor.postprocess(
614
+ image, output_type="pil"
615
+ )
616
+ else:
617
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
618
+ safety_checker_input = self.feature_extractor(
619
+ feature_extractor_input, return_tensors="pt"
620
+ ).to(device)
621
+ image, has_nsfw_concept = self.safety_checker(
622
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
623
+ )
624
+ return image, has_nsfw_concept
625
+
626
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
627
+ def prepare_extra_step_kwargs(self, generator, eta):
628
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
629
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
630
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
631
+ # and should be between [0, 1]
632
+
633
+ accepts_eta = "eta" in set(
634
+ inspect.signature(self.scheduler.step).parameters.keys()
635
+ )
636
+ extra_step_kwargs = {}
637
+ if accepts_eta:
638
+ extra_step_kwargs["eta"] = eta
639
+
640
+ # check if the scheduler accepts generator
641
+ accepts_generator = "generator" in set(
642
+ inspect.signature(self.scheduler.step).parameters.keys()
643
+ )
644
+ if accepts_generator:
645
+ extra_step_kwargs["generator"] = generator
646
+ return extra_step_kwargs
647
+
648
+ def check_inputs(
649
+ self,
650
+ prompt,
651
+ height,
652
+ width,
653
+ strength,
654
+ callback_steps,
655
+ negative_prompt=None,
656
+ prompt_embeds=None,
657
+ negative_prompt_embeds=None,
658
+ ):
659
+ if strength < 0 or strength > 1:
660
+ raise ValueError(
661
+ f"The value of strength should in [0.0, 1.0] but is {strength}"
662
+ )
663
+
664
+ if height % 8 != 0 or width % 8 != 0:
665
+ raise ValueError(
666
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
667
+ )
668
+
669
+ if (callback_steps is None) or (
670
+ callback_steps is not None
671
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
672
+ ):
673
+ raise ValueError(
674
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
675
+ f" {type(callback_steps)}."
676
+ )
677
+
678
+ if prompt is not None and prompt_embeds is not None:
679
+ raise ValueError(
680
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
681
+ " only forward one of the two."
682
+ )
683
+ elif prompt is None and prompt_embeds is None:
684
+ raise ValueError(
685
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
686
+ )
687
+ elif prompt is not None and (
688
+ not isinstance(prompt, str) and not isinstance(prompt, list)
689
+ ):
690
+ raise ValueError(
691
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
692
+ )
693
+
694
+ if negative_prompt is not None and negative_prompt_embeds is not None:
695
+ raise ValueError(
696
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
697
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
698
+ )
699
+
700
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
701
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
702
+ raise ValueError(
703
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
704
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
705
+ f" {negative_prompt_embeds.shape}."
706
+ )
707
+
708
+ def prepare_latents(
709
+ self,
710
+ batch_size,
711
+ num_channels_latents,
712
+ height,
713
+ width,
714
+ dtype,
715
+ device,
716
+ generator,
717
+ latents=None,
718
+ image=None,
719
+ timestep=None,
720
+ is_strength_max=True,
721
+ return_noise=False,
722
+ return_image_latents=False,
723
+ ):
724
+ shape = (
725
+ batch_size,
726
+ num_channels_latents,
727
+ height // self.vae_scale_factor,
728
+ width // self.vae_scale_factor,
729
+ )
730
+ if isinstance(generator, list) and len(generator) != batch_size:
731
+ raise ValueError(
732
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
733
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
734
+ )
735
+
736
+ if (image is None or timestep is None) and not is_strength_max:
737
+ raise ValueError(
738
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
739
+ "However, either the image or the noise timestep has not been provided."
740
+ )
741
+
742
+ if return_image_latents or (latents is None and not is_strength_max):
743
+ image = image.to(device=device, dtype=dtype)
744
+ image_latents = self._encode_vae_image(image=image, generator=generator)
745
+
746
+ if latents is None:
747
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
748
+ # if strength is 1. then initialise the latents to noise, else initial to image + noise
749
+ latents = (
750
+ noise
751
+ if is_strength_max
752
+ else self.scheduler.add_noise(image_latents, noise, timestep)
753
+ )
754
+ # if pure noise then scale the initial latents by the Scheduler's init sigma
755
+ latents = (
756
+ latents * self.scheduler.init_noise_sigma
757
+ if is_strength_max
758
+ else latents
759
+ )
760
+ else:
761
+ noise = latents.to(device)
762
+ latents = noise * self.scheduler.init_noise_sigma
763
+
764
+ outputs = (latents,)
765
+
766
+ if return_noise:
767
+ outputs += (noise,)
768
+
769
+ if return_image_latents:
770
+ outputs += (image_latents,)
771
+
772
+ return outputs
773
+
774
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
775
+ if isinstance(generator, list):
776
+ image_latents = [
777
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(
778
+ generator=generator[i]
779
+ )
780
+ for i in range(image.shape[0])
781
+ ]
782
+ image_latents = torch.cat(image_latents, dim=0)
783
+ else:
784
+ image_latents = self.vae.encode(image).latent_dist.sample(
785
+ generator=generator
786
+ )
787
+
788
+ image_latents = self.vae.config.scaling_factor * image_latents
789
+
790
+ return image_latents
791
+
792
+ def prepare_mask_latents(
793
+ self,
794
+ mask,
795
+ masked_image,
796
+ batch_size,
797
+ height,
798
+ width,
799
+ dtype,
800
+ device,
801
+ generator,
802
+ do_classifier_free_guidance,
803
+ ):
804
+ # resize the mask to latents shape as we concatenate the mask to the latents
805
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
806
+ # and half precision
807
+ mask = torch.nn.functional.interpolate(
808
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
809
+ )
810
+ mask = mask.to(device=device, dtype=dtype)
811
+
812
+ masked_image = masked_image.to(device=device, dtype=dtype)
813
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
814
+
815
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
816
+ if mask.shape[0] < batch_size:
817
+ if not batch_size % mask.shape[0] == 0:
818
+ raise ValueError(
819
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
820
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
821
+ " of masks that you pass is divisible by the total requested batch size."
822
+ )
823
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
824
+ if masked_image_latents.shape[0] < batch_size:
825
+ if not batch_size % masked_image_latents.shape[0] == 0:
826
+ raise ValueError(
827
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
828
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
829
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
830
+ )
831
+ masked_image_latents = masked_image_latents.repeat(
832
+ batch_size // masked_image_latents.shape[0], 1, 1, 1
833
+ )
834
+
835
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
836
+ masked_image_latents = (
837
+ torch.cat([masked_image_latents] * 2)
838
+ if do_classifier_free_guidance
839
+ else masked_image_latents
840
+ )
841
+
842
+ # aligning device to prevent device errors when concating it with the latent model input
843
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
844
+ return mask, masked_image_latents
845
+
846
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
847
+ def get_timesteps(self, num_inference_steps, strength, device):
848
+ # get the original timestep using init_timestep
849
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
850
+
851
+ t_start = max(num_inference_steps - init_timestep, 0)
852
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
853
+
854
+ return timesteps, num_inference_steps - t_start
855
+
856
+ @torch.no_grad()
857
+ def __call__(
858
+ self,
859
+ promptA: Union[str, List[str]] = None,
860
+ promptB: Union[str, List[str]] = None,
861
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
862
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
863
+ height: Optional[int] = None,
864
+ width: Optional[int] = None,
865
+ strength: float = 1.0,
866
+ tradoff: float = 1.0,
867
+ tradoff_nag: float = 1.0,
868
+ num_inference_steps: int = 50,
869
+ guidance_scale: float = 7.5,
870
+ negative_promptA: Optional[Union[str, List[str]]] = None,
871
+ negative_promptB: Optional[Union[str, List[str]]] = None,
872
+ num_images_per_prompt: Optional[int] = 1,
873
+ eta: float = 0.0,
874
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
875
+ latents: Optional[torch.FloatTensor] = None,
876
+ prompt_embeds: Optional[torch.FloatTensor] = None,
877
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
878
+ output_type: Optional[str] = "pil",
879
+ return_dict: bool = True,
880
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
881
+ callback_steps: int = 1,
882
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
883
+ task_class: Union[torch.Tensor, float, int] = None,
884
+ ):
885
+ r"""
886
+ The call function to the pipeline for generation.
887
+
888
+ Args:
889
+ prompt (`str` or `List[str]`, *optional*):
890
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
891
+ image (`PIL.Image.Image`):
892
+ `Image` or tensor representing an image batch to be inpainted (which parts of the image to be masked
893
+ out with `mask_image` and repainted according to `prompt`).
894
+ mask_image (`PIL.Image.Image`):
895
+ `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
896
+ while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
897
+ (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
898
+ expected shape would be `(B, H, W, 1)`.
899
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
900
+ The height in pixels of the generated image.
901
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
902
+ The width in pixels of the generated image.
903
+ strength (`float`, *optional*, defaults to 1.0):
904
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
905
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
906
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
907
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
908
+ essentially ignores `image`.
909
+ num_inference_steps (`int`, *optional*, defaults to 50):
910
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
911
+ expense of slower inference. This parameter is modulated by `strength`.
912
+ guidance_scale (`float`, *optional*, defaults to 7.5):
913
+ A higher guidance scale value encourages the model to generate images closely linked to the text
914
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
915
+ negative_prompt (`str` or `List[str]`, *optional*):
916
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
917
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
918
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
919
+ The number of images to generate per prompt.
920
+ eta (`float`, *optional*, defaults to 0.0):
921
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
922
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
923
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
924
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
925
+ generation deterministic.
926
+ latents (`torch.FloatTensor`, *optional*):
927
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
928
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
929
+ tensor is generated by sampling using the supplied random `generator`.
930
+ prompt_embeds (`torch.FloatTensor`, *optional*):
931
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
932
+ provided, text embeddings are generated from the `prompt` input argument.
933
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
934
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
935
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
936
+ output_type (`str`, *optional*, defaults to `"pil"`):
937
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
938
+ return_dict (`bool`, *optional*, defaults to `True`):
939
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
940
+ plain tuple.
941
+ callback (`Callable`, *optional*):
942
+ A function that calls every `callback_steps` steps during inference. The function is called with the
943
+ following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
944
+ callback_steps (`int`, *optional*, defaults to 1):
945
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
946
+ every step.
947
+ cross_attention_kwargs (`dict`, *optional*):
948
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
949
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
950
+
951
+ Examples:
952
+
953
+ ```py
954
+ >>> import PIL
955
+ >>> import requests
956
+ >>> import torch
957
+ >>> from io import BytesIO
958
+
959
+ >>> from diffusers import StableDiffusionInpaintPipeline
960
+
961
+
962
+ >>> def download_image(url):
963
+ ... response = requests.get(url)
964
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
965
+
966
+
967
+ >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
968
+ >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
969
+
970
+ >>> init_image = download_image(img_url).resize((512, 512))
971
+ >>> mask_image = download_image(mask_url).resize((512, 512))
972
+
973
+ >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
974
+ ... "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
975
+ ... )
976
+ >>> pipe = pipe.to("cuda")
977
+
978
+ >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
979
+ >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
980
+ ```
981
+
982
+ Returns:
983
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
984
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
985
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
986
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
987
+ "not-safe-for-work" (nsfw) content.
988
+ """
989
+ # 0. Default height and width to unet
990
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
991
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
992
+ prompt = promptA
993
+ negative_prompt = negative_promptA
994
+ # 1. Check inputs
995
+ self.check_inputs(
996
+ prompt,
997
+ height,
998
+ width,
999
+ strength,
1000
+ callback_steps,
1001
+ negative_prompt,
1002
+ prompt_embeds,
1003
+ negative_prompt_embeds,
1004
+ )
1005
+
1006
+ # 2. Define call parameters
1007
+ if prompt is not None and isinstance(prompt, str):
1008
+ batch_size = 1
1009
+ elif prompt is not None and isinstance(prompt, list):
1010
+ batch_size = len(prompt)
1011
+ else:
1012
+ batch_size = prompt_embeds.shape[0]
1013
+
1014
+ device = self._execution_device
1015
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
1016
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
1017
+ # corresponds to doing no classifier free guidance.
1018
+ do_classifier_free_guidance = guidance_scale > 1.0
1019
+
1020
+ # 3. Encode input prompt
1021
+ text_encoder_lora_scale = (
1022
+ cross_attention_kwargs.get("scale", None)
1023
+ if cross_attention_kwargs is not None
1024
+ else None
1025
+ )
1026
+ prompt_embeds = self._encode_prompt(
1027
+ promptA,
1028
+ promptB,
1029
+ tradoff,
1030
+ device,
1031
+ num_images_per_prompt,
1032
+ do_classifier_free_guidance,
1033
+ negative_promptA,
1034
+ negative_promptB,
1035
+ tradoff_nag,
1036
+ prompt_embeds=prompt_embeds,
1037
+ negative_prompt_embeds=negative_prompt_embeds,
1038
+ lora_scale=text_encoder_lora_scale,
1039
+ )
1040
+
1041
+ # 4. set timesteps
1042
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
1043
+ timesteps, num_inference_steps = self.get_timesteps(
1044
+ num_inference_steps=num_inference_steps, strength=strength, device=device
1045
+ )
1046
+ # check that number of inference steps is not < 1 - as this doesn't make sense
1047
+ if num_inference_steps < 1:
1048
+ raise ValueError(
1049
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
1050
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
1051
+ )
1052
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1053
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1054
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1055
+ is_strength_max = strength == 1.0
1056
+
1057
+ # 5. Preprocess mask and image
1058
+ mask, masked_image, init_image = prepare_mask_and_masked_image(
1059
+ image, mask_image, height, width, return_image=True
1060
+ )
1061
+ mask_condition = mask.clone()
1062
+
1063
+ # 6. Prepare latent variables
1064
+ num_channels_latents = self.vae.config.latent_channels
1065
+ num_channels_unet = self.unet.config.in_channels
1066
+ return_image_latents = num_channels_unet == 4
1067
+
1068
+ latents_outputs = self.prepare_latents(
1069
+ batch_size * num_images_per_prompt,
1070
+ num_channels_latents,
1071
+ height,
1072
+ width,
1073
+ prompt_embeds.dtype,
1074
+ device,
1075
+ generator,
1076
+ latents,
1077
+ image=init_image,
1078
+ timestep=latent_timestep,
1079
+ is_strength_max=is_strength_max,
1080
+ return_noise=True,
1081
+ return_image_latents=return_image_latents,
1082
+ )
1083
+
1084
+ if return_image_latents:
1085
+ latents, noise, image_latents = latents_outputs
1086
+ else:
1087
+ latents, noise = latents_outputs
1088
+
1089
+ # 7. Prepare mask latent variables
1090
+ mask, masked_image_latents = self.prepare_mask_latents(
1091
+ mask,
1092
+ masked_image,
1093
+ batch_size * num_images_per_prompt,
1094
+ height,
1095
+ width,
1096
+ prompt_embeds.dtype,
1097
+ device,
1098
+ generator,
1099
+ do_classifier_free_guidance,
1100
+ )
1101
+
1102
+ # 8. Check that sizes of mask, masked image and latents match
1103
+ if num_channels_unet == 9:
1104
+ # default case for runwayml/stable-diffusion-inpainting
1105
+ num_channels_mask = mask.shape[1]
1106
+ num_channels_masked_image = masked_image_latents.shape[1]
1107
+ if (
1108
+ num_channels_latents + num_channels_mask + num_channels_masked_image
1109
+ != self.unet.config.in_channels
1110
+ ):
1111
+ raise ValueError(
1112
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
1113
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
1114
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1115
+ f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1116
+ " `pipeline.unet` or your `mask_image` or `image` input."
1117
+ )
1118
+ elif num_channels_unet != 4:
1119
+ raise ValueError(
1120
+ f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
1121
+ )
1122
+
1123
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1124
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1125
+
1126
+ # 10. Denoising loop
1127
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1128
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1129
+ for i, t in enumerate(timesteps):
1130
+ # expand the latents if we are doing classifier free guidance
1131
+ latent_model_input = (
1132
+ torch.cat([latents] * 2) if do_classifier_free_guidance else latents
1133
+ )
1134
+
1135
+ # concat latents, mask, masked_image_latents in the channel dimension
1136
+ latent_model_input = self.scheduler.scale_model_input(
1137
+ latent_model_input, t
1138
+ )
1139
+
1140
+ if num_channels_unet == 9:
1141
+ latent_model_input = torch.cat(
1142
+ [latent_model_input, mask, masked_image_latents], dim=1
1143
+ )
1144
+
1145
+ # predict the noise residual
1146
+ if task_class is not None:
1147
+ noise_pred = self.unet(
1148
+ sample=latent_model_input,
1149
+ timestep=t,
1150
+ encoder_hidden_states=prompt_embeds,
1151
+ cross_attention_kwargs=cross_attention_kwargs,
1152
+ return_dict=False,
1153
+ task_class=task_class,
1154
+ )[0]
1155
+ else:
1156
+ noise_pred = self.unet(
1157
+ latent_model_input,
1158
+ t,
1159
+ encoder_hidden_states=prompt_embeds,
1160
+ cross_attention_kwargs=cross_attention_kwargs,
1161
+ return_dict=False,
1162
+ )[0]
1163
+
1164
+ # perform guidance
1165
+ if do_classifier_free_guidance:
1166
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1167
+ noise_pred = noise_pred_uncond + guidance_scale * (
1168
+ noise_pred_text - noise_pred_uncond
1169
+ )
1170
+
1171
+ # compute the previous noisy sample x_t -> x_t-1
1172
+ latents = self.scheduler.step(
1173
+ noise_pred, t, latents, **extra_step_kwargs, return_dict=False
1174
+ )[0]
1175
+
1176
+ if num_channels_unet == 4:
1177
+ init_latents_proper = image_latents[:1]
1178
+ init_mask = mask[:1]
1179
+
1180
+ if i < len(timesteps) - 1:
1181
+ noise_timestep = timesteps[i + 1]
1182
+ init_latents_proper = self.scheduler.add_noise(
1183
+ init_latents_proper, noise, torch.tensor([noise_timestep])
1184
+ )
1185
+
1186
+ latents = (
1187
+ 1 - init_mask
1188
+ ) * init_latents_proper + init_mask * latents
1189
+
1190
+ # call the callback, if provided
1191
+ if i == len(timesteps) - 1 or (
1192
+ (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
1193
+ ):
1194
+ progress_bar.update()
1195
+ if callback is not None and i % callback_steps == 0:
1196
+ callback(i, t, latents)
1197
+
1198
+ if not output_type == "latent":
1199
+ condition_kwargs = {}
1200
+ if isinstance(self.vae, AsymmetricAutoencoderKL):
1201
+ init_image = init_image.to(
1202
+ device=device, dtype=masked_image_latents.dtype
1203
+ )
1204
+ init_image_condition = init_image.clone()
1205
+ init_image = self._encode_vae_image(init_image, generator=generator)
1206
+ mask_condition = mask_condition.to(
1207
+ device=device, dtype=masked_image_latents.dtype
1208
+ )
1209
+ condition_kwargs = {
1210
+ "image": init_image_condition,
1211
+ "mask": mask_condition,
1212
+ }
1213
+ image = self.vae.decode(
1214
+ latents / self.vae.config.scaling_factor,
1215
+ return_dict=False,
1216
+ **condition_kwargs,
1217
+ )[0]
1218
+ image, has_nsfw_concept = self.run_safety_checker(
1219
+ image, device, prompt_embeds.dtype
1220
+ )
1221
+ else:
1222
+ image = latents
1223
+ has_nsfw_concept = None
1224
+
1225
+ if has_nsfw_concept is None:
1226
+ do_denormalize = [True] * image.shape[0]
1227
+ else:
1228
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1229
+
1230
+ image = self.image_processor.postprocess(
1231
+ image, output_type=output_type, do_denormalize=do_denormalize
1232
+ )
1233
+
1234
+ # Offload last model to CPU
1235
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1236
+ self.final_offload_hook.offload()
1237
+
1238
+ if not return_dict:
1239
+ return (image, has_nsfw_concept)
1240
+
1241
+ return StableDiffusionPipelineOutput(
1242
+ images=image, nsfw_content_detected=has_nsfw_concept
1243
+ )
pipeline_PowerPaint_ControlNet.py ADDED
@@ -0,0 +1,1775 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/
16
+
17
+ import inspect
18
+ import warnings
19
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
20
+
21
+ import numpy as np
22
+ import PIL.Image
23
+ import torch
24
+ import torch.nn.functional as F
25
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
26
+
27
+ from diffusers.image_processor import VaeImageProcessor
28
+ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
29
+ from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
30
+ from diffusers.schedulers import KarrasDiffusionSchedulers
31
+ from diffusers.utils import (
32
+ is_accelerate_available,
33
+ is_accelerate_version,
34
+ logging,
35
+ replace_example_docstring,
36
+ )
37
+ from diffusers.utils.torch_utils import randn_tensor,is_compiled_module
38
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
39
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
40
+ from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
41
+ from diffusers.pipelines.controlnet import MultiControlNetModel
42
+
43
+
44
+
45
+
46
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
47
+
48
+
49
+ EXAMPLE_DOC_STRING = """
50
+ Examples:
51
+ ```py
52
+ >>> # !pip install transformers accelerate
53
+ >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
54
+ >>> from diffusers.utils import load_image
55
+ >>> import numpy as np
56
+ >>> import torch
57
+
58
+ >>> init_image = load_image(
59
+ ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
60
+ ... )
61
+ >>> init_image = init_image.resize((512, 512))
62
+
63
+ >>> generator = torch.Generator(device="cpu").manual_seed(1)
64
+
65
+ >>> mask_image = load_image(
66
+ ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
67
+ ... )
68
+ >>> mask_image = mask_image.resize((512, 512))
69
+
70
+
71
+ >>> def make_inpaint_condition(image, image_mask):
72
+ ... image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
73
+ ... image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0
74
+
75
+ ... assert image.shape[0:1] == image_mask.shape[0:1], "image and image_mask must have the same image size"
76
+ ... image[image_mask > 0.5] = -1.0 # set as masked pixel
77
+ ... image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
78
+ ... image = torch.from_numpy(image)
79
+ ... return image
80
+
81
+
82
+ >>> control_image = make_inpaint_condition(init_image, mask_image)
83
+
84
+ >>> controlnet = ControlNetModel.from_pretrained(
85
+ ... "lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16
86
+ ... )
87
+ >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
88
+ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
89
+ ... )
90
+
91
+ >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
92
+ >>> pipe.enable_model_cpu_offload()
93
+
94
+ >>> # generate image
95
+ >>> image = pipe(
96
+ ... "a handsome man with ray-ban sunglasses",
97
+ ... num_inference_steps=20,
98
+ ... generator=generator,
99
+ ... eta=1.0,
100
+ ... image=init_image,
101
+ ... mask_image=mask_image,
102
+ ... control_image=control_image,
103
+ ... ).images[0]
104
+ ```
105
+ """
106
+
107
+
108
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image
109
+ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False):
110
+ """
111
+ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
112
+ converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
113
+ ``image`` and ``1`` for the ``mask``.
114
+
115
+ The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
116
+ binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
117
+
118
+ Args:
119
+ image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
120
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
121
+ ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
122
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
123
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
124
+ ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
125
+
126
+
127
+ Raises:
128
+ ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
129
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
130
+ TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
131
+ (ot the other way around).
132
+
133
+ Returns:
134
+ tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
135
+ dimensions: ``batch x channels x height x width``.
136
+ """
137
+
138
+ if image is None:
139
+ raise ValueError("`image` input cannot be undefined.")
140
+
141
+ if mask is None:
142
+ raise ValueError("`mask_image` input cannot be undefined.")
143
+
144
+ if isinstance(image, torch.Tensor):
145
+ if not isinstance(mask, torch.Tensor):
146
+ raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
147
+
148
+ # Batch single image
149
+ if image.ndim == 3:
150
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
151
+ image = image.unsqueeze(0)
152
+
153
+ # Batch and add channel dim for single mask
154
+ if mask.ndim == 2:
155
+ mask = mask.unsqueeze(0).unsqueeze(0)
156
+
157
+ # Batch single mask or add channel dim
158
+ if mask.ndim == 3:
159
+ # Single batched mask, no channel dim or single mask not batched but channel dim
160
+ if mask.shape[0] == 1:
161
+ mask = mask.unsqueeze(0)
162
+
163
+ # Batched masks no channel dim
164
+ else:
165
+ mask = mask.unsqueeze(1)
166
+
167
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
168
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
169
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
170
+
171
+ # Check image is in [-1, 1]
172
+ if image.min() < -1 or image.max() > 1:
173
+ raise ValueError("Image should be in [-1, 1] range")
174
+
175
+ # Check mask is in [0, 1]
176
+ if mask.min() < 0 or mask.max() > 1:
177
+ raise ValueError("Mask should be in [0, 1] range")
178
+
179
+ # Binarize mask
180
+ mask[mask < 0.5] = 0
181
+ mask[mask >= 0.5] = 1
182
+
183
+ # Image as float32
184
+ image = image.to(dtype=torch.float32)
185
+ elif isinstance(mask, torch.Tensor):
186
+ raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
187
+ else:
188
+ # preprocess image
189
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
190
+ image = [image]
191
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
192
+ # resize all images w.r.t passed height an width
193
+ image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
194
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
195
+ image = np.concatenate(image, axis=0)
196
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
197
+ image = np.concatenate([i[None, :] for i in image], axis=0)
198
+
199
+ image = image.transpose(0, 3, 1, 2)
200
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
201
+
202
+ # preprocess mask
203
+ if isinstance(mask, (PIL.Image.Image, np.ndarray)):
204
+ mask = [mask]
205
+
206
+ if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
207
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
208
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
209
+ mask = mask.astype(np.float32) / 255.0
210
+ elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
211
+ mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
212
+
213
+ mask[mask < 0.5] = 0
214
+ mask[mask >= 0.5] = 1
215
+ mask = torch.from_numpy(mask)
216
+
217
+ masked_image = image * (mask < 0.5)
218
+
219
+ # n.b. ensure backwards compatibility as old function does not return image
220
+ if return_image:
221
+ return mask, masked_image, image
222
+
223
+ return mask, masked_image
224
+
225
+
226
+ class StableDiffusionControlNetInpaintPipeline(
227
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
228
+ ):
229
+ r"""
230
+ Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
231
+
232
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
233
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
234
+
235
+ In addition the pipeline inherits the following loading methods:
236
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
237
+
238
+ <Tip>
239
+
240
+ This pipeline can be used both with checkpoints that have been specifically fine-tuned for inpainting, such as
241
+ [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)
242
+ as well as default text-to-image stable diffusion checkpoints, such as
243
+ [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5).
244
+ Default text-to-image stable diffusion checkpoints might be preferable for controlnets that have been fine-tuned on
245
+ those, such as [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint).
246
+
247
+ </Tip>
248
+
249
+ Args:
250
+ vae ([`AutoencoderKL`]):
251
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
252
+ text_encoder ([`CLIPTextModel`]):
253
+ Frozen text-encoder. Stable Diffusion uses the text portion of
254
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
255
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
256
+ tokenizer (`CLIPTokenizer`):
257
+ Tokenizer of class
258
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
259
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
260
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
261
+ Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
262
+ as a list, the outputs from each ControlNet are added together to create one combined additional
263
+ conditioning.
264
+ scheduler ([`SchedulerMixin`]):
265
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
266
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
267
+ safety_checker ([`StableDiffusionSafetyChecker`]):
268
+ Classification module that estimates whether generated images could be considered offensive or harmful.
269
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
270
+ feature_extractor ([`CLIPImageProcessor`]):
271
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
272
+ """
273
+ _optional_components = ["safety_checker", "feature_extractor"]
274
+
275
+ def __init__(
276
+ self,
277
+ vae: AutoencoderKL,
278
+ text_encoder: CLIPTextModel,
279
+ tokenizer: CLIPTokenizer,
280
+ unet: UNet2DConditionModel,
281
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
282
+ scheduler: KarrasDiffusionSchedulers,
283
+ safety_checker: StableDiffusionSafetyChecker,
284
+ feature_extractor: CLIPImageProcessor,
285
+ requires_safety_checker: bool = True,
286
+ ):
287
+ super().__init__()
288
+
289
+ if safety_checker is None and requires_safety_checker:
290
+ logger.warning(
291
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
292
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
293
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
294
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
295
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
296
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
297
+ )
298
+
299
+ if safety_checker is not None and feature_extractor is None:
300
+ raise ValueError(
301
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
302
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
303
+ )
304
+
305
+ if isinstance(controlnet, (list, tuple)):
306
+ controlnet = MultiControlNetModel(controlnet)
307
+
308
+ self.register_modules(
309
+ vae=vae,
310
+ text_encoder=text_encoder,
311
+ tokenizer=tokenizer,
312
+ unet=unet,
313
+ controlnet=controlnet,
314
+ scheduler=scheduler,
315
+ safety_checker=safety_checker,
316
+ feature_extractor=feature_extractor,
317
+ )
318
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
319
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
320
+ self.control_image_processor = VaeImageProcessor(
321
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
322
+ )
323
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
324
+
325
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
326
+ def enable_vae_slicing(self):
327
+ r"""
328
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
329
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
330
+ """
331
+ self.vae.enable_slicing()
332
+
333
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
334
+ def disable_vae_slicing(self):
335
+ r"""
336
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
337
+ computing decoding in one step.
338
+ """
339
+ self.vae.disable_slicing()
340
+
341
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
342
+ def enable_vae_tiling(self):
343
+ r"""
344
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
345
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
346
+ processing larger images.
347
+ """
348
+ self.vae.enable_tiling()
349
+
350
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
351
+ def disable_vae_tiling(self):
352
+ r"""
353
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
354
+ computing decoding in one step.
355
+ """
356
+ self.vae.disable_tiling()
357
+
358
+ def enable_model_cpu_offload(self, gpu_id=0):
359
+ r"""
360
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
361
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
362
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
363
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
364
+ """
365
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
366
+ from accelerate import cpu_offload_with_hook
367
+ else:
368
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
369
+
370
+ device = torch.device(f"cuda:{gpu_id}")
371
+
372
+ hook = None
373
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
374
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
375
+
376
+ if self.safety_checker is not None:
377
+ # the safety checker can offload the vae again
378
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
379
+
380
+ # control net hook has be manually offloaded as it alternates with unet
381
+ cpu_offload_with_hook(self.controlnet, device)
382
+
383
+ # We'll offload the last model manually.
384
+ self.final_offload_hook = hook
385
+
386
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
387
+ def _encode_prompt(
388
+ self,
389
+ promptA,
390
+ promptB,
391
+ t,
392
+ device,
393
+ num_images_per_prompt,
394
+ do_classifier_free_guidance,
395
+ negative_promptA=None,
396
+ negative_promptB=None,
397
+ t_nag = None,
398
+ prompt_embeds: Optional[torch.FloatTensor] = None,
399
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
400
+ lora_scale: Optional[float] = None,
401
+ ):
402
+ r"""
403
+ Encodes the prompt into text encoder hidden states.
404
+
405
+ Args:
406
+ prompt (`str` or `List[str]`, *optional*):
407
+ prompt to be encoded
408
+ device: (`torch.device`):
409
+ torch device
410
+ num_images_per_prompt (`int`):
411
+ number of images that should be generated per prompt
412
+ do_classifier_free_guidance (`bool`):
413
+ whether to use classifier free guidance or not
414
+ negative_prompt (`str` or `List[str]`, *optional*):
415
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
416
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
417
+ less than `1`).
418
+ prompt_embeds (`torch.FloatTensor`, *optional*):
419
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
420
+ provided, text embeddings will be generated from `prompt` input argument.
421
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
422
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
423
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
424
+ argument.
425
+ lora_scale (`float`, *optional*):
426
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
427
+ """
428
+ # set lora scale so that monkey patched LoRA
429
+ # function of text encoder can correctly access it
430
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
431
+ self._lora_scale = lora_scale
432
+
433
+ prompt = promptA
434
+ negative_prompt = negative_promptA
435
+
436
+ if promptA is not None and isinstance(promptA, str):
437
+ batch_size = 1
438
+ elif promptA is not None and isinstance(promptA, list):
439
+ batch_size = len(promptA)
440
+ else:
441
+ batch_size = prompt_embeds.shape[0]
442
+
443
+ if prompt_embeds is None:
444
+ # textual inversion: procecss multi-vector tokens if necessary
445
+ if isinstance(self, TextualInversionLoaderMixin):
446
+ promptA = self.maybe_convert_prompt(promptA, self.tokenizer)
447
+
448
+ text_inputsA = self.tokenizer(
449
+ promptA,
450
+ padding="max_length",
451
+ max_length=self.tokenizer.model_max_length,
452
+ truncation=True,
453
+ return_tensors="pt",
454
+ )
455
+ text_inputsB = self.tokenizer(
456
+ promptB,
457
+ padding="max_length",
458
+ max_length=self.tokenizer.model_max_length,
459
+ truncation=True,
460
+ return_tensors="pt",
461
+ )
462
+ text_input_idsA = text_inputsA.input_ids
463
+ text_input_idsB = text_inputsB.input_ids
464
+ untruncated_ids = self.tokenizer(promptA, padding="longest", return_tensors="pt").input_ids
465
+
466
+ if untruncated_ids.shape[-1] >= text_input_idsA.shape[-1] and not torch.equal(
467
+ text_input_idsA, untruncated_ids
468
+ ):
469
+ removed_text = self.tokenizer.batch_decode(
470
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
471
+ )
472
+ logger.warning(
473
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
474
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
475
+ )
476
+
477
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
478
+ attention_mask = text_inputsA.attention_mask.to(device)
479
+ else:
480
+ attention_mask = None
481
+
482
+ # print("text_input_idsA: ",text_input_idsA)
483
+ # print("text_input_idsB: ",text_input_idsB)
484
+ # print('t: ',t)
485
+
486
+ prompt_embedsA = self.text_encoder(
487
+ text_input_idsA.to(device),
488
+ attention_mask=attention_mask,
489
+ )
490
+ prompt_embedsA = prompt_embedsA[0]
491
+
492
+ prompt_embedsB = self.text_encoder(
493
+ text_input_idsB.to(device),
494
+ attention_mask=attention_mask,
495
+ )
496
+ prompt_embedsB = prompt_embedsB[0]
497
+ prompt_embeds = prompt_embedsA*(t)+(1-t)*prompt_embedsB
498
+ # print("prompt_embeds: ",prompt_embeds)
499
+
500
+ if self.text_encoder is not None:
501
+ prompt_embeds_dtype = self.text_encoder.dtype
502
+ elif self.unet is not None:
503
+ prompt_embeds_dtype = self.unet.dtype
504
+ else:
505
+ prompt_embeds_dtype = prompt_embeds.dtype
506
+
507
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
508
+
509
+ bs_embed, seq_len, _ = prompt_embeds.shape
510
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
511
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
512
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
513
+
514
+ # get unconditional embeddings for classifier free guidance
515
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
516
+ uncond_tokensA: List[str]
517
+ uncond_tokensB: List[str]
518
+ if negative_prompt is None:
519
+ uncond_tokensA = [""] * batch_size
520
+ uncond_tokensB = [""] * batch_size
521
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
522
+ raise TypeError(
523
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
524
+ f" {type(prompt)}."
525
+ )
526
+ elif isinstance(negative_prompt, str):
527
+ uncond_tokensA = [negative_promptA]
528
+ uncond_tokensB = [negative_promptB]
529
+ elif batch_size != len(negative_prompt):
530
+ raise ValueError(
531
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
532
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
533
+ " the batch size of `prompt`."
534
+ )
535
+ else:
536
+ uncond_tokensA = negative_promptA
537
+ uncond_tokensB = negative_promptB
538
+
539
+ # textual inversion: procecss multi-vector tokens if necessary
540
+ if isinstance(self, TextualInversionLoaderMixin):
541
+ uncond_tokensA = self.maybe_convert_prompt(uncond_tokensA, self.tokenizer)
542
+ uncond_tokensB = self.maybe_convert_prompt(uncond_tokensB, self.tokenizer)
543
+
544
+ max_length = prompt_embeds.shape[1]
545
+ uncond_inputA = self.tokenizer(
546
+ uncond_tokensA,
547
+ padding="max_length",
548
+ max_length=max_length,
549
+ truncation=True,
550
+ return_tensors="pt",
551
+ )
552
+ uncond_inputB = self.tokenizer(
553
+ uncond_tokensB,
554
+ padding="max_length",
555
+ max_length=max_length,
556
+ truncation=True,
557
+ return_tensors="pt",
558
+ )
559
+
560
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
561
+ attention_mask = uncond_inputA.attention_mask.to(device)
562
+ else:
563
+ attention_mask = None
564
+
565
+ negative_prompt_embedsA = self.text_encoder(
566
+ uncond_inputA.input_ids.to(device),
567
+ attention_mask=attention_mask,
568
+ )
569
+ negative_prompt_embedsB = self.text_encoder(
570
+ uncond_inputB.input_ids.to(device),
571
+ attention_mask=attention_mask,
572
+ )
573
+ negative_prompt_embeds = negative_prompt_embedsA[0]*(t_nag)+(1-t_nag)*negative_prompt_embedsB[0]
574
+
575
+ # negative_prompt_embeds = negative_prompt_embeds[0]
576
+
577
+ if do_classifier_free_guidance:
578
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
579
+ seq_len = negative_prompt_embeds.shape[1]
580
+
581
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
582
+
583
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
584
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
585
+
586
+ # For classifier free guidance, we need to do two forward passes.
587
+ # Here we concatenate the unconditional and text embeddings into a single batch
588
+ # to avoid doing two forward passes
589
+ # print("prompt_embeds: ",prompt_embeds)
590
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
591
+
592
+ return prompt_embeds
593
+
594
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
595
+ def run_safety_checker(self, image, device, dtype):
596
+ if self.safety_checker is None:
597
+ has_nsfw_concept = None
598
+ else:
599
+ if torch.is_tensor(image):
600
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
601
+ else:
602
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
603
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
604
+ image, has_nsfw_concept = self.safety_checker(
605
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
606
+ )
607
+ return image, has_nsfw_concept
608
+
609
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
610
+ def decode_latents(self, latents):
611
+ warnings.warn(
612
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
613
+ " use VaeImageProcessor instead",
614
+ FutureWarning,
615
+ )
616
+ latents = 1 / self.vae.config.scaling_factor * latents
617
+ image = self.vae.decode(latents, return_dict=False)[0]
618
+ image = (image / 2 + 0.5).clamp(0, 1)
619
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
620
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
621
+ return image
622
+
623
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
624
+ def prepare_extra_step_kwargs(self, generator, eta):
625
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
626
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
627
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
628
+ # and should be between [0, 1]
629
+
630
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
631
+ extra_step_kwargs = {}
632
+ if accepts_eta:
633
+ extra_step_kwargs["eta"] = eta
634
+
635
+ # check if the scheduler accepts generator
636
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
637
+ if accepts_generator:
638
+ extra_step_kwargs["generator"] = generator
639
+ return extra_step_kwargs
640
+
641
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
642
+ def get_timesteps(self, num_inference_steps, strength, device):
643
+ # get the original timestep using init_timestep
644
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
645
+
646
+ t_start = max(num_inference_steps - init_timestep, 0)
647
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
648
+
649
+ return timesteps, num_inference_steps - t_start
650
+
651
+ def check_inputs(
652
+ self,
653
+ prompt,
654
+ image,
655
+ height,
656
+ width,
657
+ callback_steps,
658
+ negative_prompt=None,
659
+ prompt_embeds=None,
660
+ negative_prompt_embeds=None,
661
+ controlnet_conditioning_scale=1.0,
662
+ control_guidance_start=0.0,
663
+ control_guidance_end=1.0,
664
+ ):
665
+ if height % 8 != 0 or width % 8 != 0:
666
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
667
+
668
+ if (callback_steps is None) or (
669
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
670
+ ):
671
+ raise ValueError(
672
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
673
+ f" {type(callback_steps)}."
674
+ )
675
+
676
+ if prompt is not None and prompt_embeds is not None:
677
+ raise ValueError(
678
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
679
+ " only forward one of the two."
680
+ )
681
+ elif prompt is None and prompt_embeds is None:
682
+ raise ValueError(
683
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
684
+ )
685
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
686
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
687
+
688
+ if negative_prompt is not None and negative_prompt_embeds is not None:
689
+ raise ValueError(
690
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
691
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
692
+ )
693
+
694
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
695
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
696
+ raise ValueError(
697
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
698
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
699
+ f" {negative_prompt_embeds.shape}."
700
+ )
701
+
702
+ # `prompt` needs more sophisticated handling when there are multiple
703
+ # conditionings.
704
+ if isinstance(self.controlnet, MultiControlNetModel):
705
+ if isinstance(prompt, list):
706
+ logger.warning(
707
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
708
+ " prompts. The conditionings will be fixed across the prompts."
709
+ )
710
+
711
+ # Check `image`
712
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
713
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
714
+ )
715
+
716
+ if (
717
+ isinstance(self.controlnet, ControlNetModel)
718
+ or is_compiled
719
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
720
+ ):
721
+ self.check_image(image, prompt, prompt_embeds)
722
+ elif (
723
+ isinstance(self.controlnet, MultiControlNetModel)
724
+ or is_compiled
725
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
726
+ ):
727
+ if not isinstance(image, list):
728
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
729
+
730
+ # When `image` is a nested list:
731
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
732
+ elif any(isinstance(i, list) for i in image):
733
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
734
+ elif len(image) != len(self.controlnet.nets):
735
+ raise ValueError(
736
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
737
+ )
738
+
739
+ for image_ in image:
740
+ self.check_image(image_, prompt, prompt_embeds)
741
+ else:
742
+ assert False
743
+
744
+ # Check `controlnet_conditioning_scale`
745
+ if (
746
+ isinstance(self.controlnet, ControlNetModel)
747
+ or is_compiled
748
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
749
+ ):
750
+ if not isinstance(controlnet_conditioning_scale, float):
751
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
752
+ elif (
753
+ isinstance(self.controlnet, MultiControlNetModel)
754
+ or is_compiled
755
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
756
+ ):
757
+ if isinstance(controlnet_conditioning_scale, list):
758
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
759
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
760
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
761
+ self.controlnet.nets
762
+ ):
763
+ raise ValueError(
764
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
765
+ " the same length as the number of controlnets"
766
+ )
767
+ else:
768
+ assert False
769
+
770
+ if len(control_guidance_start) != len(control_guidance_end):
771
+ raise ValueError(
772
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
773
+ )
774
+
775
+ if isinstance(self.controlnet, MultiControlNetModel):
776
+ if len(control_guidance_start) != len(self.controlnet.nets):
777
+ raise ValueError(
778
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
779
+ )
780
+
781
+ for start, end in zip(control_guidance_start, control_guidance_end):
782
+ if start >= end:
783
+ raise ValueError(
784
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
785
+ )
786
+ if start < 0.0:
787
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
788
+ if end > 1.0:
789
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
790
+
791
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
792
+ def check_image(self, image, prompt, prompt_embeds):
793
+ image_is_pil = isinstance(image, PIL.Image.Image)
794
+ image_is_tensor = isinstance(image, torch.Tensor)
795
+ image_is_np = isinstance(image, np.ndarray)
796
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
797
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
798
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
799
+
800
+ if (
801
+ not image_is_pil
802
+ and not image_is_tensor
803
+ and not image_is_np
804
+ and not image_is_pil_list
805
+ and not image_is_tensor_list
806
+ and not image_is_np_list
807
+ ):
808
+ raise TypeError(
809
+ f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
810
+ )
811
+
812
+ if image_is_pil:
813
+ image_batch_size = 1
814
+ else:
815
+ image_batch_size = len(image)
816
+
817
+ if prompt is not None and isinstance(prompt, str):
818
+ prompt_batch_size = 1
819
+ elif prompt is not None and isinstance(prompt, list):
820
+ prompt_batch_size = len(prompt)
821
+ elif prompt_embeds is not None:
822
+ prompt_batch_size = prompt_embeds.shape[0]
823
+
824
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
825
+ raise ValueError(
826
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
827
+ )
828
+
829
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
830
+ def prepare_control_image(
831
+ self,
832
+ image,
833
+ width,
834
+ height,
835
+ batch_size,
836
+ num_images_per_prompt,
837
+ device,
838
+ dtype,
839
+ do_classifier_free_guidance=False,
840
+ guess_mode=False,
841
+ ):
842
+ image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
843
+ image_batch_size = image.shape[0]
844
+
845
+ if image_batch_size == 1:
846
+ repeat_by = batch_size
847
+ else:
848
+ # image batch size is the same as prompt batch size
849
+ repeat_by = num_images_per_prompt
850
+
851
+ image = image.repeat_interleave(repeat_by, dim=0)
852
+
853
+ image = image.to(device=device, dtype=dtype)
854
+
855
+ if do_classifier_free_guidance and not guess_mode:
856
+ image = torch.cat([image] * 2)
857
+
858
+ return image
859
+
860
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_latents
861
+ def prepare_latents(
862
+ self,
863
+ batch_size,
864
+ num_channels_latents,
865
+ height,
866
+ width,
867
+ dtype,
868
+ device,
869
+ generator,
870
+ latents=None,
871
+ image=None,
872
+ timestep=None,
873
+ is_strength_max=True,
874
+ return_noise=False,
875
+ return_image_latents=False,
876
+ ):
877
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
878
+ if isinstance(generator, list) and len(generator) != batch_size:
879
+ raise ValueError(
880
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
881
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
882
+ )
883
+
884
+ if (image is None or timestep is None) and not is_strength_max:
885
+ raise ValueError(
886
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
887
+ "However, either the image or the noise timestep has not been provided."
888
+ )
889
+
890
+ if return_image_latents or (latents is None and not is_strength_max):
891
+ image = image.to(device=device, dtype=dtype)
892
+ image_latents = self._encode_vae_image(image=image, generator=generator)
893
+
894
+ if latents is None:
895
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
896
+ # if strength is 1. then initialise the latents to noise, else initial to image + noise
897
+ latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
898
+ # if pure noise then scale the initial latents by the Scheduler's init sigma
899
+ latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
900
+ else:
901
+ noise = latents.to(device)
902
+ latents = noise * self.scheduler.init_noise_sigma
903
+
904
+ outputs = (latents,)
905
+
906
+ if return_noise:
907
+ outputs += (noise,)
908
+
909
+ if return_image_latents:
910
+ outputs += (image_latents,)
911
+
912
+ return outputs
913
+
914
+ def _default_height_width(self, height, width, image):
915
+ # NOTE: It is possible that a list of images have different
916
+ # dimensions for each image, so just checking the first image
917
+ # is not _exactly_ correct, but it is simple.
918
+ while isinstance(image, list):
919
+ image = image[0]
920
+
921
+ if height is None:
922
+ if isinstance(image, PIL.Image.Image):
923
+ height = image.height
924
+ elif isinstance(image, torch.Tensor):
925
+ height = image.shape[2]
926
+
927
+ height = (height // 8) * 8 # round down to nearest multiple of 8
928
+
929
+ if width is None:
930
+ if isinstance(image, PIL.Image.Image):
931
+ width = image.width
932
+ elif isinstance(image, torch.Tensor):
933
+ width = image.shape[3]
934
+
935
+ width = (width // 8) * 8 # round down to nearest multiple of 8
936
+
937
+ return height, width
938
+
939
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
940
+ def prepare_mask_latents(
941
+ self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
942
+ ):
943
+ # resize the mask to latents shape as we concatenate the mask to the latents
944
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
945
+ # and half precision
946
+ mask = torch.nn.functional.interpolate(
947
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
948
+ )
949
+ mask = mask.to(device=device, dtype=dtype)
950
+
951
+ masked_image = masked_image.to(device=device, dtype=dtype)
952
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
953
+
954
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
955
+ if mask.shape[0] < batch_size:
956
+ if not batch_size % mask.shape[0] == 0:
957
+ raise ValueError(
958
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
959
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
960
+ " of masks that you pass is divisible by the total requested batch size."
961
+ )
962
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
963
+ if masked_image_latents.shape[0] < batch_size:
964
+ if not batch_size % masked_image_latents.shape[0] == 0:
965
+ raise ValueError(
966
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
967
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
968
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
969
+ )
970
+ masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
971
+
972
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
973
+ masked_image_latents = (
974
+ torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
975
+ )
976
+
977
+ # aligning device to prevent device errors when concating it with the latent model input
978
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
979
+ return mask, masked_image_latents
980
+
981
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image
982
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
983
+ if isinstance(generator, list):
984
+ image_latents = [
985
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i])
986
+ for i in range(image.shape[0])
987
+ ]
988
+ image_latents = torch.cat(image_latents, dim=0)
989
+ else:
990
+ image_latents = self.vae.encode(image).latent_dist.sample(generator=generator)
991
+
992
+ image_latents = self.vae.config.scaling_factor * image_latents
993
+
994
+ return image_latents
995
+
996
+ @torch.no_grad()
997
+ def predict_woControl(
998
+ self,
999
+ promptA: Union[str, List[str]] = None,
1000
+ promptB: Union[str, List[str]] = None,
1001
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
1002
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
1003
+ height: Optional[int] = None,
1004
+ width: Optional[int] = None,
1005
+ strength: float = 1.0,
1006
+ tradoff: float = 1.0,
1007
+ tradoff_nag: float = 1.0,
1008
+ num_inference_steps: int = 50,
1009
+ guidance_scale: float = 7.5,
1010
+ negative_promptA: Optional[Union[str, List[str]]] = None,
1011
+ negative_promptB: Optional[Union[str, List[str]]] = None,
1012
+ num_images_per_prompt: Optional[int] = 1,
1013
+ eta: float = 0.0,
1014
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
1015
+ latents: Optional[torch.FloatTensor] = None,
1016
+ prompt_embeds: Optional[torch.FloatTensor] = None,
1017
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1018
+ output_type: Optional[str] = "pil",
1019
+ return_dict: bool = True,
1020
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
1021
+ callback_steps: int = 1,
1022
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1023
+ task_class: Union[torch.Tensor, float, int] = None,
1024
+ ):
1025
+ r"""
1026
+ The call function to the pipeline for generation.
1027
+
1028
+ Args:
1029
+ prompt (`str` or `List[str]`, *optional*):
1030
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
1031
+ image (`PIL.Image.Image`):
1032
+ `Image` or tensor representing an image batch to be inpainted (which parts of the image to be masked
1033
+ out with `mask_image` and repainted according to `prompt`).
1034
+ mask_image (`PIL.Image.Image`):
1035
+ `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
1036
+ while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
1037
+ (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
1038
+ expected shape would be `(B, H, W, 1)`.
1039
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
1040
+ The height in pixels of the generated image.
1041
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
1042
+ The width in pixels of the generated image.
1043
+ strength (`float`, *optional*, defaults to 1.0):
1044
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
1045
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
1046
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
1047
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
1048
+ essentially ignores `image`.
1049
+ num_inference_steps (`int`, *optional*, defaults to 50):
1050
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1051
+ expense of slower inference. This parameter is modulated by `strength`.
1052
+ guidance_scale (`float`, *optional*, defaults to 7.5):
1053
+ A higher guidance scale value encourages the model to generate images closely linked to the text
1054
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
1055
+ negative_prompt (`str` or `List[str]`, *optional*):
1056
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
1057
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
1058
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
1059
+ The number of images to generate per prompt.
1060
+ eta (`float`, *optional*, defaults to 0.0):
1061
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
1062
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
1063
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
1064
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
1065
+ generation deterministic.
1066
+ latents (`torch.FloatTensor`, *optional*):
1067
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
1068
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1069
+ tensor is generated by sampling using the supplied random `generator`.
1070
+ prompt_embeds (`torch.FloatTensor`, *optional*):
1071
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
1072
+ provided, text embeddings are generated from the `prompt` input argument.
1073
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
1074
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
1075
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
1076
+ output_type (`str`, *optional*, defaults to `"pil"`):
1077
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
1078
+ return_dict (`bool`, *optional*, defaults to `True`):
1079
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
1080
+ plain tuple.
1081
+ callback (`Callable`, *optional*):
1082
+ A function that calls every `callback_steps` steps during inference. The function is called with the
1083
+ following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
1084
+ callback_steps (`int`, *optional*, defaults to 1):
1085
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
1086
+ every step.
1087
+ cross_attention_kwargs (`dict`, *optional*):
1088
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
1089
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1090
+
1091
+ Examples:
1092
+
1093
+ ```py
1094
+ >>> import PIL
1095
+ >>> import requests
1096
+ >>> import torch
1097
+ >>> from io import BytesIO
1098
+
1099
+ >>> from diffusers import StableDiffusionInpaintPipeline
1100
+
1101
+
1102
+ >>> def download_image(url):
1103
+ ... response = requests.get(url)
1104
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
1105
+
1106
+
1107
+ >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
1108
+ >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
1109
+
1110
+ >>> init_image = download_image(img_url).resize((512, 512))
1111
+ >>> mask_image = download_image(mask_url).resize((512, 512))
1112
+
1113
+ >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
1114
+ ... "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
1115
+ ... )
1116
+ >>> pipe = pipe.to("cuda")
1117
+
1118
+ >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
1119
+ >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
1120
+ ```
1121
+
1122
+ Returns:
1123
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
1124
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
1125
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
1126
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
1127
+ "not-safe-for-work" (nsfw) content.
1128
+ """
1129
+ # 0. Default height and width to unet
1130
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
1131
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
1132
+ prompt = promptA
1133
+ negative_prompt = negative_promptA
1134
+ # 1. Check inputs
1135
+ self.check_inputs(
1136
+ prompt,
1137
+ height,
1138
+ width,
1139
+ strength,
1140
+ callback_steps,
1141
+ negative_prompt,
1142
+ prompt_embeds,
1143
+ negative_prompt_embeds,
1144
+ )
1145
+
1146
+ # 2. Define call parameters
1147
+ if prompt is not None and isinstance(prompt, str):
1148
+ batch_size = 1
1149
+ elif prompt is not None and isinstance(prompt, list):
1150
+ batch_size = len(prompt)
1151
+ else:
1152
+ batch_size = prompt_embeds.shape[0]
1153
+
1154
+ device = self._execution_device
1155
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
1156
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
1157
+ # corresponds to doing no classifier free guidance.
1158
+ do_classifier_free_guidance = guidance_scale > 1.0
1159
+
1160
+ # 3. Encode input prompt
1161
+ text_encoder_lora_scale = (
1162
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
1163
+ )
1164
+ prompt_embeds = self._encode_prompt(
1165
+ promptA,
1166
+ promptB,
1167
+ tradoff,
1168
+ device,
1169
+ num_images_per_prompt,
1170
+ do_classifier_free_guidance,
1171
+ negative_promptA,
1172
+ negative_promptB,
1173
+ tradoff_nag,
1174
+ prompt_embeds=prompt_embeds,
1175
+ negative_prompt_embeds=negative_prompt_embeds,
1176
+ lora_scale=text_encoder_lora_scale,
1177
+ )
1178
+
1179
+ # 4. set timesteps
1180
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
1181
+ timesteps, num_inference_steps = self.get_timesteps(
1182
+ num_inference_steps=num_inference_steps, strength=strength, device=device
1183
+ )
1184
+ # check that number of inference steps is not < 1 - as this doesn't make sense
1185
+ if num_inference_steps < 1:
1186
+ raise ValueError(
1187
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
1188
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
1189
+ )
1190
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1191
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1192
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1193
+ is_strength_max = strength == 1.0
1194
+
1195
+ # 5. Preprocess mask and image
1196
+ mask, masked_image, init_image = prepare_mask_and_masked_image(
1197
+ image, mask_image, height, width, return_image=True
1198
+ )
1199
+ mask_condition = mask.clone()
1200
+
1201
+ # 6. Prepare latent variables
1202
+ num_channels_latents = self.vae.config.latent_channels
1203
+ num_channels_unet = self.unet.config.in_channels
1204
+ return_image_latents = num_channels_unet == 4
1205
+
1206
+ latents_outputs = self.prepare_latents(
1207
+ batch_size * num_images_per_prompt,
1208
+ num_channels_latents,
1209
+ height,
1210
+ width,
1211
+ prompt_embeds.dtype,
1212
+ device,
1213
+ generator,
1214
+ latents,
1215
+ image=init_image,
1216
+ timestep=latent_timestep,
1217
+ is_strength_max=is_strength_max,
1218
+ return_noise=True,
1219
+ return_image_latents=return_image_latents,
1220
+ )
1221
+
1222
+ if return_image_latents:
1223
+ latents, noise, image_latents = latents_outputs
1224
+ else:
1225
+ latents, noise = latents_outputs
1226
+
1227
+ # 7. Prepare mask latent variables
1228
+ mask, masked_image_latents = self.prepare_mask_latents(
1229
+ mask,
1230
+ masked_image,
1231
+ batch_size * num_images_per_prompt,
1232
+ height,
1233
+ width,
1234
+ prompt_embeds.dtype,
1235
+ device,
1236
+ generator,
1237
+ do_classifier_free_guidance,
1238
+ )
1239
+
1240
+ # 8. Check that sizes of mask, masked image and latents match
1241
+ if num_channels_unet == 9:
1242
+ # default case for runwayml/stable-diffusion-inpainting
1243
+ num_channels_mask = mask.shape[1]
1244
+ num_channels_masked_image = masked_image_latents.shape[1]
1245
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
1246
+ raise ValueError(
1247
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
1248
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
1249
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1250
+ f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1251
+ " `pipeline.unet` or your `mask_image` or `image` input."
1252
+ )
1253
+ elif num_channels_unet != 4:
1254
+ raise ValueError(
1255
+ f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
1256
+ )
1257
+
1258
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1259
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1260
+
1261
+ # 10. Denoising loop
1262
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1263
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1264
+ for i, t in enumerate(timesteps):
1265
+ # expand the latents if we are doing classifier free guidance
1266
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
1267
+
1268
+ # concat latents, mask, masked_image_latents in the channel dimension
1269
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1270
+
1271
+ if num_channels_unet == 9:
1272
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
1273
+
1274
+ # predict the noise residual
1275
+ if task_class is not None:
1276
+ noise_pred = self.unet(
1277
+ sample = latent_model_input,
1278
+ timestep = t,
1279
+ encoder_hidden_states=prompt_embeds,
1280
+ cross_attention_kwargs=cross_attention_kwargs,
1281
+ return_dict=False,
1282
+ task_class = task_class,
1283
+ )[0]
1284
+ else:
1285
+ noise_pred = self.unet(
1286
+ latent_model_input,
1287
+ t,
1288
+ encoder_hidden_states=prompt_embeds,
1289
+ cross_attention_kwargs=cross_attention_kwargs,
1290
+ return_dict=False,
1291
+ )[0]
1292
+
1293
+ # perform guidance
1294
+ if do_classifier_free_guidance:
1295
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1296
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1297
+
1298
+ # compute the previous noisy sample x_t -> x_t-1
1299
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1300
+
1301
+ if num_channels_unet == 4:
1302
+ init_latents_proper = image_latents[:1]
1303
+ init_mask = mask[:1]
1304
+
1305
+ if i < len(timesteps) - 1:
1306
+ noise_timestep = timesteps[i + 1]
1307
+ init_latents_proper = self.scheduler.add_noise(
1308
+ init_latents_proper, noise, torch.tensor([noise_timestep])
1309
+ )
1310
+
1311
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
1312
+
1313
+ # call the callback, if provided
1314
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1315
+ progress_bar.update()
1316
+ if callback is not None and i % callback_steps == 0:
1317
+ callback(i, t, latents)
1318
+
1319
+ if not output_type == "latent":
1320
+ condition_kwargs = {}
1321
+ if isinstance(self.vae, AsymmetricAutoencoderKL):
1322
+ init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
1323
+ init_image_condition = init_image.clone()
1324
+ init_image = self._encode_vae_image(init_image, generator=generator)
1325
+ mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
1326
+ condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
1327
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, **condition_kwargs)[0]
1328
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
1329
+ else:
1330
+ image = latents
1331
+ has_nsfw_concept = None
1332
+
1333
+ if has_nsfw_concept is None:
1334
+ do_denormalize = [True] * image.shape[0]
1335
+ else:
1336
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1337
+
1338
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1339
+
1340
+ # Offload last model to CPU
1341
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1342
+ self.final_offload_hook.offload()
1343
+
1344
+ if not return_dict:
1345
+ return (image, has_nsfw_concept)
1346
+
1347
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
1348
+
1349
+
1350
+ @torch.no_grad()
1351
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
1352
+ def __call__(
1353
+ self,
1354
+ promptA: Union[str, List[str]] = None,
1355
+ promptB: Union[str, List[str]] = None,
1356
+ image: Union[torch.Tensor, PIL.Image.Image] = None,
1357
+ mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
1358
+ control_image: Union[
1359
+ torch.FloatTensor,
1360
+ PIL.Image.Image,
1361
+ np.ndarray,
1362
+ List[torch.FloatTensor],
1363
+ List[PIL.Image.Image],
1364
+ List[np.ndarray],
1365
+ ] = None,
1366
+ height: Optional[int] = None,
1367
+ width: Optional[int] = None,
1368
+ strength: float = 1.0,
1369
+ tradoff: float = 1.0,
1370
+ tradoff_nag: float = 1.0,
1371
+ num_inference_steps: int = 50,
1372
+ guidance_scale: float = 7.5,
1373
+ negative_promptA: Optional[Union[str, List[str]]] = None,
1374
+ negative_promptB: Optional[Union[str, List[str]]] = None,
1375
+ num_images_per_prompt: Optional[int] = 1,
1376
+ eta: float = 0.0,
1377
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
1378
+ latents: Optional[torch.FloatTensor] = None,
1379
+ prompt_embeds: Optional[torch.FloatTensor] = None,
1380
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1381
+ output_type: Optional[str] = "pil",
1382
+ return_dict: bool = True,
1383
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
1384
+ callback_steps: int = 1,
1385
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1386
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.5,
1387
+ guess_mode: bool = False,
1388
+ control_guidance_start: Union[float, List[float]] = 0.0,
1389
+ control_guidance_end: Union[float, List[float]] = 1.0,
1390
+ ):
1391
+ r"""
1392
+ Function invoked when calling the pipeline for generation.
1393
+
1394
+ Args:
1395
+ prompt (`str` or `List[str]`, *optional*):
1396
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
1397
+ instead.
1398
+ image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
1399
+ `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
1400
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
1401
+ the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
1402
+ also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
1403
+ height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
1404
+ specified in init, images must be passed as a list such that each element of the list can be correctly
1405
+ batched for input to a single controlnet.
1406
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1407
+ The height in pixels of the generated image.
1408
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1409
+ The width in pixels of the generated image.
1410
+ strength (`float`, *optional*, defaults to 1.):
1411
+ Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
1412
+ between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
1413
+ `strength`. The number of denoising steps depends on the amount of noise initially added. When
1414
+ `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
1415
+ iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
1416
+ portion of the reference `image`.
1417
+ num_inference_steps (`int`, *optional*, defaults to 50):
1418
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1419
+ expense of slower inference.
1420
+ guidance_scale (`float`, *optional*, defaults to 7.5):
1421
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
1422
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
1423
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1424
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
1425
+ usually at the expense of lower image quality.
1426
+ negative_prompt (`str` or `List[str]`, *optional*):
1427
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
1428
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
1429
+ less than `1`).
1430
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
1431
+ The number of images to generate per prompt.
1432
+ eta (`float`, *optional*, defaults to 0.0):
1433
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
1434
+ [`schedulers.DDIMScheduler`], will be ignored for others.
1435
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
1436
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
1437
+ to make generation deterministic.
1438
+ latents (`torch.FloatTensor`, *optional*):
1439
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
1440
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1441
+ tensor will ge generated by sampling using the supplied random `generator`.
1442
+ prompt_embeds (`torch.FloatTensor`, *optional*):
1443
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
1444
+ provided, text embeddings will be generated from `prompt` input argument.
1445
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
1446
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1447
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
1448
+ argument.
1449
+ output_type (`str`, *optional*, defaults to `"pil"`):
1450
+ The output format of the generate image. Choose between
1451
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
1452
+ return_dict (`bool`, *optional*, defaults to `True`):
1453
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
1454
+ plain tuple.
1455
+ callback (`Callable`, *optional*):
1456
+ A function that will be called every `callback_steps` steps during inference. The function will be
1457
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
1458
+ callback_steps (`int`, *optional*, defaults to 1):
1459
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
1460
+ called at every step.
1461
+ cross_attention_kwargs (`dict`, *optional*):
1462
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1463
+ `self.processor` in
1464
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1465
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 0.5):
1466
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
1467
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
1468
+ corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
1469
+ than for [`~StableDiffusionControlNetPipeline.__call__`].
1470
+ guess_mode (`bool`, *optional*, defaults to `False`):
1471
+ In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
1472
+ you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
1473
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
1474
+ The percentage of total steps at which the controlnet starts applying.
1475
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
1476
+ The percentage of total steps at which the controlnet stops applying.
1477
+
1478
+ Examples:
1479
+
1480
+ Returns:
1481
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
1482
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
1483
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
1484
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
1485
+ (nsfw) content, according to the `safety_checker`.
1486
+ """
1487
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
1488
+
1489
+ # 0. Default height and width to unet
1490
+ height, width = self._default_height_width(height, width, image)
1491
+
1492
+ prompt = promptA
1493
+ negative_prompt = negative_promptA
1494
+
1495
+ # align format for control guidance
1496
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
1497
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
1498
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
1499
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
1500
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
1501
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
1502
+ control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
1503
+ control_guidance_end
1504
+ ]
1505
+
1506
+ # 1. Check inputs. Raise error if not correct
1507
+ self.check_inputs(
1508
+ prompt,
1509
+ control_image,
1510
+ height,
1511
+ width,
1512
+ callback_steps,
1513
+ negative_prompt,
1514
+ prompt_embeds,
1515
+ negative_prompt_embeds,
1516
+ controlnet_conditioning_scale,
1517
+ control_guidance_start,
1518
+ control_guidance_end,
1519
+ )
1520
+
1521
+ # 2. Define call parameters
1522
+ if prompt is not None and isinstance(prompt, str):
1523
+ batch_size = 1
1524
+ elif prompt is not None and isinstance(prompt, list):
1525
+ batch_size = len(prompt)
1526
+ else:
1527
+ batch_size = prompt_embeds.shape[0]
1528
+
1529
+ device = self._execution_device
1530
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
1531
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
1532
+ # corresponds to doing no classifier free guidance.
1533
+ do_classifier_free_guidance = guidance_scale > 1.0
1534
+
1535
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
1536
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
1537
+
1538
+ global_pool_conditions = (
1539
+ controlnet.config.global_pool_conditions
1540
+ if isinstance(controlnet, ControlNetModel)
1541
+ else controlnet.nets[0].config.global_pool_conditions
1542
+ )
1543
+ guess_mode = guess_mode or global_pool_conditions
1544
+
1545
+ # 3. Encode input prompt
1546
+ text_encoder_lora_scale = (
1547
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
1548
+ )
1549
+ prompt_embeds = self._encode_prompt(
1550
+ promptA,
1551
+ promptB,
1552
+ tradoff,
1553
+ device,
1554
+ num_images_per_prompt,
1555
+ do_classifier_free_guidance,
1556
+ negative_promptA,
1557
+ negative_promptB,
1558
+ tradoff_nag,
1559
+ prompt_embeds=prompt_embeds,
1560
+ negative_prompt_embeds=negative_prompt_embeds,
1561
+ lora_scale=text_encoder_lora_scale,
1562
+ )
1563
+
1564
+ # 4. Prepare image
1565
+ if isinstance(controlnet, ControlNetModel):
1566
+ control_image = self.prepare_control_image(
1567
+ image=control_image,
1568
+ width=width,
1569
+ height=height,
1570
+ batch_size=batch_size * num_images_per_prompt,
1571
+ num_images_per_prompt=num_images_per_prompt,
1572
+ device=device,
1573
+ dtype=controlnet.dtype,
1574
+ do_classifier_free_guidance=do_classifier_free_guidance,
1575
+ guess_mode=guess_mode,
1576
+ )
1577
+ elif isinstance(controlnet, MultiControlNetModel):
1578
+ control_images = []
1579
+
1580
+ for control_image_ in control_image:
1581
+ control_image_ = self.prepare_control_image(
1582
+ image=control_image_,
1583
+ width=width,
1584
+ height=height,
1585
+ batch_size=batch_size * num_images_per_prompt,
1586
+ num_images_per_prompt=num_images_per_prompt,
1587
+ device=device,
1588
+ dtype=controlnet.dtype,
1589
+ do_classifier_free_guidance=do_classifier_free_guidance,
1590
+ guess_mode=guess_mode,
1591
+ )
1592
+
1593
+ control_images.append(control_image_)
1594
+
1595
+ control_image = control_images
1596
+ else:
1597
+ assert False
1598
+
1599
+ # 4. Preprocess mask and image - resizes image and mask w.r.t height and width
1600
+ mask, masked_image, init_image = prepare_mask_and_masked_image(
1601
+ image, mask_image, height, width, return_image=True
1602
+ )
1603
+
1604
+ # 5. Prepare timesteps
1605
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
1606
+ timesteps, num_inference_steps = self.get_timesteps(
1607
+ num_inference_steps=num_inference_steps, strength=strength, device=device
1608
+ )
1609
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1610
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1611
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1612
+ is_strength_max = strength == 1.0
1613
+
1614
+ # 6. Prepare latent variables
1615
+ num_channels_latents = self.vae.config.latent_channels
1616
+ num_channels_unet = self.unet.config.in_channels
1617
+ return_image_latents = num_channels_unet == 4
1618
+ latents_outputs = self.prepare_latents(
1619
+ batch_size * num_images_per_prompt,
1620
+ num_channels_latents,
1621
+ height,
1622
+ width,
1623
+ prompt_embeds.dtype,
1624
+ device,
1625
+ generator,
1626
+ latents,
1627
+ image=init_image,
1628
+ timestep=latent_timestep,
1629
+ is_strength_max=is_strength_max,
1630
+ return_noise=True,
1631
+ return_image_latents=return_image_latents,
1632
+ )
1633
+
1634
+ if return_image_latents:
1635
+ latents, noise, image_latents = latents_outputs
1636
+ else:
1637
+ latents, noise = latents_outputs
1638
+
1639
+ # 7. Prepare mask latent variables
1640
+ mask, masked_image_latents = self.prepare_mask_latents(
1641
+ mask,
1642
+ masked_image,
1643
+ batch_size * num_images_per_prompt,
1644
+ height,
1645
+ width,
1646
+ prompt_embeds.dtype,
1647
+ device,
1648
+ generator,
1649
+ do_classifier_free_guidance,
1650
+ )
1651
+
1652
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1653
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1654
+
1655
+ # 7.1 Create tensor stating which controlnets to keep
1656
+ controlnet_keep = []
1657
+ for i in range(len(timesteps)):
1658
+ keeps = [
1659
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1660
+ for s, e in zip(control_guidance_start, control_guidance_end)
1661
+ ]
1662
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
1663
+
1664
+ # 8. Denoising loop
1665
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1666
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1667
+ for i, t in enumerate(timesteps):
1668
+ # expand the latents if we are doing classifier free guidance
1669
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
1670
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1671
+
1672
+ # controlnet(s) inference
1673
+ if guess_mode and do_classifier_free_guidance:
1674
+ # Infer ControlNet only for the conditional batch.
1675
+ control_model_input = latents
1676
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
1677
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
1678
+ else:
1679
+ control_model_input = latent_model_input
1680
+ controlnet_prompt_embeds = prompt_embeds
1681
+
1682
+ if isinstance(controlnet_keep[i], list):
1683
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1684
+ else:
1685
+ controlnet_cond_scale = controlnet_conditioning_scale
1686
+ if isinstance(controlnet_cond_scale, list):
1687
+ controlnet_cond_scale = controlnet_cond_scale[0]
1688
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1689
+
1690
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
1691
+ control_model_input,
1692
+ t,
1693
+ encoder_hidden_states=controlnet_prompt_embeds,
1694
+ controlnet_cond=control_image,
1695
+ conditioning_scale=cond_scale,
1696
+ guess_mode=guess_mode,
1697
+ return_dict=False,
1698
+ )
1699
+
1700
+ if guess_mode and do_classifier_free_guidance:
1701
+ # Infered ControlNet only for the conditional batch.
1702
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
1703
+ # add 0 to the unconditional batch to keep it unchanged.
1704
+ down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
1705
+ mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
1706
+
1707
+ # predict the noise residual
1708
+ if num_channels_unet == 9:
1709
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
1710
+
1711
+ noise_pred = self.unet(
1712
+ latent_model_input,
1713
+ t,
1714
+ encoder_hidden_states=prompt_embeds,
1715
+ cross_attention_kwargs=cross_attention_kwargs,
1716
+ down_block_additional_residuals=down_block_res_samples,
1717
+ mid_block_additional_residual=mid_block_res_sample,
1718
+ return_dict=False,
1719
+ )[0]
1720
+
1721
+ # perform guidance
1722
+ if do_classifier_free_guidance:
1723
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1724
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1725
+
1726
+ # compute the previous noisy sample x_t -> x_t-1
1727
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1728
+
1729
+ if num_channels_unet == 4:
1730
+ init_latents_proper = image_latents[:1]
1731
+ init_mask = mask[:1]
1732
+
1733
+ if i < len(timesteps) - 1:
1734
+ noise_timestep = timesteps[i + 1]
1735
+ init_latents_proper = self.scheduler.add_noise(
1736
+ init_latents_proper, noise, torch.tensor([noise_timestep])
1737
+ )
1738
+
1739
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
1740
+
1741
+ # call the callback, if provided
1742
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1743
+ progress_bar.update()
1744
+ if callback is not None and i % callback_steps == 0:
1745
+ callback(i, t, latents)
1746
+
1747
+ # If we do sequential model offloading, let's offload unet and controlnet
1748
+ # manually for max memory savings
1749
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1750
+ self.unet.to("cpu")
1751
+ self.controlnet.to("cpu")
1752
+ torch.cuda.empty_cache()
1753
+
1754
+ if not output_type == "latent":
1755
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1756
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
1757
+ else:
1758
+ image = latents
1759
+ has_nsfw_concept = None
1760
+
1761
+ if has_nsfw_concept is None:
1762
+ do_denormalize = [True] * image.shape[0]
1763
+ else:
1764
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1765
+
1766
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1767
+
1768
+ # Offload last model to CPU
1769
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1770
+ self.final_offload_hook.offload()
1771
+
1772
+ if not return_dict:
1773
+ return (image, has_nsfw_concept)
1774
+
1775
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
power_paint_tokenizer.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import copy
4
+ import random
5
+ from typing import Any, List, Optional, Union
6
+ from transformers import CLIPTokenizer
7
+
8
+
9
+ class PowerPaintTokenizer:
10
+ def __init__(self, tokenizer: CLIPTokenizer):
11
+ self.wrapped = tokenizer
12
+ self.token_map = {}
13
+ placeholder_tokens = ["P_ctxt", "P_shape", "P_obj"]
14
+ num_vec_per_token = 10
15
+ for placeholder_token in placeholder_tokens:
16
+ output = []
17
+ for i in range(num_vec_per_token):
18
+ ith_token = placeholder_token + f"_{i}"
19
+ output.append(ith_token)
20
+ self.token_map[placeholder_token] = output
21
+
22
+ def __getattr__(self, name: str) -> Any:
23
+ if name == "wrapped":
24
+ return super().__getattr__("wrapped")
25
+
26
+ try:
27
+ return getattr(self.wrapped, name)
28
+ except AttributeError:
29
+ try:
30
+ return super().__getattr__(name)
31
+ except AttributeError:
32
+ raise AttributeError(
33
+ "'name' cannot be found in both "
34
+ f"'{self.__class__.__name__}' and "
35
+ f"'{self.__class__.__name__}.tokenizer'."
36
+ )
37
+
38
+ def try_adding_tokens(self, tokens: Union[str, List[str]], *args, **kwargs):
39
+ """Attempt to add tokens to the tokenizer.
40
+
41
+ Args:
42
+ tokens (Union[str, List[str]]): The tokens to be added.
43
+ """
44
+ num_added_tokens = self.wrapped.add_tokens(tokens, *args, **kwargs)
45
+ assert num_added_tokens != 0, (
46
+ f"The tokenizer already contains the token {tokens}. Please pass "
47
+ "a different `placeholder_token` that is not already in the "
48
+ "tokenizer."
49
+ )
50
+
51
+ def get_token_info(self, token: str) -> dict:
52
+ """Get the information of a token, including its start and end index in
53
+ the current tokenizer.
54
+
55
+ Args:
56
+ token (str): The token to be queried.
57
+
58
+ Returns:
59
+ dict: The information of the token, including its start and end
60
+ index in current tokenizer.
61
+ """
62
+ token_ids = self.__call__(token).input_ids
63
+ start, end = token_ids[1], token_ids[-2] + 1
64
+ return {"name": token, "start": start, "end": end}
65
+
66
+ def add_placeholder_token(
67
+ self, placeholder_token: str, *args, num_vec_per_token: int = 1, **kwargs
68
+ ):
69
+ """Add placeholder tokens to the tokenizer.
70
+
71
+ Args:
72
+ placeholder_token (str): The placeholder token to be added.
73
+ num_vec_per_token (int, optional): The number of vectors of
74
+ the added placeholder token.
75
+ *args, **kwargs: The arguments for `self.wrapped.add_tokens`.
76
+ """
77
+ output = []
78
+ if num_vec_per_token == 1:
79
+ self.try_adding_tokens(placeholder_token, *args, **kwargs)
80
+ output.append(placeholder_token)
81
+ else:
82
+ output = []
83
+ for i in range(num_vec_per_token):
84
+ ith_token = placeholder_token + f"_{i}"
85
+ self.try_adding_tokens(ith_token, *args, **kwargs)
86
+ output.append(ith_token)
87
+
88
+ for token in self.token_map:
89
+ if token in placeholder_token:
90
+ raise ValueError(
91
+ f"The tokenizer already has placeholder token {token} "
92
+ f"that can get confused with {placeholder_token} "
93
+ "keep placeholder tokens independent"
94
+ )
95
+ self.token_map[placeholder_token] = output
96
+
97
+ def replace_placeholder_tokens_in_text(
98
+ self,
99
+ text: Union[str, List[str]],
100
+ vector_shuffle: bool = False,
101
+ prop_tokens_to_load: float = 1.0,
102
+ ) -> Union[str, List[str]]:
103
+ """Replace the keywords in text with placeholder tokens. This function
104
+ will be called in `self.__call__` and `self.encode`.
105
+
106
+ Args:
107
+ text (Union[str, List[str]]): The text to be processed.
108
+ vector_shuffle (bool, optional): Whether to shuffle the vectors.
109
+ Defaults to False.
110
+ prop_tokens_to_load (float, optional): The proportion of tokens to
111
+ be loaded. If 1.0, all tokens will be loaded. Defaults to 1.0.
112
+
113
+ Returns:
114
+ Union[str, List[str]]: The processed text.
115
+ """
116
+ if isinstance(text, list):
117
+ output = []
118
+ for i in range(len(text)):
119
+ output.append(
120
+ self.replace_placeholder_tokens_in_text(
121
+ text[i], vector_shuffle=vector_shuffle
122
+ )
123
+ )
124
+ return output
125
+
126
+ for placeholder_token in self.token_map:
127
+ if placeholder_token in text:
128
+ tokens = self.token_map[placeholder_token]
129
+ tokens = tokens[: 1 + int(len(tokens) * prop_tokens_to_load)]
130
+ if vector_shuffle:
131
+ tokens = copy.copy(tokens)
132
+ random.shuffle(tokens)
133
+ text = text.replace(placeholder_token, " ".join(tokens))
134
+ return text
135
+
136
+ def replace_text_with_placeholder_tokens(
137
+ self, text: Union[str, List[str]]
138
+ ) -> Union[str, List[str]]:
139
+ """Replace the placeholder tokens in text with the original keywords.
140
+ This function will be called in `self.decode`.
141
+
142
+ Args:
143
+ text (Union[str, List[str]]): The text to be processed.
144
+
145
+ Returns:
146
+ Union[str, List[str]]: The processed text.
147
+ """
148
+ if isinstance(text, list):
149
+ output = []
150
+ for i in range(len(text)):
151
+ output.append(self.replace_text_with_placeholder_tokens(text[i]))
152
+ return output
153
+
154
+ for placeholder_token, tokens in self.token_map.items():
155
+ merged_tokens = " ".join(tokens)
156
+ if merged_tokens in text:
157
+ text = text.replace(merged_tokens, placeholder_token)
158
+ return text
159
+
160
+ def __call__(
161
+ self,
162
+ text: Union[str, List[str]],
163
+ *args,
164
+ vector_shuffle: bool = False,
165
+ prop_tokens_to_load: float = 1.0,
166
+ **kwargs,
167
+ ):
168
+ """The call function of the wrapper.
169
+
170
+ Args:
171
+ text (Union[str, List[str]]): The text to be tokenized.
172
+ vector_shuffle (bool, optional): Whether to shuffle the vectors.
173
+ Defaults to False.
174
+ prop_tokens_to_load (float, optional): The proportion of tokens to
175
+ be loaded. If 1.0, all tokens will be loaded. Defaults to 1.0
176
+ *args, **kwargs: The arguments for `self.wrapped.__call__`.
177
+ """
178
+ replaced_text = self.replace_placeholder_tokens_in_text(
179
+ text, vector_shuffle=vector_shuffle, prop_tokens_to_load=prop_tokens_to_load
180
+ )
181
+
182
+ return self.wrapped.__call__(replaced_text, *args, **kwargs)
183
+
184
+ def encode(self, text: Union[str, List[str]], *args, **kwargs):
185
+ """Encode the passed text to token index.
186
+
187
+ Args:
188
+ text (Union[str, List[str]]): The text to be encode.
189
+ *args, **kwargs: The arguments for `self.wrapped.__call__`.
190
+ """
191
+ replaced_text = self.replace_placeholder_tokens_in_text(text)
192
+ return self.wrapped(replaced_text, *args, **kwargs)
193
+
194
+ def decode(
195
+ self, token_ids, return_raw: bool = False, *args, **kwargs
196
+ ) -> Union[str, List[str]]:
197
+ """Decode the token index to text.
198
+
199
+ Args:
200
+ token_ids: The token index to be decoded.
201
+ return_raw: Whether keep the placeholder token in the text.
202
+ Defaults to False.
203
+ *args, **kwargs: The arguments for `self.wrapped.decode`.
204
+
205
+ Returns:
206
+ Union[str, List[str]]: The decoded text.
207
+ """
208
+ text = self.wrapped.decode(token_ids, *args, **kwargs)
209
+ if return_raw:
210
+ return text
211
+ replaced_text = self.replace_text_with_placeholder_tokens(text)
212
+ return replaced_text
213
+
214
+
215
+ class EmbeddingLayerWithFixes(nn.Module):
216
+ """The revised embedding layer to support external embeddings. This design
217
+ of this class is inspired by https://github.com/AUTOMATIC1111/stable-
218
+ diffusion-webui/blob/22bcc7be428c94e9408f589966c2040187245d81/modules/sd_hi
219
+ jack.py#L224 # noqa.
220
+
221
+ Args:
222
+ wrapped (nn.Emebdding): The embedding layer to be wrapped.
223
+ external_embeddings (Union[dict, List[dict]], optional): The external
224
+ embeddings added to this layer. Defaults to None.
225
+ """
226
+
227
+ def __init__(
228
+ self,
229
+ wrapped: nn.Embedding,
230
+ external_embeddings: Optional[Union[dict, List[dict]]] = None,
231
+ ):
232
+ super().__init__()
233
+ self.wrapped = wrapped
234
+ self.num_embeddings = wrapped.weight.shape[0]
235
+
236
+ self.external_embeddings = []
237
+ if external_embeddings:
238
+ self.add_embeddings(external_embeddings)
239
+
240
+ self.trainable_embeddings = nn.ParameterDict()
241
+
242
+ @property
243
+ def weight(self):
244
+ """Get the weight of wrapped embedding layer."""
245
+ return self.wrapped.weight
246
+
247
+ def check_duplicate_names(self, embeddings: List[dict]):
248
+ """Check whether duplicate names exist in list of 'external
249
+ embeddings'.
250
+
251
+ Args:
252
+ embeddings (List[dict]): A list of embedding to be check.
253
+ """
254
+ names = [emb["name"] for emb in embeddings]
255
+ assert len(names) == len(set(names)), (
256
+ "Found duplicated names in 'external_embeddings'. Name list: " f"'{names}'"
257
+ )
258
+
259
+ def check_ids_overlap(self, embeddings):
260
+ """Check whether overlap exist in token ids of 'external_embeddings'.
261
+
262
+ Args:
263
+ embeddings (List[dict]): A list of embedding to be check.
264
+ """
265
+ ids_range = [[emb["start"], emb["end"], emb["name"]] for emb in embeddings]
266
+ ids_range.sort() # sort by 'start'
267
+ # check if 'end' has overlapping
268
+ for idx in range(len(ids_range) - 1):
269
+ name1, name2 = ids_range[idx][-1], ids_range[idx + 1][-1]
270
+ assert ids_range[idx][1] <= ids_range[idx + 1][0], (
271
+ f"Found ids overlapping between embeddings '{name1}' " f"and '{name2}'."
272
+ )
273
+
274
+ def add_embeddings(self, embeddings: Optional[Union[dict, List[dict]]]):
275
+ """Add external embeddings to this layer.
276
+
277
+ Use case:
278
+
279
+ >>> 1. Add token to tokenizer and get the token id.
280
+ >>> tokenizer = TokenizerWrapper('openai/clip-vit-base-patch32')
281
+ >>> # 'how much' in kiswahili
282
+ >>> tokenizer.add_placeholder_tokens('ngapi', num_vec_per_token=4)
283
+ >>>
284
+ >>> 2. Add external embeddings to the model.
285
+ >>> new_embedding = {
286
+ >>> 'name': 'ngapi', # 'how much' in kiswahili
287
+ >>> 'embedding': torch.ones(1, 15) * 4,
288
+ >>> 'start': tokenizer.get_token_info('kwaheri')['start'],
289
+ >>> 'end': tokenizer.get_token_info('kwaheri')['end'],
290
+ >>> 'trainable': False # if True, will registry as a parameter
291
+ >>> }
292
+ >>> embedding_layer = nn.Embedding(10, 15)
293
+ >>> embedding_layer_wrapper = EmbeddingLayerWithFixes(embedding_layer)
294
+ >>> embedding_layer_wrapper.add_embeddings(new_embedding)
295
+ >>>
296
+ >>> 3. Forward tokenizer and embedding layer!
297
+ >>> input_text = ['hello, ngapi!', 'hello my friend, ngapi?']
298
+ >>> input_ids = tokenizer(
299
+ >>> input_text, padding='max_length', truncation=True,
300
+ >>> return_tensors='pt')['input_ids']
301
+ >>> out_feat = embedding_layer_wrapper(input_ids)
302
+ >>>
303
+ >>> 4. Let's validate the result!
304
+ >>> assert (out_feat[0, 3: 7] == 2.3).all()
305
+ >>> assert (out_feat[2, 5: 9] == 2.3).all()
306
+
307
+ Args:
308
+ embeddings (Union[dict, list[dict]]): The external embeddings to
309
+ be added. Each dict must contain the following 4 fields: 'name'
310
+ (the name of this embedding), 'embedding' (the embedding
311
+ tensor), 'start' (the start token id of this embedding), 'end'
312
+ (the end token id of this embedding). For example:
313
+ `{name: NAME, start: START, end: END, embedding: torch.Tensor}`
314
+ """
315
+ if isinstance(embeddings, dict):
316
+ embeddings = [embeddings]
317
+
318
+ self.external_embeddings += embeddings
319
+ self.check_duplicate_names(self.external_embeddings)
320
+ self.check_ids_overlap(self.external_embeddings)
321
+
322
+ # set for trainable
323
+ added_trainable_emb_info = []
324
+ for embedding in embeddings:
325
+ trainable = embedding.get("trainable", False)
326
+ if trainable:
327
+ name = embedding["name"]
328
+ embedding["embedding"] = torch.nn.Parameter(embedding["embedding"])
329
+ self.trainable_embeddings[name] = embedding["embedding"]
330
+ added_trainable_emb_info.append(name)
331
+
332
+ added_emb_info = [emb["name"] for emb in embeddings]
333
+ added_emb_info = ", ".join(added_emb_info)
334
+ print(f"Successfully add external embeddings: {added_emb_info}.", "current")
335
+
336
+ if added_trainable_emb_info:
337
+ added_trainable_emb_info = ", ".join(added_trainable_emb_info)
338
+ print(
339
+ "Successfully add trainable external embeddings: "
340
+ f"{added_trainable_emb_info}",
341
+ "current",
342
+ )
343
+
344
+ def replace_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
345
+ """Replace external input ids to 0.
346
+
347
+ Args:
348
+ input_ids (torch.Tensor): The input ids to be replaced.
349
+
350
+ Returns:
351
+ torch.Tensor: The replaced input ids.
352
+ """
353
+ input_ids_fwd = input_ids.clone()
354
+ input_ids_fwd[input_ids_fwd >= self.num_embeddings] = 0
355
+ return input_ids_fwd
356
+
357
+ def replace_embeddings(
358
+ self, input_ids: torch.Tensor, embedding: torch.Tensor, external_embedding: dict
359
+ ) -> torch.Tensor:
360
+ """Replace external embedding to the embedding layer. Noted that, in
361
+ this function we use `torch.cat` to avoid inplace modification.
362
+
363
+ Args:
364
+ input_ids (torch.Tensor): The original token ids. Shape like
365
+ [LENGTH, ].
366
+ embedding (torch.Tensor): The embedding of token ids after
367
+ `replace_input_ids` function.
368
+ external_embedding (dict): The external embedding to be replaced.
369
+
370
+ Returns:
371
+ torch.Tensor: The replaced embedding.
372
+ """
373
+ new_embedding = []
374
+
375
+ name = external_embedding["name"]
376
+ start = external_embedding["start"]
377
+ end = external_embedding["end"]
378
+ target_ids_to_replace = [i for i in range(start, end)]
379
+ ext_emb = external_embedding["embedding"]
380
+
381
+ # do not need to replace
382
+ if not (input_ids == start).any():
383
+ return embedding
384
+
385
+ # start replace
386
+ s_idx, e_idx = 0, 0
387
+ while e_idx < len(input_ids):
388
+ if input_ids[e_idx] == start:
389
+ if e_idx != 0:
390
+ # add embedding do not need to replace
391
+ new_embedding.append(embedding[s_idx:e_idx])
392
+
393
+ # check if the next embedding need to replace is valid
394
+ actually_ids_to_replace = [
395
+ int(i) for i in input_ids[e_idx : e_idx + end - start]
396
+ ]
397
+ assert actually_ids_to_replace == target_ids_to_replace, (
398
+ f"Invalid 'input_ids' in position: {s_idx} to {e_idx}. "
399
+ f"Expect '{target_ids_to_replace}' for embedding "
400
+ f"'{name}' but found '{actually_ids_to_replace}'."
401
+ )
402
+
403
+ new_embedding.append(ext_emb)
404
+
405
+ s_idx = e_idx + end - start
406
+ e_idx = s_idx + 1
407
+ else:
408
+ e_idx += 1
409
+
410
+ if e_idx == len(input_ids):
411
+ new_embedding.append(embedding[s_idx:e_idx])
412
+
413
+ return torch.cat(new_embedding, dim=0)
414
+
415
+ def forward(
416
+ self, input_ids: torch.Tensor, external_embeddings: Optional[List[dict]] = None
417
+ ):
418
+ """The forward function.
419
+
420
+ Args:
421
+ input_ids (torch.Tensor): The token ids shape like [bz, LENGTH] or
422
+ [LENGTH, ].
423
+ external_embeddings (Optional[List[dict]]): The external
424
+ embeddings. If not passed, only `self.external_embeddings`
425
+ will be used. Defaults to None.
426
+
427
+ input_ids: shape like [bz, LENGTH] or [LENGTH].
428
+ """
429
+ assert input_ids.ndim in [1, 2]
430
+ if input_ids.ndim == 1:
431
+ input_ids = input_ids.unsqueeze(0)
432
+
433
+ if external_embeddings is None and not self.external_embeddings:
434
+ return self.wrapped(input_ids)
435
+
436
+ input_ids_fwd = self.replace_input_ids(input_ids)
437
+ inputs_embeds = self.wrapped(input_ids_fwd)
438
+
439
+ vecs = []
440
+
441
+ if external_embeddings is None:
442
+ external_embeddings = []
443
+ elif isinstance(external_embeddings, dict):
444
+ external_embeddings = [external_embeddings]
445
+ embeddings = self.external_embeddings + external_embeddings
446
+
447
+ for input_id, embedding in zip(input_ids, inputs_embeds):
448
+ new_embedding = embedding
449
+ for external_embedding in embeddings:
450
+ new_embedding = self.replace_embeddings(
451
+ input_id, new_embedding, external_embedding
452
+ )
453
+ vecs.append(new_embedding)
454
+
455
+ return torch.stack(vecs)
456
+
457
+
458
+ def add_tokens(
459
+ tokenizer,
460
+ text_encoder,
461
+ placeholder_tokens: list,
462
+ initialize_tokens: list = None,
463
+ num_vectors_per_token: int = 1,
464
+ ):
465
+ """Add token for training.
466
+
467
+ # TODO: support add tokens as dict, then we can load pretrained tokens.
468
+ """
469
+ if initialize_tokens is not None:
470
+ assert len(initialize_tokens) == len(
471
+ placeholder_tokens
472
+ ), "placeholder_token should be the same length as initialize_token"
473
+ for ii in range(len(placeholder_tokens)):
474
+ tokenizer.add_placeholder_token(
475
+ placeholder_tokens[ii], num_vec_per_token=num_vectors_per_token
476
+ )
477
+
478
+ # text_encoder.set_embedding_layer()
479
+ embedding_layer = text_encoder.text_model.embeddings.token_embedding
480
+ text_encoder.text_model.embeddings.token_embedding = EmbeddingLayerWithFixes(
481
+ embedding_layer
482
+ )
483
+ embedding_layer = text_encoder.text_model.embeddings.token_embedding
484
+
485
+ assert embedding_layer is not None, (
486
+ "Do not support get embedding layer for current text encoder. "
487
+ "Please check your configuration."
488
+ )
489
+ initialize_embedding = []
490
+ if initialize_tokens is not None:
491
+ for ii in range(len(placeholder_tokens)):
492
+ init_id = tokenizer(initialize_tokens[ii]).input_ids[1]
493
+ temp_embedding = embedding_layer.weight[init_id]
494
+ initialize_embedding.append(
495
+ temp_embedding[None, ...].repeat(num_vectors_per_token, 1)
496
+ )
497
+ else:
498
+ for ii in range(len(placeholder_tokens)):
499
+ init_id = tokenizer("a").input_ids[1]
500
+ temp_embedding = embedding_layer.weight[init_id]
501
+ len_emb = temp_embedding.shape[0]
502
+ init_weight = (torch.rand(num_vectors_per_token, len_emb) - 0.5) / 2.0
503
+ initialize_embedding.append(init_weight)
504
+
505
+ # initialize_embedding = torch.cat(initialize_embedding,dim=0)
506
+
507
+ token_info_all = []
508
+ for ii in range(len(placeholder_tokens)):
509
+ token_info = tokenizer.get_token_info(placeholder_tokens[ii])
510
+ token_info["embedding"] = initialize_embedding[ii]
511
+ token_info["trainable"] = True
512
+ token_info_all.append(token_info)
513
+ embedding_layer.add_embeddings(token_info_all)