fantos commited on
Commit
a39c53b
โ€ข
1 Parent(s): 95ed3f6

Upload app (14).py

Browse files
Files changed (1) hide show
  1. app (14).py +351 -0
app (14).py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import random
3
+ import torch
4
+ import cv2
5
+ import gradio as gr
6
+ import numpy as np
7
+ from huggingface_hub import snapshot_download
8
+ from transformers import CLIPVisionModelWithProjection,CLIPImageProcessor
9
+ from diffusers.utils import load_image
10
+ from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
11
+ from kolors.models.modeling_chatglm import ChatGLMModel
12
+ from kolors.models.tokenization_chatglm import ChatGLMTokenizer
13
+ from kolors.models.controlnet import ControlNetModel
14
+ from diffusers import AutoencoderKL
15
+ from kolors.models.unet_2d_condition import UNet2DConditionModel
16
+ from diffusers import EulerDiscreteScheduler
17
+ from PIL import Image
18
+ from annotator.midas import MidasDetector
19
+ from annotator.dwpose import DWposeDetector
20
+ from annotator.util import resize_image, HWC3
21
+
22
+
23
+ device = "cuda"
24
+ ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
25
+ ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
26
+ ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
27
+ ckpt_dir_pose = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Pose")
28
+
29
+ text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
30
+ tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
31
+ vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
32
+ scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
33
+ unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
34
+ controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
35
+ controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
36
+ controlnet_pose = ControlNetModel.from_pretrained(f"{ckpt_dir_pose}", revision=None).half().to(device)
37
+
38
+ pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
39
+ vae=vae,
40
+ controlnet = controlnet_depth,
41
+ text_encoder=text_encoder,
42
+ tokenizer=tokenizer,
43
+ unet=unet,
44
+ scheduler=scheduler,
45
+ force_zeros_for_empty_prompt=False
46
+ )
47
+
48
+ pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
49
+ vae=vae,
50
+ controlnet = controlnet_canny,
51
+ text_encoder=text_encoder,
52
+ tokenizer=tokenizer,
53
+ unet=unet,
54
+ scheduler=scheduler,
55
+ force_zeros_for_empty_prompt=False
56
+ )
57
+
58
+ pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
59
+ vae=vae,
60
+ controlnet = controlnet_pose,
61
+ text_encoder=text_encoder,
62
+ tokenizer=tokenizer,
63
+ unet=unet,
64
+ scheduler=scheduler,
65
+ force_zeros_for_empty_prompt=False
66
+ )
67
+
68
+ @spaces.GPU
69
+ def process_canny_condition(image, canny_threods=[100,200]):
70
+ np_image = image.copy()
71
+ np_image = cv2.Canny(np_image, canny_threods[0], canny_threods[1])
72
+ np_image = np_image[:, :, None]
73
+ np_image = np.concatenate([np_image, np_image, np_image], axis=2)
74
+ np_image = HWC3(np_image)
75
+ return Image.fromarray(np_image)
76
+
77
+ model_midas = MidasDetector()
78
+ @spaces.GPU
79
+ def process_depth_condition_midas(img, res = 1024):
80
+ h,w,_ = img.shape
81
+ img = resize_image(HWC3(img), res)
82
+ result = HWC3(model_midas(img))
83
+ result = cv2.resize(result, (w,h))
84
+ return Image.fromarray(result)
85
+
86
+ model_dwpose = DWposeDetector()
87
+ @spaces.GPU
88
+ def process_dwpose_condition(image, res=1024):
89
+ h,w,_ = image.shape
90
+ img = resize_image(HWC3(image), res)
91
+ out_res, out_img = model_dwpose(image)
92
+ result = HWC3(out_img)
93
+ result = cv2.resize( result, (w,h) )
94
+ return Image.fromarray(result)
95
+
96
+ MAX_SEED = np.iinfo(np.int32).max
97
+ MAX_IMAGE_SIZE = 1024
98
+
99
+ @spaces.GPU
100
+ def infer_depth(prompt,
101
+ image = None,
102
+ negative_prompt = "nsfw๏ผŒ่„ธ้ƒจ้˜ดๅฝฑ๏ผŒไฝŽๅˆ†่พจ็Ž‡๏ผŒjpegไผชๅฝฑใ€ๆจก็ณŠใ€็ณŸ็ณ•๏ผŒ้ป‘่„ธ๏ผŒ้œ“่™น็ฏ",
103
+ seed = 397886929,
104
+ randomize_seed = False,
105
+ guidance_scale = 6.0,
106
+ num_inference_steps = 50,
107
+ controlnet_conditioning_scale = 0.7,
108
+ control_guidance_end = 0.9,
109
+ strength = 1.0
110
+ ):
111
+ if randomize_seed:
112
+ seed = random.randint(0, MAX_SEED)
113
+ generator = torch.Generator().manual_seed(seed)
114
+ init_image = resize_image(image, MAX_IMAGE_SIZE)
115
+ pipe = pipe_depth.to("cuda")
116
+ condi_img = process_depth_condition_midas( np.array(init_image), MAX_IMAGE_SIZE)
117
+ image = pipe(
118
+ prompt= prompt ,
119
+ image = init_image,
120
+ controlnet_conditioning_scale = controlnet_conditioning_scale,
121
+ control_guidance_end = control_guidance_end,
122
+ strength= strength ,
123
+ control_image = condi_img,
124
+ negative_prompt= negative_prompt ,
125
+ num_inference_steps= num_inference_steps,
126
+ guidance_scale= guidance_scale,
127
+ num_images_per_prompt=1,
128
+ generator=generator,
129
+ ).images[0]
130
+ return [condi_img, image], seed
131
+
132
+ @spaces.GPU
133
+ def infer_canny(prompt,
134
+ image = None,
135
+ negative_prompt = "nsfw๏ผŒ่„ธ้ƒจ้˜ดๅฝฑ๏ฟฝ๏ฟฝ๏ฟฝไฝŽๅˆ†่พจ็Ž‡๏ผŒjpegไผชๅฝฑใ€ๆจก็ณŠใ€็ณŸ็ณ•๏ผŒ้ป‘่„ธ๏ผŒ้œ“่™น็ฏ",
136
+ seed = 397886929,
137
+ randomize_seed = False,
138
+ guidance_scale = 6.0,
139
+ num_inference_steps = 50,
140
+ controlnet_conditioning_scale = 0.7,
141
+ control_guidance_end = 0.9,
142
+ strength = 1.0
143
+ ):
144
+ if randomize_seed:
145
+ seed = random.randint(0, MAX_SEED)
146
+ generator = torch.Generator().manual_seed(seed)
147
+ init_image = resize_image(image, MAX_IMAGE_SIZE)
148
+ pipe = pipe_canny.to("cuda")
149
+ condi_img = process_canny_condition(np.array(init_image))
150
+ image = pipe(
151
+ prompt= prompt ,
152
+ image = init_image,
153
+ controlnet_conditioning_scale = controlnet_conditioning_scale,
154
+ control_guidance_end = control_guidance_end,
155
+ strength= strength ,
156
+ control_image = condi_img,
157
+ negative_prompt= negative_prompt ,
158
+ num_inference_steps= num_inference_steps,
159
+ guidance_scale= guidance_scale,
160
+ num_images_per_prompt=1,
161
+ generator=generator,
162
+ ).images[0]
163
+ return [condi_img, image], seed
164
+
165
+ @spaces.GPU
166
+ def infer_pose(prompt,
167
+ image = None,
168
+ negative_prompt = "nsfw๏ผŒ่„ธ้ƒจ้˜ดๅฝฑ๏ผŒไฝŽๅˆ†่พจ็Ž‡๏ผŒjpegไผชๅฝฑใ€ๆจก็ณŠใ€็ณŸ็ณ•๏ผŒ้ป‘่„ธ๏ผŒ้œ“่™น็ฏ",
169
+ seed = 66,
170
+ randomize_seed = False,
171
+ guidance_scale = 6.0,
172
+ num_inference_steps = 50,
173
+ controlnet_conditioning_scale = 0.7,
174
+ control_guidance_end = 0.9,
175
+ strength = 1.0
176
+ ):
177
+ if randomize_seed:
178
+ seed = random.randint(0, MAX_SEED)
179
+ generator = torch.Generator().manual_seed(seed)
180
+ init_image = resize_image(image, MAX_IMAGE_SIZE)
181
+ pipe = pipe_pose.to("cuda")
182
+ condi_img = process_dwpose_condition(np.array(init_image), MAX_IMAGE_SIZE)
183
+ image = pipe(
184
+ prompt= prompt ,
185
+ image = init_image,
186
+ controlnet_conditioning_scale = controlnet_conditioning_scale,
187
+ control_guidance_end = control_guidance_end,
188
+ strength= strength ,
189
+ control_image = condi_img,
190
+ negative_prompt= negative_prompt ,
191
+ num_inference_steps= num_inference_steps,
192
+ guidance_scale= guidance_scale,
193
+ num_images_per_prompt=1,
194
+ generator=generator,
195
+ ).images[0]
196
+ return [condi_img, image], seed
197
+
198
+
199
+ canny_examples = [
200
+ ["์•„๋ฆ„๋‹ค์šด ์†Œ๋…€, ๊ณ ํ’ˆ์งˆ, ๋งค์šฐ ์„ ๋ช…, ์ƒ์ƒํ•œ ์ƒ‰์ƒ, ์ดˆ๊ณ ํ•ด์ƒ๋„, ์ตœ์ƒ์˜ ํ’ˆ์งˆ, 8k, ๊ณ ํ™”์งˆ, 4K",
201
+ "image/woman_1.png"],
202
+ ["ํŒŒ๋…ธ๋ผ๋งˆ, ์ปต ์•ˆ์— ์•‰์•„์žˆ๋Š” ๊ท€์—ฌ์šด ํฐ ๊ฐ•์•„์ง€, ์นด๋ฉ”๋ผ๋ฅผ ๋ฐ”๋ผ๋ณด๋Š”, ์• ๋‹ˆ๋ฉ”์ด์…˜ ์Šคํƒ€์ผ, 3D ๋ Œ๋”๋ง, ์˜ฅํ…Œ์ธ ๋ Œ๋”",
203
+ "image/dog.png"]
204
+ ]
205
+
206
+ depth_examples = [
207
+ ["์‹ ์นด์ด ๋งˆ์ฝ”ํ†  ์Šคํƒ€์ผ, ํ’๋ถ€ํ•œ ์ƒ‰๊ฐ, ์ดˆ๋ก ์…”์ธ ๋ฅผ ์ž…์€ ์—ฌ์„ฑ์ด ๋“คํŒ์— ์„œ ์žˆ๋Š”, ์•„๋ฆ„๋‹ค์šด ํ’๊ฒฝ, ๋ง‘๊ณ  ๋ฐ์€, ์–ผ๋ฃฉ์ง„ ๋น›๊ณผ ๊ทธ๋ฆผ์ž, ์ตœ๊ณ ์˜ ํ’ˆ์งˆ, ์ดˆ์„ธ๋ฐ€, 8K ํ™”์งˆ",
208
+ "image/woman_2.png"],
209
+ ["ํ™”๋ คํ•œ ์ƒ‰์ƒ์˜ ์ž‘์€ ์ƒˆ, ๊ณ ํ’ˆ์งˆ, ๋งค์šฐ ์„ ๋ช…, ์ƒ์ƒํ•œ ์ƒ‰์ƒ, ์ดˆ๊ณ ํ•ด์ƒ๋„, ์ตœ์ƒ์˜ ํ’ˆ์งˆ, 8k, ๊ณ ํ™”์งˆ, 4K",
210
+ "image/bird.png"]
211
+ ]
212
+
213
+ pose_examples = [
214
+ ["๋ณด๋ผ์ƒ‰ ํผํ”„ ์Šฌ๋ฆฌ๋ธŒ ๋“œ๋ ˆ์Šค๋ฅผ ์ž…๊ณ  ์™•๊ด€๊ณผ ํฐ์ƒ‰ ๋ ˆ์ด์Šค ์žฅ๊ฐ‘์„ ๋‚€ ์†Œ๋…€๊ฐ€ ์–‘ ์†์œผ๋กœ ์–ผ๊ตด์„ ๊ฐ์‹ธ๊ณ  ์žˆ๋Š”, ๊ณ ํ’ˆ์งˆ, ๋งค์šฐ ์„ ๋ช…, ์ƒ์ƒํ•œ ์ƒ‰์ƒ, ์ดˆ๊ณ ํ•ด์ƒ๋„, ์ตœ์ƒ์˜ ํ’ˆ์งˆ, 8k, ๊ณ ํ™”์งˆ, 4K",
215
+ "image/woman_3.png"],
216
+ ["๊ฒ€์€์ƒ‰ ์Šคํฌ์ธ  ์žฌํ‚ท๊ณผ ํฐ์ƒ‰ ์ด๋„ˆ๋ฅผ ์ž…๊ณ  ๋ชฉ๊ฑธ์ด๋ฅผ ํ•œ ์—ฌ์„ฑ์ด ๊ฑฐ๋ฆฌ์— ์„œ ์žˆ๋Š”, ๋ฐฐ๊ฒฝ์€ ๋นจ๊ฐ„ ๊ฑด๋ฌผ๊ณผ ๋…น์ƒ‰ ๋‚˜๋ฌด, ๊ณ ํ’ˆ์งˆ, ๋งค์šฐ ์„ ๋ช…, ์ƒ์ƒํ•œ ์ƒ‰์ƒ, ์ดˆ๊ณ ํ•ด์ƒ๋„, ์ตœ์ƒ์˜ ํ’ˆ์งˆ, 8k, ๊ณ ํ™”์งˆ, 4K",
217
+ "image/woman_4.png"]
218
+ ]
219
+
220
+ css = """
221
+ footer {
222
+ visibility: hidden;
223
+ }
224
+ """
225
+
226
+
227
+ def load_description(fp):
228
+ with open(fp, 'r', encoding='utf-8') as f:
229
+ content = f.read()
230
+ return content
231
+
232
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
233
+ with gr.Row():
234
+ with gr.Column(elem_id="col-left"):
235
+ with gr.Row():
236
+ prompt = gr.Textbox(
237
+ label="ํ”„๋กฌํ”„ํŠธ",
238
+ placeholder="ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”",
239
+ lines=2
240
+ )
241
+ with gr.Row():
242
+ image = gr.Image(label="์ด๋ฏธ์ง€", type="pil")
243
+ with gr.Accordion("๊ณ ๊ธ‰ ์„ค์ •", open=False):
244
+ negative_prompt = gr.Textbox(
245
+ label="๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ",
246
+ placeholder="๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”",
247
+ visible=True,
248
+ value="nsfw, ์–ผ๊ตด ๊ทธ๋ฆผ์ž, ์ €ํ•ด์ƒ๋„, jpeg ์•„ํ‹ฐํŒฉํŠธ, ํ๋ฆฟํ•จ, ์—ด์•…ํ•จ, ๊ฒ€์€ ์–ผ๊ตด, ๋„ค์˜จ ์กฐ๋ช…"
249
+ )
250
+ seed = gr.Slider(
251
+ label="์‹œ๋“œ",
252
+ minimum=0,
253
+ maximum=MAX_SEED,
254
+ step=1,
255
+ value=0,
256
+ )
257
+ randomize_seed = gr.Checkbox(label="์‹œ๋“œ ๋ฌด์ž‘์œ„ํ™”", value=True)
258
+ with gr.Row():
259
+ guidance_scale = gr.Slider(
260
+ label="๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ",
261
+ minimum=0.0,
262
+ maximum=10.0,
263
+ step=0.1,
264
+ value=6.0,
265
+ )
266
+ num_inference_steps = gr.Slider(
267
+ label="์ถ”๋ก  ๋‹จ๊ณ„ ์ˆ˜",
268
+ minimum=10,
269
+ maximum=50,
270
+ step=1,
271
+ value=30,
272
+ )
273
+ with gr.Row():
274
+ controlnet_conditioning_scale = gr.Slider(
275
+ label="์ปจํŠธ๋กค๋„ท ์ปจ๋””์…”๋‹ ์Šค์ผ€์ผ",
276
+ minimum=0.0,
277
+ maximum=1.0,
278
+ step=0.1,
279
+ value=0.7,
280
+ )
281
+ control_guidance_end = gr.Slider(
282
+ label="์ปจํŠธ๋กค ๊ฐ€์ด๋˜์Šค ์ข…๋ฃŒ",
283
+ minimum=0.0,
284
+ maximum=1.0,
285
+ step=0.1,
286
+ value=0.9,
287
+ )
288
+ with gr.Row():
289
+ strength = gr.Slider(
290
+ label="๊ฐ•๋„",
291
+ minimum=0.0,
292
+ maximum=1.0,
293
+ step=0.1,
294
+ value=1.0,
295
+ )
296
+ with gr.Row():
297
+ canny_button = gr.Button("์บ๋‹ˆ", elem_id="button")
298
+ depth_button = gr.Button("๊นŠ์ด", elem_id="button")
299
+ pose_button = gr.Button("ํฌ์ฆˆ", elem_id="button")
300
+
301
+ with gr.Column(elem_id="col-right"):
302
+ result = gr.Gallery(label="๊ฒฐ๊ณผ", show_label=False, columns=2)
303
+ seed_used = gr.Number(label="์‚ฌ์šฉ๋œ ์‹œ๋“œ")
304
+
305
+
306
+
307
+ with gr.Row():
308
+ gr.Examples(
309
+ fn = infer_canny,
310
+ examples = canny_examples,
311
+ inputs = [prompt, image],
312
+ outputs = [result, seed_used],
313
+ label = "Canny"
314
+ )
315
+ with gr.Row():
316
+ gr.Examples(
317
+ fn = infer_depth,
318
+ examples = depth_examples,
319
+ inputs = [prompt, image],
320
+ outputs = [result, seed_used],
321
+ label = "Depth"
322
+ )
323
+
324
+ with gr.Row():
325
+ gr.Examples(
326
+ fn = infer_pose,
327
+ examples = pose_examples,
328
+ inputs = [prompt, image],
329
+ outputs = [result, seed_used],
330
+ label = "Pose"
331
+ )
332
+
333
+ canny_button.click(
334
+ fn = infer_canny,
335
+ inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
336
+ outputs = [result, seed_used]
337
+ )
338
+
339
+ depth_button.click(
340
+ fn = infer_depth,
341
+ inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
342
+ outputs = [result, seed_used]
343
+ )
344
+
345
+ pose_button.click(
346
+ fn = infer_pose,
347
+ inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
348
+ outputs = [result, seed_used]
349
+ )
350
+
351
+ Kolors.queue().launch(debug=True)