Spaces:
Running
Running
Add stage2 support
Browse files
app.py
CHANGED
@@ -8,7 +8,8 @@ from model import AppModel
|
|
8 |
|
9 |
DESCRIPTION = '''# <a href="https://github.com/THUDM/CogVideo">CogVideo</a>
|
10 |
|
11 |
-
|
|
|
12 |
|
13 |
The model accepts only Chinese as input.
|
14 |
By checking the "Translate to Chinese" checkbox, the results of English to Chinese translation with [this Space](https://huggingface.co/spaces/chinhon/translation_eng2ch) will be used as input.
|
@@ -19,7 +20,7 @@ FOOTER = '<img id="visitor-badge" alt="visitor badge" src="https://visitor-badge
|
|
19 |
|
20 |
|
21 |
def main():
|
22 |
-
only_first_stage =
|
23 |
model = AppModel(only_first_stage)
|
24 |
|
25 |
with gr.Blocks(css='style.css') as demo:
|
@@ -36,9 +37,9 @@ def main():
|
|
36 |
step=1,
|
37 |
value=1234,
|
38 |
label='Seed')
|
39 |
-
|
40 |
label='Only First Stage',
|
41 |
-
value=
|
42 |
visible=not only_first_stage)
|
43 |
image_prompt = gr.Image(type="filepath",
|
44 |
label="Image Prompt",
|
@@ -53,10 +54,10 @@ def main():
|
|
53 |
result_video = gr.Video(show_label=False)
|
54 |
|
55 |
examples = gr.Examples(
|
56 |
-
examples=[['骑滑板的皮卡丘', False, 1234, True,None],
|
57 |
-
['a cat playing chess', True, 1253,
|
58 |
fn=model.run_with_translation,
|
59 |
-
inputs=[text, translate, seed,
|
60 |
outputs=[translated_text, result_video],
|
61 |
cache_examples=True)
|
62 |
|
@@ -68,7 +69,7 @@ def main():
|
|
68 |
text,
|
69 |
translate,
|
70 |
seed,
|
71 |
-
|
72 |
image_prompt
|
73 |
],
|
74 |
outputs=[translated_text, result_video])
|
|
|
8 |
|
9 |
DESCRIPTION = '''# <a href="https://github.com/THUDM/CogVideo">CogVideo</a>
|
10 |
|
11 |
+
This Space supports the first stage and the second stage (better quality) of the CogVideo pipeline.
|
12 |
+
Note that the second stage of CogVideo is **much slower**.
|
13 |
|
14 |
The model accepts only Chinese as input.
|
15 |
By checking the "Translate to Chinese" checkbox, the results of English to Chinese translation with [this Space](https://huggingface.co/spaces/chinhon/translation_eng2ch) will be used as input.
|
|
|
20 |
|
21 |
|
22 |
def main():
|
23 |
+
only_first_stage = False
|
24 |
model = AppModel(only_first_stage)
|
25 |
|
26 |
with gr.Blocks(css='style.css') as demo:
|
|
|
37 |
step=1,
|
38 |
value=1234,
|
39 |
label='Seed')
|
40 |
+
run_only_first_stage = gr.Checkbox(
|
41 |
label='Only First Stage',
|
42 |
+
value=True,
|
43 |
visible=not only_first_stage)
|
44 |
image_prompt = gr.Image(type="filepath",
|
45 |
label="Image Prompt",
|
|
|
54 |
result_video = gr.Video(show_label=False)
|
55 |
|
56 |
examples = gr.Examples(
|
57 |
+
examples=[['骑滑板的皮卡丘', False, 1234, True, None],
|
58 |
+
['a cat playing chess', True, 1253, False, None]],
|
59 |
fn=model.run_with_translation,
|
60 |
+
inputs=[text, translate, seed, run_only_first_stage, image_prompt],
|
61 |
outputs=[translated_text, result_video],
|
62 |
cache_examples=True)
|
63 |
|
|
|
69 |
text,
|
70 |
translate,
|
71 |
seed,
|
72 |
+
run_only_first_stage,
|
73 |
image_prompt
|
74 |
],
|
75 |
outputs=[translated_text, result_video])
|
model.py
CHANGED
@@ -62,8 +62,8 @@ if os.getenv('SYSTEM') == 'spaces':
|
|
62 |
|
63 |
download_and_extract_icetk_models()
|
64 |
download_and_extract_cogvideo_models('cogvideo-stage1.zip')
|
65 |
-
|
66 |
-
|
67 |
|
68 |
os.environ['SAT_HOME'] = '/home/user/app/pretrained'
|
69 |
|
@@ -299,7 +299,8 @@ def my_filling_sequence(
|
|
299 |
# initialize generation
|
300 |
counter = context_length - 1 # Last fixed index is ``counter''
|
301 |
index = 0 # Next forward starting index, also the length of cache.
|
302 |
-
mems_buffers_on_GPU = False
|
|
|
303 |
mems_indexs = [0, 0]
|
304 |
mems_len = [(400 + 74) if limited_spatial_channel_mem else 5 * 400 + 74,
|
305 |
5 * 400 + 74]
|
@@ -308,7 +309,8 @@ def my_filling_sequence(
|
|
308 |
batch_size,
|
309 |
mem_len,
|
310 |
args.hidden_size * 2,
|
311 |
-
dtype=next(model.parameters()).dtype
|
|
|
312 |
for mem_len in mems_len
|
313 |
]
|
314 |
|
@@ -320,13 +322,13 @@ def my_filling_sequence(
|
|
320 |
batch_size,
|
321 |
mem_len,
|
322 |
args.hidden_size * 2,
|
323 |
-
dtype=next(model.parameters()).dtype
|
|
|
324 |
for mem_len in mems_len
|
325 |
]
|
326 |
guider_mems_indexs = [0, 0]
|
327 |
guider_mems = None
|
328 |
|
329 |
-
torch.cuda.empty_cache()
|
330 |
# step-by-step generation
|
331 |
while counter < len(seq[0]) - 1:
|
332 |
# we have generated counter+1 tokens
|
@@ -448,34 +450,34 @@ def my_filling_sequence(
|
|
448 |
]
|
449 |
guider_logits = guider_logits_all
|
450 |
else:
|
451 |
-
if not mems_buffers_on_GPU:
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
|
480 |
logits, *output_per_layers = model(
|
481 |
input_tokens[:, index:],
|
@@ -513,17 +515,17 @@ def my_filling_sequence(
|
|
513 |
o['mem_kv'][0] for o in guider_output_per_layers
|
514 |
], [o['mem_kv'][1] for o in guider_output_per_layers]
|
515 |
|
516 |
-
if not mems_buffers_on_GPU:
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
|
528 |
mems, mems_indexs = my_update_mems([mem_kv0, mem_kv1],
|
529 |
mems_buffers, mems_indexs,
|
@@ -677,7 +679,7 @@ def get_default_args() -> argparse.Namespace:
|
|
677 |
'--batch-size',
|
678 |
'1',
|
679 |
'--max-inference-batch-size',
|
680 |
-
'
|
681 |
]
|
682 |
args = get_args(args_list)
|
683 |
args = argparse.Namespace(**vars(args), **vars(known))
|
@@ -779,7 +781,7 @@ class Model:
|
|
779 |
path = auto_create('cogview2-dsr', path=None)
|
780 |
dsr = DirectSuperResolution(self.args,
|
781 |
path,
|
782 |
-
max_bz=
|
783 |
onCUDA=False)
|
784 |
else:
|
785 |
dsr = None
|
@@ -1184,7 +1186,8 @@ class Model:
|
|
1184 |
else:
|
1185 |
self.args.stage_1 = False
|
1186 |
self.args.both_stages = True
|
1187 |
-
|
|
|
1188 |
parent_given_tokens, res = self.process_stage1(
|
1189 |
self.model_stage1,
|
1190 |
text,
|
@@ -1231,7 +1234,7 @@ class AppModel(Model):
|
|
1231 |
|
1232 |
def run_with_translation(
|
1233 |
self, text: str, translate: bool, seed: int,
|
1234 |
-
only_first_stage: bool,image_prompt: None) -> tuple[str | None, str | None]:
|
1235 |
|
1236 |
logger.info(f'{text=}, {translate=}, {seed=}, {only_first_stage=},{image_prompt=}')
|
1237 |
if translate:
|
|
|
62 |
|
63 |
download_and_extract_icetk_models()
|
64 |
download_and_extract_cogvideo_models('cogvideo-stage1.zip')
|
65 |
+
download_and_extract_cogvideo_models('cogvideo-stage2.zip')
|
66 |
+
download_and_extract_cogview2_models('cogview2-dsr.zip')
|
67 |
|
68 |
os.environ['SAT_HOME'] = '/home/user/app/pretrained'
|
69 |
|
|
|
299 |
# initialize generation
|
300 |
counter = context_length - 1 # Last fixed index is ``counter''
|
301 |
index = 0 # Next forward starting index, also the length of cache.
|
302 |
+
# mems_buffers_on_GPU = False
|
303 |
+
torch.cuda.empty_cache()
|
304 |
mems_indexs = [0, 0]
|
305 |
mems_len = [(400 + 74) if limited_spatial_channel_mem else 5 * 400 + 74,
|
306 |
5 * 400 + 74]
|
|
|
309 |
batch_size,
|
310 |
mem_len,
|
311 |
args.hidden_size * 2,
|
312 |
+
dtype=next(model.parameters()).dtype,
|
313 |
+
device=next(model.parameters()).device)
|
314 |
for mem_len in mems_len
|
315 |
]
|
316 |
|
|
|
322 |
batch_size,
|
323 |
mem_len,
|
324 |
args.hidden_size * 2,
|
325 |
+
dtype=next(model.parameters()).dtype,
|
326 |
+
device=next(model.parameters()).device)
|
327 |
for mem_len in mems_len
|
328 |
]
|
329 |
guider_mems_indexs = [0, 0]
|
330 |
guider_mems = None
|
331 |
|
|
|
332 |
# step-by-step generation
|
333 |
while counter < len(seq[0]) - 1:
|
334 |
# we have generated counter+1 tokens
|
|
|
450 |
]
|
451 |
guider_logits = guider_logits_all
|
452 |
else:
|
453 |
+
# if not mems_buffers_on_GPU:
|
454 |
+
# if not mode_stage1:
|
455 |
+
# torch.cuda.empty_cache()
|
456 |
+
# for idx, mem in enumerate(mems):
|
457 |
+
# mems[idx] = mem.to(next(model.parameters()).device)
|
458 |
+
# if guider_seq is not None:
|
459 |
+
# for idx, mem in enumerate(guider_mems):
|
460 |
+
# guider_mems[idx] = mem.to(
|
461 |
+
# next(model.parameters()).device)
|
462 |
+
# else:
|
463 |
+
# torch.cuda.empty_cache()
|
464 |
+
# for idx, mem_buffer in enumerate(mems_buffers):
|
465 |
+
# mems_buffers[idx] = mem_buffer.to(
|
466 |
+
# next(model.parameters()).device)
|
467 |
+
# mems = [
|
468 |
+
# mems_buffers[id][:, :, :mems_indexs[id]]
|
469 |
+
# for id in range(2)
|
470 |
+
# ]
|
471 |
+
# if guider_seq is not None:
|
472 |
+
# for idx, guider_mem_buffer in enumerate(
|
473 |
+
# guider_mems_buffers):
|
474 |
+
# guider_mems_buffers[idx] = guider_mem_buffer.to(
|
475 |
+
# next(model.parameters()).device)
|
476 |
+
# guider_mems = [
|
477 |
+
# guider_mems_buffers[id]
|
478 |
+
# [:, :, :guider_mems_indexs[id]] for id in range(2)
|
479 |
+
# ]
|
480 |
+
# mems_buffers_on_GPU = True
|
481 |
|
482 |
logits, *output_per_layers = model(
|
483 |
input_tokens[:, index:],
|
|
|
515 |
o['mem_kv'][0] for o in guider_output_per_layers
|
516 |
], [o['mem_kv'][1] for o in guider_output_per_layers]
|
517 |
|
518 |
+
# if not mems_buffers_on_GPU:
|
519 |
+
# torch.cuda.empty_cache()
|
520 |
+
# for idx, mem_buffer in enumerate(mems_buffers):
|
521 |
+
# mems_buffers[idx] = mem_buffer.to(
|
522 |
+
# next(model.parameters()).device)
|
523 |
+
# if guider_seq is not None:
|
524 |
+
# for idx, guider_mem_buffer in enumerate(
|
525 |
+
# guider_mems_buffers):
|
526 |
+
# guider_mems_buffers[idx] = guider_mem_buffer.to(
|
527 |
+
# next(model.parameters()).device)
|
528 |
+
# mems_buffers_on_GPU = True
|
529 |
|
530 |
mems, mems_indexs = my_update_mems([mem_kv0, mem_kv1],
|
531 |
mems_buffers, mems_indexs,
|
|
|
679 |
'--batch-size',
|
680 |
'1',
|
681 |
'--max-inference-batch-size',
|
682 |
+
'1',
|
683 |
]
|
684 |
args = get_args(args_list)
|
685 |
args = argparse.Namespace(**vars(args), **vars(known))
|
|
|
781 |
path = auto_create('cogview2-dsr', path=None)
|
782 |
dsr = DirectSuperResolution(self.args,
|
783 |
path,
|
784 |
+
max_bz=4,
|
785 |
onCUDA=False)
|
786 |
else:
|
787 |
dsr = None
|
|
|
1186 |
else:
|
1187 |
self.args.stage_1 = False
|
1188 |
self.args.both_stages = True
|
1189 |
+
|
1190 |
+
torch.cuda.empty_cache()
|
1191 |
parent_given_tokens, res = self.process_stage1(
|
1192 |
self.model_stage1,
|
1193 |
text,
|
|
|
1234 |
|
1235 |
def run_with_translation(
|
1236 |
self, text: str, translate: bool, seed: int,
|
1237 |
+
only_first_stage: bool, image_prompt: None) -> tuple[str | None, str | None]:
|
1238 |
|
1239 |
logger.info(f'{text=}, {translate=}, {seed=}, {only_first_stage=},{image_prompt=}')
|
1240 |
if translate:
|