kevinwang676 commited on
Commit
68fbfa8
1 Parent(s): 8167aaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +561 -94
app.py CHANGED
@@ -5,6 +5,29 @@ from src.gradio_demo import SadTalker
5
  # from src.utils.text2speech import TTSTalker
6
  from huggingface_hub import snapshot_download
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def get_source_image(image):
9
  return image
10
 
@@ -42,101 +65,545 @@ download_model()
42
  sad_talker = SadTalker(lazy_load=True)
43
 
44
 
45
- with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
46
- gr.Markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> </h2> \
47
- <a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
48
- <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
49
- <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
50
-
51
-
52
- gr.Markdown("""
53
- <b>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href="https://huggingface.co/spaces/vinthony/SadTalker?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></b> \
54
- <br/><b>Alternatively, try our GitHub <a href=https://github.com/Winfredy/SadTalker> code </a> on your own GPU. </b> <a style='display:inline-block' href="https://github.com/Winfredy/SadTalker"><img src="https://img.shields.io/github/stars/Winfredy/SadTalker?style=social"/></a> \
55
- """)
56
-
57
- with gr.Row().style(equal_height=False):
58
- with gr.Column(variant='panel'):
59
- with gr.Tabs(elem_id="sadtalker_source_image"):
60
- with gr.TabItem('Source image'):
61
- with gr.Row():
62
- source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image").style(width=512)
63
-
64
-
65
- with gr.Tabs(elem_id="sadtalker_driven_audio"):
66
- with gr.TabItem('Driving Methods'):
67
-
68
- with gr.Row():
69
- driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
70
- driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
71
-
72
- with gr.Column():
73
- use_idle_mode = gr.Checkbox(label="Use Idle Animation", visible=False)
74
- length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.", visible=False)
75
- use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
76
-
77
- with gr.Row():
78
- ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref", visible=False).style(width=512)
79
-
80
- with gr.Column():
81
- use_ref_video = gr.Checkbox(label="Use Reference Video", visible=False)
82
- ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))", visible=False)
83
-
84
- ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
85
-
86
-
87
- with gr.Column(variant='panel'):
88
- with gr.Tabs(elem_id="sadtalker_checkbox"):
89
- with gr.TabItem('Settings'):
90
- with gr.Column(variant='panel'):
91
- # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
92
- # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
93
- with gr.Row():
94
- pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0, visible=False) #
95
- exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1, visible=False) #
96
- blink_every = gr.Checkbox(label="use eye blink", value=True, visible=False)
97
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  with gr.Row():
99
- size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?", visible=False) #
100
- preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?", visible=False)
101
-
 
 
 
102
  with gr.Row():
103
- is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)", value=True)
104
- facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?", visible=False)
105
-
 
 
 
 
 
106
  with gr.Row():
107
- batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=2)
108
- enhancer = gr.Checkbox(label="GFPGAN as Face enhancer", value=True, visible=False)
109
-
110
- submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
111
-
112
- with gr.Tabs(elem_id="sadtalker_genearted"):
113
- gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
114
-
115
-
116
-
117
- submit.click(
118
- fn=sad_talker.test,
119
- inputs=[source_image,
120
- driven_audio,
121
- preprocess_type,
122
- is_still_mode,
123
- enhancer,
124
- batch_size,
125
- size_of_image,
126
- pose_style,
127
- facerender,
128
- exp_weight,
129
- use_ref_video,
130
- ref_video,
131
- ref_info,
132
- use_idle_mode,
133
- length_of_audio,
134
- blink_every
135
- ],
136
- outputs=[gen_video]
137
- )
138
-
139
-
140
- sadtalker_interface.queue().launch(debug=True)
141
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
 
5
  # from src.utils.text2speech import TTSTalker
6
  from huggingface_hub import snapshot_download
7
 
8
+ import torch
9
+ import librosa
10
+ from scipy.io.wavfile import write
11
+ from transformers import WavLMModel
12
+
13
+ import utils
14
+ from models import SynthesizerTrn
15
+ from mel_processing import mel_spectrogram_torch
16
+ from speaker_encoder.voice_encoder import SpeakerEncoder
17
+
18
+ import time
19
+ from textwrap import dedent
20
+
21
+ import mdtex2html
22
+ from loguru import logger
23
+ from transformers import AutoModel, AutoTokenizer
24
+
25
+ from tts_voice import tts_order_voice
26
+ import edge_tts
27
+ import tempfile
28
+ import anyio
29
+
30
+
31
  def get_source_image(image):
32
  return image
33
 
 
65
  sad_talker = SadTalker(lazy_load=True)
66
 
67
 
68
+ # ChatGLM2 & FreeVC
69
+
70
+ '''
71
+ def get_wavlm():
72
+ os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
73
+ shutil.move('WavLM-Large.pt', 'wavlm')
74
+ '''
75
+
76
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
77
+
78
+ smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
79
+
80
+ print("Loading FreeVC(24k)...")
81
+ hps = utils.get_hparams_from_file("configs/freevc-24.json")
82
+ freevc_24 = SynthesizerTrn(
83
+ hps.data.filter_length // 2 + 1,
84
+ hps.train.segment_size // hps.data.hop_length,
85
+ **hps.model).to(device)
86
+ _ = freevc_24.eval()
87
+ _ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
88
+
89
+ print("Loading WavLM for content...")
90
+ cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
91
+
92
+ def convert(model, src, tgt):
93
+ with torch.no_grad():
94
+ # tgt
95
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
96
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
97
+ if model == "FreeVC" or model == "FreeVC (24kHz)":
98
+ g_tgt = smodel.embed_utterance(wav_tgt)
99
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
100
+ else:
101
+ wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
102
+ mel_tgt = mel_spectrogram_torch(
103
+ wav_tgt,
104
+ hps.data.filter_length,
105
+ hps.data.n_mel_channels,
106
+ hps.data.sampling_rate,
107
+ hps.data.hop_length,
108
+ hps.data.win_length,
109
+ hps.data.mel_fmin,
110
+ hps.data.mel_fmax
111
+ )
112
+ # src
113
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
114
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
115
+ c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
116
+ # infer
117
+ if model == "FreeVC":
118
+ audio = freevc.infer(c, g=g_tgt)
119
+ elif model == "FreeVC-s":
120
+ audio = freevc_s.infer(c, mel=mel_tgt)
121
+ else:
122
+ audio = freevc_24.infer(c, g=g_tgt)
123
+ audio = audio[0][0].data.cpu().float().numpy()
124
+ if model == "FreeVC" or model == "FreeVC-s":
125
+ write("out.wav", hps.data.sampling_rate, audio)
126
+ else:
127
+ write("out.wav", 24000, audio)
128
+ out = "out.wav"
129
+ return out
130
+
131
+ # GLM2
132
+
133
+ language_dict = tts_order_voice
134
+
135
+ # fix timezone in Linux
136
+ os.environ["TZ"] = "Asia/Shanghai"
137
+ try:
138
+ time.tzset() # type: ignore # pylint: disable=no-member
139
+ except Exception:
140
+ # Windows
141
+ logger.warning("Windows, cant run time.tzset()")
142
+
143
+ # model_name = "THUDM/chatglm2-6b"
144
+ model_name = "THUDM/chatglm2-6b-int4"
145
+
146
+ RETRY_FLAG = False
147
+
148
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
149
+
150
+ # model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda()
151
+
152
+ # 4/8 bit
153
+ # model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).quantize(4).cuda()
154
+
155
+ has_cuda = torch.cuda.is_available()
156
+
157
+ # has_cuda = False # force cpu
158
+
159
+ if has_cuda:
160
+ model_glm = (
161
+ AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda().half()
162
+ ) # 3.92G
163
+ else:
164
+ model_glm = AutoModel.from_pretrained(
165
+ model_name, trust_remote_code=True
166
+ ).float() # .float() .half().float()
167
+
168
+ model_glm = model_glm.eval()
169
+
170
+ _ = """Override Chatbot.postprocess"""
171
+
172
+
173
+ def postprocess(self, y):
174
+ if y is None:
175
+ return []
176
+ for i, (message, response) in enumerate(y):
177
+ y[i] = (
178
+ None if message is None else mdtex2html.convert((message)),
179
+ None if response is None else mdtex2html.convert(response),
180
+ )
181
+ return y
182
+
183
+
184
+ gr.Chatbot.postprocess = postprocess
185
+
186
+
187
+ def parse_text(text):
188
+ """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
189
+ lines = text.split("\n")
190
+ lines = [line for line in lines if line != ""]
191
+ count = 0
192
+ for i, line in enumerate(lines):
193
+ if "```" in line:
194
+ count += 1
195
+ items = line.split("`")
196
+ if count % 2 == 1:
197
+ lines[i] = f'<pre><code class="language-{items[-1]}">'
198
+ else:
199
+ lines[i] = "<br></code></pre>"
200
+ else:
201
+ if i > 0:
202
+ if count % 2 == 1:
203
+ line = line.replace("`", r"\`")
204
+ line = line.replace("<", "&lt;")
205
+ line = line.replace(">", "&gt;")
206
+ line = line.replace(" ", "&nbsp;")
207
+ line = line.replace("*", "&ast;")
208
+ line = line.replace("_", "&lowbar;")
209
+ line = line.replace("-", "&#45;")
210
+ line = line.replace(".", "&#46;")
211
+ line = line.replace("!", "&#33;")
212
+ line = line.replace("(", "&#40;")
213
+ line = line.replace(")", "&#41;")
214
+ line = line.replace("$", "&#36;")
215
+ lines[i] = "<br>" + line
216
+ text = "".join(lines)
217
+ return text
218
+
219
+
220
+ def predict(
221
+ RETRY_FLAG, input, chatbot, max_length, top_p, temperature, history, past_key_values
222
+ ):
223
+ try:
224
+ chatbot.append((parse_text(input), ""))
225
+ except Exception as exc:
226
+ logger.error(exc)
227
+ logger.debug(f"{chatbot=}")
228
+ _ = """
229
+ if chatbot:
230
+ chatbot[-1] = (parse_text(input), str(exc))
231
+ yield chatbot, history, past_key_values
232
+ # """
233
+ yield chatbot, history, past_key_values
234
+
235
+ for response, history, past_key_values in model_glm.stream_chat(
236
+ tokenizer,
237
+ input,
238
+ history,
239
+ past_key_values=past_key_values,
240
+ return_past_key_values=True,
241
+ max_length=max_length,
242
+ top_p=top_p,
243
+ temperature=temperature,
244
+ ):
245
+ chatbot[-1] = (parse_text(input), parse_text(response))
246
+ # chatbot[-1][-1] = parse_text(response)
247
+
248
+ yield chatbot, history, past_key_values, parse_text(response)
249
+
250
+
251
+ def trans_api(input, max_length=4096, top_p=0.8, temperature=0.2):
252
+ if max_length < 10:
253
+ max_length = 4096
254
+ if top_p < 0.1 or top_p > 1:
255
+ top_p = 0.85
256
+ if temperature <= 0 or temperature > 1:
257
+ temperature = 0.01
258
+ try:
259
+ res, _ = model_glm.chat(
260
+ tokenizer,
261
+ input,
262
+ history=[],
263
+ past_key_values=None,
264
+ max_length=max_length,
265
+ top_p=top_p,
266
+ temperature=temperature,
267
+ )
268
+ # logger.debug(f"{res=} \n{_=}")
269
+ except Exception as exc:
270
+ logger.error(f"{exc=}")
271
+ res = str(exc)
272
+
273
+ return res
274
+
275
+
276
+ def reset_user_input():
277
+ return gr.update(value="")
278
+
279
+
280
+ def reset_state():
281
+ return [], [], None, ""
282
+
283
+
284
+ # Delete last turn
285
+ def delete_last_turn(chat, history):
286
+ if chat and history:
287
+ chat.pop(-1)
288
+ history.pop(-1)
289
+ return chat, history
290
+
291
+
292
+ # Regenerate response
293
+ def retry_last_answer(
294
+ user_input, chatbot, max_length, top_p, temperature, history, past_key_values
295
+ ):
296
+ if chatbot and history:
297
+ # Removing the previous conversation from chat
298
+ chatbot.pop(-1)
299
+ # Setting up a flag to capture a retry
300
+ RETRY_FLAG = True
301
+ # Getting last message from user
302
+ user_input = history[-1][0]
303
+ # Removing bot response from the history
304
+ history.pop(-1)
305
+
306
+ yield from predict(
307
+ RETRY_FLAG, # type: ignore
308
+ user_input,
309
+ chatbot,
310
+ max_length,
311
+ top_p,
312
+ temperature,
313
+ history,
314
+ past_key_values,
315
+ )
316
+
317
+ # print
318
+
319
+ def print(text):
320
+ return text
321
+
322
+ # TTS
323
+
324
+ async def text_to_speech_edge(text, language_code):
325
+ voice = language_dict[language_code]
326
+ communicate = edge_tts.Communicate(text, voice)
327
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
328
+ tmp_path = tmp_file.name
329
+
330
+ await communicate.save(tmp_path)
331
+
332
+ return tmp_path
333
+
334
+
335
+ with gr.Blocks(title="ChatGLM2-6B-int4", theme=gr.themes.Soft(text_size="sm")) as demo:
336
+ gr.HTML("<center>"
337
+ "<h1>🥳💕🎶 - ChatGLM2 + 声音克隆:和你喜欢的角色畅所欲言吧!</h1>"
338
+ "</center>")
339
+ gr.Markdown("## <center>💡 - 第二代ChatGLM大语言模型 + FreeVC变声,为您打造独一无二的沉浸式对话体验,支持中英双语</center>")
340
+ gr.Markdown("## <center>🌊 - 更多精彩应用,尽在[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕</center>")
341
+ gr.Markdown("### <center>⭐ - 如果您喜欢这个程序,欢迎给我的[Github项目](https://github.com/KevinWang676/ChatGLM2-Voice-Cloning)点赞支持!</center>")
342
+
343
+ with gr.Tab("🍻 - ChatGLM2聊天区"):
344
+ with gr.Accordion("📒 相关信息", open=False):
345
+ _ = f""" ChatGLM2的可选参数信息:
346
+ * Low temperature: responses will be more deterministic and focused; High temperature: responses more creative.
347
+ * Suggested temperatures -- translation: up to 0.3; chatting: > 0.4
348
+ * Top P controls dynamic vocabulary selection based on context.\n
349
+ 如果您想让ChatGLM2进行角色扮演并与之对话,请先输入恰当的提示词,如“请你扮演成动漫角色蜡笔小新并和我进行对话”;您也可以为ChatGLM2提供自定义的角色设定\n
350
+ 当您使用声音克隆功能时,请先在此程序的对应位置上传一段您喜欢的音频
351
+ """
352
+ gr.Markdown(dedent(_))
353
+ chatbot = gr.Chatbot(height=300)
354
+ with gr.Row():
355
+ with gr.Column(scale=4):
356
+ with gr.Column(scale=12):
357
+ user_input = gr.Textbox(
358
+ label="请在此处和GLM2聊天 (按回车键即可发送)",
359
+ placeholder="聊点什么吧",
360
+ )
361
+ RETRY_FLAG = gr.Checkbox(value=False, visible=False)
362
+ with gr.Column(min_width=32, scale=1):
363
+ with gr.Row():
364
+ submitBtn = gr.Button("开始和GLM2交流吧", variant="primary")
365
+ deleteBtn = gr.Button("删除最新一轮对话", variant="secondary")
366
+ retryBtn = gr.Button("重新生成最新一轮对话", variant="secondary")
367
+
368
+ with gr.Accordion("🔧 更多设置", open=False):
369
+ with gr.Row():
370
+ emptyBtn = gr.Button("清空所有聊天记录")
371
+ max_length = gr.Slider(
372
+ 0,
373
+ 32768,
374
+ value=8192,
375
+ step=1.0,
376
+ label="Maximum length",
377
+ interactive=True,
378
+ )
379
+ top_p = gr.Slider(
380
+ 0, 1, value=0.85, step=0.01, label="Top P", interactive=True
381
+ )
382
+ temperature = gr.Slider(
383
+ 0.01, 1, value=0.95, step=0.01, label="Temperature", interactive=True
384
+ )
385
+
386
+
387
+ with gr.Row():
388
+ test1 = gr.Textbox(label="GLM2的最新回答 (可编辑)", lines = 3)
389
+ with gr.Column():
390
+ language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人")
391
+ tts_btn = gr.Button("生成对应的音频吧", variant="primary")
392
+ output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False)
393
+
394
+ tts_btn.click(text_to_speech_edge, inputs=[test1, language], outputs=[output_audio])
395
+
396
+ with gr.Row():
397
+ model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False)
398
+ audio1 = output_audio
399
+ audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath')
400
+ clone_btn = gr.Button("开始AI声音克隆吧", variant="primary")
401
+ audio_cloned = gr.Audio(label="为您生成的专属声音克隆音频", type='filepath')
402
+
403
+ clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned])
404
+
405
+ history = gr.State([])
406
+ past_key_values = gr.State(None)
407
+
408
+ user_input.submit(
409
+ predict,
410
+ [
411
+ RETRY_FLAG,
412
+ user_input,
413
+ chatbot,
414
+ max_length,
415
+ top_p,
416
+ temperature,
417
+ history,
418
+ past_key_values,
419
+ ],
420
+ [chatbot, history, past_key_values, test1],
421
+ show_progress="full",
422
+ )
423
+ submitBtn.click(
424
+ predict,
425
+ [
426
+ RETRY_FLAG,
427
+ user_input,
428
+ chatbot,
429
+ max_length,
430
+ top_p,
431
+ temperature,
432
+ history,
433
+ past_key_values,
434
+ ],
435
+ [chatbot, history, past_key_values, test1],
436
+ show_progress="full",
437
+ api_name="predict",
438
+ )
439
+ submitBtn.click(reset_user_input, [], [user_input])
440
+
441
+ emptyBtn.click(
442
+ reset_state, outputs=[chatbot, history, past_key_values, test1], show_progress="full"
443
+ )
444
+
445
+ retryBtn.click(
446
+ retry_last_answer,
447
+ inputs=[
448
+ user_input,
449
+ chatbot,
450
+ max_length,
451
+ top_p,
452
+ temperature,
453
+ history,
454
+ past_key_values,
455
+ ],
456
+ # outputs = [chatbot, history, last_user_message, user_message]
457
+ outputs=[chatbot, history, past_key_values, test1],
458
+ )
459
+ deleteBtn.click(delete_last_turn, [chatbot, history], [chatbot, history])
460
+
461
+ with gr.Accordion("📔 提示词示例", open=False):
462
+ etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
463
+ examples = gr.Examples(
464
+ examples=[
465
+ ["Explain the plot of Cinderella in a sentence."],
466
+ [
467
+ "How long does it take to become proficient in French, and what are the best methods for retaining information?"
468
+ ],
469
+ ["What are some common mistakes to avoid when writing code?"],
470
+ ["Build a prompt to generate a beautiful portrait of a horse"],
471
+ ["Suggest four metaphors to describe the benefits of AI"],
472
+ ["Write a pop song about leaving home for the sandy beaches."],
473
+ ["Write a summary demonstrating my ability to tame lions"],
474
+ ["鲁迅和周树人什么关系"],
475
+ ["从前有一头牛,这头牛后面有什么?"],
476
+ ["正无穷大加一大于正无穷大吗?"],
477
+ ["正无穷大加正无穷大大于正无穷大吗?"],
478
+ ["-2的平方根等于什么"],
479
+ ["树上有5只鸟,猎人开枪打死了一只。树上还有几只鸟?"],
480
+ ["树上有11只鸟,猎人开���打死了一只。树上还有几只鸟?提示:需考虑鸟可能受惊吓飞走。"],
481
+ ["鲁迅和周树人什么关系 用英文回答"],
482
+ ["以红楼梦的行文风格写一张委婉的请假条。不少于320字。"],
483
+ [f"{etext} 翻成中文,列出3个版本"],
484
+ [f"{etext} \n 翻成中文,保留原意,但使用文学性的语言。不要写解释。列出3个版本"],
485
+ ["js 判断一个数是不是质数"],
486
+ ["js 实现python 的 range(10)"],
487
+ ["js 实现python 的 [*(range(10)]"],
488
+ ["假定 1 + 2 = 4, 试求 7 + 8"],
489
+ ["Erkläre die Handlung von Cinderella in einem Satz."],
490
+ ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
491
+ ],
492
+ inputs=[user_input],
493
+ examples_per_page=30,
494
+ )
495
+
496
+ with gr.Accordion("For Chat/Translation API", open=False, visible=False):
497
+ input_text = gr.Text()
498
+ tr_btn = gr.Button("Go", variant="primary")
499
+ out_text = gr.Text()
500
+ tr_btn.click(
501
+ trans_api,
502
+ [input_text, max_length, top_p, temperature],
503
+ out_text,
504
+ # show_progress="full",
505
+ api_name="tr",
506
+ )
507
+ _ = """
508
+ input_text.submit(
509
+ trans_api,
510
+ [input_text, max_length, top_p, temperature],
511
+ out_text,
512
+ show_progress="full",
513
+ api_name="tr1",
514
+ )
515
+ # """
516
+ with gr.Tab("📺 - 视频聊天区"):
517
+ with gr.Row().style(equal_height=False):
518
+ with gr.Column(variant='panel'):
519
+ with gr.Tabs(elem_id="sadtalker_source_image"):
520
+ with gr.TabItem('Source image'):
521
  with gr.Row():
522
+ source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image").style(width=512)
523
+
524
+
525
+ with gr.Tabs(elem_id="sadtalker_driven_audio"):
526
+ with gr.TabItem('Driving Methods'):
527
+
528
  with gr.Row():
529
+ driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
530
+ driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
531
+
532
+ with gr.Column():
533
+ use_idle_mode = gr.Checkbox(label="Use Idle Animation", visible=False)
534
+ length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.", visible=False)
535
+ use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
536
+
537
  with gr.Row():
538
+ ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref", visible=False).style(width=512)
539
+
540
+ with gr.Column():
541
+ use_ref_video = gr.Checkbox(label="Use Reference Video", visible=False)
542
+ ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))", visible=False)
543
+
544
+ ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
545
+
546
+
547
+ with gr.Column(variant='panel'):
548
+ with gr.Tabs(elem_id="sadtalker_checkbox"):
549
+ with gr.TabItem('Settings'):
550
+ with gr.Column(variant='panel'):
551
+ # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
552
+ # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
553
+ with gr.Row():
554
+ pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0, visible=False) #
555
+ exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1, visible=False) #
556
+ blink_every = gr.Checkbox(label="use eye blink", value=True, visible=False)
557
+
558
+ with gr.Row():
559
+ size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?", visible=False) #
560
+ preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?", visible=False)
561
+
562
+ with gr.Row():
563
+ is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)", value=True)
564
+ facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?", visible=False)
565
+
566
+ with gr.Row():
567
+ batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=2)
568
+ enhancer = gr.Checkbox(label="GFPGAN as Face enhancer", value=True, visible=False)
569
+
570
+ submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
571
+
572
+ with gr.Tabs(elem_id="sadtalker_genearted"):
573
+ gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
574
+
575
+
576
+
577
+ submit.click(
578
+ fn=sad_talker.test,
579
+ inputs=[source_image,
580
+ driven_audio,
581
+ preprocess_type,
582
+ is_still_mode,
583
+ enhancer,
584
+ batch_size,
585
+ size_of_image,
586
+ pose_style,
587
+ facerender,
588
+ exp_weight,
589
+ use_ref_video,
590
+ ref_video,
591
+ ref_info,
592
+ use_idle_mode,
593
+ length_of_audio,
594
+ blink_every
595
+ ],
596
+ outputs=[gen_video]
597
+ )
598
+ gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>")
599
+ gr.Markdown("<center>💡 - 如何使用此程序:输入您对ChatGLM的提问后,依次点击“开始和GLM2交流吧”、“生成对应的音频吧”、“开始AI声音克隆吧”三个按键即可;使用声音克隆功能时,请先上传一段您喜欢的音频</center>")
600
+ gr.HTML('''
601
+ <div class="footer">
602
+ <p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘
603
+ </p>
604
+ </div>
605
+ ''')
606
+
607
+
608
+ demo.queue().launch(show_error=True, debug=True)
609