Yijun-Yang commited on
Commit
3702bb5
1 Parent(s): dbe9ffc

pythonversion

Browse files
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import time
4
+ import os
5
+ import glob
6
+ import random
7
+ import shutil
8
+ from enum import Enum
9
+ from threading import Thread
10
+ from multiprocessing import Process, Value
11
+
12
+ import gradio as gr
13
+ import pytoml
14
+ from loguru import logger
15
+ import spaces
16
+
17
+ from huixiangdou.service import Worker, llm_serve, ArticleRetrieval, CacheRetriever, FeatureStore, FileOperation
18
+
19
+ class PARAM_CODE(Enum):
20
+ """Parameter code."""
21
+ SUCCESS = 0
22
+ FAILED = 1
23
+ ERROR = 2
24
+
25
+ def parse_args():
26
+ """Parse args."""
27
+ parser = argparse.ArgumentParser(description='Worker.')
28
+ parser.add_argument('--work_dir',
29
+ type=str,
30
+ default='workdir',
31
+ help='Working directory.')
32
+ parser.add_argument('--repo_dir',
33
+ type=str,
34
+ default='repodir',
35
+ help='Repository directory.')
36
+ parser.add_argument(
37
+ '--config_path',
38
+ default='config.ini',
39
+ type=str,
40
+ help='Worker configuration path. Default value is config.ini')
41
+ parser.add_argument('--standalone',
42
+ action='store_true',
43
+ default=True,
44
+ help='Auto deploy required Hybrid LLM Service.')
45
+ args = parser.parse_args()
46
+ return args
47
+
48
+ def update_remote_buttons(remote):
49
+ if remote:
50
+ return [
51
+ gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",
52
+ visible=True),
53
+ gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
54
+ label="选择大模型提供商",
55
+ interactive=True,visible=True),
56
+ gr.Textbox(label="您的API",lines = 1,
57
+ interactive=True,visible=True),
58
+ gr.Dropdown([],label="选择模型",
59
+ interactive=True,visible=True)
60
+ ]
61
+ else:
62
+ return [
63
+ gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",
64
+ visible=False),
65
+ gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
66
+ label="选择大模型提供商",
67
+ interactive=False,visible=False),
68
+ gr.Textbox(label="您的API",lines = 1,
69
+ interactive=False,visible=False),
70
+ gr.Dropdown([],label="选择模型",
71
+ interactive=False,visible=False)
72
+ ]
73
+
74
+ def udate_model_dropdown(remote_company):
75
+ model_choices = {
76
+ 'kimi': ['moonshot-v1-128k'],
77
+ 'deepseek': ['deepseek-chat'],
78
+ 'zhipuai': ['glm-4'],
79
+ 'gpt': ['gpt-4-32k-0613','gpt-3.5-turbo']
80
+ }
81
+ return gr.Dropdown(choices= model_choices[remote_company])
82
+
83
+ def update_remote_config(remote_ornot,remote_company = None,api = None,model = None):
84
+ with open(CONFIG_PATH, encoding='utf8') as f:
85
+ config = pytoml.load(f)
86
+
87
+ if remote_ornot:
88
+ if remote_company == None or api == None or model == None:
89
+ raise ValueError('remote_company, api, model not provided')
90
+ config['llm']['enable_local'] = 0
91
+ config['llm']['enable_remote'] = 1
92
+ config['llm']['server']['remote_type'] = remote_company
93
+ config['llm']['server']['remote_api_key'] = api
94
+ config['llm']['server']['remote_llm_model'] = model
95
+ else:
96
+ config['llm']['enable_local'] = 1
97
+ config['llm']['enable_remote'] = 0
98
+ with open(CONFIG_PATH, 'w') as f:
99
+ pytoml.dump(config, f)
100
+ return gr.Button("配置已保存")
101
+
102
+ @spaces.GPU
103
+ def get_ready(query:str,chunksize=None,k=None):
104
+
105
+ with open(CONFIG_PATH, encoding='utf8') as f:
106
+ config = pytoml.load(f)
107
+ workdir = config['feature_store']['work_dir']
108
+ repodir = config['feature_store']['repo_dir']
109
+
110
+ if query == 'repo_work': # no need to return assistant
111
+ return repodir, workdir, config
112
+ theme = ''
113
+ try:
114
+ with open(os.path.join(config['feature_store']['repo_dir'],'config.json'), 'r') as f:
115
+ repo_config = json.load(f)
116
+ theme = ' '.join(repo_config['keywords'])
117
+ except:
118
+ pass
119
+
120
+ if query == 'annotation':
121
+ if not chunksize or not k:
122
+ raise ValueError('chunksize or k not provided')
123
+ chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
124
+ clusterdir = os.path.join(chunkdir, 'cluster_features', f'cluster_features_{k}')
125
+ assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
126
+ samples_json = os.path.join(clusterdir,'samples.json')
127
+ with open(samples_json, 'r') as f:
128
+ samples = json.load(f)
129
+ f.close()
130
+ return clusterdir, samples, assistant, theme
131
+
132
+ elif query == 'inspiration':
133
+ if not chunksize or not k:
134
+ raise ValueError('chunksize or k not provided')
135
+
136
+ chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
137
+ clusterdir = os.path.join(chunkdir, 'cluster_features', f'cluster_features_{k}')
138
+ assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
139
+ annofile = os.path.join(clusterdir,'annotation.jsonl')
140
+ with open(annofile, 'r') as f:
141
+ annoresult = f.readlines()
142
+
143
+ f.close()
144
+ annoresult = [json.loads(obj) for obj in annoresult]
145
+ return clusterdir, annoresult, assistant, theme
146
+ elif query == 'summarize': # no need for params k
147
+ if not chunksize:
148
+ raise ValueError('chunksize not provided')
149
+ chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
150
+ assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
151
+ return assistant,theme
152
+
153
+ else:
154
+ raise ValueError('query not recognized')
155
+
156
+ def update_repo_info():
157
+ with open(CONFIG_PATH, encoding='utf8') as f:
158
+ config = pytoml.load(f)
159
+ repodir = config['feature_store']['repo_dir']
160
+ if os.path.exists(repodir):
161
+ pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
162
+ number_of_pdf = len(pdffiles)
163
+ if os.path.exists(os.path.join(repodir,'config.json')):
164
+
165
+ with open(os.path.join(repodir,'config.json'), 'r') as f:
166
+ repo_config = json.load(f)
167
+
168
+ keywords = repo_config['keywords']
169
+ length = repo_config['len']
170
+ retmax = repo_config['retmax']
171
+
172
+ return keywords,length,retmax,number_of_pdf
173
+ else:
174
+ return None,None,None,number_of_pdf
175
+ else:
176
+ return None,None,None,None
177
+
178
+ def upload_file(files):
179
+ repodir, workdir, _ = get_ready('repo_work')
180
+ if not os.path.exists(repodir):
181
+ os.makedirs(repodir)
182
+
183
+ for file in files:
184
+ destination_path = os.path.join(repodir, os.path.basename(file.name))
185
+
186
+ shutil.copy(file.name, destination_path)
187
+
188
+
189
+ return files
190
+
191
+ def generate_articles_repo(keywords:str,retmax:int):
192
+ keys= [k.strip() for k in keywords.split('\n')]
193
+ repodir, _, _ = get_ready('repo_work')
194
+
195
+ articelfinder = ArticleRetrieval(keywords = keys,
196
+ repo_dir = repodir,
197
+ retmax = retmax)
198
+ articelfinder.initiallize()
199
+ return update_repo()
200
+
201
+ def delete_articles_repo():
202
+ # 在这里运行生成数据库的函数
203
+ repodir, workdir, _ = get_ready('repo_work')
204
+ if os.path.exists(repodir):
205
+ shutil.rmtree(repodir)
206
+ if os.path.exists(workdir):
207
+ shutil.rmtree(workdir)
208
+
209
+ return gr.Textbox(label="文献库概况",lines =3,
210
+ value = '文献库和相关数据库已删除',
211
+ visible = True)
212
+
213
+ def update_repo():
214
+ keys,len,retmax,pdflen = update_repo_info()
215
+ if keys:
216
+ newinfo = f"搜索得到文献:\n 关键词:{keys}\n 文献数量:{len}\n 获取上限:{retmax}\n\n上传文献:\n 数量:{pdflen}"
217
+ else:
218
+ if pdflen:
219
+ newinfo = f'搜索得到文献:无\n上传文献:\n 数量:{pdflen}'
220
+ else:
221
+ newinfo = '目前还没有文献库'
222
+
223
+ return gr.Textbox(label="文献库概况",lines =1,
224
+ value = newinfo,
225
+ visible = True)
226
+
227
+ def update_database_info():
228
+ with open(CONFIG_PATH, encoding='utf8') as f:
229
+ config = pytoml.load(f)
230
+ workdir = config['feature_store']['work_dir']
231
+ chunkdirs = glob.glob(os.path.join(workdir, 'chunksize_*'))
232
+ chunkdirs.sort()
233
+ list_of_chunksize = [int(chunkdir.split('_')[-1]) for chunkdir in chunkdirs]
234
+ # print(list_of_chunksize)
235
+ jsonobj = {}
236
+ for chunkdir in chunkdirs:
237
+ k_dir = glob.glob(os.path.join(chunkdir, 'cluster_features','cluster_features_*'))
238
+ k_dir.sort()
239
+ list_of_k = [int(k.split('_')[-1]) for k in k_dir]
240
+ jsonobj[int(chunkdir.split('_')[-1])] = list_of_k
241
+
242
+
243
+ new_options = [f"chunksize:{chunksize}, k:{k}" for chunksize in list_of_chunksize for k in jsonobj[chunksize]]
244
+
245
+ return new_options, jsonobj
246
+
247
+ @spaces.GPU
248
+ def generate_database(chunksize:int,nclusters:str|list[str]):
249
+ # 在这里运行生成数据库的函数
250
+ repodir, workdir, _ = get_ready('repo_work')
251
+ if not os.path.exists(repodir):
252
+ return gr.Textbox(label="数据库已生成",value = '请先生成文献库',visible = True)
253
+ nclusters = [int(i) for i in nclusters]
254
+ # 文献库和数据库的覆盖删除逻辑待定
255
+ # 理论上 文献库只能生成一次 所以每次生成文献库都要删除之前的文献库和数据库
256
+ # 数据库可以根据文献库多次生成 暂不做删除 目前没有节省算力的逻辑 重复计算后覆盖 以后优化
257
+ # 不同的chunksize和nclusters会放在不同的文件夹下 不会互相覆盖
258
+ # if os.path.exists(workdir):
259
+ # shutil.rmtree(workdir)
260
+
261
+ cache = CacheRetriever(config_path=CONFIG_PATH)
262
+ fs_init = FeatureStore(embeddings=cache.embeddings,
263
+ reranker=cache.reranker,
264
+ chunk_size=chunksize,
265
+ n_clusters=nclusters,
266
+ config_path=CONFIG_PATH)
267
+
268
+ # walk all files in repo dir
269
+ file_opr = FileOperation()
270
+ files = file_opr.scan_dir(repo_dir=repodir)
271
+ fs_init.initialize(files=files, work_dir=workdir,file_opr=file_opr)
272
+ file_opr.summarize(files)
273
+ del fs_init
274
+ cache.pop('default')
275
+ texts, _ = update_database_info()
276
+ return gr.Textbox(label="数据库概况",value = '\n'.join(texts) ,visible = True)
277
+
278
+ def delete_database():
279
+ _, workdir, _ = get_ready('repo_work')
280
+ if os.path.exists(workdir):
281
+ shutil.rmtree(workdir)
282
+ return gr.Textbox(label="数据库概况",lines =3,value = '数据库已删除',visible = True)
283
+
284
+ def update_database_textbox():
285
+ texts, _ = update_database_info()
286
+ if texts == []:
287
+ return gr.Textbox(label="数据库概况",value = '目前还没有数据库',visible = True)
288
+ else:
289
+ return gr.Textbox(label="数据库概况",value = '\n'.join(texts),visible = True)
290
+
291
+ def update_chunksize_dropdown():
292
+ _, jsonobj = update_database_info()
293
+ return gr.Dropdown(choices= jsonobj.keys())
294
+
295
+ def update_ncluster_dropdown(chunksize:int):
296
+ _, jsonobj = update_database_info()
297
+ nclusters = jsonobj[chunksize]
298
+ return gr.Dropdown(choices= nclusters)
299
+
300
+ @spaces.GPU
301
+ def annotation(n,chunksize:int,nclusters:int,remote_ornot:bool):
302
+ '''
303
+ use llm to annotate cluster
304
+ n: percentage of clusters to annotate
305
+ '''
306
+ query = 'annotation'
307
+ if remote_ornot:
308
+ backend = 'remote'
309
+ else:
310
+ backend = 'local'
311
+
312
+ clusterdir, samples, assistant, theme = get_ready('annotation',chunksize,nclusters)
313
+ new_obj_list = []
314
+ n = round(n * len(samples.keys()))
315
+ for cluster_no in random.sample(samples.keys(), n):
316
+ chunk = '\n'.join(samples[cluster_no]['samples'][:10])
317
+
318
+ code, reply, cluster_no = assistant.annotate_cluster(
319
+ theme = theme,
320
+ cluster_no=cluster_no,
321
+ chunk=chunk,
322
+ history=[],
323
+ groupname='',
324
+ backend=backend)
325
+ references = f"cluster_no: {cluster_no}"
326
+ new_obj = {
327
+ 'cluster_no': cluster_no,
328
+ 'chunk': chunk,
329
+ 'annotation': reply
330
+ }
331
+ new_obj_list.append(new_obj)
332
+ logger.info(f'{code}, {query}, {reply}, {references}')
333
+
334
+ with open(os.path.join(clusterdir, 'annotation.jsonl'), 'a') as f:
335
+ json.dump(new_obj, f, ensure_ascii=False)
336
+ f.write('\n')
337
+
338
+ return '\n\n'.join([obj['annotation'] for obj in new_obj_list])
339
+
340
+ @spaces.GPU
341
+ def inspiration(annotation:str,chunksize:int,nclusters:int,remote_ornot:bool):
342
+ query = 'inspiration'
343
+ if remote_ornot:
344
+ backend = 'remote'
345
+ else:
346
+ backend = 'local'
347
+
348
+ clusterdir, annoresult, assistant, theme = get_ready('inspiration',chunksize,nclusters)
349
+ new_obj_list = []
350
+
351
+ if annotation is not None: # if the user wants to get inspiration from specific clusters only
352
+ annoresult = [obj for obj in annoresult if obj['annotation'] in [txt.strip() for txt in annotation.split('\n')]]
353
+
354
+ for index in random.sample(range(len(annoresult)), min(5, len(annoresult))):
355
+ cluster_no = annoresult[index]['cluster_no']
356
+ chunks = annoresult[index]['annotation']
357
+
358
+ code, reply = assistant.getinspiration(
359
+ theme = theme,
360
+ annotations = chunks,
361
+ history=[],
362
+ groupname='',backend=backend)
363
+ new_obj = {
364
+ 'inspiration': reply,
365
+ 'cluster_no': cluster_no
366
+ }
367
+ new_obj_list.append(new_obj)
368
+ logger.info(f'{code}, {query}, {cluster_no},{reply}')
369
+
370
+ with open(os.path.join(clusterdir, 'inspiration.jsonl'), 'a') as f:
371
+ json.dump(new_obj, f, ensure_ascii=False)
372
+ with open(os.path.join(clusterdir, 'inspiration.txt'), 'a') as f:
373
+ f.write(f'{reply}\n')
374
+
375
+ return '\n\n'.join(list(set([obj['inspiration'] for obj in new_obj_list])))
376
+
377
+
378
+ def getpmcurls(references):
379
+ urls = []
380
+ for ref in references:
381
+ if ref.startswith('PMC'):
382
+
383
+ refid = ref.replace('.txt','')
384
+ urls.append(f'https://www.ncbi.nlm.nih.gov/pmc/articles/{refid}/')
385
+ else:
386
+ urls.append(ref)
387
+ return urls
388
+
389
+ @spaces.GPU
390
+ def summarize_text(query,chunksize:int,remote_ornot:bool):
391
+ if remote_ornot:
392
+ backend = 'remote'
393
+ else:
394
+ backend = 'local'
395
+
396
+ assistant,_ = get_ready('summarize',chunksize=chunksize,k=None)
397
+ code, reply, references = assistant.generate(query=query,
398
+ history=[],
399
+ groupname='',backend = backend)
400
+
401
+ logger.info(f'{code}, {query}, {reply}, {references}')
402
+ urls = getpmcurls(references)
403
+ mds = '\n'.join([f'[{ref}]({url})' for ref,url in zip(references,urls)])
404
+ return reply, gr.Markdown(label="参考文献",value = mds)
405
+
406
+ def main_interface():
407
+ with gr.Blocks() as demo:
408
+ with gr.Row():
409
+ gr.Markdown(
410
+ """
411
+ # 医学文献综述助手 (又名 不想看文献)
412
+ """
413
+ )
414
+
415
+ with gr.Tab("模型服务配置"):
416
+ gr.Markdown("""
417
+ #### 配置模型服务 🛠️
418
+
419
+ 1. **是否使用远程大模型**
420
+ - 勾选此项,如果你想使用远程的大模型服务。
421
+ - 如果不勾选,将默认使用本地模型服务。
422
+
423
+ 2. **API配置**
424
+ - 配置大模型提供商和API,确保模型服务能够正常运行。
425
+ - 提供商选择:kimi、deepseek、zhipuai、gpt。
426
+ - 输入您的API密钥和选择对应模型。
427
+ - 点击“保存配置”按钮以保存您的设置。
428
+
429
+ 📝 **备注**:请参考[如何使用]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')获取更多信息。
430
+
431
+ """)
432
+
433
+ remote_ornot = gr.Checkbox(label="是否使用远程大模型")
434
+ with gr.Accordion("API配置", open=True):
435
+ apimd = gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",visible=False)
436
+ remote_company = gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
437
+ label="选择大模型提供商",interactive=False,visible=False)
438
+ api = gr.Textbox(label="您的API",lines = 1,interactive=False,visible=False)
439
+ model = gr.Dropdown([],label="选择模型",interactive=False,visible=False)
440
+
441
+ confirm_button = gr.Button("保存配置")
442
+
443
+ remote_ornot.change(update_remote_buttons, inputs=[remote_ornot],outputs=[apimd,remote_company,api,model])
444
+ remote_company.change(udate_model_dropdown, inputs=[remote_company],outputs=[model])
445
+ confirm_button.click(update_remote_config, inputs=[remote_ornot,remote_company,api,model],outputs=[confirm_button])
446
+
447
+
448
+ with gr.Tab("文献查找+数据库生成"):
449
+ gr.Markdown("""
450
+ #### 查找文献 📚
451
+
452
+ 1. **输入关键词批量PubMed PMC文献**
453
+ - 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
454
+ - 设置查找数量(0-1000)。
455
+ - 点击“搜索PubMed PMC”按钮进行文献查找。
456
+
457
+ 2. **上传PDF**
458
+ - 通过“上传PDF”按钮上传您已有的PDF文献文件。
459
+
460
+ 3. **更新文献库情况 删除文献库**
461
+ - 点击“更新文献库情况”按钮,查看当前文献库的概况。
462
+ - 如果需要重置或删除现有文献库,点击“删除文献库”按钮。
463
+
464
+
465
+ #### 生成数据库 🗂️
466
+
467
+ 1. **设置数据库构建参数 生成数据库**
468
+ - 选择块大小(Chunk Size)和聚类数(Number of Clusters)。
469
+ - 提供选项用于选择合适的块大小和聚类数。
470
+ - 点击“生成数据库”按钮开始数据库生成过程。
471
+
472
+ 2. **更新数据库情况 删除数据库**
473
+ - 点击“更新数据库情况”按钮,查看当前数据库的概况。
474
+ - 点击“删除数据库”按钮移除现有数据库。
475
+
476
+ 📝 **备注**:请参考[如何选择数据库构建参数]('https://github.com/jabberwockyang/MedicalReviewAgent/tree/main')获取更多信息。
477
+ """)
478
+ with gr.Row(equal_height=True):
479
+ with gr.Column(scale=1):
480
+ input_keys = gr.Textbox(label="感兴趣的关键词",
481
+ lines = 5)
482
+ retmax = gr.Slider(
483
+ minimum=0,
484
+ maximum=1000,
485
+ value=500,
486
+ interactive=True,
487
+ label="查多少",
488
+ )
489
+ generate_repo_button = gr.Button("搜索PubMed PMC")
490
+ with gr.Column(scale=2):
491
+ file_output = gr.File(scale=2)
492
+ upload_button = gr.UploadButton("上传PDF",
493
+ file_types=[".pdf",".csv",".doc"],
494
+ file_count="multiple",scale=0)
495
+
496
+ with gr.Row(equal_height=True):
497
+ with gr.Column(scale=0):
498
+ delete_repo_button = gr.Button("删除文献库")
499
+ update_repo_button = gr.Button("更新文献库情况")
500
+ with gr.Column(scale=2):
501
+
502
+ repo_summary =gr.Textbox(label= '文献库概况', value="目前还没有文献库")
503
+
504
+ generate_repo_button.click(generate_articles_repo,
505
+ inputs=[input_keys,retmax],
506
+ outputs = [repo_summary])
507
+
508
+
509
+ delete_repo_button.click(delete_articles_repo, inputs=None,
510
+ outputs = repo_summary)
511
+ update_repo_button.click(update_repo, inputs=None,
512
+ outputs = repo_summary)
513
+ upload_button.upload(upload_file, upload_button, file_output)
514
+
515
+ with gr.Accordion("数据库构建参数", open=True):
516
+ gr.Markdown("[如何选择数据库构建参数]('https://github.com/jabberwockyang/MedicalReviewAgent/tree/main')")
517
+ chunksize = gr.Slider(label="Chunk Size",
518
+ info= 'How long you want the chunk to be?',
519
+ minimum=128, maximum=4096,value=1024,step=1,
520
+ interactive=True)
521
+ ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
522
+ # default=["20", "50", '100'],
523
+ label="Number of Clusters",
524
+ info="How many Clusters you want to generate")
525
+
526
+ with gr.Row():
527
+ gene_database_button = gr.Button("生成数据库")
528
+ delete_database_button = gr.Button("删除数据库")
529
+ update_database_button = gr.Button("更新数据库情况")
530
+
531
+ database_summary = gr.Textbox(label="数据库概况",lines = 1,value="目前还没有数据库")
532
+
533
+
534
+ gene_database_button.click(generate_database, inputs=[chunksize,ncluster],
535
+ outputs = database_summary)
536
+
537
+ update_database_button.click(update_database_textbox,inputs=None,
538
+ outputs = [database_summary])
539
+
540
+ delete_database_button.click(delete_database, inputs=None,
541
+ outputs = database_summary)
542
+ with gr.Tab("写综述"):
543
+ gr.Markdown("""
544
+ #### 写综述 ✍️
545
+
546
+ 1. **更新数据库情况**
547
+ - 点击“更新数据库情况”按钮,确保使用最新的数据库信息。
548
+
549
+ 2. **选择块大小和聚类数**
550
+ - 从下拉菜单中选择合适的块大小和聚类数。
551
+
552
+ 3. **抽样标注文章聚类**
553
+ - 设置抽样标注比例(0-1)。
554
+ - 点击“抽样标注文章聚类”按钮开始标注过程。
555
+
556
+ 4. **获取灵感**
557
+ - 如果不知道写什么,点击“获取灵感”按钮。
558
+ - 系统将基于标注的文章聚类提供相应的综述子问题。
559
+
560
+ 5. **写综述**
561
+ - 输入您想写的内容或主题。
562
+ - 点击“写综述”按钮,生成综述文本。
563
+
564
+ 6. **查看生成结果**
565
+ - 生成的综述文本将显示在“看看”文本框中。
566
+ - 参考文献将显示在“参考文献”框中。
567
+
568
+ 📝 **备注**:可以尝试不同的参数进行标注和灵感获取,有助于提高综述的质量和相关性。
569
+ """)
570
+
571
+ with gr.Accordion("聚类标注相关参数", open=True):
572
+ with gr.Row():
573
+ update_options = gr.Button("更新数据库情况", scale=0)
574
+ chunksize = gr.Dropdown([], label="选择块大小", scale=0)
575
+ nclusters = gr.Dropdown([], label="选择聚类数", scale=0)
576
+ ntoread = gr.Slider(
577
+ minimum=0,maximum=1,value=0.5,
578
+ interactive=True,
579
+ label="抽样标注比例",
580
+ )
581
+
582
+ annotation_button = gr.Button("抽样标注文章聚类")
583
+ annotation_output = gr.Textbox(label="文章聚类标注/片段摘要",
584
+ lines = 5,
585
+ interactive= True,
586
+ show_copy_button=True)
587
+ inspiration_button = gr.Button("获取灵感")
588
+ inspiration_output = gr.Textbox(label="灵光一现",
589
+ lines = 5,
590
+ show_copy_button=True)
591
+
592
+
593
+ query = gr.Textbox(label="想写什么")
594
+
595
+ write_button = gr.Button("写综述")
596
+ output_text = gr.Textbox(label="看看",lines=10)
597
+ output_references = gr.Markdown(label="参考文献")
598
+
599
+ update_options.click(update_chunksize_dropdown,
600
+ outputs=[chunksize])
601
+
602
+ chunksize.change(update_ncluster_dropdown,
603
+ inputs=[chunksize],
604
+ outputs= [nclusters])
605
+
606
+ annotation_button.click(annotation,
607
+ inputs = [ntoread, chunksize, nclusters,remote_ornot],
608
+ outputs=[annotation_output])
609
+
610
+ inspiration_button.click(inspiration,
611
+ inputs= [annotation_output, chunksize, nclusters,remote_ornot],
612
+ outputs=[inspiration_output])
613
+
614
+ write_button.click(summarize_text,
615
+ inputs=[query, chunksize,remote_ornot],
616
+ outputs =[output_text,output_references])
617
+
618
+ demo.launch(share=False, server_name='0.0.0.0', debug=True,show_error=True,allowed_paths=['img_0.jpg'])
619
+
620
+ # start service
621
+ if __name__ == '__main__':
622
+ args = parse_args()
623
+ # copy config from config-bak
624
+ shutil.copy('config-bak.ini', args.config_path) # yyj
625
+ CONFIG_PATH = args.config_path
626
+
627
+ if args.standalone is True:
628
+ # hybrid llm serve
629
+ server_ready = Value('i', 0)
630
+ server_process = Process(target=llm_serve,
631
+ args=(args.config_path, server_ready))
632
+ server_process.start()
633
+ while True:
634
+ if server_ready.value == 0:
635
+ logger.info('waiting for server to be ready..')
636
+ time.sleep(3)
637
+ elif server_ready.value == 1:
638
+ break
639
+ else:
640
+ logger.error('start local LLM server failed, quit.')
641
+ raise Exception('local LLM path')
642
+ logger.info('Hybrid LLM Server start.')
643
+
644
+ main_interface()
.ipynb_checkpoints/config-bak-checkpoint.ini ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [feature_store]
2
+ reject_throttle = 0
3
+ embedding_model_path = "maidalun1020/bce-embedding-base_v1"
4
+ reranker_model_path = "maidalun1020/bce-reranker-base_v1"
5
+ repo_dir = "repodir"
6
+ work_dir = "workdir"
7
+ n_clusters = [20, 50]
8
+ chunk_size = 1024
9
+
10
+ [web_search]
11
+ x_api_key = "${YOUR-API-KEY}"
12
+ domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.com", "stackoverflow.com", "juejin.cn", "zhuanlan.zhihu.com", "www.cnblogs.com"]
13
+ save_dir = "logs/web_search_result"
14
+
15
+ [llm]
16
+ enable_local = 1
17
+ enable_remote = 1
18
+ client_url = "http://127.0.0.1:8888/inference"
19
+
20
+ [llm.server]
21
+ local_llm_path = "Qwen/Qwen1.5-7B-Chat"
22
+ local_llm_max_text_length = 32000
23
+ local_llm_bind_port = 8888
24
+ remote_type = ""
25
+ remote_api_key = ""
26
+ remote_llm_max_text_length = 32000
27
+ remote_llm_model = ""
28
+ rpm = 500
29
+
30
+ [worker]
31
+ enable_sg_search = 0
32
+ save_path = "logs/work.txt"
33
+
34
+ [worker.time]
35
+ start = "00:00:00"
36
+ end = "23:59:59"
37
+ has_weekday = 1
38
+
39
+ [sg_search]
40
+ binary_src_path = "/usr/local/bin/src"
41
+ src_access_token = "${YOUR-SRC-ACCESS-TOKEN}"
42
+
43
+ [sg_search.opencompass]
44
+ github_repo_id = "open-compass/opencompass"
45
+ introduction = "用于评测大型语言模型(LLM). 它提供了完整的开源可复现的评测框架,支持大语言模型、多模态模型的一站式评测,基于分布式技术,对大参数量模型亦能实现高效评测。评测方向汇总为知识、语言、理解、推理、考试五大能力维度,整合集纳了超过70个评测数据集,合计提供了超过40万个模型评测问题,并提供长文本、安全、代码3类大模型特色技术能力评测。"
46
+
47
+ [sg_search.lmdeploy]
48
+ github_repo_id = "internlm/lmdeploy"
49
+ introduction = "lmdeploy 是一个用于压缩、部署和服务 LLM(Large Language Model)的工具包。是一个服务端场景下,transformer 结构 LLM 部署工具,支持 GPU 服务端部署,速度有保障,支持 Tensor Parallel,多并发优化,功能全面,包括模型转换、缓存历史会话的 cache feature 等. 它还提供了 WebUI、命令行和 gRPC 客户端接入。"
50
+
51
+ [frontend]
52
+ type = "none"
53
+ webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
54
+ message_process_policy = "immediate"
55
+
56
+ [frontend.lark_group]
57
+ app_id = "cli_a53a34dcb778500e"
58
+ app_secret = "2ajhg1ixSvlNm1bJkH4tJhPfTCsGGHT1"
59
+ encrypt_key = "abc"
60
+ verification_token = "def"
61
+
62
+ [frontend.wechat_personal]
63
+ bind_port = 9527