future-xy
commited on
Commit
•
2d754ab
1
Parent(s):
a89d71b
connect front and backend
Browse files- backend-cli.py +27 -25
- src/backend/envs.py +3 -3
- src/backend/moe_infinity.py +3 -3
- src/backend/run_eval_suite.py +1 -0
- src/display/utils.py +5 -4
backend-cli.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
|
3 |
import os
|
4 |
import json
|
|
|
5 |
|
6 |
import socket
|
7 |
import random
|
@@ -33,7 +34,8 @@ def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
|
33 |
try:
|
34 |
set_eval_request(api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir)
|
35 |
return
|
36 |
-
except Exception:
|
|
|
37 |
time.sleep(60)
|
38 |
return
|
39 |
|
@@ -262,14 +264,21 @@ def process_pending_requests() -> bool:
|
|
262 |
return True
|
263 |
|
264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
if __name__ == "__main__":
|
266 |
-
|
|
|
267 |
#debug specific task by ping
|
268 |
if local_debug:
|
269 |
debug_model_names = ['mistralai/Mixtral-8x7B-Instruct-v0.1']
|
270 |
# debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
|
271 |
# debug_task_name = 'ifeval'
|
272 |
-
debug_task_name = '
|
273 |
task_lst = TASKS_HARNESS.copy()
|
274 |
for task in task_lst:
|
275 |
for debug_model_name in debug_model_names:
|
@@ -279,31 +288,24 @@ if __name__ == "__main__":
|
|
279 |
eval_request = EvalRequest(model=debug_model_name, private=False, status='', json_filepath='', precision='float16')
|
280 |
results = process_evaluation(task, eval_request)
|
281 |
|
282 |
-
|
283 |
-
|
284 |
-
if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
|
285 |
-
wait = False
|
286 |
-
hard_task_lst = ['nq', 'trivia', 'tqa']
|
287 |
-
|
288 |
-
if wait:
|
289 |
-
time.sleep(60 * random.randint(5, 10))
|
290 |
|
291 |
-
|
292 |
-
|
293 |
-
if random.randint(0, 10) == 0:
|
294 |
res = process_pending_requests()
|
|
|
295 |
time.sleep(60)
|
296 |
|
297 |
-
|
298 |
-
if random.randint(0, 5) == 0:
|
299 |
-
|
300 |
-
else:
|
301 |
-
|
302 |
|
303 |
-
|
304 |
|
305 |
-
|
306 |
-
if random.randint(0, 5) == 0:
|
307 |
-
|
308 |
-
else:
|
309 |
-
|
|
|
2 |
|
3 |
import os
|
4 |
import json
|
5 |
+
import argparse
|
6 |
|
7 |
import socket
|
8 |
import random
|
|
|
34 |
try:
|
35 |
set_eval_request(api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir)
|
36 |
return
|
37 |
+
except Exception as e:
|
38 |
+
print(f"Error setting eval request to {set_to_status}: {e}. Retrying in 60 seconds")
|
39 |
time.sleep(60)
|
40 |
return
|
41 |
|
|
|
264 |
return True
|
265 |
|
266 |
|
267 |
+
def get_args():
|
268 |
+
parser = argparse.ArgumentParser(description='Run the backend')
|
269 |
+
parser.add_argument('--debug', action='store_true', help='Run in debug mode')
|
270 |
+
return parser.parse_args()
|
271 |
+
|
272 |
+
|
273 |
if __name__ == "__main__":
|
274 |
+
args = get_args()
|
275 |
+
local_debug = args.debug
|
276 |
#debug specific task by ping
|
277 |
if local_debug:
|
278 |
debug_model_names = ['mistralai/Mixtral-8x7B-Instruct-v0.1']
|
279 |
# debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
|
280 |
# debug_task_name = 'ifeval'
|
281 |
+
debug_task_name = 'mmlu'
|
282 |
task_lst = TASKS_HARNESS.copy()
|
283 |
for task in task_lst:
|
284 |
for debug_model_name in debug_model_names:
|
|
|
288 |
eval_request = EvalRequest(model=debug_model_name, private=False, status='', json_filepath='', precision='float16')
|
289 |
results = process_evaluation(task, eval_request)
|
290 |
|
291 |
+
while True:
|
292 |
+
res = False
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
+
# if random.randint(0, 10) == 0:
|
|
|
|
|
295 |
res = process_pending_requests()
|
296 |
+
print(f"waiting for 60 seconds")
|
297 |
time.sleep(60)
|
298 |
|
299 |
+
# if res is False:
|
300 |
+
# if random.randint(0, 5) == 0:
|
301 |
+
# res = maybe_refresh_results(100)
|
302 |
+
# else:
|
303 |
+
# res = process_finished_requests(100)
|
304 |
|
305 |
+
# time.sleep(60)
|
306 |
|
307 |
+
# if res is False:
|
308 |
+
# if random.randint(0, 5) == 0:
|
309 |
+
# res = maybe_refresh_results(0)
|
310 |
+
# else:
|
311 |
+
# res = process_finished_requests(0)
|
src/backend/envs.py
CHANGED
@@ -35,7 +35,7 @@ class Tasks(Enum):
|
|
35 |
# task8 = Task("xsum", "rougeL", "XSum", 2)
|
36 |
# task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
|
37 |
|
38 |
-
task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
|
39 |
# task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
|
40 |
|
41 |
# task10 = Task("memo-trap", "acc", "memo-trap", 0)
|
@@ -43,7 +43,7 @@ class Tasks(Enum):
|
|
43 |
|
44 |
# task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
45 |
|
46 |
-
|
47 |
|
48 |
# task15 = Task("fever10", "acc", "FEVER", 16)
|
49 |
# task15_1 = Task("fever11", "acc", "FEVER", 8)
|
@@ -56,7 +56,7 @@ class Tasks(Enum):
|
|
56 |
# task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
|
57 |
|
58 |
# task20 = Task("race", "acc", "RACE", 0)
|
59 |
-
task21 = Task("
|
60 |
|
61 |
|
62 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
|
|
35 |
# task8 = Task("xsum", "rougeL", "XSum", 2)
|
36 |
# task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
|
37 |
|
38 |
+
# task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
|
39 |
# task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
|
40 |
|
41 |
# task10 = Task("memo-trap", "acc", "memo-trap", 0)
|
|
|
43 |
|
44 |
# task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
45 |
|
46 |
+
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
|
47 |
|
48 |
# task15 = Task("fever10", "acc", "FEVER", 16)
|
49 |
# task15_1 = Task("fever11", "acc", "FEVER", 8)
|
|
|
56 |
# task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
|
57 |
|
58 |
# task20 = Task("race", "acc", "RACE", 0)
|
59 |
+
task21 = Task("mmlu", "acc", "MMLU", 5)
|
60 |
|
61 |
|
62 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
src/backend/moe_infinity.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import torch
|
2 |
import os
|
3 |
from transformers import AutoTokenizer
|
4 |
-
import transformers
|
5 |
from transformers import AutoModelForCausalLM
|
6 |
from moe_infinity import MoE
|
7 |
from typing import List, Tuple, Optional, Union
|
@@ -29,7 +28,7 @@ class MoEHFLM(HFLM):
|
|
29 |
self.use_chat_template = use_chat_template
|
30 |
if "device" in kwargs:
|
31 |
kwargs.pop("device")
|
32 |
-
super().__init__(*args, **kwargs, pretrained=pretrained,
|
33 |
# self._create_model()
|
34 |
|
35 |
def _create_model(self, *args, **kwargs):
|
@@ -43,7 +42,8 @@ class MoEHFLM(HFLM):
|
|
43 |
}
|
44 |
# Update default config with any user-provided config
|
45 |
final_moe_config = {**default_moe_config, **self.moe_config}
|
46 |
-
self._model = MoE(self.checkpoint, final_moe_config)
|
|
|
47 |
|
48 |
@property
|
49 |
def max_length(self):
|
|
|
1 |
import torch
|
2 |
import os
|
3 |
from transformers import AutoTokenizer
|
|
|
4 |
from transformers import AutoModelForCausalLM
|
5 |
from moe_infinity import MoE
|
6 |
from typing import List, Tuple, Optional, Union
|
|
|
28 |
self.use_chat_template = use_chat_template
|
29 |
if "device" in kwargs:
|
30 |
kwargs.pop("device")
|
31 |
+
super().__init__(*args, **kwargs, pretrained=pretrained, device_map="cuda:0") # Assuming HFLM accepts a 'pretrained' arg and handles it
|
32 |
# self._create_model()
|
33 |
|
34 |
def _create_model(self, *args, **kwargs):
|
|
|
42 |
}
|
43 |
# Update default config with any user-provided config
|
44 |
final_moe_config = {**default_moe_config, **self.moe_config}
|
45 |
+
# self._model = MoE(self.checkpoint, final_moe_config)
|
46 |
+
self._model = AutoModelForCausalLM.from_pretrained(self.checkpoint, torch_dtype=torch.float16, device_map="auto")
|
47 |
|
48 |
@property
|
49 |
def max_length(self):
|
src/backend/run_eval_suite.py
CHANGED
@@ -33,6 +33,7 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
33 |
|
34 |
print(f"Selected Tasks: {task_names}")
|
35 |
print(f"Eval Request: {eval_request.get_model_args()}")
|
|
|
36 |
# hf-chat is implemented to use apply_chat_template
|
37 |
results = evaluator.simple_evaluate(model="moe-infinity", # "hf-causal-experimental", # "hf-causal", hf-chat
|
38 |
model_args=eval_request.get_model_args(),
|
|
|
33 |
|
34 |
print(f"Selected Tasks: {task_names}")
|
35 |
print(f"Eval Request: {eval_request.get_model_args()}")
|
36 |
+
print(f"Num Fewshot: {num_fewshot}, Batch Size: {batch_size}, Device: {device}, Use Cache: {use_cache}, Limit: {limit}")
|
37 |
# hf-chat is implemented to use apply_chat_template
|
38 |
results = evaluator.simple_evaluate(model="moe-infinity", # "hf-causal-experimental", # "hf-causal", hf-chat
|
39 |
model_args=eval_request.get_model_args(),
|
src/display/utils.py
CHANGED
@@ -24,7 +24,7 @@ class Tasks(Enum):
|
|
24 |
# truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc")
|
25 |
# truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE")
|
26 |
|
27 |
-
xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE")
|
28 |
# xsum_f = Task("xsum_v2", "factKB", "XSum/factKB")
|
29 |
# xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P")
|
30 |
|
@@ -45,8 +45,8 @@ class Tasks(Enum):
|
|
45 |
# halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
46 |
|
47 |
# # XXX include me back at some point
|
48 |
-
|
49 |
-
|
50 |
|
51 |
|
52 |
# These classes are for user facing column names,
|
@@ -62,7 +62,8 @@ class ColumnContent:
|
|
62 |
dummy: bool = False
|
63 |
|
64 |
auto_eval_column_dict = []
|
65 |
-
auto_eval_column_dict.append(["
|
|
|
66 |
# Init
|
67 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
68 |
# auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
24 |
# truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc")
|
25 |
# truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE")
|
26 |
|
27 |
+
# xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE")
|
28 |
# xsum_f = Task("xsum_v2", "factKB", "XSum/factKB")
|
29 |
# xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P")
|
30 |
|
|
|
45 |
# halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
46 |
|
47 |
# # XXX include me back at some point
|
48 |
+
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
49 |
+
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
50 |
|
51 |
|
52 |
# These classes are for user facing column names,
|
|
|
62 |
dummy: bool = False
|
63 |
|
64 |
auto_eval_column_dict = []
|
65 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
|
66 |
+
auto_eval_column_dict.append(["hardware", ColumnContent, ColumnContent("Hardware", "str", True, never_hidden=True)])
|
67 |
# Init
|
68 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
69 |
# auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|