future-xy commited on
Commit
2d754ab
1 Parent(s): a89d71b

connect front and backend

Browse files
backend-cli.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import os
4
  import json
 
5
 
6
  import socket
7
  import random
@@ -33,7 +34,8 @@ def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
33
  try:
34
  set_eval_request(api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir)
35
  return
36
- except Exception:
 
37
  time.sleep(60)
38
  return
39
 
@@ -262,14 +264,21 @@ def process_pending_requests() -> bool:
262
  return True
263
 
264
 
 
 
 
 
 
 
265
  if __name__ == "__main__":
266
- local_debug = True
 
267
  #debug specific task by ping
268
  if local_debug:
269
  debug_model_names = ['mistralai/Mixtral-8x7B-Instruct-v0.1']
270
  # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
271
  # debug_task_name = 'ifeval'
272
- debug_task_name = 'selfcheckgpt'
273
  task_lst = TASKS_HARNESS.copy()
274
  for task in task_lst:
275
  for debug_model_name in debug_model_names:
@@ -279,31 +288,24 @@ if __name__ == "__main__":
279
  eval_request = EvalRequest(model=debug_model_name, private=False, status='', json_filepath='', precision='float16')
280
  results = process_evaluation(task, eval_request)
281
 
282
- wait = True
283
- hard_task_lst = None
284
- if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
285
- wait = False
286
- hard_task_lst = ['nq', 'trivia', 'tqa']
287
-
288
- if wait:
289
- time.sleep(60 * random.randint(5, 10))
290
 
291
- res = False
292
-
293
- if random.randint(0, 10) == 0:
294
  res = process_pending_requests()
 
295
  time.sleep(60)
296
 
297
- if res is False:
298
- if random.randint(0, 5) == 0:
299
- res = maybe_refresh_results(100, hard_task_lst=hard_task_lst)
300
- else:
301
- res = process_finished_requests(100, hard_task_lst=hard_task_lst)
302
 
303
- time.sleep(60)
304
 
305
- if res is False:
306
- if random.randint(0, 5) == 0:
307
- res = maybe_refresh_results(0, hard_task_lst=hard_task_lst)
308
- else:
309
- res = process_finished_requests(0, hard_task_lst=hard_task_lst)
 
2
 
3
  import os
4
  import json
5
+ import argparse
6
 
7
  import socket
8
  import random
 
34
  try:
35
  set_eval_request(api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir)
36
  return
37
+ except Exception as e:
38
+ print(f"Error setting eval request to {set_to_status}: {e}. Retrying in 60 seconds")
39
  time.sleep(60)
40
  return
41
 
 
264
  return True
265
 
266
 
267
+ def get_args():
268
+ parser = argparse.ArgumentParser(description='Run the backend')
269
+ parser.add_argument('--debug', action='store_true', help='Run in debug mode')
270
+ return parser.parse_args()
271
+
272
+
273
  if __name__ == "__main__":
274
+ args = get_args()
275
+ local_debug = args.debug
276
  #debug specific task by ping
277
  if local_debug:
278
  debug_model_names = ['mistralai/Mixtral-8x7B-Instruct-v0.1']
279
  # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
280
  # debug_task_name = 'ifeval'
281
+ debug_task_name = 'mmlu'
282
  task_lst = TASKS_HARNESS.copy()
283
  for task in task_lst:
284
  for debug_model_name in debug_model_names:
 
288
  eval_request = EvalRequest(model=debug_model_name, private=False, status='', json_filepath='', precision='float16')
289
  results = process_evaluation(task, eval_request)
290
 
291
+ while True:
292
+ res = False
 
 
 
 
 
 
293
 
294
+ # if random.randint(0, 10) == 0:
 
 
295
  res = process_pending_requests()
296
+ print(f"waiting for 60 seconds")
297
  time.sleep(60)
298
 
299
+ # if res is False:
300
+ # if random.randint(0, 5) == 0:
301
+ # res = maybe_refresh_results(100)
302
+ # else:
303
+ # res = process_finished_requests(100)
304
 
305
+ # time.sleep(60)
306
 
307
+ # if res is False:
308
+ # if random.randint(0, 5) == 0:
309
+ # res = maybe_refresh_results(0)
310
+ # else:
311
+ # res = process_finished_requests(0)
src/backend/envs.py CHANGED
@@ -35,7 +35,7 @@ class Tasks(Enum):
35
  # task8 = Task("xsum", "rougeL", "XSum", 2)
36
  # task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
37
 
38
- task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
39
  # task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
40
 
41
  # task10 = Task("memo-trap", "acc", "memo-trap", 0)
@@ -43,7 +43,7 @@ class Tasks(Enum):
43
 
44
  # task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
45
 
46
- # task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
47
 
48
  # task15 = Task("fever10", "acc", "FEVER", 16)
49
  # task15_1 = Task("fever11", "acc", "FEVER", 8)
@@ -56,7 +56,7 @@ class Tasks(Enum):
56
  # task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
57
 
58
  # task20 = Task("race", "acc", "RACE", 0)
59
- task21 = Task("gsm8k", "acc", "GSM8K", 0)
60
 
61
 
62
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 
35
  # task8 = Task("xsum", "rougeL", "XSum", 2)
36
  # task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
37
 
38
+ # task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
39
  # task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
40
 
41
  # task10 = Task("memo-trap", "acc", "memo-trap", 0)
 
43
 
44
  # task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
45
 
46
+ task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
47
 
48
  # task15 = Task("fever10", "acc", "FEVER", 16)
49
  # task15_1 = Task("fever11", "acc", "FEVER", 8)
 
56
  # task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
57
 
58
  # task20 = Task("race", "acc", "RACE", 0)
59
+ task21 = Task("mmlu", "acc", "MMLU", 5)
60
 
61
 
62
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
src/backend/moe_infinity.py CHANGED
@@ -1,7 +1,6 @@
1
  import torch
2
  import os
3
  from transformers import AutoTokenizer
4
- import transformers
5
  from transformers import AutoModelForCausalLM
6
  from moe_infinity import MoE
7
  from typing import List, Tuple, Optional, Union
@@ -29,7 +28,7 @@ class MoEHFLM(HFLM):
29
  self.use_chat_template = use_chat_template
30
  if "device" in kwargs:
31
  kwargs.pop("device")
32
- super().__init__(*args, **kwargs, pretrained=pretrained, device="cuda:0") # Assuming HFLM accepts a 'pretrained' arg and handles it
33
  # self._create_model()
34
 
35
  def _create_model(self, *args, **kwargs):
@@ -43,7 +42,8 @@ class MoEHFLM(HFLM):
43
  }
44
  # Update default config with any user-provided config
45
  final_moe_config = {**default_moe_config, **self.moe_config}
46
- self._model = MoE(self.checkpoint, final_moe_config)
 
47
 
48
  @property
49
  def max_length(self):
 
1
  import torch
2
  import os
3
  from transformers import AutoTokenizer
 
4
  from transformers import AutoModelForCausalLM
5
  from moe_infinity import MoE
6
  from typing import List, Tuple, Optional, Union
 
28
  self.use_chat_template = use_chat_template
29
  if "device" in kwargs:
30
  kwargs.pop("device")
31
+ super().__init__(*args, **kwargs, pretrained=pretrained, device_map="cuda:0") # Assuming HFLM accepts a 'pretrained' arg and handles it
32
  # self._create_model()
33
 
34
  def _create_model(self, *args, **kwargs):
 
42
  }
43
  # Update default config with any user-provided config
44
  final_moe_config = {**default_moe_config, **self.moe_config}
45
+ # self._model = MoE(self.checkpoint, final_moe_config)
46
+ self._model = AutoModelForCausalLM.from_pretrained(self.checkpoint, torch_dtype=torch.float16, device_map="auto")
47
 
48
  @property
49
  def max_length(self):
src/backend/run_eval_suite.py CHANGED
@@ -33,6 +33,7 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
33
 
34
  print(f"Selected Tasks: {task_names}")
35
  print(f"Eval Request: {eval_request.get_model_args()}")
 
36
  # hf-chat is implemented to use apply_chat_template
37
  results = evaluator.simple_evaluate(model="moe-infinity", # "hf-causal-experimental", # "hf-causal", hf-chat
38
  model_args=eval_request.get_model_args(),
 
33
 
34
  print(f"Selected Tasks: {task_names}")
35
  print(f"Eval Request: {eval_request.get_model_args()}")
36
+ print(f"Num Fewshot: {num_fewshot}, Batch Size: {batch_size}, Device: {device}, Use Cache: {use_cache}, Limit: {limit}")
37
  # hf-chat is implemented to use apply_chat_template
38
  results = evaluator.simple_evaluate(model="moe-infinity", # "hf-causal-experimental", # "hf-causal", hf-chat
39
  model_args=eval_request.get_model_args(),
src/display/utils.py CHANGED
@@ -24,7 +24,7 @@ class Tasks(Enum):
24
  # truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc")
25
  # truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE")
26
 
27
- xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE")
28
  # xsum_f = Task("xsum_v2", "factKB", "XSum/factKB")
29
  # xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P")
30
 
@@ -45,8 +45,8 @@ class Tasks(Enum):
45
  # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
46
 
47
  # # XXX include me back at some point
48
- # selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
49
- gsm8k = Task("gsm8k", "acc", "GSM8K")
50
 
51
 
52
  # These classes are for user facing column names,
@@ -62,7 +62,8 @@ class ColumnContent:
62
  dummy: bool = False
63
 
64
  auto_eval_column_dict = []
65
- auto_eval_column_dict.append(["system", ColumnContent, ColumnContent("System", "str", True, never_hidden=True)])
 
66
  # Init
67
  # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
68
  # auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
24
  # truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc")
25
  # truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE")
26
 
27
+ # xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE")
28
  # xsum_f = Task("xsum_v2", "factKB", "XSum/factKB")
29
  # xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P")
30
 
 
45
  # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
46
 
47
  # # XXX include me back at some point
48
+ selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
49
+ mmlu = Task("hendrycksTest", "acc", "MMLU")
50
 
51
 
52
  # These classes are for user facing column names,
 
62
  dummy: bool = False
63
 
64
  auto_eval_column_dict = []
65
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
66
+ auto_eval_column_dict.append(["hardware", ColumnContent, ColumnContent("Hardware", "str", True, never_hidden=True)])
67
  # Init
68
  # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
69
  # auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])