Miaoran000 commited on
Commit
e071b26
1 Parent(s): 5a86006

update for HF HHEM2.1

Browse files
.gitignore CHANGED
@@ -16,6 +16,7 @@ eval-results-bk/
16
  eval-results-bk_hhem21/
17
  eval-results_hhem21/
18
  hhem21_server/
 
19
 
20
  src/assets/model_counts.html
21
 
 
16
  eval-results-bk_hhem21/
17
  eval-results_hhem21/
18
  hhem21_server/
19
+ leaderboard_results/
20
 
21
  src/assets/model_counts.html
22
 
src/backend/evaluate_model.py CHANGED
@@ -56,8 +56,8 @@ class Evaluator:
56
  self.write_out = write_out
57
  self.output_base_path = output_base_path
58
  try:
59
- self.summary_generator = SummaryGenerator(model, revision)
60
- self.eval_model = EvaluationModel(envs.HEM_PATH)
61
  except Exception as e:
62
  logging.error(f"Error initializing Evaluator: {e}")
63
  raise
@@ -72,9 +72,6 @@ class Evaluator:
72
  """
73
  try:
74
  df = pd.read_csv(envs.DATASET_PATH)
75
- # print(envs.DATASET_PATH)
76
- # print(df.shape)
77
- # print(df.iloc[-1])
78
  self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
79
 
80
  avg_summary_len = self.summary_generator.avg_length
@@ -103,7 +100,7 @@ class Evaluator:
103
  print('Updating result files')
104
  leaderboard_path = os.getcwd() # the path of leaderboard folder
105
  print(leaderboard_path)
106
- working_path = os.path.join(leaderboard_path, 'Hallucination Leaderboard Results')
107
  if not os.path.exists(working_path):
108
  logging.error(f"Need to first download the results from google drive to the learderboard folder")
109
  raise
@@ -124,19 +121,5 @@ class Evaluator:
124
  leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
125
  print('leaderboard_summaries.csv has been updated')
126
 
127
- # update leaderboard_summaries_with_scores.csv
128
- # BUG: get error when opening the file
129
- existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
130
- encoding='utf-8', sep=",", quotechar='"', quoting=2)
131
- print(existing_df.shape)
132
- score_doc = set(existing_df['model'].values.tolist())
133
- print(score_doc)
134
- mask = existing_df['model'] == self.model
135
- existing_df = existing_df[~mask]
136
- # get new result
137
- leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
138
- leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
139
- leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
140
- print('leaderboard_summaries_with_scores.csv has been updated')
141
 
142
 
 
56
  self.write_out = write_out
57
  self.output_base_path = output_base_path
58
  try:
59
+ self.summary_generator = SummaryGenerator(model, revision, self.device)
60
+ self.eval_model = EvaluationModel(envs.HEM_PATH, self.device)
61
  except Exception as e:
62
  logging.error(f"Error initializing Evaluator: {e}")
63
  raise
 
72
  """
73
  try:
74
  df = pd.read_csv(envs.DATASET_PATH)
 
 
 
75
  self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
76
 
77
  avg_summary_len = self.summary_generator.avg_length
 
100
  print('Updating result files')
101
  leaderboard_path = os.getcwd() # the path of leaderboard folder
102
  print(leaderboard_path)
103
+ working_path = os.path.join(leaderboard_path, 'leaderboard_results')
104
  if not os.path.exists(working_path):
105
  logging.error(f"Need to first download the results from google drive to the learderboard folder")
106
  raise
 
121
  leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
122
  print('leaderboard_summaries.csv has been updated')
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
 
src/backend/model_operations.py CHANGED
@@ -11,11 +11,8 @@ import pandas as pd
11
  import spacy
12
  from sentence_transformers import CrossEncoder
13
  import litellm
14
- # from litellm import completion
15
  from tqdm import tqdm
16
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
17
- # from accelerate import PartialState
18
- # from accelerate.inference import prepare_pippy
19
  import torch
20
  import cohere
21
  from openai import OpenAI
@@ -41,20 +38,6 @@ nlp = spacy.load("en_core_web_sm")
41
 
42
  os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
43
 
44
-
45
- def load_evaluation_model(model_path):
46
- """Load the evaluation model from the given path
47
-
48
- Args:
49
- model_path (str): Path to the evaluation model
50
-
51
- Returns:
52
- CrossEncoder: The evaluation model
53
- """
54
- model = CrossEncoder(model_path)
55
- return model
56
-
57
-
58
  class ModelLoadingException(Exception):
59
  """Exception raised for errors in loading a model.
60
 
@@ -81,7 +64,7 @@ class SummaryGenerator:
81
  answer_rate (float): Rate of non-empty summaries.
82
  """
83
 
84
- def __init__(self, model_id, revision):
85
  """
86
  Initializes the SummaryGenerator with a model.
87
 
@@ -94,6 +77,7 @@ class SummaryGenerator:
94
  self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
95
  self.summaries_df = pd.DataFrame()
96
  self.revision = revision
 
97
  self.avg_length = None
98
  self.answer_rate = None
99
  self.exceptions = None
@@ -206,10 +190,9 @@ class SummaryGenerator:
206
 
207
  payload = {
208
  "model": self.model_id,
209
- # "max_tokens": 4096,
210
  'max_new_tokens': 250,
211
  "temperature": 0.0,
212
- # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
213
  }
214
  payload['messages'] = [{"role": "system", "content": system_prompt},
215
  {"role": "user", "content": user_prompt}]
@@ -365,39 +348,40 @@ class SummaryGenerator:
365
 
366
  # Using HF API or download checkpoints
367
  elif self.local_model is None and self.local_pipeline is None:
368
- try: # try use HuggingFace API
369
- print('** using huggingface api')
370
- response = litellm.completion(
371
- model=self.model,
372
- messages=[{"role": "system", "content": system_prompt},
373
- {"role": "user", "content": user_prompt}],
374
- temperature=0.0,
375
- max_tokens=250,
376
- api_base=self.api_base,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  )
378
- result = response['choices'][0]['message']['content']
379
- result = result.split('<|im_end|>')[0]
380
- print(result)
381
- return result
382
- except Exception as e:
383
- if 'Rate limit reached' in str(e) and 'yi-1.5' not in self.model_id.lower():
384
- wait_time = 300
385
- current_time = datetime.now().strftime('%H:%M:%S')
386
- print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
387
- time.sleep(wait_time)
388
- else:
389
- if using_pipeline:
390
- self.local_pipeline = pipeline(
391
- "text-generation",
392
- model=self.model_id,
393
- model_kwargs={"torch_dtype": torch.bfloat16},
394
- device_map="auto",
395
- )
396
- else:
397
- self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
398
- print("Tokenizer loaded")
399
- self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
400
- print("Local model loaded")
401
 
402
 
403
  # Using local model/pipeline
@@ -438,7 +422,7 @@ class SummaryGenerator:
438
  prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
439
  # print(prompt)
440
  # print('-'*50)
441
- input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
442
  with torch.no_grad():
443
  outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
444
  if 'glm' in self.model_id.lower():
@@ -451,6 +435,8 @@ class SummaryGenerator:
451
  result = result.split("### Assistant:\n")[-1]
452
 
453
  else:
 
 
454
  result = result.replace(prompt.strip(), '')
455
 
456
  print(result)
@@ -494,17 +480,43 @@ class EvaluationModel:
494
  hallucination_rate (float): Rate of hallucination in summaries.
495
  """
496
 
497
- def __init__(self, model_path):
498
  """
499
  Initializes the EvaluationModel with a CrossEncoder model.
500
 
501
  Args:
502
  model_path (str): Path to the CrossEncoder model.
503
  """
504
- self.model = load_evaluation_model(model_path)
 
 
505
  self.scores = []
506
  self.factual_consistency_rate = None
507
  self.hallucination_rate = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
  def evaluate_hallucination(self, summaries_df):
510
  """
@@ -525,22 +537,14 @@ class EvaluationModel:
525
  for doc, summary in source_summary_pairs:
526
  if util.is_summary_valid(summary):
527
  try:
528
- # summary_pieces = summary.split('\n')
529
- # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
530
  summary = summary.replace('<bos>','').replace('<eos>','').strip()
531
- score = self.model.predict([doc, summary])# [0]
532
- if not isinstance(score, float):
533
- try:
534
- score = score.item()
535
- except:
536
- logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
537
- continue
538
- # print inconsistent summaries for checking
539
- if score < 0.5:
540
- print(doc)
541
- print('-'*10)
542
- print(summary)
543
- print('='*20)
544
  hem_scores.append(score)
545
  sources.append(doc)
546
  summaries.append(summary)
 
11
  import spacy
12
  from sentence_transformers import CrossEncoder
13
  import litellm
 
14
  from tqdm import tqdm
15
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForTokenClassification
 
 
16
  import torch
17
  import cohere
18
  from openai import OpenAI
 
38
 
39
  os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  class ModelLoadingException(Exception):
42
  """Exception raised for errors in loading a model.
43
 
 
64
  answer_rate (float): Rate of non-empty summaries.
65
  """
66
 
67
+ def __init__(self, model_id, revision, device):
68
  """
69
  Initializes the SummaryGenerator with a model.
70
 
 
77
  self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
78
  self.summaries_df = pd.DataFrame()
79
  self.revision = revision
80
+ self.device = device
81
  self.avg_length = None
82
  self.answer_rate = None
83
  self.exceptions = None
 
190
 
191
  payload = {
192
  "model": self.model_id,
 
193
  'max_new_tokens': 250,
194
  "temperature": 0.0,
195
+
196
  }
197
  payload['messages'] = [{"role": "system", "content": system_prompt},
198
  {"role": "user", "content": user_prompt}]
 
348
 
349
  # Using HF API or download checkpoints
350
  elif self.local_model is None and self.local_pipeline is None:
351
+ # try: # try use HuggingFace API
352
+ # print('** using huggingface api')
353
+ # response = litellm.completion(
354
+ # model=self.model,
355
+ # messages=[{"role": "system", "content": system_prompt},
356
+ # {"role": "user", "content": user_prompt}],
357
+ # temperature=0.0,
358
+ # max_tokens=250,
359
+ # api_base=self.api_base,
360
+ # )
361
+ # result = response['choices'][0]['message']['content']
362
+ # result = result.split('<|im_end|>')[0]
363
+ # print(result)
364
+ # return result
365
+ # except Exception as e:
366
+ # if 'Rate limit reached' in str(e) :
367
+ # wait_time = 300
368
+ # current_time = datetime.now().strftime('%H:%M:%S')
369
+ # print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
370
+ # time.sleep(wait_time)
371
+ # else:
372
+ if using_pipeline:
373
+ self.local_pipeline = pipeline(
374
+ "text-generation",
375
+ model=self.model_id,
376
+ model_kwargs={"torch_dtype": torch.bfloat16},
377
+ device_map="auto",
378
  )
379
+ else:
380
+ self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
381
+ print("Tokenizer loaded")
382
+ self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
383
+ print(self.local_model.device)
384
+ print("Local model loaded")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
 
387
  # Using local model/pipeline
 
422
  prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
423
  # print(prompt)
424
  # print('-'*50)
425
+ input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
426
  with torch.no_grad():
427
  outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
428
  if 'glm' in self.model_id.lower():
 
435
  result = result.split("### Assistant:\n")[-1]
436
 
437
  else:
438
+ print(prompt)
439
+ print('-'*50)
440
  result = result.replace(prompt.strip(), '')
441
 
442
  print(result)
 
480
  hallucination_rate (float): Rate of hallucination in summaries.
481
  """
482
 
483
+ def __init__(self, model_path, device):
484
  """
485
  Initializes the EvaluationModel with a CrossEncoder model.
486
 
487
  Args:
488
  model_path (str): Path to the CrossEncoder model.
489
  """
490
+ self.model = AutoModelForTokenClassification.from_pretrained(model_path)
491
+ self.device = device
492
+ self.model.to(self.device)
493
  self.scores = []
494
  self.factual_consistency_rate = None
495
  self.hallucination_rate = None
496
+
497
+ def predict(self, text_pairs):
498
+ """Load LoRA adapters of HHEM and make predictions
499
+ All HHEM 2.1 settings, e.g., prompt template, are hardcoded in this function.
500
+ Args:
501
+ text_pairs: list of tuples, each tuple contains two strings (premise, hypothesis)
502
+ checkpoint: model ID on Hugging Face
503
+ """
504
+
505
+ prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
506
+
507
+ tokenizer = AutoTokenizer.from_pretrained('t5-base')
508
+ inputs = tokenizer(
509
+ [prompt.format(text1=pair[0], text2=pair[1]) for pair in text_pairs],
510
+ return_tensors='pt', padding='longest').to(self.device)
511
+
512
+ self.model.eval()
513
+ with torch.no_grad():
514
+ output = self.model(**inputs)
515
+ logits = output.logits
516
+ logits = logits[:,0,:] # get the logits on the first token
517
+ logits = torch.softmax(logits, dim=-1)
518
+ scores = [round(x, 5) for x in logits[:, 1].tolist()] # list of float
519
+ return scores
520
 
521
  def evaluate_hallucination(self, summaries_df):
522
  """
 
537
  for doc, summary in source_summary_pairs:
538
  if util.is_summary_valid(summary):
539
  try:
 
 
540
  summary = summary.replace('<bos>','').replace('<eos>','').strip()
541
+ score = self.predict([(doc, summary)])[0]
542
+ # print(score)
543
+ # if score < 0.5:
544
+ # print(doc)
545
+ # print('-'*10)
546
+ # print(summary)
547
+ # print('='*20)
 
 
 
 
 
 
548
  hem_scores.append(score)
549
  sources.append(doc)
550
  summaries.append(summary)
src/envs.py CHANGED
@@ -23,10 +23,10 @@ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
23
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
24
  API = HfApi(token=TOKEN)
25
 
26
- LEADERBOARD_DATASET_PATH = "Hallucination Leaderboard Results/leaderboard_summaries.csv"
27
  DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
28
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
29
- HEM_PATH = 'vectara/hallucination_evaluation_model'
30
 
31
  SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
32
  USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
 
23
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
24
  API = HfApi(token=TOKEN)
25
 
26
+ LEADERBOARD_DATASET_PATH = "leaderboard_results/leaderboard_summaries.csv"
27
  DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
28
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
29
+ HEM_PATH = 'vectara/HHEM-2.1'
30
 
31
  SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
32
  USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "