Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Miaoran000 commited on Aug 14

Commit

e071b26

•

1 Parent(s): 5a86006

update for HF HHEM2.1

Browse files

Files changed (4) hide show

.gitignore +1 -0
src/backend/evaluate_model.py +3 -20
src/backend/model_operations.py +75 -71
src/envs.py +2 -2

.gitignore CHANGED Viewed

@@ -16,6 +16,7 @@ eval-results-bk/
 eval-results-bk_hhem21/
 eval-results_hhem21/
 hhem21_server/
 src/assets/model_counts.html

 eval-results-bk_hhem21/
 eval-results_hhem21/
 hhem21_server/
+leaderboard_results/
 src/assets/model_counts.html

src/backend/evaluate_model.py CHANGED Viewed

@@ -56,8 +56,8 @@ class Evaluator:
         self.write_out = write_out
         self.output_base_path = output_base_path
         try:
-            self.summary_generator = SummaryGenerator(model, revision)
-            self.eval_model = EvaluationModel(envs.HEM_PATH)
         except Exception as e:
             logging.error(f"Error initializing Evaluator: {e}")
             raise
@@ -72,9 +72,6 @@ class Evaluator:
         """
         try:
             df = pd.read_csv(envs.DATASET_PATH)
-            # print(envs.DATASET_PATH)
-            # print(df.shape)
-            # print(df.iloc[-1])
             self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
             avg_summary_len = self.summary_generator.avg_length
@@ -103,7 +100,7 @@ class Evaluator:
         print('Updating result files')
         leaderboard_path = os.getcwd() # the path of leaderboard folder
         print(leaderboard_path)
-        working_path = os.path.join(leaderboard_path, 'Hallucination Leaderboard Results')
         if not os.path.exists(working_path):
             logging.error(f"Need to first download the results from google drive to the learderboard folder")
             raise
@@ -124,19 +121,5 @@ class Evaluator:
         leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
         print('leaderboard_summaries.csv has been updated')
-        # update leaderboard_summaries_with_scores.csv
-        # BUG: get error when opening the file
-        existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
-                                encoding='utf-8', sep=",", quotechar='"', quoting=2)
-        print(existing_df.shape)
-        score_doc = set(existing_df['model'].values.tolist())
-        print(score_doc)
-        mask = existing_df['model'] == self.model
-        existing_df = existing_df[~mask]
-        # get new result
-        leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
-        leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
-        leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
-        print('leaderboard_summaries_with_scores.csv has been updated')

         self.write_out = write_out
         self.output_base_path = output_base_path
         try:
+            self.summary_generator = SummaryGenerator(model, revision, self.device)
+            self.eval_model = EvaluationModel(envs.HEM_PATH, self.device)
         except Exception as e:
             logging.error(f"Error initializing Evaluator: {e}")
             raise
         """
         try:
             df = pd.read_csv(envs.DATASET_PATH)
             self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
             avg_summary_len = self.summary_generator.avg_length
         print('Updating result files')
         leaderboard_path = os.getcwd() # the path of leaderboard folder
         print(leaderboard_path)
+        working_path = os.path.join(leaderboard_path, 'leaderboard_results')
         if not os.path.exists(working_path):
             logging.error(f"Need to first download the results from google drive to the learderboard folder")
             raise
         leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
         print('leaderboard_summaries.csv has been updated')

src/backend/model_operations.py CHANGED Viewed

@@ -11,11 +11,8 @@ import pandas as pd
 import spacy
 from sentence_transformers import CrossEncoder
 import litellm
-# from litellm import completion
 from tqdm import tqdm
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
-# from accelerate import PartialState
-# from accelerate.inference import prepare_pippy
 import torch
 import cohere
 from openai import OpenAI
@@ -41,20 +38,6 @@ nlp = spacy.load("en_core_web_sm")
 os.environ["HUGGINGFACE_API_KEY"] =  envs.TOKEN
-def load_evaluation_model(model_path):
-    """Load the evaluation model from the given path
-    Args:
-        model_path (str): Path to the evaluation model
-    Returns:
-        CrossEncoder: The evaluation model
-    """
-    model = CrossEncoder(model_path)
-    return model
 class ModelLoadingException(Exception):
     """Exception raised for errors in loading a model.
@@ -81,7 +64,7 @@ class SummaryGenerator:
         answer_rate (float): Rate of non-empty summaries.
     """
-    def __init__(self, model_id, revision):
         """
         Initializes the SummaryGenerator with a model.
@@ -94,6 +77,7 @@ class SummaryGenerator:
         self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
         self.summaries_df = pd.DataFrame()
         self.revision = revision
         self.avg_length = None
         self.answer_rate = None
         self.exceptions = None
@@ -206,10 +190,9 @@ class SummaryGenerator:
             payload = {
                 "model": self.model_id,
-                # "max_tokens": 4096,
                 'max_new_tokens': 250,
                 "temperature": 0.0,
-                # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
             }
             payload['messages'] = [{"role": "system", "content": system_prompt},
                                         {"role": "user", "content": user_prompt}]
@@ -365,39 +348,40 @@ class SummaryGenerator:
         # Using HF API or download checkpoints
         elif self.local_model is None and self.local_pipeline is None:
-            try: # try use HuggingFace API
-                print('** using huggingface api')
-                response = litellm.completion(
-                    model=self.model,
-                    messages=[{"role": "system", "content": system_prompt},
-                                {"role": "user", "content": user_prompt}],
-                    temperature=0.0,
-                    max_tokens=250,
-                    api_base=self.api_base,
                 )
-                result = response['choices'][0]['message']['content']
-                result = result.split('<|im_end|>')[0]
-                print(result)
-                return result
-            except Exception as e:
-                if 'Rate limit reached' in str(e) and 'yi-1.5' not in self.model_id.lower():
-                    wait_time = 300
-                    current_time = datetime.now().strftime('%H:%M:%S')
-                    print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
-                    time.sleep(wait_time)
-                else:
-                    if using_pipeline:
-                        self.local_pipeline = pipeline(
-                            "text-generation",
-                            model=self.model_id,
-                            model_kwargs={"torch_dtype": torch.bfloat16},
-                            device_map="auto",
-                        )
-                    else:
-                        self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
-                        print("Tokenizer loaded")
-                        self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
-                        print("Local model loaded")
         # Using local model/pipeline
@@ -438,7 +422,7 @@ class SummaryGenerator:
                 prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
             # print(prompt)
             # print('-'*50)
-            input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
             with torch.no_grad():
                 outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
                 if 'glm' in self.model_id.lower():
@@ -451,6 +435,8 @@ class SummaryGenerator:
                 result = result.split("### Assistant:\n")[-1]
             else:
                 result = result.replace(prompt.strip(), '')
             print(result)
@@ -494,17 +480,43 @@ class EvaluationModel:
         hallucination_rate (float): Rate of hallucination in summaries.
     """
-    def __init__(self, model_path):
         """
         Initializes the EvaluationModel with a CrossEncoder model.
         Args:
             model_path (str): Path to the CrossEncoder model.
         """
-        self.model = load_evaluation_model(model_path)
         self.scores = []
         self.factual_consistency_rate = None
         self.hallucination_rate = None
     def evaluate_hallucination(self, summaries_df):
         """
@@ -525,22 +537,14 @@ class EvaluationModel:
         for doc, summary in source_summary_pairs:
             if util.is_summary_valid(summary):
                 try:
-                    # summary_pieces = summary.split('\n')
-                    # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
                     summary = summary.replace('<bos>','').replace('<eos>','').strip()
-                    score = self.model.predict([doc, summary])# [0]
-                    if not isinstance(score, float):
-                        try:
-                            score = score.item()
-                        except:
-                            logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
-                            continue
-                        # print inconsistent summaries for checking
-                        if score < 0.5:
-                            print(doc)
-                            print('-'*10)
-                            print(summary)
-                            print('='*20)
                     hem_scores.append(score)
                     sources.append(doc)
                     summaries.append(summary)

 import spacy
 from sentence_transformers import CrossEncoder
 import litellm
 from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForTokenClassification
 import torch
 import cohere
 from openai import OpenAI
 os.environ["HUGGINGFACE_API_KEY"] =  envs.TOKEN
 class ModelLoadingException(Exception):
     """Exception raised for errors in loading a model.
         answer_rate (float): Rate of non-empty summaries.
     """
+    def __init__(self, model_id, revision, device):
         """
         Initializes the SummaryGenerator with a model.
         self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
         self.summaries_df = pd.DataFrame()
         self.revision = revision
+        self.device = device
         self.avg_length = None
         self.answer_rate = None
         self.exceptions = None
             payload = {
                 "model": self.model_id,
                 'max_new_tokens': 250,
                 "temperature": 0.0,
             }
             payload['messages'] = [{"role": "system", "content": system_prompt},
                                         {"role": "user", "content": user_prompt}]
         # Using HF API or download checkpoints
         elif self.local_model is None and self.local_pipeline is None:
+            # try: # try use HuggingFace API
+            #     print('** using huggingface api')
+            #     response = litellm.completion(
+            #         model=self.model,
+            #         messages=[{"role": "system", "content": system_prompt},
+            #                     {"role": "user", "content": user_prompt}],
+            #         temperature=0.0,
+            #         max_tokens=250,
+            #         api_base=self.api_base,
+            #     )
+            #     result = response['choices'][0]['message']['content']
+            #     result = result.split('<|im_end|>')[0]
+            #     print(result)
+            #     return result
+            # except Exception as e:
+            #     if 'Rate limit reached' in str(e) :
+            #         wait_time = 300
+            #         current_time = datetime.now().strftime('%H:%M:%S')
+            #         print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
+            #         time.sleep(wait_time)
+            #     else:
+            if using_pipeline:
+                self.local_pipeline = pipeline(
+                    "text-generation",
+                    model=self.model_id,
+                    model_kwargs={"torch_dtype": torch.bfloat16},
+                    device_map="auto",
                 )
+            else:
+                self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
+                print("Tokenizer loaded")
+                self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
+                print(self.local_model.device)
+                print("Local model loaded")
         # Using local model/pipeline
                 prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
             # print(prompt)
             # print('-'*50)
+            input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
             with torch.no_grad():
                 outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
                 if 'glm' in self.model_id.lower():
                 result = result.split("### Assistant:\n")[-1]
             else:
+                print(prompt)
+                print('-'*50)
                 result = result.replace(prompt.strip(), '')
             print(result)
         hallucination_rate (float): Rate of hallucination in summaries.
     """
+    def __init__(self, model_path, device):
         """
         Initializes the EvaluationModel with a CrossEncoder model.
         Args:
             model_path (str): Path to the CrossEncoder model.
         """
+        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
+        self.device = device
+        self.model.to(self.device)
         self.scores = []
         self.factual_consistency_rate = None
         self.hallucination_rate = None
+    def predict(self, text_pairs):
+        """Load LoRA adapters of HHEM and make predictions
+        All HHEM 2.1 settings, e.g., prompt template, are hardcoded in this function.
+        Args:
+            text_pairs: list of tuples, each tuple contains two strings (premise, hypothesis)
+            checkpoint: model ID on Hugging Face
+        """
+        prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
+        tokenizer = AutoTokenizer.from_pretrained('t5-base')
+        inputs = tokenizer(
+            [prompt.format(text1=pair[0], text2=pair[1]) for pair in text_pairs],
+            return_tensors='pt', padding='longest').to(self.device)
+        self.model.eval()
+        with torch.no_grad():
+            output = self.model(**inputs)
+        logits = output.logits
+        logits = logits[:,0,:] # get the logits on the first token
+        logits = torch.softmax(logits, dim=-1)
+        scores = [round(x, 5) for x in logits[:, 1].tolist()] # list of float
+        return scores
     def evaluate_hallucination(self, summaries_df):
         """
         for doc, summary in source_summary_pairs:
             if util.is_summary_valid(summary):
                 try:
                     summary = summary.replace('<bos>','').replace('<eos>','').strip()
+                    score = self.predict([(doc, summary)])[0]
+                    # print(score)
+                    # if score < 0.5:
+                    #     print(doc)
+                    #     print('-'*10)
+                    #     print(summary)
+                    #     print('='*20)
                     hem_scores.append(score)
                     sources.append(doc)
                     summaries.append(summary)

src/envs.py CHANGED Viewed

@@ -23,10 +23,10 @@ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
 API = HfApi(token=TOKEN)
-LEADERBOARD_DATASET_PATH = "Hallucination Leaderboard Results/leaderboard_summaries.csv"
 DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
 SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
-HEM_PATH = 'vectara/hallucination_evaluation_model'
 SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
 USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "

 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
 API = HfApi(token=TOKEN)
+LEADERBOARD_DATASET_PATH = "leaderboard_results/leaderboard_summaries.csv"
 DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
 SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
+HEM_PATH = 'vectara/HHEM-2.1'
 SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
 USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "