import logging import pandas as pd import src.envs as envs from src.backend.model_operations import SummaryGenerator, EvaluationModel import src.backend.util as util logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class Evaluator: """A class to evaluate summaries generated by a language model. Attributes: model (str): The name or path of the model. revision (str): The model revision. precision (str): The precision setting of the model. num_fewshot (int): Number of few-shot examples to use. batch_size (int): Batch size for processing. device (str): The device to run the model on. no_cache (bool): Flag to disable caching. limit (int): Limit on the number of items to process. write_out (bool): Whether to write results to a file. output_base_path (str): Base path for output files. summary_generator (SummaryGenerator): Instance for generating summaries. eval_model (EvaluationModel): Instance for evaluating summaries. """ def __init__(self, model, revision, precision, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs'): """Initializes the Evaluator with the given model and settings. Args: model (str): The name or path of the model. revision (str): The model revision. precision (str): The precision setting of the model. num_fewshot (int): Number of few-shot examples to use. batch_size (int): Batch size for processing. device (str): The device to run the model on. no_cache (bool): Flag to disable caching. limit (int): Limit on the number of items to process. write_out (bool): Whether to write results to a file. output_base_path (str): Base path for output files. """ self.model = model self.revision = revision self.precision = precision self.batch_size = batch_size self.device = device self.no_cache = no_cache self.limit = limit self.write_out = write_out self.output_base_path = output_base_path try: self.summary_generator = SummaryGenerator(model, revision) self.eval_model = EvaluationModel(envs.HEM_PATH) except Exception as e: logging.error(f"Error initializing Evaluator: {e}") raise def evaluate(self): """ Performs the evaluation process by generating summaries and computing metrics. Returns: dict: A dictionary containing evaluation results. """ try: df = pd.read_csv(envs.DATASET_PATH) generated_summaries_df = self.summary_generator.generate_summaries(df) avg_summary_len = self.summary_generator.avg_length answer_rate = self.summary_generator.answer_rate hallucination_scores = self.eval_model.evaluate_hallucination( generated_summaries_df) factual_consistency_rate = self.eval_model.compute_factual_consistency_rate() hallucination_rate = self.eval_model.hallucination_rate results = util.format_results(model_name=self.model, revision=self.revision, precision=self.precision, factual_consistency_rate=factual_consistency_rate, hallucination_rate=hallucination_rate, answer_rate=answer_rate, avg_summary_len=avg_summary_len) return results except FileNotFoundError: logging.error(f"File not found: {envs.DATASET_PATH}") raise except Exception as e: logging.error(f"Error during evaluation: {e}") raise