from datasets import load_dataset MODEL = 'cmg_gpt_4_0613' CACHE_DIR = 'cache' def load_data(): dataset = load_dataset("JetBrains-Research/lca-cmg", "commitchronicle-py-long", split="test", cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename( columns={'message': 'reference'}) model_dataset = load_dataset("JetBrains-Research/lca-results", MODEL, split="test", cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]] model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')] dataset = dataset.join(other=model_dataset) return dataset.reset_index().to_dict('records')