|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8"> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
<title>Stick To Your Role! Leaderboard</title> |
|
|
|
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/css/bootstrap.min.css"> |
|
|
|
<link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/dataTables.bootstrap5.min.css"> |
|
|
|
<style> |
|
body { |
|
background-color: #f8f9fa; |
|
font-family: 'Arial', sans-serif; |
|
} |
|
.container { |
|
max-width: 1200px; |
|
margin: auto; |
|
padding: 15px; |
|
background: #fff; |
|
border-radius: 8px; |
|
box-shadow: 0 4px 8px rgba(0,0,0,0.1); |
|
} |
|
|
|
h1 { |
|
color: #333; |
|
text-align: center; |
|
} |
|
h2 { |
|
color: #333; |
|
margin-top: 30px; |
|
text-align: center; |
|
} |
|
|
|
p { |
|
margin: auto; |
|
margin-top: 20px; |
|
margin-bottom: 10px; |
|
max-width: 1000px; |
|
text-align: left; |
|
} |
|
|
|
.table-responsive { |
|
margin-top: 20px; |
|
max-width: 1000px; |
|
margin: auto; |
|
} |
|
|
|
.main-table { |
|
font-size: 15px |
|
} |
|
|
|
.full-table { |
|
font-size: 12px |
|
} |
|
|
|
table { |
|
border-collapse: separate; |
|
border-spacing: 0; |
|
width: 1000px; |
|
margin: auto; |
|
border: none; |
|
} |
|
table thead th { |
|
background-color: #610b5d; |
|
color: white; |
|
border: 1px solid #dee2e6; |
|
text-align: left; |
|
} |
|
table tbody tr { |
|
background-color: #fff; |
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
} |
|
table tbody tr:hover { |
|
background-color: #f1f1f1; |
|
} |
|
table td, table th { |
|
padding: 5px; |
|
border: 1px solid #dee2e6; |
|
} |
|
table th:first-child { |
|
border-top-left-radius: 10px; |
|
} |
|
table th:last-child { |
|
border-top-right-radius: 10px; |
|
} |
|
.section { |
|
padding-left: 150px; |
|
padding-right: 150px; |
|
text-align: left; |
|
} |
|
.citation-section { |
|
margin-top: 5px; |
|
text-align: center; |
|
max-width: 1000px; |
|
margin: auto; |
|
} |
|
.citation-box { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 5px; |
|
margin-top: 5px; |
|
font-size: 12px; |
|
text-align: left; |
|
font-family: 'Courier New', Courier, monospace; |
|
white-space: pre; |
|
} |
|
.image-container { |
|
display: flex; |
|
justify-content: center; |
|
gap: 10px; |
|
margin-bottom: 40px; |
|
max-width: 1100px; |
|
margin: auto; |
|
} |
|
.image-container a { |
|
flex: 1; |
|
} |
|
.image-container img { |
|
max-width: 100%; |
|
height: auto; |
|
display: block; |
|
margin: auto; |
|
} |
|
.about-button { |
|
text-align: center; |
|
margin-top: 50px; |
|
margin-bottom: 50px; |
|
} |
|
.custom-button { |
|
background-color: #610b5d; |
|
color: #fff; |
|
border-radius: 15px; |
|
padding: 10px 20px; |
|
font-size: 18px; |
|
text-decoration: none; |
|
} |
|
.custom-button:hover { |
|
background-color: #812b7d; |
|
color: #fff; |
|
} |
|
|
|
|
|
table.dataTable thead .sorting:after, |
|
table.dataTable thead .sorting:before, |
|
table.dataTable thead .sorting_asc:before, |
|
table.dataTable thead .sorting_asc:after, |
|
table.dataTable thead .sorting_desc:before, |
|
table.dataTable thead .sorting_desc:after { |
|
display: none; |
|
} |
|
|
|
table.dataTable thead .sorting_asc { |
|
background-image: url("{{ url_for('static', filename='icons/sort_asc_gray.png') }}"); |
|
background-repeat: no-repeat; |
|
background-position: center right; |
|
} |
|
|
|
table.dataTable thead .sorting_desc { |
|
background-image: url("{{ url_for('static', filename='icons/sort_desc_gray.png') }}"); |
|
background-repeat: no-repeat; |
|
background-position: center right; |
|
} |
|
|
|
|
|
table.dataTable > thead > tr > th:not(.sorting_disabled), |
|
table.dataTable > thead > tr > td:not(.sorting_disabled) { |
|
padding-right: 5px; |
|
} |
|
|
|
|
|
table.dataTable tbody td:first-child, |
|
table.dataTable tbody td:nth-child(2), |
|
table.dataTable thead th:first-child, |
|
table.dataTable thead th:nth-child(2) { |
|
text-align: left; |
|
} |
|
|
|
|
|
table.dataTable tbody td:not(:first-child):not(:nth-child(2)), |
|
table.dataTable thead th:not(:first-child):not(:nth-child(2)) { |
|
text-align: center; |
|
} |
|
|
|
</style> |
|
</head> |
|
<body> |
|
<div class="container"> |
|
<h1 class="mt-5">Stick To Your Role! Leaderboard</h1> |
|
<p> |
|
The Stick to Your Role! leaderboard compares LLMs based on <b>undesired sensitivity to context change</b>. |
|
It focuses on the stability of personal value expression in simulated personas. |
|
As proposed in our <a href="https://arxiv.org/abs/2402.14846">paper</a>, |
|
unwanted context-dependence should be seen as a <b>property of LLMs</b> - a dimension of LLM comparison (alongside others such as model size speed or expressed knowledge). |
|
This leaderboard aims to provide such a comparison and extends our paper with a more focused and elaborate experimental setup. |
|
Standard benchmarks present MANY questions from the SAME MINIMAL contexts (e.g. multiple choice questions), |
|
we present SAME questions from MANY different contexts. |
|
</p> |
|
<div class="table-responsive main-table"> |
|
|
|
{{ main_table_html|safe }} |
|
</div> |
|
<div class="image-container"> |
|
<a href="{{ url_for('static', filename='figures/cardinal.svg') }}" target="_blank"> |
|
<img src="{{ url_for('static', filename='figures/cardinal.svg') }}" alt="Cardinal"> |
|
</a> |
|
<a href="{{ url_for('static', filename='figures/ordinal.svg') }}" target="_blank"> |
|
<img src="{{ url_for('static', filename='figures/ordinal.svg') }}" alt="Ordinal"> |
|
</a> |
|
</div> |
|
<p> |
|
We leverage the Schwartz's theory of <a href="https://www.sciencedirect.com/science/article/abs/pii/S0065260108602816">Basic Personal Values</a>, |
|
which defines 10 values Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, Universalism), |
|
and the associated PVQ-40 and SVS questionnaires (available <a href="https://www.researchgate.net/publication/354384463_A_Repository_of_Schwartz_Value_Scales_with_Instructions_and_an_Introduction">here</a>). |
|
</p> |
|
<p> |
|
Using the <a href="https://pubmed.ncbi.nlm.nih.gov/31402448/">methodology from psychology</a>, we focus on population-level (interpersonal) value stability, i.e. <b>Rank-Order stability</b> (RO stability). |
|
Rank-Order stability refers to the extent the order of different personas (in terms of expression of some value) remains the same along different contexts. |
|
Refer <a href="{{ url_for('about', _anchor='rank_order_stability') }}">here</a> or to our <a href="https://arxiv.org/abs/2402.14846">paper</a> for more details. |
|
</p> |
|
<p> |
|
In addition to Rank-Order stability we compute <b>validity metrics</b>, which are a common practice in psychology. |
|
Validity refers to the extent the questionnaire measures what it purports to measure. |
|
It can be seen the questionnaire's accuracy in measuring the intended factors, i.e. values. |
|
For example, basic personal values should be organized in a circular structure, and questions measuring the same value should be correlated. |
|
The table below additionally shows the validity metrics, refer <a href="{{ url_for('about', _anchor='metrics') }}">here</a> for more details. |
|
</p> |
|
<p> |
|
We <b>aggregate</b> Rank-Order stability and validation metrics to rank the models. We do so in two ways: <b>Cardinal</b> and <b>Ordinal</b>. |
|
Following, <a href="https://arxiv.org/abs/2405.01719">this paper</a>, we compute the stability and diversity of those rankings. See <a href="{{ url_for('about', _anchor='aggregate_metrics') }}">here</a> for more details. |
|
</p> |
|
<div class="table-responsive full-table"> |
|
|
|
{{ full_table_html|safe }} |
|
</div> |
|
<div class="about-button"> |
|
<a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a> |
|
</div> |
|
<div class="citation-section"> |
|
<p> |
|
If you found this project useful, please cite our related paper, |
|
which this leaderboard extends with a more focused and elaborate experimental setup. |
|
Refer <a href="{{ url_for('about', _anchor='paper') }}">here</a> for details. |
|
</p> |
|
<div class="citation-box" id="citation-text"> |
|
@article{kovavc2024stick, |
|
title={Stick to your Role! Stability of Personal Values Expressed in Large Language Models}, |
|
author={Kova{\v{c}}, Grgur and Portelas, R{\'e}my and Sawayama, Masataka and Dominey, Peter Ford and Oudeyer, Pierre-Yves}, |
|
journal={arXiv preprint arXiv:2402.14846}, |
|
year={2024} |
|
} |
|
</div> |
|
</div> |
|
</div> |
|
|
|
|
|
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> |
|
|
|
<script src="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/js/bootstrap.bundle.min.js"></script> |
|
|
|
<script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script> |
|
<script src="https://cdn.datatables.net/1.11.5/js/dataTables.bootstrap5.min.js"></script> |
|
|
|
<script> |
|
$(document).ready(function() { |
|
const table = $('table').DataTable({ |
|
"paging": false, |
|
"info": false, |
|
"columnDefs": [ |
|
{ "orderable": false, "targets": 0 }, |
|
{ "searchable": false, "targets": 0 } |
|
], |
|
"order": [[ 2, 'desc' ]], |
|
"drawCallback": function(settings) { |
|
var api = this.api(); |
|
api.column(0, {order:'applied'}).nodes().each(function(cell, i) { |
|
cell.innerHTML = i + 1; |
|
}); |
|
} |
|
}); |
|
}); |
|
</script> |
|
</body> |
|
</html> |
|
|