File size: 23,737 Bytes
b42419d
 
 
 
 
40c29ba
b42419d
 
 
 
1c7b1ad
 
b42419d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b19d196
 
 
 
 
 
 
 
 
 
 
 
 
 
b42419d
 
b19d196
 
 
b42419d
b19d196
b42419d
b19d196
b42419d
b19d196
 
b42419d
 
 
 
 
 
 
 
 
 
 
 
 
 
40c29ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b42419d
 
 
 
40c29ba
 
b42419d
 
 
 
40c29ba
 
 
 
 
 
 
b42419d
 
 
 
 
 
 
 
 
 
 
 
b19d196
b42419d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace58da
 
 
 
 
ca60da9
fa96c3a
ace58da
 
 
fa96c3a
ca60da9
 
ace58da
 
 
 
 
 
 
 
 
 
 
ca60da9
b42419d
 
40c29ba
 
b42419d
 
 
 
 
 
b19d196
b42419d
 
 
 
 
40c29ba
b42419d
 
 
 
 
 
 
40c29ba
 
b42419d
 
 
ca60da9
 
40c29ba
ca60da9
 
b42419d
 
40c29ba
 
 
 
ace58da
40c29ba
 
 
 
 
 
 
 
 
 
 
1047c44
 
 
 
 
40c29ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b42419d
1c7b1ad
 
 
 
b42419d
40c29ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3219568
40c29ba
 
1047c44
 
40c29ba
 
 
 
 
 
b42419d
 
215d189
b42419d
 
 
 
1c7b1ad
b42419d
 
1c7b1ad
 
b42419d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Stick To Your Role! About</title>
    <!-- Include Bootstrap CSS for styling -->
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/css/bootstrap.min.css">
    <!-- Include DataTables CSS -->
    <link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/dataTables.bootstrap5.min.css">
    <!-- Include mathjax -->
    <script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.1.2/es5/tex-mml-chtml.js"></script>
    <!-- Custom CSS for additional styling -->
    <style>
        body {
            background-color: #f8f9fa;
            font-family: 'Arial', sans-serif;
        }
        .container {
            max-width: 1200px; /* Limit the width of the container */
            margin: auto; /* Center the container */
            padding: 20px; /* Add some padding */
            background: #fff;
            border-radius: 8px;
            box-shadow: 0 4px 8px rgba(0,0,0,0.1);
        }
        h1 {
            color: #333;
            text-align: center;
        }
        h2 {
            color: #333;
            margin-top: 30px;
            text-align: center;
        }
        .table-responsive {
            margin-top: 20px;
        }
        table {
            border-collapse: separate;
            border-spacing: 0;
            font-size: 14px; /* Reduce the font size */
            width: 100%;
            border: none; /* Remove any default border */
        }
        table thead th {
            background-color: #610b5d;
            color: white;
            border: 1px solid #dee2e6;
            text-align: left;
        }
        table tbody tr {
            background-color: #fff;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        table tbody tr:hover {
            background-color: #f1f1f1;
        }
        table td, table th {
            padding: 10px; /* Reduce padding */
            border: 1px solid #dee2e6;
        }
        table th:first-child {
            border-top-left-radius: 10px;
        }
        table th:last-child {
            border-top-right-radius: 10px;
        }
        .section{
            padding-top: 19px;
            text-align: left;
        }

        .section p {
            padding-left: 150px;
            padding-right: 150px;
            text-indent: 2em;
            margin: auto;
            margin-bottom: 10px;
            text-align: left;
        }

        .section ol,ul {
            padding-left: 150px;
            padding-right: 150px;
            margin: auto;
            margin-bottom: 20px;
            margin-left: 50px;
            text-align: left;
            margin-top: 0px;
        }

        .citation-section {
            width: 100%;
            margin-top: 50px;
            text-align: center;
        }
        .citation-box {
            background-color: #f8f9fa;
            border: 1px solid #dee2e6;
            border-radius: 8px;
            padding: 10px;
            margin-top: 5px;
            font-size: 15px;
            text-align: left;
            font-family: 'Courier New', Courier, monospace;
            white-space: pre;
        }

        .image-container-structure {
            display: flex;
            justify-content: center;
            gap: 10px;
            margin-bottom: 40px;
            max-width: 70%; /* Adjust the width as needed */
            margin: auto;
        }

        .image-container-structure a {
            flex: 1;
        }

        .image-container-structure img {
            max-width: 100%;
            height: auto;
            display: block;
            margin: auto;
        }

        .image-container {
            width: 100%;
            margin-bottom: 40px;
        }
        .image-container #admin-questionnaire {
            width: 50%;
            height: auto;
            display: block;
            margin: auto;
        }
        .image-container #ro-image {
            width: 70%;
            height: auto;
            display: block;
            margin: auto;
        }

        .section-title {
            font-size: 24px;
            font-weight: bold;
            text-align: center;
            margin-bottom: 40px;
            padding: 20px; /* Add padding for more margin around text */
            background-color: #610b5d;
            color: #fff; /* Ensure text is readable on dark background */
            border-radius: 15px; /* Rounded edges */
        }
        .back-button {
            text-align: center;
            margin-top: 50px;
        }
        .custom-button {
            background-color: #610b5d;
            color: #fff; /* Set white text color */
            border-radius: 15px; /* Rounded edges */
            padding: 10px 20px; /* Padding for the button */
            font-size: 18px; /* Increase font size */
            text-decoration: none; /* Remove underline */
        }
        .custom-button:hover {
            background-color: #812b7d;
            color: #fff;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1 class="mt-5">Stick To Your Role! Leaderboard</h1>
        <div class="table-responsive">
            <!-- Render the table HTML here -->
            {{ table_html|safe }}
        </div>
        <div class="section">
            <div class="section-title">Motivation</div>
            <p>
                LLMs can role-play different personas by simulating their values and behavior, but can they stick to their role whatever the context?
                Is simulated Joan of Arc more tradition-driven than Elvis?
                Will it still be the case after playing chess?
            </p>
            <p>
                Benchmarks usually compare models with <b>many questions</b> from <b>a single minimal context</b>, e.g. as multiple choices questions.
                This kind of evaluation is little informative of LLMs' behavior in deployment, where they are exposed to new contexts (especially considering the LLMs highly context-dependant nature).
                We argue that <b>undesired context-dependence</b> can be seen as a <b>property of LLMs</b>: a dimension of LLM comparison alongside others like size, speed, or knowledge.
                While some context-dependence is desired (e.g. following instructions), some is undesired (e.g. drastically changing the simulated value expression based on the interlocutor).
                We evaluate LLMs by asking the <b> same questions </b> from <b> many different contexts </b>.
            </p>
            <p>
                LLMs are often used to simulate personas and populations, we study the stability and coherence of a simulated <b>population</b> - in contrast to evaluating each persona separately, <b>we evaluate the stability of simulated personas relative to each other, i.e. as a population</b>.
                We study simulated populations over different contexts, i.e. conversations on different topics.
                To do that, we leverage the psychological methodology to study the interpersonal stability of personal value expression in a simulated population.
                We adopt the <b>Schwartz Theory of Basic Personal Values</b>, which defines 10 values: Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, and Universalism.
                To evaluate their expression we use the associated questionnaires: <b>PVQ-40</b>, and <b>SVS</b>.
            </p>
            <p>
                The Stick to Your Role! leaderboard aims to provide an up-to-date comparison of recent LLMs based on their ability to coherently simulate popultions.
                It, in tandem with other minimal-context benchmarks, should enable you to choose the best-suited model for your usecase!
                If you want to evaluate or add your model you can follow the instructions
                <a href="{{ url_for('new_model') }}">here</a>.
            </p>
        </div>
        <div class="section">
            <div class="section-title">Administering a questionnaire to a simulated persona in context</div>
            <p>To evaluate the stability on a population level we need to be able to evaluate a <b>value profile</b> expressed by a <b>simulated individual</b> in a <b>specific context</b> (conversation topic). We do with the following procedure:</p>
            <ol>
                <li> The Tested model is instructed to simulate a persona</li>
                <li> A separate model instance - The Interlocutor - is instructed to simulate a “human using a chatbot”
                <li> A conversation topic is induced by manually setting the first Interlocutor’s message (e.g. Tell me a
                joke)
                <li> A conversation is simulated
                <li> A question from the questionnaire is set as the last Interlocutor’s last message and The Tested model’s
                response is recorded (this is repeated for every item in the questionnaire)
                <li> The questionnaire is scored to obtain scores for the 10 personal values
            </ol>
            <div class="image-container">
                <a href="{{ url_for('static', filename='figures/admin_questionnaire.svg') }}" target="_blank">
                    <img id="admin-questionnaire"  src="{{ url_for('static', filename='figures/admin_questionnaire.svg') }}" alt="Structure">
                </a>
            </div>
        </div>
        <div class="section">
            <div class="section-title">Contexts</div>
            <p>
                We aim to score the expressed value profile for each simulated persona in different contexts.
                More precisely a population (50 personas) is evaluated with a context chunk (50 topics: one per persona).
                Then, the simulated population in one context chunk is compared to the same population in another context chunk.
                Here are the considered context chunks:
            </p>
            <ul>
                <li> <b> no_conv </b>: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
                <li> <b> no_conv_svs </b>: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
                <li> <b> chunk_0-chunk-4 </b>: <a href="https://gitlab.inria.fr/gkovac/value_stability/-/tree/master/contexts/leaderboard_reddit_chunks?ref_type=heads">50 reddit posts</a> used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest.</li>
                <li> <b> chess </b>: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
                <li> <b> grammar </b>: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
            </ul>
        </div>
        <div class="section" id="validation">
            <div class="section-title">Validation</div>
            <p>
                Validity refers to the extent the questionnaire measures what it purports to measure.
                It can be seen as the questionnaire's accuracy in measuring the intended factors, i.e. values.
                Following the recommendations in <a href="https://pubmed.ncbi.nlm.nih.gov/22329443/">this paper</a>,
                the validation consists of two phases: Theory-Based Multidimensional Scaling (MDS) and Confirmatory Factor Analysis (CFA).
            </p>
            <p>
                <b>Theory-Based Multidimensional Scaling (MDS)</b> tests that the expressed values are organized in a circular structure as predicted by the theory.
                Values should be ordered in a circle in the same order as shown on the figure below (Tradition and Conformity should be on the same angle with Tradition closer to the center).
                To compute the structure in our data, we calculate the intercorrelations between different items (questions).
                This provides us with 40 points in a 40D space (for PVQ-40), which is space is then reduced to 2D by <a href="https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html">MDS</a>.
                Crucially, MDS is initialized with the theoretical circular value structure, i.e. items corresponding to the same value are assigned the same angle.
                When MDS is fit, it provides the <b>Stress (&darr;) </b> metric ('Stress-1 index') indicating the goodness of the fit.
                A value of 0 indicates 'perfect' fit, 0.025 excellent, 0.05 good, 0.1 fair, and 0.2 poor.
<!--                It is common to also qualitatively analyze this structure to see if the items are organized in distinct regions.-->
<!--                Since a leaderboard cannot contain qualitative measures,-->
<!--                we construct a quantitative measure with the same intuition.-->
<!--                The <b>Separability (&uarr;)</b> metric is the accuracy of a linear SVM OvO classifier.-->
<!--                The intuition is that all values should be linearly separable.-->
            </p>

            <div class="image-container-structure">
                <a href="{{ url_for('static', filename='figures/theoretical_structure.svg') }}" target="_blank">
                    <img id="theoretical-structure"  src="{{ url_for('static', filename='figures/theoretical_structure.svg') }}" alt="Theoretical Structure">
                </a>
                <a href="{{ url_for('static', filename='figures/gpt-4o_structure.svg') }}" target="_blank">
                    <img id="gpt4o-structure"  src="{{ url_for('static', filename='figures/gpt_no_conv_structure.svg') }}" alt="GPT-4o structure">
                </a>
            </div>

            <p>
                <b>Confirmatory Factor Analysis (CFA) </b> fits a model on the data.
                The model is defined according to the theory
                and the fit of this model is used as a metric.
                Due to the circular structure of basic personal values,
                it is <a href="https://pubmed.ncbi.nlm.nih.gov/22329443/">recommended</a> to employ a Magnifying glass CFA strategy.
                Four separate models are fit, one for each of the high level values (consisting of several low-level values):
                Conservation (security, conformity, tradition),
                Openness to Change (self-direction, stimulation, hedonism),
                Self-transcendence (benevolence, universalism),
                Self-enhancement (achievement, power).
                Fit is measured with three standard metrics:
                <b>Comparative Fit Index - CFI (&uarr;) - </b> compares the fit of a model to a more restricted baseline model
                (>.90 considered acceptable fit).
                <b>Standardized root mean square residual - SRMR (&darr;) </b> compares the sample variances and covariances to the estimated ones.
                (<.05 considered good fit, <.08 considered reasonable fit).
                <b>Root mean square error of approximation - RMSEA (&darr;)</b>
                reflects the degree to which a model fits the population covariance matrix,
                while taking into account the degrees of freedom and sample size
                (<.05 considered good fit; < .08 considered reasonable fit).
            </p>

        </div>
        <div class="section" id="rank_order_stability">
            <div class="section-title">Rank-Order stability</div>
            <p>
                <b>Rank-Order stability (&uarr;)</b> is used to estimate the stability of some value inside a population.
                In psychology, it is computed as the correlation in the order of individuals at two points in time (individuals are ordered based on
their expression of that value).
                Intuitively, this can be seen as addressing the following question: "Does Jack always value Tradition more than Jane does?".
                As shown below, instead of comparing two points in time, we compare
                <a href="https://gitlab.inria.fr/gkovac/value_stability/-/blob/master/personas/real_world_people/personas.json?ref_type=heads">the simulated population</a>
                in different contexts (simulated conversations of different topics).
                We then average over different context pairs and values to obtain the final estimate.
            </p>
            <div class="image-container" >
            <a href="{{ url_for('static', filename='figures/rank_order_stability_computation.svg') }}" target="_blank">
                <img id="ro-image" src="{{ url_for('static', filename='figures/rank_order_stability_computation.svg') }}" alt="rank-order stablity">
            </a>
            </div>
        </div>
        <div class="section" id="aggregate_metrics">
            <div class="section-title">Aggregate Metrics</div>
            <p>
            To rank models, we aggregate the rank-order and validity metrics in two ways :
            </p>
            <ul>
                <li> <b> Cardinal - Score (&uarr;) </b> - the score is averaged over all metrics (with descending metrics inverted), context pairs (for stability) and contexts (for validity metrics),
                    i.e. the average of \( \binom{n\_context\_chunks}{2} + n\_validity\_metrics*n\_context\_chunks \) values.</li>
                <li> <b> Ordinal - Win rate (&uarr;) </b> - for each metric, each context pair (for stability) and each context (for validity metrics) is considered as a game between two models, the win rate of a model is the percentage of won games against all models,
                    i.e. the average of \( (n\_models-1) * ( \binom{n\_context\_chunks}{2} + n\_validity\_metrics*n\_context\_chunks) \).</li>
            </ul>
            <p>
                Following this <a href="https://arxiv.org/abs/2405.01719">paper</a> and associated <a href="https://github.com/socialfoundations/benchbench">benchbench</a> library,
                we can compute the diversity and the sensitivity of the two ranking methods.
                A benchmark is considered <b>diverse</b> if different tasks order models in different ways.
                We use the reversed Kendall’s coefficient of concordance (W) diversity metric.
                A benchmark is considered <b>sensitive</b> if the model ordering is sensitive to
                the addition of new irrelevant models (for ordinal benchmarks),
                or to the label noise (for cardinal benchmarks).
                We use the max rank change (MRC) sensitivity metric.
            </p>
        </div>
        <div class="section" id="paper">
            <div class="section-title">Differences with the paper</div>
            <p>
                This leaderboard is grounded in the methodology presented in our <a href="https://arxiv.org/abs/2402.14846">research paper</a>.
                The paper contains various experiments which are not included in the leaderboard such as:
                multiple populations,
                within-person stability,
                stability on downstream tasks,
                correlations of value expression and behavior on downstream tasks, and so on.
                The leaderboard focused on population-level stability (Rank-Order) and contains various additions to the methodology.
                These changes were made to keep up with the newly released model and to make the evaluation more detailed.
                We describe additions made in the leaderboard here for clarity:
                <ol>
                    <li>a new population was created and was balanced with respect to gender</li>
                    <li>context chunks - instead of evaluating the stability of a population between pairs of contexts, where all personas are given the same topic (e.g. chess), we evaluate it between pairs of context chunks, where each participant is given a different random context</li>
                    <li>more diverse and longer contexts (up to 6k tokens) were created with reddit posts from the <a href="https://webis.de/data/webis-tldr-17.html">webis dataset</a> (the dataset was cleaned to exclude posts from NSFW subreddits)</li>
                    <li>different interlocutors - chess and grammar topic were still introduced as in the paper (same context for all participants), but the interlocutor model was instructed to simulate a random persona from the same population (as opposed to a human user in other settings)</li>
                    <li>in the paper, multiple seeds for the order of suggested answers were used, given that the results didn't vary much between seeds, here, a single seed was used facilitating the analysis with more longer contexts</li>
                    <li>evaluations were also done without simulating conversations (no_conv setting)</li>
                    <li>evaluations were also done with the SVS questionnaire (in the no_conv setting)</li>
<!--                    <li>validation metrics - Stress, Separability, CFI, SRMR, RMSEA metrics were introduced </li>-->
                    <li>validation metrics - Stress, CFI, SRMR, RMSEA metrics were introduced </li>
                    <li>cardinal and ordinal ordering with sensitivity and diversity estimates were added</li>
                    <li>newer models were evaluated</li>
                </ol>


            </p>
        </div>
        <div class="back-button">
            <a href="{{ url_for('index') }}" class="custom-button mt-3">Main page</a>
        </div>
        <div class="citation-section">
            <p>If you found this project useful, please cite our related paper:</p>
            <div class="citation-box" id="citation-text">
@inproceedings{kovavc2024stick,
  title={Stick to your Role! Stability of Personal Values Expressed in Large Language Models},
  author={Kova{\v{c}}, Grgur and Portelas, R{\'e}my and Sawayama, Masataka and Dominey, Peter Ford and Oudeyer, Pierre-Yves},
  booktitle={Proceedings of the Annual Meeting of the Cognitive Science Society},
  volume={46},
  year={2024}
}
            </div>
        </div>
    </div>

    <!-- Include jQuery -->
    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
    <!-- Include Bootstrap JS -->
    <script src="https://stackpath.bootstrapcdn.com/bootstrap/5.1.3/js/bootstrap.bundle.min.js"></script>
    <!-- Include DataTables JS -->
    <script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
    <script src="https://cdn.datatables.net/1.11.5/js/dataTables.bootstrap5.min.js"></script>
    <!-- Initialize DataTables -->
    <script>
        $(document).ready(function() {
            const table = $('table').DataTable({
                "paging": false,
                "info": false,
                "columnDefs": [
                    { "orderable": false, "targets": 0 },
                    { "searchable": false, "targets": 0 }
                ],
                "order": [[ 2, 'desc' ]],
                "drawCallback": function(settings) {
                    var api = this.api();
                    api.column(0, {order:'applied'}).nodes().each(function(cell, i) {
                        cell.innerHTML = i + 1;
                    });
                }
            });
        });

    </script>
</body>
</html>