grg commited on
Commit
ca60da9
1 Parent(s): b42419d

Adding cardinal and ordinal figures, updating descriptions on per_model pages and index, fixed leaderboard.csv

Browse files
app.py CHANGED
@@ -11,6 +11,17 @@ def index():
11
  df = df.round(3)
12
  df.insert(0, '#', '')
13
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Generate the table HTML with clickable model names
15
  table_html = df.to_html(classes='table table-striped table-bordered', escape=False, index=False)
16
 
 
11
  df = df.round(3)
12
  df.insert(0, '#', '')
13
 
14
+ df = df.rename(columns={
15
+ "Ordinal (Win rate)": "Ordinal (Win rate) (+)",
16
+ "Cardinal (Score)": "Cardinal (Score) (+)",
17
+ "RO Stability": "RO Stability (+)",
18
+ "Rank Distance": "Rank Distance (-)",
19
+ "CFI": "CFI (+)",
20
+ "SRMR": "SRMR (-)",
21
+ "RMSEA": "RMSEA (-)",
22
+ "Cronbach alpha": "Cronbach alpha (+)"
23
+ })
24
+
25
  # Generate the table HTML with clickable model names
26
  table_html = df.to_html(classes='table table-striped table-bordered', escape=False, index=False)
27
 
static/figures/Qwen2-7B-Instruct/matrix.svg CHANGED
static/figures/cardinal.svg ADDED
static/figures/ordinal.svg ADDED
static/leaderboard.csv CHANGED
@@ -1,15 +1,15 @@
1
  Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Rank Distance,CFI,SRMR,RMSEA,Cronbach alpha
2
- phi-3-mini-128k-instruct,0.258309591642925,0.18707102480796897,0.16880341880341881,0.7264957264957265,0.32478632478632474,0.5555555555555556,0.6239316239316239,0.23076923076923084
3
- phi-3-medium-128k-instruct,0.28490028490028496,0.18789267301588508,0.26282051282051283,0.7521367521367521,0.2820512820512821,0.7692307692307692,0.7606837606837606,0.5128205128205128
4
- Mistral-7B-Instruct-v0.1,0.3418803418803419,0.23063750442486428,0.12393162393162394,0.4786324786324786,0.49572649572649574,0.3076923076923077,0.2222222222222222,0.09401709401709403
5
- Mistral-7B-Instruct-v0.2,0.3342830009496676,0.1747138068267554,0.41666666666666674,0.4358974358974358,0.1282051282051282,0.829059829059829,0.8205128205128205,0.2991452991452992
6
- Mistral-7B-Instruct-v0.3,0.3133903133903134,0.20131219867252867,0.23504273504273498,0.4273504273504273,0.28205128205128205,0.6324786324786325,0.5897435897435896,0.24786324786324787
7
- Mixtral-8x7B-Instruct-v0.1,0.43114909781576455,0.2437400779497571,0.5811965811965812,0.641025641025641,0.2136752136752137,0.7863247863247863,0.7521367521367521,0.5213675213675214
8
- Mixtral-8x22B-Instruct-v0.1,0.29629629629629634,0.18791617935864172,0.37820512820512825,0.6837606837606838,0.1794871794871795,0.9230769230769231,0.9145299145299145,0.49572649572649574
9
- command_r_plus,0.560303893637227,0.3737946817620246,0.6880341880341879,0.6923076923076923,0.45299145299145294,0.5128205128205128,0.5811965811965811,0.6239316239316239
10
- llama_3_8b_instruct,0.4691358024691358,0.28828624999947805,0.5747863247863247,0.5470085470085471,0.3162393162393162,0.7008547008547008,0.6923076923076923,0.5470085470085471
11
- llama_3_70b_instruct,0.7701804368471036,0.5976823900754995,0.9380341880341881,0.7264957264957265,0.8376068376068376,0.4273504273504274,0.49572649572649574,0.9914529914529915
12
- Qwen2-7B-Instruct,0.5251661918328584,0.3400513233761655,0.5769230769230768,0.5811965811965811,0.4188034188034188,0.45299145299145294,0.4871794871794872,0.5213675213675214
13
- Qwen2-72B-Instruct,0.5906932573599241,0.42123592516768155,0.9658119658119655,0.5811965811965811,0.07692307692307693,0.9658119658119658,0.9914529914529915,0.9145299145299146
14
- gpt-3.5-turbo-0125,0.23741690408357075,0.14920836189480854,0.24145299145299137,0.7777777777777778,0.1965811965811966,0.717948717948718,0.7094017094017093,0.1794871794871795
15
- gpt-4o-0513,0.7340930674264008,0.5383734693976642,0.8482905982905984,0.6666666666666667,0.811965811965812,0.41025641025641024,0.3418803418803419,0.8205128205128205
 
1
  Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Rank Distance,CFI,SRMR,RMSEA,Cronbach alpha
2
+ phi-3-mini-128k-instruct,0.258309591642925,0.18707102480796897,0.039299993295009855,0.7387387387387387,0.18377777777777776,0.5171888888888889,0.5129444444444444,0.11153354438597567
3
+ phi-3-medium-128k-instruct,0.28490028490028496,0.18789267301588508,0.09692037989916814,0.7207207207207207,0.13975555555555555,0.7127888888888889,0.7074888888888888,0.30459548048923546
4
+ Mistral-7B-Instruct-v0.1,0.3418803418803419,0.23063750442486428,0.027216280472015988,0.6636636636636636,0.22268888888888888,0.32623333333333326,0.3056444444444445,0.039724970488267154
5
+ Mistral-7B-Instruct-v0.2,0.3342830009496676,0.1747138068267554,0.14417876497818388,0.6516516516516517,0.08117777777777778,0.8068555555555555,0.8051,0.1781386309574924
6
+ Mistral-7B-Instruct-v0.3,0.3133903133903134,0.20131219867252867,0.07960539866974455,0.6126126126126126,0.15823333333333334,0.6148222222222223,0.6058111111111111,0.1684008059863923
7
+ Mixtral-8x7B-Instruct-v0.1,0.43114909781576455,0.2437400779497571,0.21473356319081474,0.6846846846846846,0.13543333333333335,0.7121888888888889,0.7086555555555556,0.3048222445803502
8
+ Mixtral-8x22B-Instruct-v0.1,0.29629629629629634,0.18791617935864172,0.1414001940345544,0.6696696696696696,0.11905555555555554,0.8129333333333333,0.8111777777777778,0.30037006331478344
9
+ command_r_plus,0.560303893637227,0.3737946817620246,0.3429686514651868,0.6726726726726726,0.2763777777777778,0.5177555555555555,0.5135777777777777,0.41990575822570303
10
+ llama_3_8b_instruct,0.4691358024691358,0.28828624999947805,0.24527785038654715,0.6996996996996997,0.21058888888888885,0.6220777777777777,0.6159777777777777,0.34063121481548086
11
+ llama_3_70b_instruct,0.7701804368471036,0.5976823900754995,0.607020698814379,0.6966966966966968,0.4662222222222222,0.25131111111111115,0.2503333333333333,0.6831776343408991
12
+ Qwen2-7B-Instruct,0.5251661918328584,0.3400513233761655,0.25108519506513916,0.7057057057057057,0.2508,0.4224444444444444,0.41727777777777786,0.35074905805286155
13
+ Qwen2-72B-Instruct,0.5906932573599241,0.42123592516768155,0.6465993243020925,0.6426426426426426,0.06031111111111111,0.9069111111111111,0.9069555555555555,0.6009242274989618
14
+ gpt-3.5-turbo-0125,0.23741690408357075,0.14920836189480854,0.08240359836763214,0.7297297297297297,0.08906666666666668,0.7106777777777777,0.7033,0.06790170442358906
15
+ gpt-4o-0513,0.7340930674264008,0.5383734693976642,0.5122163952167618,0.6546546546546546,0.4056777777777778,0.24250000000000016,0.23266666666666658,0.5206391872554754
templates/about.html CHANGED
@@ -134,20 +134,23 @@
134
  </div>
135
  <div class="section">
136
  <div class="section-title">Motivation</div>
137
- <ul>
138
- <li> LLMs are often used to simulate personas and populations</li>
139
- <li> Benchmarks usually compare models with MANY QUESTIONS from A SINGLE MINIMAL CONTEXT, e.g. as multiple choices questions
140
- -> this kind of evaluation is little informative of LLMs' behavior in deployment when exposed to new contexts
141
- (especially when we consider the LLMs highly context-dependant nature)
142
- <li> CONTEXT-DEPENDENCE can be seen as a PROPERTY of LLMs: a dimension of LLM comparison alongside others like size, speed, or knowledge
143
- <li> We evaluate LLMs by asking the SAME QUESTIONS from MANY DIFFERENT CONTEXTS
144
- <li> We study the coherence of simulated populations over different contexts (conversations on different topics)
145
- <li> We leverage the psychological methodology to study the interpersonal stability of personal value expression of those simulated populations
146
- </ul>
 
 
 
147
  </div>
148
  <div class="section">
149
  <div class="section-title">Administering a questionnaire in context to a simulated persona</div>
150
- <p>We use the following procedure:</p>
151
  <ol>
152
  <li> The Tested model is instructed to simulate a persona</li>
153
  <li> A separate model instance - The Interlocutor - is instructed to simulate a “human using a chatbot”
@@ -175,25 +178,25 @@
175
  Here are the considered context chunks:
176
  </p>
177
  <ul>
178
- <li> no_conv: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
179
- <li> no_conv_svs: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
180
- <li> chunk_0-chunk-4: 50 reddit posts are used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest </li>
181
- <li> chess: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
182
- <li> grammar: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
183
  </ul>
184
  </div>
185
  <div class="section">
186
  <div class="section-title">Metrics</div>
187
  <p>We evaluate the following metrics (+ denotes higher is better; - denotes lower is better) </p>
188
  <ul>
189
- <li> RO Stability (+) - Average Rank-Order stability between each pair of context chunks (see our <a href="https://arxiv.org/abs/2402.14846">paper</a> for details) </li>
190
- <li> Rank Distance (-) - Average distance between the theoretical and the observed order of values in a circular space. </li>
191
- <li> CFI (+) - a common Validity metric </li>
192
- <li> SRMR (-) - a common Validity metric </li>
193
- <li> RMSEA (-) - a common Validity metric </li>
194
- <li> Cronbach alpha (+) - a common Reliability metric </li>
195
- <li> Ordinal (Win rate) (+) - each context pair and each metric is considered as a game between models, the metric shows the average win rate over all such games</li>
196
- <li> Cardinal (Score) (+) - the average over all context pairs and metrics (with descending metric inverted) </li>
197
  </ul>
198
  </div>
199
  <div class="back-button">
 
134
  </div>
135
  <div class="section">
136
  <div class="section-title">Motivation</div>
137
+ <p>
138
+ Benchmarks usually compare models with MANY QUESTIONS from A SINGLE MINIMAL CONTEXT, e.g. as multiple choices questions.
139
+ This kind of evaluation is little informative of LLMs' behavior in deployment when exposed to new contexts (especially when we consider the LLMs highly context-dependant nature).
140
+ We argue that CONTEXT-DEPENDENCE can be seen as a PROPERTY of LLMs: a dimension of LLM comparison alongside others like size, speed, or knowledge.
141
+ We evaluate LLMs by asking the SAME QUESTIONS from MANY DIFFERENT CONTEXTS.
142
+ </p>
143
+ <p>
144
+ LLMs are often used to simulate personas and populations.
145
+ We study the coherence of simulated populations over different contexts (conversations on different topics).
146
+ To do that we leverage the psychological methodology to study the interpersonal stability of personal value expression of those simulated populations.
147
+ We adopt the Schwartz Theory of Basic Personal Values that defines 10 values: Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, and Universalism.
148
+ To score those values we use the associated questionnaires: PVQ-40, and SVS.
149
+ </p>
150
  </div>
151
  <div class="section">
152
  <div class="section-title">Administering a questionnaire in context to a simulated persona</div>
153
+ <p>To evaluate the stability on a population level we need to be able to evaluate a value profile expressed by a simulated individual in a specific context (conversation topic).</p>
154
  <ol>
155
  <li> The Tested model is instructed to simulate a persona</li>
156
  <li> A separate model instance - The Interlocutor - is instructed to simulate a “human using a chatbot”
 
178
  Here are the considered context chunks:
179
  </p>
180
  <ul>
181
+ <li> <b> no_conv </b>: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
182
+ <li> <b> no_conv_svs </b>: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
183
+ <li> <b> chunk_0-chunk-4 </b>: 50 reddit posts are used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest </li>
184
+ <li> <b> chess </b>: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
185
+ <li> <b> grammar </b>: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
186
  </ul>
187
  </div>
188
  <div class="section">
189
  <div class="section-title">Metrics</div>
190
  <p>We evaluate the following metrics (+ denotes higher is better; - denotes lower is better) </p>
191
  <ul>
192
+ <li> <b> RO Stability (+) </b> - Average Rank-Order stability between each pair of context chunks. More details are given in the per-model pages (e.g. <a href="model/gpt-4o-0513">gpt-4o-0513</a>) and in our <a href="https://arxiv.org/abs/2402.14846">paper</a> </li>
193
+ <li> <b> Rank Distance (-) </b> - Average distance between the theoretical and the observed order of values in a circular space. More details are given in the per-model pages (e.g. <a href="model/gpt-4o-0513">gpt-4o-0513</a>) </li>
194
+ <li> <b> CFI (+) </b> - a common Validity metric </li>
195
+ <li> <b> SRMR (-) </b> - a common Validity metric </li>
196
+ <li> <b> RMSEA (-) </b> - a common Validity metric </li>
197
+ <li> <b> Cronbach alpha (+) </b> - a common Reliability metric </li>
198
+ <li> <b> Ordinal (Win rate) (+) </b> - each context pair and each metric is considered as a game between models, the metric shows the average win rate over all such games</li>
199
+ <li> <b> Cardinal (Score) (+) </b> - the average over all context pairs and metrics (with descending metric inverted) </li>
200
  </ul>
201
  </div>
202
  <div class="back-button">
templates/index.html CHANGED
@@ -17,7 +17,7 @@
17
  .container {
18
  max-width: 1200px; /* Limit the width of the container */
19
  margin: auto; /* Center the container */
20
- padding: 20px; /* Add some padding */
21
  background: #fff;
22
  border-radius: 8px;
23
  box-shadow: 0 4px 8px rgba(0,0,0,0.1);
@@ -37,7 +37,7 @@
37
  table {
38
  border-collapse: separate;
39
  border-spacing: 0;
40
- font-size: 14px; /* Reduce the font size */
41
  width: 100%;
42
  border: none; /* Remove any default border */
43
  }
@@ -55,7 +55,7 @@
55
  background-color: #f1f1f1;
56
  }
57
  table td, table th {
58
- padding: 10px; /* Reduce padding */
59
  border: 1px solid #dee2e6;
60
  }
61
  table th:first-child {
@@ -84,14 +84,17 @@
84
  font-family: 'Courier New', Courier, monospace;
85
  white-space: pre;
86
  }
87
-
88
  .image-container {
89
- width: 100%;
 
 
90
  margin-bottom: 40px;
91
  }
 
 
 
92
  .image-container img {
93
- width: 90%;
94
- max-width: 650px;
95
  height: auto;
96
  display: block;
97
  margin: auto;
@@ -122,6 +125,14 @@
122
  <!-- Render the table HTML here -->
123
  {{ table_html|safe }}
124
  </div>
 
 
 
 
 
 
 
 
125
  <div class="about-button">
126
  <a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a>
127
  </div>
@@ -164,7 +175,6 @@
164
  }
165
  });
166
  });
167
-
168
  </script>
169
  </body>
170
  </html>
 
17
  .container {
18
  max-width: 1200px; /* Limit the width of the container */
19
  margin: auto; /* Center the container */
20
+ padding: 15px; /* Add some padding */
21
  background: #fff;
22
  border-radius: 8px;
23
  box-shadow: 0 4px 8px rgba(0,0,0,0.1);
 
37
  table {
38
  border-collapse: separate;
39
  border-spacing: 0;
40
+ font-size: 12px; /* Reduce the font size */
41
  width: 100%;
42
  border: none; /* Remove any default border */
43
  }
 
55
  background-color: #f1f1f1;
56
  }
57
  table td, table th {
58
+ padding: 5px; /* Reduce padding */
59
  border: 1px solid #dee2e6;
60
  }
61
  table th:first-child {
 
84
  font-family: 'Courier New', Courier, monospace;
85
  white-space: pre;
86
  }
 
87
  .image-container {
88
+ display: flex;
89
+ justify-content: center;
90
+ gap: 10px;
91
  margin-bottom: 40px;
92
  }
93
+ .image-container a {
94
+ flex: 1;
95
+ }
96
  .image-container img {
97
+ max-width: 100%;
 
98
  height: auto;
99
  display: block;
100
  margin: auto;
 
125
  <!-- Render the table HTML here -->
126
  {{ table_html|safe }}
127
  </div>
128
+ <div class="image-container">
129
+ <a href="{{ url_for('static', filename='figures/cardinal.svg') }}" target="_blank">
130
+ <img src="{{ url_for('static', filename='figures/cardinal.svg') }}" alt="Cardinal">
131
+ </a>
132
+ <a href="{{ url_for('static', filename='figures/ordinal.svg') }}" target="_blank">
133
+ <img src="{{ url_for('static', filename='figures/ordinal.svg') }}" alt="Ordinal">
134
+ </a>
135
+ </div>
136
  <div class="about-button">
137
  <a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a>
138
  </div>
 
175
  }
176
  });
177
  });
 
178
  </script>
179
  </body>
180
  </html>
templates/model_detail.html CHANGED
@@ -43,6 +43,12 @@
43
  font-size: 30px;
44
  margin-bottom: 20px;
45
  }
 
 
 
 
 
 
46
  .image-container {
47
  width: 100%;
48
  margin-bottom: 40px;
@@ -89,7 +95,13 @@
89
  <div class="model-name">{{ model_name }}</div>
90
  <div class="image-section">
91
  <h2>Structure</h2>
92
- <p>This image shows .....</p>
 
 
 
 
 
 
93
  <div class="image-container">
94
  <a href="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" target="_blank">
95
  <img src="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" alt="Structure">
@@ -98,7 +110,13 @@
98
  </div>
99
  <div class="image-section">
100
  <h2>Pairwise Rank-Order stability</h2>
101
- <p>This image shows .....</p>
 
 
 
 
 
 
102
  <div class="matrix-image-container">
103
  <a href="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" target="_blank">
104
  <img src="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" alt="Matrix" >
@@ -106,8 +124,13 @@
106
  </div>
107
  </div>
108
  <div class="image-section">
109
- <h2>Ranks</h2>
110
- <p>This image shows .....</p>
 
 
 
 
 
111
  <div class="image-container">
112
  <a href="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" target="_blank">
113
  <img src="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" alt="Ranks">
 
43
  font-size: 30px;
44
  margin-bottom: 20px;
45
  }
46
+ .image-section p {
47
+ width: 80%;
48
+ margin: auto;
49
+ margin-bottom: 20px;
50
+ text-align: left;
51
+ }
52
  .image-container {
53
  width: 100%;
54
  margin-bottom: 40px;
 
95
  <div class="model-name">{{ model_name }}</div>
96
  <div class="image-section">
97
  <h2>Structure</h2>
98
+ <p>
99
+ This image shows the circular value structure projected on a 2D plane.
100
+ This was done by computing the intercorrelations between different values this space was then reduces with a SVD based approach and varimax rotation (`FactorAnalysis` object from `scikit-learn`).
101
+ The theoretical order is shown in the top left figure.
102
+ The distance is computed as the average distance of each value to it's rank in the theoretical order.
103
+ The minimal distance with the theoretical order in the clockwise and counter-clockwise direction was taken as the final distance.
104
+ </p>
105
  <div class="image-container">
106
  <a href="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" target="_blank">
107
  <img src="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" alt="Structure">
 
110
  </div>
111
  <div class="image-section">
112
  <h2>Pairwise Rank-Order stability</h2>
113
+ <p>
114
+ This image shows the Rank-Order stability between each pair of context chunks.
115
+ Rank-Order stability is computed by ordering the personas based on their expression of some value,
116
+ and then computing the correlation between their orders in two different context chunks.
117
+ The stability estimates for the ten values are then averaged to get the final Rank-Order stability measure.
118
+ Refer to our <a href="https://arxiv.org/abs/2402.14846">paper</a> for details.
119
+ </p>
120
  <div class="matrix-image-container">
121
  <a href="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" target="_blank">
122
  <img src="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" alt="Matrix" >
 
124
  </div>
125
  </div>
126
  <div class="image-section">
127
+ <h2>Visualizing the order of simulated perosonas</h2>
128
+ <p>
129
+ This image visualizes the order of personas in each context chunk and for each values.
130
+ For each value (row), the personas are ordered on the x-axis by their expression of this value in the `no_conv` setting (gray).
131
+ Therefore, the Rank-Order stability between the `no_conv` chunk and some chunk corresponds to the extent to which the curve is increasing in that chunks.
132
+ (Only some personas are marked on the x-axis for clarity).
133
+ </p>
134
  <div class="image-container">
135
  <a href="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" target="_blank">
136
  <img src="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" alt="Ranks">