Adding cardinal and ordinal figures, updating descriptions on per_model pages and index, fixed leaderboard.csv
Browse files- app.py +11 -0
- static/figures/Qwen2-7B-Instruct/matrix.svg +136 -136
- static/figures/cardinal.svg +1753 -0
- static/figures/ordinal.svg +1769 -0
- static/leaderboard.csv +14 -14
- templates/about.html +27 -24
- templates/index.html +18 -8
- templates/model_detail.html +27 -4
app.py
CHANGED
@@ -11,6 +11,17 @@ def index():
|
|
11 |
df = df.round(3)
|
12 |
df.insert(0, '#', '')
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# Generate the table HTML with clickable model names
|
15 |
table_html = df.to_html(classes='table table-striped table-bordered', escape=False, index=False)
|
16 |
|
|
|
11 |
df = df.round(3)
|
12 |
df.insert(0, '#', '')
|
13 |
|
14 |
+
df = df.rename(columns={
|
15 |
+
"Ordinal (Win rate)": "Ordinal (Win rate) (+)",
|
16 |
+
"Cardinal (Score)": "Cardinal (Score) (+)",
|
17 |
+
"RO Stability": "RO Stability (+)",
|
18 |
+
"Rank Distance": "Rank Distance (-)",
|
19 |
+
"CFI": "CFI (+)",
|
20 |
+
"SRMR": "SRMR (-)",
|
21 |
+
"RMSEA": "RMSEA (-)",
|
22 |
+
"Cronbach alpha": "Cronbach alpha (+)"
|
23 |
+
})
|
24 |
+
|
25 |
# Generate the table HTML with clickable model names
|
26 |
table_html = df.to_html(classes='table table-striped table-bordered', escape=False, index=False)
|
27 |
|
static/figures/Qwen2-7B-Instruct/matrix.svg
CHANGED
static/figures/cardinal.svg
ADDED
static/figures/ordinal.svg
ADDED
static/leaderboard.csv
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Rank Distance,CFI,SRMR,RMSEA,Cronbach alpha
|
2 |
-
phi-3-mini-128k-instruct,0.258309591642925,0.18707102480796897,0.
|
3 |
-
phi-3-medium-128k-instruct,0.28490028490028496,0.18789267301588508,0.
|
4 |
-
Mistral-7B-Instruct-v0.1,0.3418803418803419,0.23063750442486428,0.
|
5 |
-
Mistral-7B-Instruct-v0.2,0.3342830009496676,0.1747138068267554,0.
|
6 |
-
Mistral-7B-Instruct-v0.3,0.3133903133903134,0.20131219867252867,0.
|
7 |
-
Mixtral-8x7B-Instruct-v0.1,0.43114909781576455,0.2437400779497571,0.
|
8 |
-
Mixtral-8x22B-Instruct-v0.1,0.29629629629629634,0.18791617935864172,0.
|
9 |
-
command_r_plus,0.560303893637227,0.3737946817620246,0.
|
10 |
-
llama_3_8b_instruct,0.4691358024691358,0.28828624999947805,0.
|
11 |
-
llama_3_70b_instruct,0.7701804368471036,0.5976823900754995,0.
|
12 |
-
Qwen2-7B-Instruct,0.5251661918328584,0.3400513233761655,0.
|
13 |
-
Qwen2-72B-Instruct,0.5906932573599241,0.42123592516768155,0.
|
14 |
-
gpt-3.5-turbo-0125,0.23741690408357075,0.14920836189480854,0.
|
15 |
-
gpt-4o-0513,0.7340930674264008,0.5383734693976642,0.
|
|
|
1 |
Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Rank Distance,CFI,SRMR,RMSEA,Cronbach alpha
|
2 |
+
phi-3-mini-128k-instruct,0.258309591642925,0.18707102480796897,0.039299993295009855,0.7387387387387387,0.18377777777777776,0.5171888888888889,0.5129444444444444,0.11153354438597567
|
3 |
+
phi-3-medium-128k-instruct,0.28490028490028496,0.18789267301588508,0.09692037989916814,0.7207207207207207,0.13975555555555555,0.7127888888888889,0.7074888888888888,0.30459548048923546
|
4 |
+
Mistral-7B-Instruct-v0.1,0.3418803418803419,0.23063750442486428,0.027216280472015988,0.6636636636636636,0.22268888888888888,0.32623333333333326,0.3056444444444445,0.039724970488267154
|
5 |
+
Mistral-7B-Instruct-v0.2,0.3342830009496676,0.1747138068267554,0.14417876497818388,0.6516516516516517,0.08117777777777778,0.8068555555555555,0.8051,0.1781386309574924
|
6 |
+
Mistral-7B-Instruct-v0.3,0.3133903133903134,0.20131219867252867,0.07960539866974455,0.6126126126126126,0.15823333333333334,0.6148222222222223,0.6058111111111111,0.1684008059863923
|
7 |
+
Mixtral-8x7B-Instruct-v0.1,0.43114909781576455,0.2437400779497571,0.21473356319081474,0.6846846846846846,0.13543333333333335,0.7121888888888889,0.7086555555555556,0.3048222445803502
|
8 |
+
Mixtral-8x22B-Instruct-v0.1,0.29629629629629634,0.18791617935864172,0.1414001940345544,0.6696696696696696,0.11905555555555554,0.8129333333333333,0.8111777777777778,0.30037006331478344
|
9 |
+
command_r_plus,0.560303893637227,0.3737946817620246,0.3429686514651868,0.6726726726726726,0.2763777777777778,0.5177555555555555,0.5135777777777777,0.41990575822570303
|
10 |
+
llama_3_8b_instruct,0.4691358024691358,0.28828624999947805,0.24527785038654715,0.6996996996996997,0.21058888888888885,0.6220777777777777,0.6159777777777777,0.34063121481548086
|
11 |
+
llama_3_70b_instruct,0.7701804368471036,0.5976823900754995,0.607020698814379,0.6966966966966968,0.4662222222222222,0.25131111111111115,0.2503333333333333,0.6831776343408991
|
12 |
+
Qwen2-7B-Instruct,0.5251661918328584,0.3400513233761655,0.25108519506513916,0.7057057057057057,0.2508,0.4224444444444444,0.41727777777777786,0.35074905805286155
|
13 |
+
Qwen2-72B-Instruct,0.5906932573599241,0.42123592516768155,0.6465993243020925,0.6426426426426426,0.06031111111111111,0.9069111111111111,0.9069555555555555,0.6009242274989618
|
14 |
+
gpt-3.5-turbo-0125,0.23741690408357075,0.14920836189480854,0.08240359836763214,0.7297297297297297,0.08906666666666668,0.7106777777777777,0.7033,0.06790170442358906
|
15 |
+
gpt-4o-0513,0.7340930674264008,0.5383734693976642,0.5122163952167618,0.6546546546546546,0.4056777777777778,0.24250000000000016,0.23266666666666658,0.5206391872554754
|
templates/about.html
CHANGED
@@ -134,20 +134,23 @@
|
|
134 |
</div>
|
135 |
<div class="section">
|
136 |
<div class="section-title">Motivation</div>
|
137 |
-
<
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
147 |
</div>
|
148 |
<div class="section">
|
149 |
<div class="section-title">Administering a questionnaire in context to a simulated persona</div>
|
150 |
-
<p>
|
151 |
<ol>
|
152 |
<li> The Tested model is instructed to simulate a persona</li>
|
153 |
<li> A separate model instance - The Interlocutor - is instructed to simulate a “human using a chatbot”
|
@@ -175,25 +178,25 @@
|
|
175 |
Here are the considered context chunks:
|
176 |
</p>
|
177 |
<ul>
|
178 |
-
<li> no_conv
|
179 |
-
<li> no_conv_svs
|
180 |
-
<li> chunk_0-chunk-4
|
181 |
-
<li> chess
|
182 |
-
<li> grammar
|
183 |
</ul>
|
184 |
</div>
|
185 |
<div class="section">
|
186 |
<div class="section-title">Metrics</div>
|
187 |
<p>We evaluate the following metrics (+ denotes higher is better; - denotes lower is better) </p>
|
188 |
<ul>
|
189 |
-
<li> RO Stability (+) - Average Rank-Order stability between each pair of context chunks (
|
190 |
-
<li> Rank Distance (-) - Average distance between the theoretical and the observed order of values in a circular space. </li>
|
191 |
-
<li> CFI (+) - a common Validity metric </li>
|
192 |
-
<li> SRMR (-) - a common Validity metric </li>
|
193 |
-
<li> RMSEA (-) - a common Validity metric </li>
|
194 |
-
<li> Cronbach alpha (+) - a common Reliability metric </li>
|
195 |
-
<li> Ordinal (Win rate) (+) - each context pair and each metric is considered as a game between models, the metric shows the average win rate over all such games</li>
|
196 |
-
<li> Cardinal (Score) (+) - the average over all context pairs and metrics (with descending metric inverted) </li>
|
197 |
</ul>
|
198 |
</div>
|
199 |
<div class="back-button">
|
|
|
134 |
</div>
|
135 |
<div class="section">
|
136 |
<div class="section-title">Motivation</div>
|
137 |
+
<p>
|
138 |
+
Benchmarks usually compare models with MANY QUESTIONS from A SINGLE MINIMAL CONTEXT, e.g. as multiple choices questions.
|
139 |
+
This kind of evaluation is little informative of LLMs' behavior in deployment when exposed to new contexts (especially when we consider the LLMs highly context-dependant nature).
|
140 |
+
We argue that CONTEXT-DEPENDENCE can be seen as a PROPERTY of LLMs: a dimension of LLM comparison alongside others like size, speed, or knowledge.
|
141 |
+
We evaluate LLMs by asking the SAME QUESTIONS from MANY DIFFERENT CONTEXTS.
|
142 |
+
</p>
|
143 |
+
<p>
|
144 |
+
LLMs are often used to simulate personas and populations.
|
145 |
+
We study the coherence of simulated populations over different contexts (conversations on different topics).
|
146 |
+
To do that we leverage the psychological methodology to study the interpersonal stability of personal value expression of those simulated populations.
|
147 |
+
We adopt the Schwartz Theory of Basic Personal Values that defines 10 values: Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, and Universalism.
|
148 |
+
To score those values we use the associated questionnaires: PVQ-40, and SVS.
|
149 |
+
</p>
|
150 |
</div>
|
151 |
<div class="section">
|
152 |
<div class="section-title">Administering a questionnaire in context to a simulated persona</div>
|
153 |
+
<p>To evaluate the stability on a population level we need to be able to evaluate a value profile expressed by a simulated individual in a specific context (conversation topic).</p>
|
154 |
<ol>
|
155 |
<li> The Tested model is instructed to simulate a persona</li>
|
156 |
<li> A separate model instance - The Interlocutor - is instructed to simulate a “human using a chatbot”
|
|
|
178 |
Here are the considered context chunks:
|
179 |
</p>
|
180 |
<ul>
|
181 |
+
<li> <b> no_conv </b>: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
|
182 |
+
<li> <b> no_conv_svs </b>: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
|
183 |
+
<li> <b> chunk_0-chunk-4 </b>: 50 reddit posts are used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest </li>
|
184 |
+
<li> <b> chess </b>: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
|
185 |
+
<li> <b> grammar </b>: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
|
186 |
</ul>
|
187 |
</div>
|
188 |
<div class="section">
|
189 |
<div class="section-title">Metrics</div>
|
190 |
<p>We evaluate the following metrics (+ denotes higher is better; - denotes lower is better) </p>
|
191 |
<ul>
|
192 |
+
<li> <b> RO Stability (+) </b> - Average Rank-Order stability between each pair of context chunks. More details are given in the per-model pages (e.g. <a href="model/gpt-4o-0513">gpt-4o-0513</a>) and in our <a href="https://arxiv.org/abs/2402.14846">paper</a> </li>
|
193 |
+
<li> <b> Rank Distance (-) </b> - Average distance between the theoretical and the observed order of values in a circular space. More details are given in the per-model pages (e.g. <a href="model/gpt-4o-0513">gpt-4o-0513</a>) </li>
|
194 |
+
<li> <b> CFI (+) </b> - a common Validity metric </li>
|
195 |
+
<li> <b> SRMR (-) </b> - a common Validity metric </li>
|
196 |
+
<li> <b> RMSEA (-) </b> - a common Validity metric </li>
|
197 |
+
<li> <b> Cronbach alpha (+) </b> - a common Reliability metric </li>
|
198 |
+
<li> <b> Ordinal (Win rate) (+) </b> - each context pair and each metric is considered as a game between models, the metric shows the average win rate over all such games</li>
|
199 |
+
<li> <b> Cardinal (Score) (+) </b> - the average over all context pairs and metrics (with descending metric inverted) </li>
|
200 |
</ul>
|
201 |
</div>
|
202 |
<div class="back-button">
|
templates/index.html
CHANGED
@@ -17,7 +17,7 @@
|
|
17 |
.container {
|
18 |
max-width: 1200px; /* Limit the width of the container */
|
19 |
margin: auto; /* Center the container */
|
20 |
-
padding:
|
21 |
background: #fff;
|
22 |
border-radius: 8px;
|
23 |
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
@@ -37,7 +37,7 @@
|
|
37 |
table {
|
38 |
border-collapse: separate;
|
39 |
border-spacing: 0;
|
40 |
-
font-size:
|
41 |
width: 100%;
|
42 |
border: none; /* Remove any default border */
|
43 |
}
|
@@ -55,7 +55,7 @@
|
|
55 |
background-color: #f1f1f1;
|
56 |
}
|
57 |
table td, table th {
|
58 |
-
padding:
|
59 |
border: 1px solid #dee2e6;
|
60 |
}
|
61 |
table th:first-child {
|
@@ -84,14 +84,17 @@
|
|
84 |
font-family: 'Courier New', Courier, monospace;
|
85 |
white-space: pre;
|
86 |
}
|
87 |
-
|
88 |
.image-container {
|
89 |
-
|
|
|
|
|
90 |
margin-bottom: 40px;
|
91 |
}
|
|
|
|
|
|
|
92 |
.image-container img {
|
93 |
-
width:
|
94 |
-
max-width: 650px;
|
95 |
height: auto;
|
96 |
display: block;
|
97 |
margin: auto;
|
@@ -122,6 +125,14 @@
|
|
122 |
<!-- Render the table HTML here -->
|
123 |
{{ table_html|safe }}
|
124 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
<div class="about-button">
|
126 |
<a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a>
|
127 |
</div>
|
@@ -164,7 +175,6 @@
|
|
164 |
}
|
165 |
});
|
166 |
});
|
167 |
-
|
168 |
</script>
|
169 |
</body>
|
170 |
</html>
|
|
|
17 |
.container {
|
18 |
max-width: 1200px; /* Limit the width of the container */
|
19 |
margin: auto; /* Center the container */
|
20 |
+
padding: 15px; /* Add some padding */
|
21 |
background: #fff;
|
22 |
border-radius: 8px;
|
23 |
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
|
|
37 |
table {
|
38 |
border-collapse: separate;
|
39 |
border-spacing: 0;
|
40 |
+
font-size: 12px; /* Reduce the font size */
|
41 |
width: 100%;
|
42 |
border: none; /* Remove any default border */
|
43 |
}
|
|
|
55 |
background-color: #f1f1f1;
|
56 |
}
|
57 |
table td, table th {
|
58 |
+
padding: 5px; /* Reduce padding */
|
59 |
border: 1px solid #dee2e6;
|
60 |
}
|
61 |
table th:first-child {
|
|
|
84 |
font-family: 'Courier New', Courier, monospace;
|
85 |
white-space: pre;
|
86 |
}
|
|
|
87 |
.image-container {
|
88 |
+
display: flex;
|
89 |
+
justify-content: center;
|
90 |
+
gap: 10px;
|
91 |
margin-bottom: 40px;
|
92 |
}
|
93 |
+
.image-container a {
|
94 |
+
flex: 1;
|
95 |
+
}
|
96 |
.image-container img {
|
97 |
+
max-width: 100%;
|
|
|
98 |
height: auto;
|
99 |
display: block;
|
100 |
margin: auto;
|
|
|
125 |
<!-- Render the table HTML here -->
|
126 |
{{ table_html|safe }}
|
127 |
</div>
|
128 |
+
<div class="image-container">
|
129 |
+
<a href="{{ url_for('static', filename='figures/cardinal.svg') }}" target="_blank">
|
130 |
+
<img src="{{ url_for('static', filename='figures/cardinal.svg') }}" alt="Cardinal">
|
131 |
+
</a>
|
132 |
+
<a href="{{ url_for('static', filename='figures/ordinal.svg') }}" target="_blank">
|
133 |
+
<img src="{{ url_for('static', filename='figures/ordinal.svg') }}" alt="Ordinal">
|
134 |
+
</a>
|
135 |
+
</div>
|
136 |
<div class="about-button">
|
137 |
<a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a>
|
138 |
</div>
|
|
|
175 |
}
|
176 |
});
|
177 |
});
|
|
|
178 |
</script>
|
179 |
</body>
|
180 |
</html>
|
templates/model_detail.html
CHANGED
@@ -43,6 +43,12 @@
|
|
43 |
font-size: 30px;
|
44 |
margin-bottom: 20px;
|
45 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
.image-container {
|
47 |
width: 100%;
|
48 |
margin-bottom: 40px;
|
@@ -89,7 +95,13 @@
|
|
89 |
<div class="model-name">{{ model_name }}</div>
|
90 |
<div class="image-section">
|
91 |
<h2>Structure</h2>
|
92 |
-
<p>
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
<div class="image-container">
|
94 |
<a href="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" target="_blank">
|
95 |
<img src="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" alt="Structure">
|
@@ -98,7 +110,13 @@
|
|
98 |
</div>
|
99 |
<div class="image-section">
|
100 |
<h2>Pairwise Rank-Order stability</h2>
|
101 |
-
<p>
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
<div class="matrix-image-container">
|
103 |
<a href="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" target="_blank">
|
104 |
<img src="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" alt="Matrix" >
|
@@ -106,8 +124,13 @@
|
|
106 |
</div>
|
107 |
</div>
|
108 |
<div class="image-section">
|
109 |
-
<h2>
|
110 |
-
<p>
|
|
|
|
|
|
|
|
|
|
|
111 |
<div class="image-container">
|
112 |
<a href="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" target="_blank">
|
113 |
<img src="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" alt="Ranks">
|
|
|
43 |
font-size: 30px;
|
44 |
margin-bottom: 20px;
|
45 |
}
|
46 |
+
.image-section p {
|
47 |
+
width: 80%;
|
48 |
+
margin: auto;
|
49 |
+
margin-bottom: 20px;
|
50 |
+
text-align: left;
|
51 |
+
}
|
52 |
.image-container {
|
53 |
width: 100%;
|
54 |
margin-bottom: 40px;
|
|
|
95 |
<div class="model-name">{{ model_name }}</div>
|
96 |
<div class="image-section">
|
97 |
<h2>Structure</h2>
|
98 |
+
<p>
|
99 |
+
This image shows the circular value structure projected on a 2D plane.
|
100 |
+
This was done by computing the intercorrelations between different values this space was then reduces with a SVD based approach and varimax rotation (`FactorAnalysis` object from `scikit-learn`).
|
101 |
+
The theoretical order is shown in the top left figure.
|
102 |
+
The distance is computed as the average distance of each value to it's rank in the theoretical order.
|
103 |
+
The minimal distance with the theoretical order in the clockwise and counter-clockwise direction was taken as the final distance.
|
104 |
+
</p>
|
105 |
<div class="image-container">
|
106 |
<a href="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" target="_blank">
|
107 |
<img src="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" alt="Structure">
|
|
|
110 |
</div>
|
111 |
<div class="image-section">
|
112 |
<h2>Pairwise Rank-Order stability</h2>
|
113 |
+
<p>
|
114 |
+
This image shows the Rank-Order stability between each pair of context chunks.
|
115 |
+
Rank-Order stability is computed by ordering the personas based on their expression of some value,
|
116 |
+
and then computing the correlation between their orders in two different context chunks.
|
117 |
+
The stability estimates for the ten values are then averaged to get the final Rank-Order stability measure.
|
118 |
+
Refer to our <a href="https://arxiv.org/abs/2402.14846">paper</a> for details.
|
119 |
+
</p>
|
120 |
<div class="matrix-image-container">
|
121 |
<a href="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" target="_blank">
|
122 |
<img src="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" alt="Matrix" >
|
|
|
124 |
</div>
|
125 |
</div>
|
126 |
<div class="image-section">
|
127 |
+
<h2>Visualizing the order of simulated perosonas</h2>
|
128 |
+
<p>
|
129 |
+
This image visualizes the order of personas in each context chunk and for each values.
|
130 |
+
For each value (row), the personas are ordered on the x-axis by their expression of this value in the `no_conv` setting (gray).
|
131 |
+
Therefore, the Rank-Order stability between the `no_conv` chunk and some chunk corresponds to the extent to which the curve is increasing in that chunks.
|
132 |
+
(Only some personas are marked on the x-axis for clarity).
|
133 |
+
</p>
|
134 |
<div class="image-container">
|
135 |
<a href="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" target="_blank">
|
136 |
<img src="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" alt="Ranks">
|