Spaces:

flowers-team
/

StickToYourRoleLeaderboard

Running

App Files Files Community

grg commited on Jul 8

Commit

ca60da9

•

1 Parent(s): b42419d

Adding cardinal and ordinal figures, updating descriptions on per_model pages and index, fixed leaderboard.csv

Browse files

Files changed (8) hide show

app.py +11 -0
static/figures/Qwen2-7B-Instruct/matrix.svg +136 -136
static/figures/cardinal.svg +1753 -0
static/figures/ordinal.svg +1769 -0
static/leaderboard.csv +14 -14
templates/about.html +27 -24
templates/index.html +18 -8
templates/model_detail.html +27 -4

app.py CHANGED Viewed

@@ -11,6 +11,17 @@ def index():
     df = df.round(3)
     df.insert(0, '#', '')
     # Generate the table HTML with clickable model names
     table_html = df.to_html(classes='table table-striped table-bordered', escape=False, index=False)

     df = df.round(3)
     df.insert(0, '#', '')
+    df = df.rename(columns={
+        "Ordinal (Win rate)": "Ordinal (Win rate) (+)",
+        "Cardinal (Score)": "Cardinal (Score) (+)",
+        "RO Stability": "RO Stability (+)",
+        "Rank Distance": "Rank Distance (-)",
+        "CFI": "CFI (+)",
+        "SRMR": "SRMR (-)",
+        "RMSEA": "RMSEA (-)",
+        "Cronbach alpha": "Cronbach alpha (+)"
+    })
     # Generate the table HTML with clickable model names
     table_html = df.to_html(classes='table table-striped table-bordered', escape=False, index=False)

static/figures/Qwen2-7B-Instruct/matrix.svg CHANGED Viewed

static/figures/cardinal.svg ADDED Viewed

static/figures/ordinal.svg ADDED Viewed

static/leaderboard.csv CHANGED Viewed

@@ -1,15 +1,15 @@
 Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Rank Distance,CFI,SRMR,RMSEA,Cronbach alpha
-phi-3-mini-128k-instruct,0.258309591642925,0.18707102480796897,0.16880341880341881,0.7264957264957265,0.32478632478632474,0.5555555555555556,0.6239316239316239,0.23076923076923084
-phi-3-medium-128k-instruct,0.28490028490028496,0.18789267301588508,0.26282051282051283,0.7521367521367521,0.2820512820512821,0.7692307692307692,0.7606837606837606,0.5128205128205128
-Mistral-7B-Instruct-v0.1,0.3418803418803419,0.23063750442486428,0.12393162393162394,0.4786324786324786,0.49572649572649574,0.3076923076923077,0.2222222222222222,0.09401709401709403
-Mistral-7B-Instruct-v0.2,0.3342830009496676,0.1747138068267554,0.41666666666666674,0.4358974358974358,0.1282051282051282,0.829059829059829,0.8205128205128205,0.2991452991452992
-Mistral-7B-Instruct-v0.3,0.3133903133903134,0.20131219867252867,0.23504273504273498,0.4273504273504273,0.28205128205128205,0.6324786324786325,0.5897435897435896,0.24786324786324787
-Mixtral-8x7B-Instruct-v0.1,0.43114909781576455,0.2437400779497571,0.5811965811965812,0.641025641025641,0.2136752136752137,0.7863247863247863,0.7521367521367521,0.5213675213675214
-Mixtral-8x22B-Instruct-v0.1,0.29629629629629634,0.18791617935864172,0.37820512820512825,0.6837606837606838,0.1794871794871795,0.9230769230769231,0.9145299145299145,0.49572649572649574
-command_r_plus,0.560303893637227,0.3737946817620246,0.6880341880341879,0.6923076923076923,0.45299145299145294,0.5128205128205128,0.5811965811965811,0.6239316239316239
-llama_3_8b_instruct,0.4691358024691358,0.28828624999947805,0.5747863247863247,0.5470085470085471,0.3162393162393162,0.7008547008547008,0.6923076923076923,0.5470085470085471
-llama_3_70b_instruct,0.7701804368471036,0.5976823900754995,0.9380341880341881,0.7264957264957265,0.8376068376068376,0.4273504273504274,0.49572649572649574,0.9914529914529915
-Qwen2-7B-Instruct,0.5251661918328584,0.3400513233761655,0.5769230769230768,0.5811965811965811,0.4188034188034188,0.45299145299145294,0.4871794871794872,0.5213675213675214
-Qwen2-72B-Instruct,0.5906932573599241,0.42123592516768155,0.9658119658119655,0.5811965811965811,0.07692307692307693,0.9658119658119658,0.9914529914529915,0.9145299145299146
-gpt-3.5-turbo-0125,0.23741690408357075,0.14920836189480854,0.24145299145299137,0.7777777777777778,0.1965811965811966,0.717948717948718,0.7094017094017093,0.1794871794871795
-gpt-4o-0513,0.7340930674264008,0.5383734693976642,0.8482905982905984,0.6666666666666667,0.811965811965812,0.41025641025641024,0.3418803418803419,0.8205128205128205

 Model,Ordinal (Win rate),Cardinal (Score),RO Stability,Rank Distance,CFI,SRMR,RMSEA,Cronbach alpha
+phi-3-mini-128k-instruct,0.258309591642925,0.18707102480796897,0.039299993295009855,0.7387387387387387,0.18377777777777776,0.5171888888888889,0.5129444444444444,0.11153354438597567
+phi-3-medium-128k-instruct,0.28490028490028496,0.18789267301588508,0.09692037989916814,0.7207207207207207,0.13975555555555555,0.7127888888888889,0.7074888888888888,0.30459548048923546
+Mistral-7B-Instruct-v0.1,0.3418803418803419,0.23063750442486428,0.027216280472015988,0.6636636636636636,0.22268888888888888,0.32623333333333326,0.3056444444444445,0.039724970488267154
+Mistral-7B-Instruct-v0.2,0.3342830009496676,0.1747138068267554,0.14417876497818388,0.6516516516516517,0.08117777777777778,0.8068555555555555,0.8051,0.1781386309574924
+Mistral-7B-Instruct-v0.3,0.3133903133903134,0.20131219867252867,0.07960539866974455,0.6126126126126126,0.15823333333333334,0.6148222222222223,0.6058111111111111,0.1684008059863923
+Mixtral-8x7B-Instruct-v0.1,0.43114909781576455,0.2437400779497571,0.21473356319081474,0.6846846846846846,0.13543333333333335,0.7121888888888889,0.7086555555555556,0.3048222445803502
+Mixtral-8x22B-Instruct-v0.1,0.29629629629629634,0.18791617935864172,0.1414001940345544,0.6696696696696696,0.11905555555555554,0.8129333333333333,0.8111777777777778,0.30037006331478344
+command_r_plus,0.560303893637227,0.3737946817620246,0.3429686514651868,0.6726726726726726,0.2763777777777778,0.5177555555555555,0.5135777777777777,0.41990575822570303
+llama_3_8b_instruct,0.4691358024691358,0.28828624999947805,0.24527785038654715,0.6996996996996997,0.21058888888888885,0.6220777777777777,0.6159777777777777,0.34063121481548086
+llama_3_70b_instruct,0.7701804368471036,0.5976823900754995,0.607020698814379,0.6966966966966968,0.4662222222222222,0.25131111111111115,0.2503333333333333,0.6831776343408991
+Qwen2-7B-Instruct,0.5251661918328584,0.3400513233761655,0.25108519506513916,0.7057057057057057,0.2508,0.4224444444444444,0.41727777777777786,0.35074905805286155
+Qwen2-72B-Instruct,0.5906932573599241,0.42123592516768155,0.6465993243020925,0.6426426426426426,0.06031111111111111,0.9069111111111111,0.9069555555555555,0.6009242274989618
+gpt-3.5-turbo-0125,0.23741690408357075,0.14920836189480854,0.08240359836763214,0.7297297297297297,0.08906666666666668,0.7106777777777777,0.7033,0.06790170442358906
+gpt-4o-0513,0.7340930674264008,0.5383734693976642,0.5122163952167618,0.6546546546546546,0.4056777777777778,0.24250000000000016,0.23266666666666658,0.5206391872554754

templates/about.html CHANGED Viewed

@@ -134,20 +134,23 @@
         </div>
         <div class="section">
             <div class="section-title">Motivation</div>
-            <ul>
-                <li> LLMs are often used to simulate personas and populations</li>
-                <li> Benchmarks usually compare models with MANY QUESTIONS from A SINGLE MINIMAL CONTEXT, e.g. as multiple choices questions
-                    -> this kind of evaluation is little informative of LLMs' behavior in deployment when exposed to new contexts
-                    (especially when we consider the LLMs highly context-dependant nature)
-                <li> CONTEXT-DEPENDENCE can be seen as a PROPERTY of LLMs: a dimension of LLM comparison alongside others like size, speed, or knowledge
-                <li> We evaluate LLMs by asking the SAME QUESTIONS from MANY DIFFERENT CONTEXTS
-                <li> We study the coherence of simulated populations over different contexts (conversations on different topics)
-                <li> We leverage the psychological methodology to study the interpersonal stability of personal value expression of those simulated populations
-            </ul>
         </div>
         <div class="section">
             <div class="section-title">Administering a questionnaire in context to a simulated persona</div>
-            <p>We use the following procedure:</p>
             <ol>
                 <li> The Tested model is instructed to simulate a persona</li>
                 <li> A separate model instance - The Interlocutor - is instructed to simulate a “human using a chatbot”
@@ -175,25 +178,25 @@
                 Here are the considered context chunks:
             </p>
             <ul>
-                <li> no_conv: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
-                <li> no_conv_svs: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
-                <li> chunk_0-chunk-4: 50 reddit posts are used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest </li>
-                <li> chess: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
-                <li> grammar: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
             </ul>
         </div>
         <div class="section">
             <div class="section-title">Metrics</div>
             <p>We evaluate the following metrics (+ denotes higher is better; - denotes lower is better) </p>
             <ul>
-                <li> RO Stability (+) - Average Rank-Order stability between each pair of context chunks (see our <a href="https://arxiv.org/abs/2402.14846">paper</a> for details) </li>
-                <li> Rank Distance (-) - Average distance between the theoretical and the observed order of values in a circular space. </li>
-                <li> CFI (+) - a common Validity metric </li>
-                <li> SRMR (-) - a common Validity metric </li>
-                <li> RMSEA (-) - a common Validity metric </li>
-                <li> Cronbach alpha (+) - a common Reliability metric </li>
-                <li> Ordinal (Win rate) (+) - each context pair and each metric is considered as a game between models, the metric shows the average win rate over all such games</li>
-                <li> Cardinal (Score) (+) - the average over all context pairs and metrics (with descending metric inverted) </li>
             </ul>
         </div>
         <div class="back-button">

         </div>
         <div class="section">
             <div class="section-title">Motivation</div>
+            <p>
+                Benchmarks usually compare models with MANY QUESTIONS from A SINGLE MINIMAL CONTEXT, e.g. as multiple choices questions.
+                This kind of evaluation is little informative of LLMs' behavior in deployment when exposed to new contexts (especially when we consider the LLMs highly context-dependant nature).
+                We argue that CONTEXT-DEPENDENCE can be seen as a PROPERTY of LLMs: a dimension of LLM comparison alongside others like size, speed, or knowledge.
+                We evaluate LLMs by asking the SAME QUESTIONS from MANY DIFFERENT CONTEXTS.
+            </p>
+            <p>
+                LLMs are often used to simulate personas and populations.
+                We study the coherence of simulated populations over different contexts (conversations on different topics).
+                To do that we leverage the psychological methodology to study the interpersonal stability of personal value expression of those simulated populations.
+                We adopt the Schwartz Theory of Basic Personal Values that defines 10 values: Self-Direction, Stimulation, Hedonism, Achievement, Power, Security, Conformity, Tradition, Benevolence, and Universalism.
+                To score those values we use the associated questionnaires: PVQ-40, and SVS.
+            </p>
         </div>
         <div class="section">
             <div class="section-title">Administering a questionnaire in context to a simulated persona</div>
+            <p>To evaluate the stability on a population level we need to be able to evaluate a value profile expressed by a simulated individual in a specific context (conversation topic).</p>
             <ol>
                 <li> The Tested model is instructed to simulate a persona</li>
                 <li> A separate model instance - The Interlocutor - is instructed to simulate a “human using a chatbot”
                 Here are the considered context chunks:
             </p>
             <ul>
+                <li> <b> no_conv </b>: no conversation is simulated the questions from the PVQ-40 questionnaire are given directly </li>
+                <li> <b> no_conv_svs </b>: no conversation is simulated the questions from the SVS questionnaire are given directly </li>
+                <li> <b> chunk_0-chunk-4 </b>: 50 reddit posts are used as the initial Interlocutor model messages (one per persona). chunk_0 contains the longest posts, chunk_4 the shortest </li>
+                <li> <b> chess </b>: "1. e4" is given as the initial message to all personas, but for each persona the Interlocutor model is instructed to simulate a different persona (instead of a human user) </li>
+                <li> <b> grammar </b>: like chess, but "Can you check this sentence for grammar? \n Whilst Jane was waiting to meet hers friend their nose started bleeding." is given as the initial message.
             </ul>
         </div>
         <div class="section">
             <div class="section-title">Metrics</div>
             <p>We evaluate the following metrics (+ denotes higher is better; - denotes lower is better) </p>
             <ul>
+                <li> <b> RO Stability (+) </b> - Average Rank-Order stability between each pair of context chunks. More details are given in the per-model pages (e.g. <a href="model/gpt-4o-0513">gpt-4o-0513</a>) and in our <a href="https://arxiv.org/abs/2402.14846">paper</a> </li>
+                <li> <b> Rank Distance (-) </b> - Average distance between the theoretical and the observed order of values in a circular space. More details are given in the per-model pages (e.g. <a href="model/gpt-4o-0513">gpt-4o-0513</a>) </li>
+                <li> <b> CFI (+) </b> - a common Validity metric </li>
+                <li> <b> SRMR (-) </b> - a common Validity metric </li>
+                <li> <b> RMSEA (-) </b> - a common Validity metric </li>
+                <li> <b> Cronbach alpha (+) </b> - a common Reliability metric </li>
+                <li> <b> Ordinal (Win rate) (+) </b> - each context pair and each metric is considered as a game between models, the metric shows the average win rate over all such games</li>
+                <li> <b> Cardinal (Score) (+) </b> - the average over all context pairs and metrics (with descending metric inverted) </li>
             </ul>
         </div>
         <div class="back-button">

templates/index.html CHANGED Viewed

@@ -17,7 +17,7 @@
         .container {
             max-width: 1200px; /* Limit the width of the container */
             margin: auto; /* Center the container */
-            padding: 20px; /* Add some padding */
             background: #fff;
             border-radius: 8px;
             box-shadow: 0 4px 8px rgba(0,0,0,0.1);
@@ -37,7 +37,7 @@
         table {
             border-collapse: separate;
             border-spacing: 0;
-            font-size: 14px; /* Reduce the font size */
             width: 100%;
             border: none; /* Remove any default border */
         }
@@ -55,7 +55,7 @@
             background-color: #f1f1f1;
         }
         table td, table th {
-            padding: 10px; /* Reduce padding */
             border: 1px solid #dee2e6;
         }
         table th:first-child {
@@ -84,14 +84,17 @@
             font-family: 'Courier New', Courier, monospace;
             white-space: pre;
         }
         .image-container {
-            width: 100%;
             margin-bottom: 40px;
         }
         .image-container img {
-            width: 90%;
-            max-width: 650px;
             height: auto;
             display: block;
             margin: auto;
@@ -122,6 +125,14 @@
             <!-- Render the table HTML here -->
             {{ table_html|safe }}
         </div>
         <div class="about-button">
             <a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a>
         </div>
@@ -164,7 +175,6 @@
                 }
             });
         });
     </script>
 </body>
 </html>

         .container {
             max-width: 1200px; /* Limit the width of the container */
             margin: auto; /* Center the container */
+            padding: 15px; /* Add some padding */
             background: #fff;
             border-radius: 8px;
             box-shadow: 0 4px 8px rgba(0,0,0,0.1);
         table {
             border-collapse: separate;
             border-spacing: 0;
+            font-size: 12px; /* Reduce the font size */
             width: 100%;
             border: none; /* Remove any default border */
         }
             background-color: #f1f1f1;
         }
         table td, table th {
+            padding: 5px; /* Reduce padding */
             border: 1px solid #dee2e6;
         }
         table th:first-child {
             font-family: 'Courier New', Courier, monospace;
             white-space: pre;
         }
         .image-container {
+            display: flex;
+            justify-content: center;
+            gap: 10px;
             margin-bottom: 40px;
         }
+        .image-container a {
+            flex: 1;
+        }
         .image-container img {
+            max-width: 100%;
             height: auto;
             display: block;
             margin: auto;
             <!-- Render the table HTML here -->
             {{ table_html|safe }}
         </div>
+        <div class="image-container">
+            <a href="{{ url_for('static', filename='figures/cardinal.svg') }}" target="_blank">
+                <img src="{{ url_for('static', filename='figures/cardinal.svg') }}" alt="Cardinal">
+            </a>
+            <a href="{{ url_for('static', filename='figures/ordinal.svg') }}" target="_blank">
+                <img src="{{ url_for('static', filename='figures/ordinal.svg') }}" alt="Ordinal">
+            </a>
+        </div>
         <div class="about-button">
             <a href="{{ url_for('about') }}" class="custom-button mt-3">Learn More About This Project</a>
         </div>
                 }
             });
         });
     </script>
 </body>
 </html>

templates/model_detail.html CHANGED Viewed

@@ -43,6 +43,12 @@
             font-size: 30px;
             margin-bottom: 20px;
         }
         .image-container {
             width: 100%;
             margin-bottom: 40px;
@@ -89,7 +95,13 @@
         <div class="model-name">{{ model_name }}</div>
         <div class="image-section">
             <h2>Structure</h2>
-            <p>This image shows .....</p>
             <div class="image-container">
                 <a href="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" target="_blank">
                     <img src="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" alt="Structure">
@@ -98,7 +110,13 @@
         </div>
         <div class="image-section">
             <h2>Pairwise Rank-Order stability</h2>
-            <p>This image shows .....</p>
             <div class="matrix-image-container">
                 <a href="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" target="_blank">
                     <img src="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" alt="Matrix" >
@@ -106,8 +124,13 @@
             </div>
         </div>
         <div class="image-section">
-            <h2>Ranks</h2>
-            <p>This image shows .....</p>
             <div class="image-container">
                 <a href="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" target="_blank">
                     <img src="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" alt="Ranks">

             font-size: 30px;
             margin-bottom: 20px;
         }
+        .image-section p {
+            width: 80%;
+            margin: auto;
+            margin-bottom: 20px;
+            text-align: left;
+        }
         .image-container {
             width: 100%;
             margin-bottom: 40px;
         <div class="model-name">{{ model_name }}</div>
         <div class="image-section">
             <h2>Structure</h2>
+            <p>
+                This image shows the circular value structure projected on a 2D plane.
+                This was done by computing the intercorrelations between different values this space was then reduces with a SVD based approach and varimax rotation (`FactorAnalysis` object from `scikit-learn`).
+                The theoretical order is shown in the top left figure.
+                The distance is computed as the average distance of each value to it's rank in the theoretical order.
+                The minimal distance with the theoretical order in the clockwise and counter-clockwise direction was taken as the final distance.
+            </p>
             <div class="image-container">
                 <a href="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" target="_blank">
                     <img src="{{ url_for('static', filename='figures/' + model_name + '/structure.svg') }}" alt="Structure">
         </div>
         <div class="image-section">
             <h2>Pairwise Rank-Order stability</h2>
+            <p>
+                This image shows the Rank-Order stability between each pair of context chunks.
+                Rank-Order stability is computed by ordering the personas based on their expression of some value,
+                and then computing the correlation between their orders in two different context chunks.
+                The stability estimates for the ten values are then averaged to get the final Rank-Order stability measure.
+                Refer to our <a href="https://arxiv.org/abs/2402.14846">paper</a> for details.
+            </p>
             <div class="matrix-image-container">
                 <a href="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" target="_blank">
                     <img src="{{ url_for('static', filename='figures/' + model_name + '/matrix.svg') }}" alt="Matrix" >
             </div>
         </div>
         <div class="image-section">
+            <h2>Visualizing the order of simulated perosonas</h2>
+            <p>
+                This image visualizes the order of personas in each context chunk and for each values.
+                For each value (row), the personas are ordered on the x-axis by their expression of this value in the `no_conv` setting (gray).
+                Therefore, the Rank-Order stability between the `no_conv` chunk and some chunk corresponds to the extent to which the curve is increasing in that chunks.
+                (Only some personas are marked on the x-axis for clarity).
+            </p>
             <div class="image-container">
                 <a href="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" target="_blank">
                     <img src="{{ url_for('static', filename='figures/' + model_name + '/ranks.svg') }}" alt="Ranks">