tfrere HF Staff commited on
Commit
6a518ba
·
1 Parent(s): 000658b

update banner, matrix and article modifications

Browse files
app/src/content/assets/data/data_gaia.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e69d6199e9f2d0b6b07db5a90f13b2fd9bc9e0e245d2b9aa60ac929967baddfa
3
+ size 3967
app/src/content/assets/data/data_gaia_backup.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b075362b1571e36a66715126b5223a599ceaca74b0bff0d0d616662fca6a7ff3
3
+ size 335376
app/src/content/assets/data/data_gaia_points.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc205f03ced31cfbc111bc096a5ba6fe4159f1b8365446dfffb70331d38bfdb8
3
+ size 412546
app/src/content/assets/data/leaderboard_scatter_plot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b7b0c94cdd535f66dc6ac532fd80b434eddddd0dc8c36076c462c0a72faa9b
3
+ size 7688907
app/src/content/assets/data/leaderboard_scores_over_time.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c6dea46d998952f9cbfeb219afcdb403b90c28f0645e03ae537dfaa8c82b57d
3
+ size 40874
app/src/content/assets/data/leaderboard_scores_over_time_old.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d5b994b4fd6d636b2a58fabced9fcd929427501b86d0937b635a67d34872bb1
3
+ size 55411
app/src/content/chapters/general-knowledge/model-inference-and-evaluation.mdx CHANGED
@@ -11,6 +11,8 @@ import Note from "../../../components/Note.astro";
11
  import Sidenote from "../../../components/Sidenote.astro";
12
  import Accordion from "../../../components/Accordion.astro";
13
  import HtmlEmbed from "../../../components/HtmlEmbed.astro";
 
 
14
 
15
  In this section, we'll look at two steps for models: how input is preprocessed to be given to the model (`tokenization`), and how the model generates a prediction from it (`inference`).
16
 
@@ -94,12 +96,17 @@ First, as some languages do not always use spacing as a word separator (Korean,
94
 
95
  Then, tokenizers in general might be unfair to non-English languages. When training a BPE tokenizer, you use data from the different languages you want to cover, but most of the time, though, this data is unbalanced between languages (with, for example, an order of magnitude more English than Thai, or Burmese). Since BPE tokenizers create their vocabulary tokens based on the most frequent words seen, most of the long tokens will be English words - and most of the words from the less frequent languages will only be split at the character level. This effect leads to an unfairness in multilingual tokenization: some (less frequent, or *lower-resourced*) languages require orders of magnitude more tokens to generate a sentence of equivalent length as English.
96
 
 
 
97
  <iframe
 
98
  src="https://OpenEvals-tokenizers-languages.hf.space"
99
  frameborder="0"
100
- width="850"
101
- height="450"
102
  ></iframe>
 
 
103
 
104
  If you are in this case, the number of tokens that the model is allowed to generate for an evaluation should also be language dependent, as not all languages are tokenized in similar amount of tokens.
105
 
 
11
  import Sidenote from "../../../components/Sidenote.astro";
12
  import Accordion from "../../../components/Accordion.astro";
13
  import HtmlEmbed from "../../../components/HtmlEmbed.astro";
14
+ import Wide from "../../../components/Wide.astro";
15
+ import Reference from "../../../components/Reference.astro";
16
 
17
  In this section, we'll look at two steps for models: how input is preprocessed to be given to the model (`tokenization`), and how the model generates a prediction from it (`inference`).
18
 
 
96
 
97
  Then, tokenizers in general might be unfair to non-English languages. When training a BPE tokenizer, you use data from the different languages you want to cover, but most of the time, though, this data is unbalanced between languages (with, for example, an order of magnitude more English than Thai, or Burmese). Since BPE tokenizers create their vocabulary tokens based on the most frequent words seen, most of the long tokens will be English words - and most of the words from the less frequent languages will only be split at the character level. This effect leads to an unfairness in multilingual tokenization: some (less frequent, or *lower-resourced*) languages require orders of magnitude more tokens to generate a sentence of equivalent length as English.
98
 
99
+ <Wide>
100
+ <Reference align="center" caption="OpenEvals-tokenizers-languages">
101
  <iframe
102
+ className="card"
103
  src="https://OpenEvals-tokenizers-languages.hf.space"
104
  frameborder="0"
105
+ width="100%"
106
+ height="650"
107
  ></iframe>
108
+ </Reference>
109
+ </Wide>
110
 
111
  If you are in this case, the number of tokens that the model is allowed to generate for an evaluation should also be language dependent, as not all languages are tokenized in similar amount of tokens.
112
 
app/src/content/chapters/troubleshooting/troubleshooting-reproducibility.mdx CHANGED
@@ -4,8 +4,7 @@ title: "Troubleshooting reproducibility"
4
 
5
  import Note from "../../../components/Note.astro";
6
  import Sidenote from "../../../components/Sidenote.astro";
7
- import { Image } from "astro:assets";
8
- import mmluPromptImage from "../../assets/image/mmlu_prompt.png";
9
 
10
  Let's say you have read a recent tech report about a cool new model, and you want to reproduce their results on your machine... but you're not managing to?
11
  Let's explore why.
@@ -57,10 +56,12 @@ For example, for multichoice question answers, common formats include very simpl
57
 
58
  We did some experiments on this (you'll see up to a 7 points difference for the same model on the semantically equivalent prompts, the 5 rightmost columns), and a A [paper observed similar results](https://arxiv.org/abs/2310.11324).
59
 
60
- <Image src={mmluPromptImage} alt="Heatmap showing MMLU evaluation scores across different models with different prompt formats. Scores vary by up to 7 points for the same model depending on format." />
61
-
62
  **Other example**: Llama 3.1 models predicted correct MATH-Hard answers but scored poorly on the Open LLM Leaderboard, because they overfit to GSM8K's prompt format and couldn't adapt to the new one for this eval, despite it being provided in few shot examples.
63
 
 
 
 
 
64
  This [great paper](https://arxiv.org/abs/2407.07890)⭐ also highlights a side effect of this: a number of models are now trained to overfit benchmark prompts and answer formats, to the cost of adaptation to other prompts at evaluation time.
65
  </Note>
66
 
 
4
 
5
  import Note from "../../../components/Note.astro";
6
  import Sidenote from "../../../components/Sidenote.astro";
7
+ import HtmlEmbed from "../../../components/HtmlEmbed.astro";
 
8
 
9
  Let's say you have read a recent tech report about a cool new model, and you want to reproduce their results on your machine... but you're not managing to?
10
  Let's explore why.
 
56
 
57
  We did some experiments on this (you'll see up to a 7 points difference for the same model on the semantically equivalent prompts, the 5 rightmost columns), and a A [paper observed similar results](https://arxiv.org/abs/2310.11324).
58
 
 
 
59
  **Other example**: Llama 3.1 models predicted correct MATH-Hard answers but scored poorly on the Open LLM Leaderboard, because they overfit to GSM8K's prompt format and couldn't adapt to the new one for this eval, despite it being provided in few shot examples.
60
 
61
+ *Evaluation on MMLU subsets, acc_norm score (seed 0), in 5-shot.*
62
+
63
+ <HtmlEmbed src="d3-mmlu-heatmap.html" />
64
+
65
  This [great paper](https://arxiv.org/abs/2407.07890)⭐ also highlights a side effect of this: a number of models are now trained to overfit benchmark prompts and answer formats, to the cost of adaptation to other prompts at evaluation time.
66
  </Note>
67
 
app/src/content/embeds/banner.html CHANGED
@@ -1,4 +1,111 @@
1
- <div class="d3-galaxy" style="width:100%;margin:10px 0;aspect-ratio:3/1;min-height:260px;"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  <script>
3
  (() => {
4
  const ensureD3 = (cb) => {
@@ -17,230 +124,1060 @@
17
 
18
  const bootstrap = () => {
19
  const mount = document.currentScript ? document.currentScript.previousElementSibling : null;
20
- const container = (mount && mount.querySelector && mount.querySelector('.d3-galaxy')) || document.querySelector('.d3-galaxy');
21
  if (!container) return;
22
  if (container.dataset) {
23
  if (container.dataset.mounted === 'true') return;
24
  container.dataset.mounted = 'true';
25
  }
26
- // Scene params (match previous Plotly ranges)
27
- const cx = 1.5, cy = 0.5;
28
- const a = 1.3, b = 0.45;
29
- const numPoints = 3000;
30
- const numArms = 3;
31
- const numTurns = 2.1;
32
- const angleJitter = 0.12;
33
- const posNoise = 0.015;
34
-
35
- // Circle size settings
36
- const minCircleSize = 4; // minimum diameter in pixels
37
- const maxCircleSize = 12; // maximum diameter in pixels
38
-
39
- // Generate spiral + bulge
40
- const twoPi = Math.PI * 2;
41
- const t = Float64Array.from({ length: numPoints }, () => Math.random() * (twoPi * numTurns));
42
- const armIndices = Int16Array.from({ length: numPoints }, () => Math.floor(Math.random() * numArms));
43
- const armOffsets = Float64Array.from(armIndices, (k) => k * (twoPi / numArms));
44
- const theta = Float64Array.from(t, (tv, i) => tv + armOffsets[i] + d3.randomNormal.source(Math.random)(0, angleJitter)());
45
- const rNorm = Float64Array.from(t, (tv) => Math.pow(tv / (twoPi * numTurns), 0.9));
46
- const noiseScale = (rn) => posNoise * (0.8 + 0.6 * rn);
47
- const noiseX = Float64Array.from(rNorm, (rn) => d3.randomNormal.source(Math.random)(0, noiseScale(rn))());
48
- const noiseY = Float64Array.from(rNorm, (rn) => d3.randomNormal.source(Math.random)(0, noiseScale(rn))());
49
-
50
- const xSpiral = Float64Array.from(theta, (th, i) => cx + a * rNorm[i] * Math.cos(th) + noiseX[i]);
51
- const ySpiral = Float64Array.from(theta, (th, i) => cy + b * rNorm[i] * Math.sin(th) + noiseY[i]);
52
-
53
- const bulgePoints = Math.floor(0.18 * numPoints);
54
- const phiB = Float64Array.from({ length: bulgePoints }, () => twoPi * Math.random());
55
- const rB = Float64Array.from({ length: bulgePoints }, () => Math.pow(Math.random(), 2.2) * 0.22);
56
- const noiseXB = Float64Array.from({ length: bulgePoints }, () => d3.randomNormal.source(Math.random)(0, posNoise * 0.6)());
57
- const noiseYB = Float64Array.from({ length: bulgePoints }, () => d3.randomNormal.source(Math.random)(0, posNoise * 0.6)());
58
- const xBulge = Float64Array.from(phiB, (ph, i) => cx + a * rB[i] * Math.cos(ph) + noiseXB[i]);
59
- const yBulge = Float64Array.from(phiB, (ph, i) => cy + b * rB[i] * Math.sin(ph) + noiseYB[i]);
60
-
61
- // Concatenate
62
- const X = Array.from(xSpiral).concat(Array.from(xBulge));
63
- const Y = Array.from(ySpiral).concat(Array.from(yBulge));
64
- const lenSpiral = xSpiral.length;
65
-
66
- const zSpiral = Array.from(rNorm, (rn) => 1 - rn);
67
- const maxRB = rB && rB.length ? (window.d3 && d3.max ? d3.max(rB) : Math.max.apply(null, Array.from(rB))) : 1;
68
- const zBulge = Array.from(rB, (rb) => 1 - (maxRB ? rb / maxRB : 0));
69
- const Zraw = zSpiral.concat(zBulge);
70
- const sizesPx = Zraw.map((z) => minCircleSize + z * (maxCircleSize - minCircleSize)); // diameter in pixels
71
-
72
- // Labels (same categories as Python version)
73
- const labelOf = (i) => {
74
- const z = Zraw[i];
75
- if (z < 0.25) return 'tiny star';
76
- if (z < 0.5) return 'small star';
77
- if (z < 0.75) return 'medium star';
78
- return 'large star';
79
- };
80
 
81
- // Sort by size ascending for z-index: small first, big last
82
- const idx = d3.range(X.length).sort((i, j) => sizesPx[i] - sizesPx[j]);
83
-
84
- // Colors: piecewise gradient [0 -> 0.5 -> 1]
85
- const c0 = d3.rgb(78, 165, 183); // rgb(78, 165, 183)
86
- const c1 = d3.rgb(206, 192, 250); // rgb(206, 192, 250)
87
- const c2 = d3.rgb(232, 137, 171); // rgb(232, 137, 171)
88
- const interp01 = d3.interpolateRgb(c0, c1);
89
- const interp12 = d3.interpolateRgb(c1, c2);
90
- const colorFor = (v) => {
91
- const t = Math.max(0, Math.min(1, v));
92
- return t <= 0.5 ? interp01(t / 0.5) : interp12((t - 0.5) / 0.5);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  };
94
 
95
  // Create SVG
96
  const svg = d3.select(container).append('svg')
97
  .attr('width', '100%')
98
  .style('display', 'block')
99
- .style('cursor', 'crosshair');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  const render = () => {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  const width = container.clientWidth || 800;
103
- const height = Math.max(260, Math.round(width / 3)); // keep ~3:1, min height
104
  svg.attr('width', width).attr('height', height);
105
 
106
- const xScale = d3.scaleLinear().domain([0, 3]).range([0, width]);
107
- const yScale = d3.scaleLinear().domain([0, 1]).range([height, 0]);
108
 
109
- // Subtle stroke color depending on theme
110
- const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
111
- const strokeColor = isDark ? 'rgba(255,255,255,0.18)' : 'rgba(0,0,0,0.12)';
112
- const glowColor = isDark ? 'rgba(255,255,255,0.35)' : 'rgba(0,0,0,0.25)';
113
-
114
-
115
- // Group for points (no blend mode for better print/PDF visibility)
116
- const g = svg.selectAll('g.points').data([0]).join('g').attr('class', 'points');
117
-
118
- // Ensure container can host an absolute tooltip
119
- container.style.position = container.style.position || 'relative';
120
- let tip = container.querySelector('.d3-tooltip');
121
- let tipInner;
122
- if (!tip) {
123
- tip = document.createElement('div');
124
- tip.className = 'd3-tooltip';
125
- Object.assign(tip.style, {
126
- position: 'absolute',
127
- top: '0px',
128
- left: '0px',
129
- transform: 'translate(-9999px, -9999px)',
130
- pointerEvents: 'none',
131
- padding: '10px 12px',
132
- borderRadius: '12px',
133
- fontSize: '12px',
134
- lineHeight: '1.35',
135
- border: '1px solid var(--border-color)',
136
- background: 'var(--surface-bg)',
137
- color: 'var(--text-color)',
138
- boxShadow: '0 8px 32px rgba(0,0,0,.28), 0 2px 8px rgba(0,0,0,.12)',
139
- opacity: '0',
140
- transition: 'opacity .12s ease',
141
- backdropFilter: 'saturate(1.12) blur(8px)',
142
- zIndex: '20'
143
- });
144
- tipInner = document.createElement('div');
145
- tipInner.className = 'd3-tooltip__inner';
146
- Object.assign(tipInner.style, {
147
- textAlign: 'left',
148
- display: 'flex',
149
- flexDirection: 'column',
150
- gap: '6px',
151
- minWidth: '220px'
152
  });
153
- tip.appendChild(tipInner);
154
- container.appendChild(tip);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  } else {
156
- tipInner = tip.querySelector('.d3-tooltip__inner') || tip;
 
 
 
157
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- // Final filter: remove small dots very close to the galaxy center (after placement)
160
- const centerHoleRadius = 0.48; // elliptical radius threshold
161
- const smallSizeThreshold = 7.5; // same notion as Python size cut
162
- const rTotal = idx.map((i) => Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2));
163
- const idxFiltered = idx.filter((i, k) => !(rTotal[k] <= centerHoleRadius && sizesPx[i] < smallSizeThreshold));
164
-
165
- const sel = g.selectAll('circle').data(idxFiltered, (i) => i);
166
- sel.join(
167
- (enter) => enter.append('circle')
168
- .attr('cx', (i) => xScale(X[i]))
169
- .attr('cy', (i) => yScale(Y[i]))
170
- .attr('r', (i) => sizesPx[i] / 2)
171
- .attr('fill', (i) => colorFor(Zraw[i]))
172
- .attr('fill-opacity', 0.9)
173
- .on('mouseenter', function (ev, i) {
174
- d3.select(this).raise()
175
- .style('filter', `drop-shadow(0 0 8px ${glowColor})`)
176
- .transition().duration(120).ease(d3.easeCubicOut)
177
- .attr('r', (sizesPx[i] / 2) * 1.25)
178
- .attr('fill-opacity', 1);
179
- const r = Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2);
180
- const type = i < lenSpiral ? 'spiral' : 'bulge';
181
- const arm = i < lenSpiral ? (armIndices[i] + 1) : null;
182
- tipInner.innerHTML =
183
- `<div style="font-weight:800;letter-spacing:.1px;"><strong>${labelOf(i)}</strong></div>` +
184
- `<div style="font-size:11px;color:var(--muted-color);margin-top:-4px;margin-bottom:2px;letter-spacing:.1px;"><strong>Type</strong> ${type}${arm ? ` (Arm ${arm})` : ''}</div>` +
185
- `<div style="padding-top:6px;border-top:1px solid var(--border-color);"><strong>Position</strong> X ${X[i].toFixed(2)} · <strong>Y</strong> ${Y[i].toFixed(2)}</div>` +
186
- `<div><strong>Distance</strong> Radius ${r.toFixed(3)} · <strong>Z</strong> ${Zraw[i].toFixed(3)}</div>` +
187
- `<div><strong>Size</strong> ${sizesPx[i].toFixed(1)} px</div>`;
188
- tip.style.opacity = '1';
189
- })
190
- .on('mousemove', (ev, i) => {
191
- const [mx, my] = d3.pointer(ev, container);
192
- const offsetX = 10, offsetY = 12;
193
- tip.style.transform = `translate(${Math.round(mx + offsetX)}px, ${Math.round(my + offsetY)}px)`;
194
  })
195
- .on('mouseleave', function () {
196
- tip.style.opacity = '0';
197
- tip.style.transform = 'translate(-9999px, -9999px)';
198
- d3.select(this)
199
- .style('filter', null)
200
- .transition().duration(120).ease(d3.easeCubicOut)
201
- .attr('r', (i2) => sizesPx[i2] / 2)
202
- .attr('fill-opacity', 0.9);
203
- }),
204
- (update) => update
205
- .attr('cx', (i) => xScale(X[i]))
206
- .attr('cy', (i) => yScale(Y[i]))
207
- .attr('r', (i) => sizesPx[i] / 2)
208
- .attr('fill', (i) => colorFor(Zraw[i]))
209
- .attr('fill-opacity', 0.9)
210
- .on('mouseenter', function (ev, i) {
211
- d3.select(this).raise()
212
- .style('filter', `drop-shadow(0 0 8px ${glowColor})`)
213
- .transition().duration(120).ease(d3.easeCubicOut)
214
- .attr('r', (sizesPx[i] / 2) * 1.25)
215
- .attr('fill-opacity', 1);
216
- const r = Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2);
217
- const type = i < lenSpiral ? 'spiral' : 'bulge';
218
- const arm = i < lenSpiral ? (armIndices[i] + 1) : null;
219
- tipInner.innerHTML =
220
- `<div style="font-weight:800;letter-spacing:.1px;"><strong>${labelOf(i)}</strong></div>` +
221
- `<div style="font-size:11px;color:var(--muted-color);margin-top:-4px;margin-bottom:2px;letter-spacing:.1px;"><strong>Type</strong> ${type}${arm ? ` (Arm ${arm})` : ''}</div>` +
222
- `<div style="padding-top:6px;border-top:1px solid var(--border-color);"><strong>Position</strong> X ${X[i].toFixed(2)} · <strong>Y</strong> ${Y[i].toFixed(2)}</div>` +
223
- `<div><strong>Distance</strong> Radius ${r.toFixed(3)} · <strong>Z</strong> ${Zraw[i].toFixed(3)}</div>` +
224
- `<div><strong>Size</strong> ${sizesPx[i].toFixed(1)} px</div>`;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  tip.style.opacity = '1';
226
- })
227
- .on('mousemove', (ev, i) => {
228
- const [mx, my] = d3.pointer(ev, container);
229
- const offsetX = 10, offsetY = 12;
230
- tip.style.transform = `translate(${Math.round(mx + offsetX)}px, ${Math.round(my + offsetY)}px)`;
231
- })
232
- .on('mouseleave', function () {
233
  tip.style.opacity = '0';
234
  tip.style.transform = 'translate(-9999px, -9999px)';
235
- d3.select(this)
236
- .style('filter', null)
237
- .transition().duration(120).ease(d3.easeCubicOut)
238
- .attr('r', (i2) => sizesPx[i2] / 2)
239
- .attr('fill-opacity', 0.9);
240
- })
241
- );
242
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
 
 
244
  // First render + resize
245
  if (window.ResizeObserver) {
246
  const ro = new ResizeObserver(() => render());
@@ -248,11 +1185,26 @@
248
  } else {
249
  window.addEventListener('resize', render);
250
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  render();
 
252
  };
253
 
254
  if (document.readyState === 'loading') {
255
  document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
256
  } else { ensureD3(bootstrap); }
257
  })();
258
- </script>
 
1
+ <div class="d3-leaderboard-chart-wrapper" style="width:100%;margin:10px 0;padding:10px 5px 5px 5px;border-radius:8px;background:var(--surface-bg);border:1px solid var(--border-color);position:relative;">
2
+ <div class="d3-leaderboard-chart" style="width:100%;aspect-ratio:2.8/1;min-height:320px;"></div>
3
+ </div>
4
+ <style>
5
+ .d3-leaderboard-chart {
6
+ position: relative;
7
+ }
8
+
9
+ .d3-leaderboard-chart .d3-tooltip {
10
+ position: absolute;
11
+ top: 0;
12
+ left: 0;
13
+ transform: translate(-9999px, -9999px);
14
+ pointer-events: none;
15
+ padding: 10px 12px;
16
+ border-radius: 12px;
17
+ font-size: 12px;
18
+ line-height: 1.35;
19
+ border: 1px solid var(--border-color);
20
+ background: var(--surface-bg);
21
+ color: var(--text-color);
22
+ box-shadow: 0 8px 32px rgba(0,0,0,.28), 0 2px 8px rgba(0,0,0,.12);
23
+ opacity: 0;
24
+ transition: opacity .12s ease;
25
+ z-index: 20;
26
+ backdrop-filter: saturate(1.12) blur(8px);
27
+ }
28
+
29
+ .d3-info-icon {
30
+ position: absolute;
31
+ bottom: 15px;
32
+ right: 15px;
33
+ width: 28px;
34
+ height: 28px;
35
+ border-radius: 50%;
36
+ background: var(--surface-bg);
37
+ border: 1px solid var(--border-color);
38
+ display: flex;
39
+ align-items: center;
40
+ justify-content: center;
41
+ cursor: pointer;
42
+ color: var(--muted-color);
43
+ transition: all 0.2s ease;
44
+ z-index: 10;
45
+ }
46
+
47
+ .d3-info-icon:hover {
48
+ color: var(--text-color);
49
+ background: var(--surface-bg);
50
+ border-color: var(--text-color);
51
+ }
52
+
53
+ .d3-info-tooltip {
54
+ position: absolute;
55
+ bottom: 50px;
56
+ right: 15px;
57
+ max-width: 400px;
58
+ padding: 16px;
59
+ background: var(--surface-bg);
60
+ border: 1px solid var(--border-color);
61
+ border-radius: 8px;
62
+ font-size: 12px;
63
+ line-height: 1.6;
64
+ color: var(--text-color);
65
+ opacity: 0;
66
+ pointer-events: none;
67
+ z-index: 10000;
68
+ transition: opacity 0.2s ease;
69
+ box-shadow: 0 4px 12px rgba(0,0,0,0.15);
70
+ backdrop-filter: saturate(1.12) blur(8px);
71
+ text-align: left;
72
+ }
73
+
74
+ .d3-leaderboard-chart .d3-tooltip__inner {
75
+ display: flex;
76
+ flex-direction: column;
77
+ gap: 6px;
78
+ min-width: 180px;
79
+ }
80
+
81
+ .d3-leaderboard-chart .d3-tooltip__inner > div:first-child {
82
+ font-weight: 800;
83
+ letter-spacing: 0.1px;
84
+ margin-bottom: 0;
85
+ }
86
+
87
+ .d3-leaderboard-chart .d3-tooltip__inner > div:nth-child(2) {
88
+ font-size: 11px;
89
+ color: var(--muted-color, #9ca3af);
90
+ display: block;
91
+ margin-top: -4px;
92
+ margin-bottom: 2px;
93
+ letter-spacing: 0.1px;
94
+ }
95
+
96
+ .d3-leaderboard-chart .d3-tooltip__inner > div:nth-child(n+3) {
97
+ padding-top: 6px;
98
+ border-top: 1px solid var(--border-color);
99
+ }
100
+
101
+ .d3-leaderboard-chart .d3-tooltip__color-dot {
102
+ display: inline-block;
103
+ width: 12px;
104
+ height: 12px;
105
+ border-radius: 3px;
106
+ border: 1px solid var(--border-color);
107
+ }
108
+ </style>
109
  <script>
110
  (() => {
111
  const ensureD3 = (cb) => {
 
124
 
125
  const bootstrap = () => {
126
  const mount = document.currentScript ? document.currentScript.previousElementSibling : null;
127
+ const container = (mount && mount.querySelector && mount.querySelector('.d3-leaderboard-chart')) || document.querySelector('.d3-leaderboard-chart');
128
  if (!container) return;
129
  if (container.dataset) {
130
  if (container.dataset.mounted === 'true') return;
131
  container.dataset.mounted = 'true';
132
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ // Get categorical colors from ColorPalettes
135
+ function getCategoricalColors(n) {
136
+ try {
137
+ if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') {
138
+ return window.ColorPalettes.getColors('categorical', n);
139
+ }
140
+ } catch (e) {
141
+ console.warn('ColorPalettes not available, using fallback');
142
+ }
143
+ // Fallback: Tableau10 palette
144
+ const tableau = (window.d3 && window.d3.schemeTableau10)
145
+ ? window.d3.schemeTableau10
146
+ : ['#4e79a7', '#f28e2b', '#e15759', '#76b7b2', '#59a14f', '#edc948', '#b07aa1', '#ff9da7', '#9c755f', '#bab0ab'];
147
+ return tableau.slice(0, n);
148
+ }
149
+
150
+ // Create color mapping for benchmarks
151
+ let colorMap = {};
152
+
153
+ // Définir les groupes de benchmarks globalement
154
+ // Agentic en premier pour obtenir la première couleur (orange)
155
+ const BENCHMARK_GROUPS = [
156
+ {
157
+ name: 'Agentic',
158
+ benchmarks: ['GAIA']
159
+ },
160
+ {
161
+ name: 'Reasoning & Commonsense',
162
+ benchmarks: ['MUSR', 'BBH', 'Winogrande', 'TruthfulQA', 'HellaSwag']
163
+ },
164
+ {
165
+ name: 'Knowledge',
166
+ benchmarks: ['MMLU', 'MMLU-Pro', 'GPQA', 'ARC']
167
+ },
168
+ {
169
+ name: 'Math',
170
+ benchmarks: ['GSM8K', 'MATH']
171
+ },
172
+ {
173
+ name: 'Instruction following',
174
+ benchmarks: ['IFEval']
175
+ }
176
+ ];
177
+
178
+ // Fonction pour créer des variations de couleur à partir d'une couleur de base
179
+ function createColorVariation(baseColor, index, total) {
180
+ // Convertir la couleur hex en RGB
181
+ const hex = baseColor.replace('#', '');
182
+ const r = parseInt(hex.substr(0, 2), 16);
183
+ const g = parseInt(hex.substr(2, 2), 16);
184
+ const b = parseInt(hex.substr(4, 2), 16);
185
+
186
+ // Créer des variations en ajustant la luminosité
187
+ // Variation de -15% à +15% selon l'index
188
+ const variationRange = 0.15;
189
+ const step = total > 1 ? (variationRange * 2) / (total - 1) : 0;
190
+ const brightnessAdjust = -variationRange + (index * step);
191
+
192
+ // Ajuster la luminosité
193
+ const adjustBrightness = (value, factor) => {
194
+ const adjusted = value + (255 - value) * factor;
195
+ return Math.max(0, Math.min(255, Math.round(adjusted)));
196
+ };
197
+
198
+ const newR = adjustBrightness(r, brightnessAdjust);
199
+ const newG = adjustBrightness(g, brightnessAdjust);
200
+ const newB = adjustBrightness(b, brightnessAdjust);
201
+
202
+ // Convertir en hex
203
+ const toHex = (n) => {
204
+ const hex = n.toString(16);
205
+ return hex.length === 1 ? '0' + hex : hex;
206
+ };
207
+
208
+ return `#${toHex(newR)}${toHex(newG)}${toHex(newB)}`;
209
+ }
210
+
211
+ // Mapping des benchmarks vers leurs groupes
212
+ function getBenchmarkGroup(benchmark) {
213
+ // Gérer MMLU_new qui s'affiche comme MMLU-Pro
214
+ const displayName = benchmark === 'MMLU_new' ? 'MMLU-Pro' : benchmark;
215
+
216
+ for (const group of BENCHMARK_GROUPS) {
217
+ if (group.benchmarks.includes(displayName)) {
218
+ return group.name;
219
+ }
220
+ }
221
+ return null;
222
+ }
223
+
224
+ // Stocker les couleurs de base des groupes pour la légende
225
+ let groupBaseColors = {};
226
+
227
+ function updateColorMap(benchmarks) {
228
+ // Obtenir les groupes uniques (dans l'ordre de BENCHMARK_GROUPS pour que Agentic soit en premier)
229
+ const allGroups = BENCHMARK_GROUPS.map(g => g.name);
230
+ const presentGroups = allGroups.filter(groupName => {
231
+ return benchmarks.some(b => getBenchmarkGroup(b) === groupName);
232
+ });
233
+
234
+ // Obtenir la palette avec le nombre exact de groupes présents
235
+ const palette = getCategoricalColors(presentGroups.length);
236
+
237
+ // Créer un mapping groupe -> couleur de base (pour la légende)
238
+ groupBaseColors = {};
239
+ presentGroups.forEach((group, i) => {
240
+ groupBaseColors[group] = palette[i];
241
+ });
242
+
243
+ // Créer un mapping benchmark -> couleur (variation de la couleur du groupe)
244
+ colorMap = {};
245
+
246
+ // Pour chaque groupe, créer des variations pour chaque benchmark
247
+ BENCHMARK_GROUPS.forEach(group => {
248
+ if (!groupBaseColors[group.name]) return;
249
+
250
+ const baseColor = groupBaseColors[group.name];
251
+
252
+ // Trouver les benchmarks du groupe qui sont présents dans les données
253
+ const groupBenchmarks = [];
254
+ group.benchmarks.forEach(benchmark => {
255
+ // Pour MMLU-Pro, chercher MMLU_new dans les benchmarks
256
+ if (benchmark === 'MMLU-Pro') {
257
+ if (benchmarks.includes('MMLU_new')) {
258
+ groupBenchmarks.push({ displayName: 'MMLU-Pro', dataKey: 'MMLU_new' });
259
+ }
260
+ } else {
261
+ if (benchmarks.includes(benchmark)) {
262
+ groupBenchmarks.push({ displayName: benchmark, dataKey: benchmark });
263
+ }
264
+ }
265
+ });
266
+
267
+ // Assigner des variations de couleur à chaque benchmark
268
+ groupBenchmarks.forEach((benchmarkInfo, index) => {
269
+ // Créer une variation de couleur
270
+ const variation = createColorVariation(baseColor, index, groupBenchmarks.length);
271
+ // Mapper avec la clé de données (dataKey)
272
+ colorMap[benchmarkInfo.dataKey] = variation;
273
+ });
274
+ });
275
+ }
276
+
277
+ // Fonction pour obtenir la couleur de base d'un groupe (pour la légende)
278
+ function getGroupBaseColor(groupName) {
279
+ return groupBaseColors[groupName] || '#000000';
280
+ }
281
+
282
+ // Helper to get color for a benchmark (via son groupe)
283
+ function getColor(benchmark) {
284
+ if (colorMap && colorMap[benchmark]) {
285
+ return colorMap[benchmark];
286
+ }
287
+ // Fallback si le benchmark n'est pas dans la map
288
+ const group = getBenchmarkGroup(benchmark);
289
+ if (group) {
290
+ // Essayer de trouver la couleur du groupe
291
+ const palette = getCategoricalColors(BENCHMARK_GROUPS.length);
292
+ const groupIndex = BENCHMARK_GROUPS.findIndex(g => g.name === group);
293
+ if (groupIndex >= 0) {
294
+ return palette[groupIndex % palette.length];
295
+ }
296
+ }
297
+ // Fallback final
298
+ const palette = getCategoricalColors(10);
299
+ const index = (benchmark || '').charCodeAt(0) % palette.length;
300
+ return palette[index];
301
+ }
302
+
303
+ let data = null;
304
+ let scatterData = null; // Données du nuage de points
305
+
306
+ // Créer l'icône info en bas à droite de l'embed (HTML, pas SVG)
307
+ const wrapper = container.closest('.d3-leaderboard-chart-wrapper');
308
+ if (wrapper) {
309
+ let infoIcon = wrapper.querySelector('.d3-info-icon');
310
+ if (!infoIcon) {
311
+ infoIcon = document.createElement('div');
312
+ infoIcon.className = 'd3-info-icon';
313
+ infoIcon.innerHTML = `
314
+ <svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
315
+ <path d="M8 6V8M8 10H8.01" stroke="currentColor" stroke-width="1.5" stroke-linecap="round"/>
316
+ </svg>
317
+ `;
318
+ wrapper.appendChild(infoIcon);
319
+
320
+ // Tooltip pour l'icône info
321
+ let infoTooltip = wrapper.querySelector('.d3-info-tooltip');
322
+ if (!infoTooltip) {
323
+ infoTooltip = document.createElement('div');
324
+ infoTooltip.className = 'd3-info-tooltip';
325
+ infoTooltip.innerHTML = `
326
+ <div style="font-weight: 600; margin-bottom: 10px; color: var(--text-color); font-size: 13px; text-align: left;">About this chart</div>
327
+ <div style="color: var(--text-color); font-size: 12px; line-height: 1.6; text-align: left;">
328
+ <p style="margin: 0 0 10px 0; text-align: left;">
329
+ This visualization tracks the evolution of top benchmark scores over time across multiple evaluation frameworks.
330
+ The step-like lines represent the progression of maximum scores achieved for each benchmark, with circular markers
331
+ indicating when a new record was set.
332
+ </p>
333
+ <p style="margin: 0 0 10px 0; text-align: left;">
334
+ The gray scatter plot in the background shows the average scores of all evaluated models, providing context for
335
+ the top performers. Each point represents a model's average performance across all benchmarks at a given time.
336
+ </p>
337
+ <p style="margin: 0; text-align: left;">
338
+ Benchmarks are grouped by category (Reasoning & Commonsense, Knowledge, Math, Agentic, and Instruction following),
339
+ with each group sharing a color family. Variations within a group use different shades of the same base color.
340
+ </p>
341
+ </div>
342
+ `;
343
+ wrapper.appendChild(infoTooltip);
344
+ }
345
+
346
+ // Gestion du hover
347
+ infoIcon.addEventListener('mouseenter', () => {
348
+ infoTooltip.style.opacity = '1';
349
+ infoTooltip.style.pointerEvents = 'auto';
350
+ });
351
+
352
+ infoIcon.addEventListener('mouseleave', () => {
353
+ infoTooltip.style.opacity = '0';
354
+ infoTooltip.style.pointerEvents = 'none';
355
+ });
356
+ }
357
+ }
358
+
359
+ // Tooltip pour les points
360
+ let tip = container.querySelector('.d3-tooltip');
361
+ let tipInner;
362
+ if (!tip) {
363
+ tip = document.createElement('div');
364
+ tip.className = 'd3-tooltip';
365
+ tipInner = document.createElement('div');
366
+ tipInner.className = 'd3-tooltip__inner';
367
+ tip.appendChild(tipInner);
368
+ container.appendChild(tip);
369
+ } else {
370
+ tipInner = tip.querySelector('.d3-tooltip__inner') || tip;
371
+ }
372
+
373
+ // Load data from JSON (both old and new leaderboards)
374
+ const loadData = async () => {
375
+ const dataPaths = [
376
+ '/data/leaderboard_scores_over_time.json',
377
+ './assets/data/leaderboard_scores_over_time.json',
378
+ '../assets/data/leaderboard_scores_over_time.json',
379
+ '../../assets/data/leaderboard_scores_over_time.json'
380
+ ];
381
+
382
+ const oldDataPaths = [
383
+ '/data/leaderboard_scores_over_time_old.json',
384
+ './assets/data/leaderboard_scores_over_time_old.json',
385
+ '../assets/data/leaderboard_scores_over_time_old.json',
386
+ '../../assets/data/leaderboard_scores_over_time_old.json'
387
+ ];
388
+
389
+ const gaiaDataPaths = [
390
+ '/data/data_gaia.json',
391
+ './assets/data/data_gaia.json',
392
+ '../assets/data/data_gaia.json',
393
+ '../../assets/data/data_gaia.json'
394
+ ];
395
+
396
+ let newData = null;
397
+ let oldData = null;
398
+ let gaiaData = null;
399
+
400
+ // Load new leaderboard data
401
+ for (const path of dataPaths) {
402
+ try {
403
+ const response = await fetch(path, { cache: 'no-cache' });
404
+ if (response.ok) {
405
+ newData = await response.json();
406
+ break;
407
+ }
408
+ } catch (e) {
409
+ // Continue to next path
410
+ }
411
+ }
412
+
413
+ // Load old leaderboard data
414
+ for (const path of oldDataPaths) {
415
+ try {
416
+ const response = await fetch(path, { cache: 'no-cache' });
417
+ if (response.ok) {
418
+ oldData = await response.json();
419
+ break;
420
+ }
421
+ } catch (e) {
422
+ // Continue to next path
423
+ }
424
+ }
425
+
426
+ // Load GAIA data
427
+ for (const path of gaiaDataPaths) {
428
+ try {
429
+ const response = await fetch(path, { cache: 'no-cache' });
430
+ if (response.ok) {
431
+ gaiaData = await response.json();
432
+ // Convert GAIA scores from 0-1 to 0-100
433
+ if (gaiaData && gaiaData.benchmarks && gaiaData.benchmarks.GAIA) {
434
+ gaiaData.benchmarks.GAIA = gaiaData.benchmarks.GAIA.map(point => ({
435
+ ...point,
436
+ score: point.score * 100
437
+ }));
438
+ }
439
+ break;
440
+ }
441
+ } catch (e) {
442
+ // Continue to next path
443
+ }
444
+ }
445
+
446
+ // Merge both datasets
447
+ data = { benchmarks: {} };
448
+
449
+ if (newData) {
450
+ Object.assign(data.benchmarks, newData.benchmarks || {});
451
+ }
452
+
453
+ // Add GAIA data
454
+ if (gaiaData && gaiaData.benchmarks) {
455
+ Object.assign(data.benchmarks, gaiaData.benchmarks);
456
+ }
457
+
458
+ if (oldData) {
459
+ // Merge old benchmarks (they might overlap with new ones, keep both)
460
+ Object.keys(oldData.benchmarks || {}).forEach(benchmark => {
461
+ if (data.benchmarks[benchmark]) {
462
+ // If benchmark exists in both (like MMLU), create two separate entries
463
+ // Find the transition date (roughly June 2024)
464
+ const transitionDate = new Date('2024-06-01');
465
+
466
+ // Separate points by period
467
+ const oldPoints = oldData.benchmarks[benchmark].filter(p => new Date(p.date) < transitionDate);
468
+ const newPoints = data.benchmarks[benchmark].filter(p => new Date(p.date) >= transitionDate);
469
+
470
+ // Calculate records separately for old period
471
+ const oldRecords = [];
472
+ let maxSoFarOld = 0;
473
+ oldPoints.forEach(point => {
474
+ if (point.score > maxSoFarOld) {
475
+ maxSoFarOld = point.score;
476
+ oldRecords.push({ date: point.date, score: maxSoFarOld });
477
+ }
478
+ });
479
+
480
+ // For new period, calculate records independently
481
+ const newRecords = [];
482
+ let maxSoFarNew = 0;
483
+ newPoints.forEach(point => {
484
+ if (point.score > maxSoFarNew) {
485
+ maxSoFarNew = point.score;
486
+ newRecords.push({ date: point.date, score: maxSoFarNew });
487
+ }
488
+ });
489
+
490
+ // Keep old period with original name
491
+ if (oldRecords.length > 0) {
492
+ data.benchmarks[benchmark] = oldRecords;
493
+ } else {
494
+ // If no old records, remove the benchmark from data
495
+ delete data.benchmarks[benchmark];
496
+ }
497
+
498
+ // Add new period with a different name
499
+ if (newRecords.length > 0) {
500
+ data.benchmarks[benchmark + '_new'] = newRecords;
501
+ }
502
+ } else {
503
+ // Benchmark only in old data, add it directly
504
+ data.benchmarks[benchmark] = oldData.benchmarks[benchmark];
505
+ }
506
+ });
507
+ }
508
+
509
+
510
+ if (Object.keys(data.benchmarks).length === 0) {
511
+ console.warn('Could not load leaderboard data, using empty dataset');
512
+ data = { benchmarks: {} };
513
+ }
514
+
515
+ // Charger les données du nuage de points
516
+ const scatterPaths = [
517
+ '/assets/data/leaderboard_scatter_plot.json',
518
+ '/data/leaderboard_scatter_plot.json',
519
+ './assets/data/leaderboard_scatter_plot.json',
520
+ '../assets/data/leaderboard_scatter_plot.json',
521
+ '../../assets/data/leaderboard_scatter_plot.json'
522
+ ];
523
+
524
+ const gaiaScatterPaths = [
525
+ '/assets/data/data_gaia_points.json',
526
+ '/data/data_gaia_points.json',
527
+ './assets/data/data_gaia_points.json',
528
+ '../assets/data/data_gaia_points.json',
529
+ '../../assets/data/data_gaia_points.json'
530
+ ];
531
+
532
+ for (const path of scatterPaths) {
533
+ try {
534
+ const scatterResponse = await fetch(path, { cache: 'no-cache' });
535
+ if (scatterResponse.ok) {
536
+ scatterData = await scatterResponse.json();
537
+ break;
538
+ }
539
+ } catch (e) {
540
+ // Continue to next path
541
+ }
542
+ }
543
+
544
+ // Load GAIA scatter points
545
+ let gaiaScatterData = null;
546
+ for (const path of gaiaScatterPaths) {
547
+ try {
548
+ const scatterResponse = await fetch(path, { cache: 'no-cache' });
549
+ if (scatterResponse.ok) {
550
+ gaiaScatterData = await scatterResponse.json();
551
+ // Convert GAIA scores from 0-1 to 0-100
552
+ if (gaiaScatterData && gaiaScatterData.points) {
553
+ gaiaScatterData.points = gaiaScatterData.points.map(point => ({
554
+ ...point,
555
+ average_score: point.average_score * 100
556
+ }));
557
+ }
558
+ break;
559
+ }
560
+ } catch (e) {
561
+ // Continue to next path
562
+ }
563
+ }
564
+
565
+ // Merge scatter data
566
+ if (gaiaScatterData && gaiaScatterData.points) {
567
+ if (!scatterData) {
568
+ scatterData = { points: [] };
569
+ }
570
+ if (!scatterData.points) {
571
+ scatterData.points = [];
572
+ }
573
+ const gaiaPointsCount = gaiaScatterData.points.length;
574
+ scatterData.points = scatterData.points.concat(gaiaScatterData.points);
575
+ console.log(`✅ ${gaiaPointsCount} points GAIA ajoutés au nuage de points (total: ${scatterData.points.length})`);
576
+ }
577
  };
578
 
579
  // Create SVG
580
  const svg = d3.select(container).append('svg')
581
  .attr('width', '100%')
582
  .style('display', 'block')
583
+ .style('cursor', 'default');
584
+
585
+ // Theme detection and update function
586
+ const getThemeColors = () => {
587
+ const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
588
+ return {
589
+ isDark,
590
+ textColor: isDark ? 'rgba(255,255,255,0.8)' : 'rgba(0,0,0,0.6)',
591
+ gridColor: isDark ? 'rgba(255,255,255,0.08)' : 'rgba(0,0,0,0.08)',
592
+ axisColor: isDark ? 'rgba(255,255,255,0.4)' : 'rgba(0,0,0,0.4)',
593
+ tooltipBg: isDark ? 'rgba(20, 20, 25, 0.98)' : 'rgba(255, 255, 255, 0.98)',
594
+ tooltipBorder: isDark ? 'rgba(255, 255, 255, 0.15)' : 'rgba(0, 0, 0, 0.1)',
595
+ tooltipText: isDark ? 'rgba(255, 255, 255, 0.95)' : 'rgba(0, 0, 0, 0.9)',
596
+ tooltipMuted: isDark ? 'rgba(255, 255, 255, 0.6)' : 'rgba(0, 0, 0, 0.5)'
597
+ };
598
+ };
599
+
600
+ let themeColors = getThemeColors();
601
+ let legendDivRef = null; // Référence à la légende pour les mises à jour de thème
602
+ let svgRef = null; // Référence au SVG pour les mises à jour de thème
603
+
604
+ // Watch for theme changes
605
+ const themeObserver = new MutationObserver(() => {
606
+ themeColors = getThemeColors();
607
+ // Legend colors are now handled by CSS variables, no manual update needed
608
+ // Update axis colors and grid
609
+ if (svgRef) {
610
+ const g = svgRef.select('g');
611
+ if (g && !g.empty()) {
612
+ g.selectAll('.axis .tick line').attr('stroke', themeColors.axisColor);
613
+ g.selectAll('.axis .tick text').attr('fill', themeColors.textColor);
614
+ g.selectAll('.grid-line').attr('stroke', themeColors.gridColor);
615
+ }
616
+ }
617
+ });
618
+ themeObserver.observe(document.documentElement, {
619
+ attributes: true,
620
+ attributeFilter: ['data-theme']
621
+ });
622
 
623
  const render = () => {
624
+ // Update theme colors before rendering
625
+ themeColors = getThemeColors();
626
+
627
+ if (!data || !data.benchmarks) {
628
+ // Show loading message
629
+ svg.selectAll('*').remove();
630
+ const width = container.clientWidth || 800;
631
+ const height = Math.max(380, Math.round(width / 2.8));
632
+ svg.attr('width', width).attr('height', height);
633
+
634
+ svg.append('text')
635
+ .attr('x', width / 2)
636
+ .attr('y', height / 2)
637
+ .attr('text-anchor', 'middle')
638
+ .attr('fill', themeColors.textColor)
639
+ .text('Chargement des données...');
640
+ return;
641
+ }
642
+
643
  const width = container.clientWidth || 800;
644
+ const height = Math.max(380, Math.round(width / 2.8));
645
  svg.attr('width', width).attr('height', height);
646
 
647
+ // Clear previous render
648
+ svg.selectAll('*').remove();
649
 
650
+ // Margins (augmenter le bottom pour la légende et plus d'espace à gauche/droite)
651
+ // Espacement uniforme à gauche et à droite
652
+ const sidePadding = 25; // Espacement uniforme pour légende et titre
653
+ const chartRightMargin = sidePadding * 2; // Double espacement à droite du chart
654
+ const margin = { top: 10, right: chartRightMargin, bottom: 150, left: 60 };
655
+ const innerWidth = width - margin.left - margin.right;
656
+ const innerHeight = height - margin.top - margin.bottom;
657
+
658
+ // Collect all dates and scores for scaling (limité à décembre 2025)
659
+ const maxDate = new Date('2025-12-31');
660
+ const allDates = [];
661
+ const allScores = [];
662
+
663
+ Object.values(data.benchmarks).forEach(benchmarkData => {
664
+ benchmarkData.forEach(point => {
665
+ const pointDate = new Date(point.date);
666
+ if (pointDate <= maxDate) {
667
+ allDates.push(pointDate);
668
+ allScores.push(point.score);
669
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  });
671
+ });
672
+
673
+ if (allDates.length === 0) {
674
+ svg.append('text')
675
+ .attr('x', width / 2)
676
+ .attr('y', height / 2)
677
+ .attr('text-anchor', 'middle')
678
+ .attr('fill', themeColors.textColor)
679
+ .text('Aucune donnée disponible');
680
+ return;
681
+ }
682
+
683
+ // Scales (store in window for hover functions)
684
+ // Limiter le domaine à décembre 2025
685
+ const maxDateLimit = new Date('2025-12-31');
686
+ const dateExtent = d3.extent(allDates);
687
+ // S'assurer que le domaine ne dépasse pas décembre 2025
688
+ const domainMax = dateExtent[1] && dateExtent[1] > maxDateLimit ? maxDateLimit : (dateExtent[1] || maxDateLimit);
689
+
690
+ window.bannerXScale = d3.scaleTime()
691
+ .domain([dateExtent[0] || new Date('2023-01-01'), domainMax])
692
+ .range([0, innerWidth])
693
+ .nice();
694
+
695
+ // Forcer le domaine maximum à décembre 2025 (après .nice() qui peut étendre le domaine)
696
+ const currentDomain = window.bannerXScale.domain();
697
+ if (currentDomain[1] > maxDateLimit) {
698
+ window.bannerXScale.domain([currentDomain[0], maxDateLimit]);
699
+ }
700
+
701
+ // Réappliquer nice() seulement sur le min si nécessaire, mais garder le max à décembre 2025
702
+ const finalDomain = window.bannerXScale.domain();
703
+ window.bannerXScale.domain([finalDomain[0], maxDateLimit]);
704
+
705
+ const xScale = window.bannerXScale;
706
+
707
+ const yScale = d3.scaleLinear()
708
+ .domain([0, Math.max(100, d3.max(allScores) * 1.1)])
709
+ .range([innerHeight, 0])
710
+ .nice();
711
+
712
+ // Create main group
713
+ const g = svg.append('g')
714
+ .attr('transform', `translate(${margin.left},${margin.top})`);
715
+
716
+ // Stocker la référence au SVG pour les mises à jour de thème
717
+ svgRef = svg;
718
+
719
+ // Hover layer for tooltip
720
+ const gHover = g.append('g').attr('class', 'hover-layer');
721
+ let hoverLine = null;
722
+
723
+ // Grid lines
724
+ const yTicks = yScale.ticks(5);
725
+ g.selectAll('.grid-line')
726
+ .data(yTicks)
727
+ .enter().append('line')
728
+ .attr('class', 'grid-line')
729
+ .attr('x1', 0)
730
+ .attr('x2', innerWidth)
731
+ .attr('y1', d => yScale(d))
732
+ .attr('y2', d => yScale(d))
733
+ .attr('stroke', themeColors.gridColor)
734
+ .attr('stroke-width', 1)
735
+ .attr('stroke-dasharray', '2,2');
736
+
737
+
738
+ // Line generator - courbe en escalier (step) pour afficher des seuils successifs
739
+ // La ligne reste constante jusqu'au prochain point
740
+ const line = d3.line()
741
+ .x(d => xScale(new Date(d.date)))
742
+ .y(d => yScale(d.score))
743
+ .curve(d3.curveStepAfter); // Step function : reste constante jusqu'au prochain point
744
+
745
+ // Draw lines for each benchmark
746
+ // Handle MMLU separately: old and new as separate lines
747
+ let benchmarks = Object.keys(data.benchmarks).sort();
748
+
749
+ // If MMLU_new exists, add it to benchmarks list but map it to display as MMLU
750
+ if (data.benchmarks['MMLU_new']) {
751
+ benchmarks.push('MMLU_new');
752
+ }
753
+
754
+ // Update color map avec tous les benchmarks (les couleurs seront assignées par groupe)
755
+ updateColorMap(benchmarks);
756
+
757
+ // Fonctions pour gérer le highlight
758
+ const highlightBenchmark = (highlightedBenchmark) => {
759
+ benchmarks.forEach(benchmark => {
760
+ const displayName = benchmark === 'MMLU_new' ? 'MMLU-Pro' : benchmark;
761
+ if (displayName === highlightedBenchmark) {
762
+ // Mettre en évidence la ligne sélectionnée
763
+ g.selectAll(`.line-${benchmark}`).style('opacity', 1).attr('stroke-width', 3);
764
+ g.selectAll(`.marker-${benchmark}`).style('opacity', 1);
765
+ g.selectAll(`.legend-${displayName}`).style('opacity', 1);
766
  } else {
767
+ // Ghost les autres lignes
768
+ g.selectAll(`.line-${benchmark}`).style('opacity', 0.15);
769
+ g.selectAll(`.marker-${benchmark}`).style('opacity', 0.15);
770
+ g.selectAll(`.legend-${displayName}`).style('opacity', 0.3);
771
  }
772
+ });
773
+ };
774
+
775
+ const resetHighlight = () => {
776
+ benchmarks.forEach(benchmark => {
777
+ g.selectAll(`.line-${benchmark}`).style('opacity', 0.9).attr('stroke-width', 2.5);
778
+ g.selectAll(`.marker-${benchmark}`).style('opacity', 1);
779
+ const displayName = benchmark === 'MMLU_new' ? 'MMLU-Pro' : benchmark;
780
+ g.selectAll(`.legend-${displayName}`).style('opacity', 1);
781
+ });
782
+ };
783
+
784
+ // Ajouter le nuage de points EN PREMIER (en dessous de tout)
785
+ if (scatterData && scatterData.points && scatterData.points.length > 0) {
786
+ // Utiliser les dates déjà filtrées (allDates est défini plus haut)
787
+ if (allDates.length > 0) {
788
+ const minDate = d3.min(allDates);
789
+ const maxDateLimit = new Date('2025-12-31'); // Limiter à décembre 2025
790
+ const maxDate = d3.min([d3.max(allDates), maxDateLimit]);
791
+
792
+ const filteredPoints = scatterData.points.filter(p => {
793
+ const pointDate = new Date(p.date);
794
+ return pointDate >= minDate && pointDate <= maxDateLimit;
795
+ });
796
+
797
+ // Debug: compter les points GAIA dans les points filtrés
798
+ const gaiaFilteredCount = filteredPoints.filter(p => p.leaderboard === 'gaia').length;
799
+ if (gaiaFilteredCount > 0) {
800
+ console.log(`✅ ${gaiaFilteredCount} points GAIA affichés dans le nuage (sur ${filteredPoints.length} points totaux)`);
801
+ }
802
+
803
+ // Créer un groupe pour le nuage de points (en arrière-plan, en premier)
804
+ const scatterGroup = g.append('g')
805
+ .attr('class', 'scatter-points');
806
+
807
+ // Ajouter les points
808
+ scatterGroup.selectAll('.scatter-point')
809
+ .data(filteredPoints)
810
+ .enter().append('circle')
811
+ .attr('class', d => `scatter-point scatter-${d.leaderboard}`)
812
+ .attr('cx', d => xScale(new Date(d.date)))
813
+ .attr('cy', d => {
814
+ // Utiliser average_score_raw si disponible (nouveau leaderboard), sinon average_score
815
+ const score = d.average_score_raw !== undefined ? d.average_score_raw : d.average_score;
816
+ return yScale(score);
817
+ })
818
+ .attr('r', 0.75) // Taille réduite pour moins de bruit visuel
819
+ .style('fill', 'var(--grid-color)') // Utiliser la variable CSS qui s'adapte au thème
820
+ .style('opacity', '0.5') // Opacité supplémentaire pour light mode (moins visible)
821
+ .attr('stroke', 'none')
822
+ .style('pointer-events', 'none'); // Ne pas interférer avec les interactions
823
+ }
824
+ }
825
+
826
+ benchmarks.forEach(benchmark => {
827
+ const points = data.benchmarks[benchmark];
828
+ if (!points || points.length === 0) return;
829
+
830
+ // Filtrer les points jusqu'en décembre 2025
831
+ const maxDate = new Date('2025-12-31');
832
+ const filteredPoints = points.filter(p => new Date(p.date) <= maxDate);
833
+
834
+ // S'assurer que les points sont triés par date pour le step chart
835
+ const sortedPoints = [...filteredPoints].sort((a, b) => new Date(a.date) - new Date(b.date));
836
+
837
+ // For MMLU_new, display as MMLU-Pro
838
+ const displayName = benchmark === 'MMLU_new' ? 'MMLU-Pro' : benchmark;
839
+ // Utiliser la couleur du groupe (getColor gère déjà MMLU_new -> MMLU-Pro)
840
+ const color = getColor(benchmark);
841
 
842
+ const path = g.append('path')
843
+ .datum(sortedPoints)
844
+ .attr('fill', 'none')
845
+ .attr('stroke', color)
846
+ .attr('stroke-width', 2.5)
847
+ .attr('d', line)
848
+ .attr('class', `line-${benchmark}`)
849
+ .style('opacity', 0.9)
850
+ .style('cursor', 'pointer');
851
+
852
+ // Add markers at every data point for step chart
853
+ g.selectAll(`.marker-${benchmark}`)
854
+ .data(sortedPoints)
855
+ .enter().append('circle')
856
+ .attr('class', `marker-${benchmark}`)
857
+ .attr('cx', d => xScale(new Date(d.date)))
858
+ .attr('cy', d => yScale(d.score))
859
+ .attr('r', 3.5)
860
+ .attr('fill', color)
861
+ .attr('stroke', 'none')
862
+ .style('cursor', 'pointer')
863
+ .on('mouseenter', function(ev, d) {
864
+ showPointTooltip(ev, d, displayName, color);
 
 
 
 
 
 
 
 
 
 
 
 
865
  })
866
+ .on('mouseleave', function() {
867
+ hidePointTooltip();
868
+ });
869
+
870
+ // Hover sur la ligne - afficher tooltip avec le nom du benchmark
871
+ path.on('mouseenter', function(ev) {
872
+ highlightBenchmark(displayName);
873
+ showLineTooltip(ev, displayName, color);
874
+ }).on('mouseleave', function() {
875
+ resetHighlight();
876
+ hideLineTooltip();
877
+ });
878
+ });
879
+
880
+ // X axis
881
+ const xAxis = d3.axisBottom(xScale)
882
+ .ticks(6)
883
+ .tickFormat(d3.timeFormat('%b %Y'))
884
+ .tickSizeOuter(0)
885
+ .tickSize(6) // Taille des barres des tics
886
+ .tickPadding(8); // Plus d'espace entre les ticks et les labels
887
+
888
+ g.append('g')
889
+ .attr('class', 'axis axis-x')
890
+ .attr('transform', `translate(0,${innerHeight})`)
891
+ .call(xAxis)
892
+ .call(g => g.select('.domain').remove()) // Enlever la barre de bordure
893
+ .call(g => g.selectAll('.tick line').attr('stroke', themeColors.axisColor).attr('stroke-width', 1))
894
+ .call(g => g.selectAll('.tick text').attr('fill', themeColors.textColor).attr('font-size', '11px').attr('opacity', 0.6));
895
+
896
+ // Y axis
897
+ const yAxis = d3.axisLeft(yScale)
898
+ .ticks(5)
899
+ .tickFormat(d => d + '%')
900
+ .tickSizeOuter(0)
901
+ .tickSize(6) // Taille des barres des tics
902
+ .tickPadding(8); // Plus d'espace entre les ticks et les labels
903
+
904
+ g.append('g')
905
+ .attr('class', 'axis axis-y')
906
+ .call(yAxis)
907
+ .call(g => g.select('.domain').remove()) // Enlever la barre de bordure
908
+ .call(g => g.selectAll('.tick line').attr('stroke', themeColors.axisColor).attr('stroke-width', 1))
909
+ .call(g => g.selectAll('.tick text').attr('fill', themeColors.textColor).attr('font-size', '11px').attr('opacity', 0.6));
910
+
911
+
912
+
913
+ const legendY = innerHeight + 70; // Plus d'espace entre graphique et légende
914
+
915
+ // Créer un conteneur foreignObject pour utiliser flexbox
916
+ // Augmenter la hauteur pour accommoder tous les éléments (6 items * ~20px + gaps + title)
917
+ const legendContainer = g.append('foreignObject')
918
+ .attr('x', -90) // Aligné à gauche
919
+ .attr('y', legendY - 30)
920
+ .attr('width', innerWidth + margin.left + margin.right) // Toute la largeur
921
+ .attr('height', 200); // Hauteur suffisante pour 6 items + titre + gaps
922
+
923
+ const legendWrapper = legendContainer.append('xhtml:div')
924
+ .style('display', 'flex')
925
+ .style('flex-direction', 'column')
926
+ .style('align-items', 'flex-start')
927
+ .style('width', '100%')
928
+ .style('padding', '12px 54px');
929
+
930
+ // Label "legend" au-dessus
931
+ const legendLabel = legendWrapper.append('xhtml:div')
932
+ .style('font-size', '12px')
933
+ .style('font-weight', '600')
934
+ .style('color', 'var(--text-color)')
935
+ .style('opacity', '0.8')
936
+ .style('margin-bottom', '8px')
937
+ .text('Legend');
938
+
939
+ const legendDiv = legendWrapper.append('xhtml:div')
940
+ .style('display', 'flex')
941
+ .style('flex-direction', 'row') // Une seule ligne horizontale avec tous les groupes
942
+ .style('align-items', 'flex-start')
943
+ .style('justify-content', 'flex-start')
944
+ .style('gap', '30px') // Gap entre les groupes
945
+ .style('width', '100%')
946
+ .style('flex-wrap', 'wrap') // Permettre le wrap si nécessaire
947
+ .style('color', 'var(--text-color)') // Utiliser la variable CSS pour le dark mode
948
+ .style('background-color', 'transparent'); // Fond transparent
949
+
950
+ // Stocker la référence pour les mises à jour de thème
951
+ legendDivRef = legendWrapper;
952
+
953
+ // Filtrer les groupes pour ne garder que ceux qui ont des benchmarks présents dans les données
954
+ const filteredGroups = BENCHMARK_GROUPS.map(group => {
955
+ const availableBenchmarks = group.benchmarks.filter(benchmark => {
956
+ // Gérer MMLU-Pro qui est stocké comme MMLU_new
957
+ const dataKey = benchmark === 'MMLU-Pro' ? 'MMLU_new' : benchmark;
958
+
959
+ if (!data || !data.benchmarks) return false;
960
+ if (!data.benchmarks[dataKey]) return false;
961
+
962
+ const benchmarkData = data.benchmarks[dataKey];
963
+ return Array.isArray(benchmarkData) && benchmarkData.length > 0;
964
+ });
965
+
966
+ return {
967
+ name: group.name,
968
+ benchmarks: availableBenchmarks
969
+ };
970
+ }).filter(group => group.benchmarks.length > 0);
971
+
972
+ // Créer la légende par groupes (seulement les titres, détails dans tooltip)
973
+ filteredGroups.forEach(group => {
974
+ const groupColor = getGroupBaseColor(group.name); // Couleur de base du groupe
975
+
976
+ // Conteneur pour le groupe
977
+ const groupDiv = legendDiv.append('xhtml:div')
978
+ .style('display', 'flex')
979
+ .style('align-items', 'center')
980
+ .style('gap', '8px')
981
+ .style('cursor', 'pointer')
982
+ .style('position', 'relative')
983
+ .attr('class', `legend-group legend-group-${group.name.replace(/\s+/g, '-')}`);
984
+
985
+ // Carré de couleur
986
+ const groupColorSquare = groupDiv.append('xhtml:div')
987
+ .style('width', '14px')
988
+ .style('height', '14px')
989
+ .style('border-radius', '3px')
990
+ .style('background-color', groupColor)
991
+ .style('flex-shrink', '0');
992
+
993
+ // Titre du groupe
994
+ const groupTitleText = groupDiv.append('xhtml:span')
995
+ .style('font-size', '11px')
996
+ .style('font-weight', '600')
997
+ .style('color', 'var(--text-color)')
998
+ .style('opacity', '0.8')
999
+ .style('white-space', 'nowrap')
1000
+ .text(group.name);
1001
+
1002
+ // Créer un tooltip externe pour éviter les problèmes de positionnement dans foreignObject
1003
+ let legendTooltip = container.querySelector('.d3-legend-tooltip');
1004
+ if (!legendTooltip) {
1005
+ legendTooltip = d3.select(container).append('div')
1006
+ .attr('class', 'd3-legend-tooltip')
1007
+ .style('position', 'absolute')
1008
+ .style('padding', '8px 12px')
1009
+ .style('background', 'var(--surface-bg)')
1010
+ .style('border', '1px solid var(--border-color)')
1011
+ .style('border-radius', '6px')
1012
+ .style('box-shadow', '0 4px 12px rgba(0,0,0,0.15)')
1013
+ .style('font-size', '10px')
1014
+ .style('color', 'var(--text-color)')
1015
+ .style('white-space', 'nowrap')
1016
+ .style('opacity', '0')
1017
+ .style('pointer-events', 'none')
1018
+ .style('z-index', '10000')
1019
+ .style('transition', 'opacity 0.2s ease')
1020
+ .style('backdrop-filter', 'saturate(1.12) blur(8px)')
1021
+ .node();
1022
+ }
1023
+
1024
+ // Hover pour afficher le tooltip et highlight tous les benchmarks du groupe
1025
+ groupDiv.on('mouseenter', function(ev) {
1026
+ const tooltip = d3.select(legendTooltip);
1027
+ tooltip
1028
+ .text(group.benchmarks.join(', '))
1029
+ .style('opacity', '1');
1030
+
1031
+ // Positionner le tooltip au-dessus de l'élément
1032
+ const rect = this.getBoundingClientRect();
1033
+ const containerRect = container.getBoundingClientRect();
1034
+ const tooltipRect = legendTooltip.getBoundingClientRect();
1035
+
1036
+ const left = rect.left - containerRect.left + (rect.width / 2) - (tooltipRect.width / 2);
1037
+ const top = rect.top - containerRect.top - tooltipRect.height - 8;
1038
+
1039
+ tooltip
1040
+ .style('left', `${left}px`)
1041
+ .style('top', `${top}px`);
1042
+
1043
+ // Highlight tous les benchmarks du groupe
1044
+ group.benchmarks.forEach(benchmark => {
1045
+ const displayName = benchmark;
1046
+ highlightBenchmark(displayName);
1047
+ });
1048
+ }).on('mouseleave', function() {
1049
+ d3.select(legendTooltip).style('opacity', '0');
1050
+ resetHighlight();
1051
+ });
1052
+ });
1053
+
1054
+ };
1055
+
1056
+ // Hover functions pour les points
1057
+ let hideTipTimer = null;
1058
+
1059
+ function showPointTooltip(ev, pointData, benchmarkName, color) {
1060
+ if (hideTipTimer) {
1061
+ clearTimeout(hideTipTimer);
1062
+ hideTipTimer = null;
1063
+ }
1064
+
1065
+ // Update theme colors
1066
+ themeColors = getThemeColors();
1067
+
1068
+ // Extraire le nom du modèle (peut contenir du HTML)
1069
+ let modelName = pointData.model || 'N/A';
1070
+ // Si c'est du HTML, extraire juste le texte
1071
+ if (modelName.includes('<')) {
1072
+ const tempDiv = document.createElement('div');
1073
+ tempDiv.innerHTML = modelName;
1074
+ modelName = tempDiv.textContent || tempDiv.innerText || 'N/A';
1075
+ }
1076
+
1077
+ // Construire le contenu du tooltip avec un design amélioré et aligné à gauche
1078
+ const score = pointData.score.toFixed(2);
1079
+ let html = `
1080
+ <div style="display:flex;align-items:center;gap:8px;margin-bottom:12px;padding-bottom:12px;border-bottom:1px solid var(--border-color);">
1081
+ <span class="d3-tooltip__color-dot" style="background:${color};width:12px;height:12px;border-radius:2px;flex-shrink:0;"></span>
1082
+ <span style="font-weight:600;font-size:13px;color:var(--text-color);">${benchmarkName}</span>
1083
+ </div>
1084
+ <div style="margin-bottom:10px;text-align:left;">
1085
+ <div style="font-size:10px;color:var(--muted-color);text-transform:uppercase;letter-spacing:0.05em;margin-bottom:5px;">Score</div>
1086
+ <div style="font-size:18px;font-weight:700;color:var(--text-color);line-height:1.2;text-align:left;">${score}%</div>
1087
+ </div>
1088
+ <div style="text-align:left;">
1089
+ <div style="font-size:10px;color:var(--muted-color);text-transform:uppercase;letter-spacing:0.05em;margin-bottom:5px;">Model</div>
1090
+ <div style="font-size:12px;color:var(--text-color);line-height:1.5;word-break:break-word;text-align:left;">${modelName}</div>
1091
+ </div>
1092
+ `;
1093
+
1094
+ tipInner.innerHTML = html;
1095
+
1096
+ // Update tooltip background and border colors avec variables CSS
1097
+ tip.style.background = 'var(--surface-bg)';
1098
+ tip.style.borderColor = 'var(--border-color)';
1099
+
1100
+ // Positionner le tooltip par rapport au point
1101
+ const rect = container.getBoundingClientRect();
1102
+ const tipRect = tip.getBoundingClientRect();
1103
+ const offsetX = 15;
1104
+ const offsetY = -10;
1105
+
1106
+ const tipX = ev.clientX - rect.left + offsetX;
1107
+ const tipY = ev.clientY - rect.top + offsetY;
1108
+
1109
+ // Ajuster si le tooltip sort de l'écran
1110
+ const maxX = window.innerWidth - tipRect.width - 20;
1111
+ const maxY = window.innerHeight - tipRect.height - 20;
1112
+ const finalX = Math.min(tipX, maxX);
1113
+ const finalY = Math.max(10, Math.min(tipY, maxY));
1114
+
1115
  tip.style.opacity = '1';
1116
+ tip.style.transform = `translate(${Math.round(finalX)}px, ${Math.round(finalY)}px)`;
1117
+ }
1118
+
1119
+ function hidePointTooltip() {
1120
+ hideTipTimer = setTimeout(() => {
 
 
1121
  tip.style.opacity = '0';
1122
  tip.style.transform = 'translate(-9999px, -9999px)';
1123
+ }, 100);
1124
+ }
1125
+
1126
+ // Tooltip pour les lignes (nom du benchmark uniquement)
1127
+ let lineTooltip = null;
1128
+
1129
+ function showLineTooltip(ev, benchmarkName, color) {
1130
+ if (!lineTooltip) {
1131
+ lineTooltip = d3.select(container).append('div')
1132
+ .attr('class', 'd3-line-tooltip')
1133
+ .style('position', 'absolute')
1134
+ .style('padding', '6px 10px')
1135
+ .style('background', 'var(--surface-bg)')
1136
+ .style('border', '1px solid var(--border-color)')
1137
+ .style('border-radius', '4px')
1138
+ .style('font-size', '11px')
1139
+ .style('font-weight', '600')
1140
+ .style('color', 'var(--text-color)')
1141
+ .style('white-space', 'nowrap')
1142
+ .style('opacity', '0')
1143
+ .style('pointer-events', 'none')
1144
+ .style('z-index', '10000')
1145
+ .style('transition', 'opacity 0.15s ease')
1146
+ .style('box-shadow', '0 2px 8px rgba(0,0,0,0.1)')
1147
+ .node();
1148
+ }
1149
+
1150
+ const tooltip = d3.select(lineTooltip);
1151
+ tooltip.text(benchmarkName);
1152
+
1153
+ const rect = container.getBoundingClientRect();
1154
+ const tooltipRect = lineTooltip.getBoundingClientRect();
1155
+ const offsetX = 10;
1156
+ const offsetY = -25;
1157
+
1158
+ const tipX = ev.clientX - rect.left + offsetX;
1159
+ const tipY = ev.clientY - rect.top + offsetY;
1160
+
1161
+ // Ajuster si le tooltip sort de l'écran
1162
+ const maxX = window.innerWidth - tooltipRect.width - 20;
1163
+ const maxY = window.innerHeight - tooltipRect.height - 20;
1164
+ const finalX = Math.min(tipX, maxX);
1165
+ const finalY = Math.max(10, Math.min(tipY, maxY));
1166
+
1167
+ tooltip
1168
+ .style('opacity', '1')
1169
+ .style('left', `${finalX}px`)
1170
+ .style('top', `${finalY}px`);
1171
+ }
1172
+
1173
+ function hideLineTooltip() {
1174
+ if (lineTooltip) {
1175
+ d3.select(lineTooltip).style('opacity', '0');
1176
+ }
1177
+ }
1178
 
1179
+ // Load data and render
1180
+ loadData().then(() => {
1181
  // First render + resize
1182
  if (window.ResizeObserver) {
1183
  const ro = new ResizeObserver(() => render());
 
1185
  } else {
1186
  window.addEventListener('resize', render);
1187
  }
1188
+
1189
+ // Re-render when color palette changes (if ColorPalettes supports events)
1190
+ if (window.ColorPalettes && typeof window.ColorPalettes.refresh === 'function') {
1191
+ // Update colors and re-render when palette refreshes
1192
+ const updateColors = () => {
1193
+ if (data && data.benchmarks) {
1194
+ updateColorMap(Object.keys(data.benchmarks).sort());
1195
+ render();
1196
+ }
1197
+ };
1198
+ // Listen for custom event if available
1199
+ window.addEventListener('colorpalettechange', updateColors);
1200
+ }
1201
+
1202
  render();
1203
+ });
1204
  };
1205
 
1206
  if (document.readyState === 'loading') {
1207
  document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
1208
  } else { ensureD3(bootstrap); }
1209
  })();
1210
+ </script>
app/src/content/embeds/d3-mmlu-heatmap.html ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="d3-mmlu-heatmap">
2
+ <div class="heatmap-container"></div>
3
+ <div class="legend-container"></div>
4
+ </div>
5
+ <style>
6
+ .d3-mmlu-heatmap {
7
+ position: relative;
8
+ margin: 24px 0;
9
+ }
10
+
11
+ .d3-mmlu-heatmap .heatmap-container {
12
+ width: 100%;
13
+ }
14
+
15
+ .d3-mmlu-heatmap .legend-container {
16
+ margin-top: 8px;
17
+ padding: 0 8px;
18
+ }
19
+
20
+ .d3-mmlu-heatmap .legend-title {
21
+ font-size: 12px;
22
+ font-weight: 600;
23
+ color: var(--text-color);
24
+ margin-bottom: 12px;
25
+ text-align: center;
26
+ }
27
+
28
+ .d3-mmlu-heatmap .legend-grid {
29
+ display: grid;
30
+ grid-template-columns: 1fr 1fr;
31
+ gap: 8px 24px;
32
+ font-size: 11px;
33
+ color: var(--text-color);
34
+ }
35
+
36
+ .d3-mmlu-heatmap .legend-column {
37
+ display: flex;
38
+ flex-direction: column;
39
+ gap: 8px;
40
+ }
41
+
42
+ .d3-mmlu-heatmap .legend-item {
43
+ display: flex;
44
+ align-items: flex-start;
45
+ gap: 8px;
46
+ }
47
+
48
+ .d3-mmlu-heatmap .legend-label {
49
+ font-weight: 700;
50
+ min-width: 20px;
51
+ }
52
+
53
+ .d3-mmlu-heatmap .legend-text {
54
+ flex: 1;
55
+ line-height: 1.4;
56
+ }
57
+
58
+ .d3-mmlu-heatmap .axis-label {
59
+ fill: var(--text-color);
60
+ font-size: 11px;
61
+ font-weight: 600;
62
+ }
63
+
64
+ .d3-mmlu-heatmap .cell-text {
65
+ fill: var(--text-color);
66
+ font-size: 10px;
67
+ font-weight: 600;
68
+ pointer-events: none;
69
+ }
70
+
71
+ @media (max-width: 768px) {
72
+ .d3-mmlu-heatmap .legend-grid {
73
+ grid-template-columns: 1fr;
74
+ }
75
+ }
76
+ </style>
77
+ <script>
78
+ (() => {
79
+ // Load D3 from CDN once
80
+ const ensureD3 = (cb) => {
81
+ if (window.d3 && typeof window.d3.select === 'function') return cb();
82
+ let s = document.getElementById('d3-cdn-script');
83
+ if (!s) {
84
+ s = document.createElement('script');
85
+ s.id = 'd3-cdn-script';
86
+ s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
87
+ document.head.appendChild(s);
88
+ }
89
+ const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
90
+ s.addEventListener('load', onReady, { once: true });
91
+ if (window.d3) onReady();
92
+ };
93
+
94
+ const bootstrap = () => {
95
+ const scriptEl = document.currentScript;
96
+ let container = scriptEl ? scriptEl.previousElementSibling : null;
97
+ if (!(container && container.classList && container.classList.contains('d3-mmlu-heatmap'))) {
98
+ const cs = Array.from(document.querySelectorAll('.d3-mmlu-heatmap')).filter(el => !(el.dataset && el.dataset.mounted === 'true'));
99
+ container = cs[cs.length - 1] || null;
100
+ }
101
+ if (!container) return;
102
+ if (container.dataset) {
103
+ if (container.dataset.mounted === 'true') return;
104
+ container.dataset.mounted = 'true';
105
+ }
106
+
107
+ // Tooltip
108
+ container.style.position = container.style.position || 'relative';
109
+ let tip = container.querySelector('.d3-tooltip');
110
+ let tipInner;
111
+ if (!tip) {
112
+ tip = document.createElement('div');
113
+ tip.className = 'd3-tooltip';
114
+ Object.assign(tip.style, {
115
+ position: 'absolute',
116
+ top: '0px',
117
+ left: '0px',
118
+ transform: 'translate(-9999px, -9999px)',
119
+ pointerEvents: 'none',
120
+ padding: '8px 10px',
121
+ borderRadius: '8px',
122
+ fontSize: '12px',
123
+ lineHeight: '1.35',
124
+ border: '1px solid var(--border-color)',
125
+ background: 'var(--surface-bg)',
126
+ color: 'var(--text-color)',
127
+ boxShadow: '0 4px 24px rgba(0,0,0,.18)',
128
+ opacity: '0',
129
+ transition: 'opacity .12s ease'
130
+ });
131
+ tipInner = document.createElement('div');
132
+ tipInner.className = 'd3-tooltip__inner';
133
+ tipInner.style.textAlign = 'left';
134
+ tip.appendChild(tipInner);
135
+ container.appendChild(tip);
136
+ } else {
137
+ tipInner = tip.querySelector('.d3-tooltip__inner') || tip;
138
+ }
139
+
140
+ // Heatmap container (no card)
141
+ const heatmapContainer = container.querySelector('.heatmap-container');
142
+ const svg = d3.select(heatmapContainer).append('svg').attr('width', '100%').style('display', 'block');
143
+ const defs = svg.append('defs');
144
+ const gRoot = svg.append('g');
145
+ const gCells = gRoot.append('g');
146
+ const gAxes = gRoot.append('g');
147
+
148
+ // Data from the image (5 models)
149
+ const models = [
150
+ 'Mistral-7B-v0.1',
151
+ 'Qwen1.5-7B',
152
+ 'gemma-7b',
153
+ 'phi-2',
154
+ 'DeciLM-7B'
155
+ ];
156
+
157
+ const promptFormats = [
158
+ '...? -> choice1/choice2/...',
159
+ 'Q:...? A: -> choice1/choice2/...',
160
+ 'Question: ...? Answer: -> choice1/choice2/...',
161
+ 'Question: ...? Choices: ... Answer: -> choice1/choice2/...',
162
+ 'Question: ...? Choices: A. ... Answer: -> choice1/choice2/...',
163
+ 'Question: ...? Choices: (A) ... Answer: -> choice1/choice2/...',
164
+ 'Question: ...? Choices: A. ... Answer: -> A/B/C/D',
165
+ 'Question: ...? Choices: (A) Answer: -> (A)/(B)/(C)/(D)'
166
+ ];
167
+
168
+ const matrix = [
169
+ [49.0, 50.5, 52.1, 54.5, 56.4, 55.4, 55.5, 57.0], // Mistral-7B-v0.1
170
+ [37.6, 41.8, 43.5, 47.9, 50.8, 51.2, 22.9, 47.7], // Qwen1.5-7B
171
+ [44.6, 48.0, 47.6, 53.5, 54.2, 54.9, 56.4, 50.7], // gemma-7b
172
+ [39.1, 44.3, 46.5, 46.1, 47.1, 48.4, 51.7, 45.8], // phi-2
173
+ [43.6, 48.9, 49.5, 51.0, 51.3, 52.0, 52.8, 52.3] // DeciLM-7B
174
+ ];
175
+
176
+ // Colors: red to green palette (red for low, green for high)
177
+ const getDivergingColors = (count) => {
178
+ try {
179
+ if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') {
180
+ return window.ColorPalettes.getColors('diverging', count);
181
+ }
182
+ } catch (_) { }
183
+ // Fallback: red to green scale
184
+ const colors = [];
185
+ for (let i = 0; i < count; i++) {
186
+ const t = i / (count - 1);
187
+ // Red (low) -> Yellow (mid) -> Green (high)
188
+ if (t < 0.5) {
189
+ // Red to yellow
190
+ const r = 255;
191
+ const g = Math.round(t * 2 * 255);
192
+ const b = 0;
193
+ colors.push(`rgb(${r}, ${g}, ${b})`);
194
+ } else {
195
+ // Yellow to green
196
+ const t2 = (t - 0.5) * 2;
197
+ const r = Math.round(255 - t2 * 255);
198
+ const g = 255;
199
+ const b = 0;
200
+ colors.push(`rgb(${r}, ${g}, ${b})`);
201
+ }
202
+ }
203
+ return colors;
204
+ };
205
+
206
+ const palette = getDivergingColors(10);
207
+
208
+ let width = 900;
209
+ const margin = { top: 0, right: 0, bottom: 0, left: 100 }; // Only left margin for model names
210
+
211
+ function updateSize() {
212
+ width = container.clientWidth || 900;
213
+
214
+ // Calculate actual content dimensions
215
+ const nRows = models.length;
216
+ const nCols = promptFormats.length;
217
+ const innerWidth = width - margin.left - margin.right;
218
+ const maxDim = Math.max(nRows, nCols);
219
+ const availableSize = Math.min(innerWidth, 600);
220
+ const cellSize = availableSize / maxDim;
221
+ const gridWidth = cellSize * nCols;
222
+ const gridHeight = cellSize * nRows;
223
+ const labelsHeight = 15; // space for X-axis labels
224
+
225
+ // Calculate exact SVG dimensions needed
226
+ const actualWidth = margin.left + gridWidth + margin.right;
227
+ const actualHeight = margin.top + gridHeight + labelsHeight + margin.bottom;
228
+
229
+ svg
230
+ .attr('viewBox', `0 0 ${actualWidth} ${actualHeight}`)
231
+ .attr('preserveAspectRatio', 'xMidYMin meet')
232
+ .style('width', '100%')
233
+ .style('height', 'auto');
234
+
235
+ gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
236
+ return { innerWidth: gridWidth, innerHeight: gridHeight + labelsHeight };
237
+ }
238
+
239
+ function getColorScale(values, minV, maxV) {
240
+ // Always use the custom red-to-green palette (fallback)
241
+ // Don't use ColorPalettes for this specific heatmap
242
+ const linearScale = d3.scaleLinear()
243
+ .domain([minV, maxV])
244
+ .range([0, 1])
245
+ .clamp(true);
246
+
247
+ return (v) => {
248
+ const t = linearScale(v);
249
+ // Apply power transformation to emphasize extremes
250
+ // Values near min/max get more extreme colors
251
+ let transformedT;
252
+ if (t < 0.5) {
253
+ // Compress lower values, making extremes more distinct
254
+ transformedT = Math.pow(t * 2, 1.8) / 2;
255
+ } else {
256
+ // Expand upper values, making extremes more distinct
257
+ transformedT = 0.5 + Math.pow((t - 0.5) * 2, 1.8) / 2;
258
+ }
259
+
260
+ // Red to green scale: red (low scores = bad) -> yellow (mid) -> green (high scores = good)
261
+ // Less flashy: reduce saturation
262
+ if (transformedT < 0.5) {
263
+ // Red to yellow (less saturated)
264
+ const r = 220;
265
+ const g = Math.round(80 + transformedT * 2 * 140);
266
+ const b = Math.round(60 + transformedT * 2 * 40);
267
+ return `rgb(${r}, ${g}, ${b})`;
268
+ } else {
269
+ // Yellow to green (less saturated)
270
+ const t2 = (transformedT - 0.5) * 2;
271
+ const r = Math.round(220 - t2 * 100);
272
+ const g = 220;
273
+ const b = Math.round(100 - t2 * 60);
274
+ return `rgb(${r}, ${g}, ${b})`;
275
+ }
276
+ };
277
+ }
278
+
279
+ function chooseReadableTextColor(bgColor) {
280
+ try {
281
+ const m = String(bgColor || '').match(/rgb\(([^)]+)\)/);
282
+ if (!m) return '#0e1116';
283
+ const [r, g, b] = m[1].split(',').map(s => parseFloat(s.trim()));
284
+ const luminance = (0.299 * r + 0.587 * g + 0.114 * b) / 255;
285
+ return luminance < 0.5 ? '#ffffff' : '#0e1116';
286
+ } catch (_) {
287
+ return '#0e1116';
288
+ }
289
+ }
290
+
291
+ function render() {
292
+ const { innerWidth, innerHeight } = updateSize();
293
+ const nRows = models.length;
294
+ const nCols = promptFormats.length;
295
+
296
+ // Calculate cell size to make each cell square
297
+ const maxDim = Math.max(nRows, nCols);
298
+ const cellSize = innerWidth / maxDim;
299
+ const gridWidth = cellSize * nCols;
300
+ const gridHeight = cellSize * nRows;
301
+
302
+ const gridOffsetX = 0;
303
+ const gridOffsetY = 0;
304
+
305
+ const x = d3.scaleBand()
306
+ .domain(d3.range(nCols))
307
+ .range([0, gridWidth])
308
+ .paddingInner(0);
309
+
310
+ const y = d3.scaleBand()
311
+ .domain(d3.range(nRows))
312
+ .range([0, gridHeight])
313
+ .paddingInner(0);
314
+
315
+ // Flatten matrix data
316
+ const flatData = [];
317
+ let minVal = Infinity, maxVal = -Infinity;
318
+ for (let r = 0; r < nRows; r++) {
319
+ for (let c = 0; c < nCols; c++) {
320
+ const value = matrix[r][c];
321
+ if (value < minVal) minVal = value;
322
+ if (value > maxVal) maxVal = value;
323
+ flatData.push({ r, c, value, model: models[r], format: promptFormats[c] });
324
+ }
325
+ }
326
+
327
+ const colorScale = getColorScale(flatData.map(d => d.value), minVal, maxVal);
328
+
329
+ gCells.attr('transform', `translate(${gridOffsetX}, ${gridOffsetY})`);
330
+
331
+ // Add rounded corners only on the outer edges of the matrix using clipPath
332
+ const cornerRadius = 6;
333
+ defs.selectAll('#matrix-clip').remove();
334
+ const clipPath = defs.append('clipPath')
335
+ .attr('id', 'matrix-clip');
336
+ clipPath.append('rect')
337
+ .attr('x', 0)
338
+ .attr('y', 0)
339
+ .attr('width', gridWidth)
340
+ .attr('height', gridHeight)
341
+ .attr('rx', cornerRadius)
342
+ .attr('ry', cornerRadius);
343
+
344
+ gCells.attr('clip-path', 'url(#matrix-clip)');
345
+
346
+ const cells = gCells.selectAll('g.cell')
347
+ .data(flatData, d => `${d.r}-${d.c}`);
348
+
349
+ const cellsEnter = cells.enter()
350
+ .append('g')
351
+ .attr('class', 'cell');
352
+
353
+ cellsEnter.append('rect')
354
+ .attr('rx', 0)
355
+ .attr('ry', 0)
356
+ .on('mousemove', (event, d) => {
357
+ const [px, py] = d3.pointer(event, container);
358
+ tipInner.innerHTML = `<strong>${d.model}</strong><br/>${d.format}<br/>Score: ${d.value.toFixed(1)}`;
359
+ tip.style.transform = `translate(${px + 10}px, ${py + 10}px)`;
360
+ tip.style.opacity = '1';
361
+ })
362
+ .on('mouseleave', () => {
363
+ tip.style.opacity = '0';
364
+ });
365
+
366
+ cellsEnter.append('text')
367
+ .attr('class', 'cell-text')
368
+ .attr('text-anchor', 'middle')
369
+ .attr('dominant-baseline', 'middle');
370
+
371
+ const cellsMerged = cellsEnter.merge(cells);
372
+
373
+ cellsMerged.select('rect')
374
+ .attr('x', d => x(d.c))
375
+ .attr('y', d => y(d.r))
376
+ .attr('width', Math.max(1, x.bandwidth()))
377
+ .attr('height', Math.max(1, y.bandwidth()))
378
+ .attr('fill', d => colorScale(d.value));
379
+
380
+ cellsMerged.select('text')
381
+ .attr('x', d => x(d.c) + x.bandwidth() / 2)
382
+ .attr('y', d => y(d.r) + y.bandwidth() / 2)
383
+ .text(d => d.value.toFixed(1))
384
+ .style('fill', function(d) {
385
+ try {
386
+ const rect = this.parentNode.querySelector('rect');
387
+ const bg = rect ? getComputedStyle(rect).fill : colorScale(d.value);
388
+ return chooseReadableTextColor(bg);
389
+ } catch (_) {
390
+ return '#0e1116';
391
+ }
392
+ });
393
+
394
+ cells.exit().remove();
395
+
396
+ // Axes
397
+ gAxes.selectAll('*').remove();
398
+ gAxes.attr('transform', `translate(${gridOffsetX}, ${gridOffsetY})`);
399
+
400
+ // X-axis labels (prompt formats)
401
+ gAxes.append('g')
402
+ .selectAll('text')
403
+ .data(promptFormats)
404
+ .join('text')
405
+ .attr('class', 'axis-label')
406
+ .attr('text-anchor', 'middle')
407
+ .attr('x', (_, i) => x(i) + x.bandwidth() / 2)
408
+ .attr('y', gridHeight + 12)
409
+ .text((d, i) => String.fromCharCode(65 + i)); // A, B, C, D, E, F, G
410
+
411
+ // Y-axis labels (models)
412
+ gAxes.append('g')
413
+ .selectAll('text')
414
+ .data(models)
415
+ .join('text')
416
+ .attr('class', 'axis-label')
417
+ .attr('text-anchor', 'end')
418
+ .attr('x', -10)
419
+ .attr('y', (_, i) => y(i) + y.bandwidth() / 2)
420
+ .attr('dominant-baseline', 'middle')
421
+ .text(d => d);
422
+
423
+ // Update HTML legend
424
+ const legendContainer = container.querySelector('.legend-container');
425
+ legendContainer.innerHTML = '';
426
+
427
+ const legendTitle = document.createElement('div');
428
+ legendTitle.className = 'legend-title';
429
+ legendTitle.textContent = 'Prompt Formats:';
430
+ legendContainer.appendChild(legendTitle);
431
+
432
+ const legendGrid = document.createElement('div');
433
+ legendGrid.className = 'legend-grid';
434
+
435
+ // Column 1: A, B, C, D (first 4)
436
+ const column1 = document.createElement('div');
437
+ column1.className = 'legend-column';
438
+
439
+ // Column 2: E, F, G, H (last 4)
440
+ const column2 = document.createElement('div');
441
+ column2.className = 'legend-column';
442
+
443
+ promptFormats.forEach((format, i) => {
444
+ const item = document.createElement('div');
445
+ item.className = 'legend-item';
446
+
447
+ const label = document.createElement('span');
448
+ label.className = 'legend-label';
449
+ label.textContent = `${String.fromCharCode(65 + i)}.`;
450
+
451
+ const text = document.createElement('span');
452
+ text.className = 'legend-text';
453
+ text.textContent = format;
454
+
455
+ item.appendChild(label);
456
+ item.appendChild(text);
457
+
458
+ // First 4 go to column 1, rest go to column 2
459
+ if (i < 4) {
460
+ column1.appendChild(item);
461
+ } else {
462
+ column2.appendChild(item);
463
+ }
464
+ });
465
+
466
+ legendGrid.appendChild(column1);
467
+ legendGrid.appendChild(column2);
468
+
469
+ legendContainer.appendChild(legendGrid);
470
+ }
471
+
472
+ // Initial render + resize handling
473
+ render();
474
+ const rerender = () => render();
475
+ if (window.ResizeObserver) {
476
+ const ro = new ResizeObserver(() => rerender());
477
+ ro.observe(container);
478
+ } else {
479
+ window.addEventListener('resize', rerender);
480
+ }
481
+ };
482
+
483
+ if (document.readyState === 'loading') {
484
+ document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
485
+ } else {
486
+ ensureD3(bootstrap);
487
+ }
488
+ })();
489
+ </script>
app/src/content/embeds/d3-two-lines-chart.html CHANGED
@@ -572,8 +572,33 @@
572
  .attr('d', d => line(applySmoothing(d.values, smoothEnabled)));
573
 
574
  // Update axes
575
- gAxes.select('.x-axis').call(d3.axisBottom(newXScale).ticks(5).tickSizeOuter(0).tickFormat(formatStep));
576
- gAxes.select('.y-axis').call(d3.axisLeft(newYScale).ticks(5).tickSizeOuter(0).tickFormat(formatValue));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
  // Update baseline position
579
  if (baseline !== null) {
@@ -639,7 +664,26 @@
639
  // Create smart formatters
640
  const stepValues = allData.map(d => d.step);
641
  const metricValues = allData.map(d => d.value);
642
- formatStep = createSmartFormatter(stepValues);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  formatValue = createSmartFormatter(metricValues);
644
 
645
  // Update clip
@@ -660,10 +704,38 @@
660
 
661
  // Axes
662
  gAxes.selectAll('*').remove();
 
 
 
 
 
 
 
 
663
  gAxes.append('g').attr('class', 'x-axis').attr('transform', `translate(0,${innerHeight})`)
664
- .call(d3.axisBottom(xScale).ticks(5).tickSizeOuter(0).tickFormat(formatStep));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
  gAxes.append('g').attr('class', 'y-axis')
666
- .call(d3.axisLeft(yScale).ticks(5).tickSizeOuter(0).tickFormat(formatValue));
667
  gAxes.selectAll('.domain, .tick line').attr('stroke', 'var(--axis-color)');
668
  gAxes.selectAll('text').attr('fill', 'var(--tick-color)');
669
 
@@ -940,7 +1012,7 @@
940
  const mean = values.reduce((a, b) => a + b, 0) / values.length;
941
  allData.push({
942
  run: runName,
943
- step: parseFloat(tokenKey) / 1e9, // Convert to billions
944
  value: mean
945
  });
946
  });
 
572
  .attr('d', d => line(applySmoothing(d.values, smoothEnabled)));
573
 
574
  // Update axes
575
+ const newXTicks = newXScale.ticks(5);
576
+ const newXAxis = d3.axisBottom(newXScale)
577
+ .tickValues(newXTicks)
578
+ .tickSizeOuter(0)
579
+ .tickFormat(formatStep);
580
+ gAxes.select('.x-axis').call(newXAxis);
581
+
582
+ // Format Y axis to round appropriately based on value magnitude
583
+ const formatValueRounded = (v) => {
584
+ if (v === 0) return '0';
585
+ // Determine rounding precision based on value magnitude
586
+ if (v < 1) {
587
+ // For values < 1, round to nearest 0.1
588
+ return d3.format('.1f')(Math.round(v * 10) / 10);
589
+ } else if (v < 10) {
590
+ // For values 1-10, round to nearest 1
591
+ return d3.format('d')(Math.round(v));
592
+ } else if (v < 100) {
593
+ // For values 10-100, round to nearest 10
594
+ return d3.format('d')(Math.round(v / 10) * 10);
595
+ } else {
596
+ // For larger values, round to nearest 10
597
+ return d3.format('d')(Math.round(v / 10) * 10);
598
+ }
599
+ };
600
+
601
+ gAxes.select('.y-axis').call(d3.axisLeft(newYScale).ticks(5).tickSizeOuter(0).tickFormat(formatValueRounded));
602
 
603
  // Update baseline position
604
  if (baseline !== null) {
 
664
  // Create smart formatters
665
  const stepValues = allData.map(d => d.step);
666
  const metricValues = allData.map(d => d.value);
667
+
668
+ // For X axis (tokens already in billions), use a specific formatter
669
+ const stepMin = d3.min(stepValues);
670
+ const stepMax = d3.max(stepValues);
671
+
672
+ // Tokens are already in billions, format appropriately
673
+ formatStep = (v) => {
674
+ if (v === 0) return '0';
675
+ // Format with appropriate precision based on value
676
+ if (v < 0.01) return d3.format('.3f')(v);
677
+ if (v < 0.1) return d3.format('.2f')(v);
678
+ if (v < 1) return d3.format('.2f')(v);
679
+ if (v < 10) return d3.format('.1f')(v);
680
+ // For larger values, check if integer
681
+ if (Math.abs(v - Math.round(v)) < 0.01) {
682
+ return d3.format('d')(Math.round(v));
683
+ }
684
+ return d3.format('.1f')(v);
685
+ };
686
+
687
  formatValue = createSmartFormatter(metricValues);
688
 
689
  // Update clip
 
704
 
705
  // Axes
706
  gAxes.selectAll('*').remove();
707
+
708
+ // Generate explicit ticks for X axis
709
+ const xTicks = xScale.ticks(5);
710
+ const xAxis = d3.axisBottom(xScale)
711
+ .tickValues(xTicks)
712
+ .tickSizeOuter(0)
713
+ .tickFormat(formatStep);
714
+
715
  gAxes.append('g').attr('class', 'x-axis').attr('transform', `translate(0,${innerHeight})`)
716
+ .call(xAxis);
717
+
718
+ // Format Y axis to round appropriately based on value magnitude
719
+ const formatValueRounded = (v) => {
720
+ if (v === 0) return '0';
721
+ // Determine rounding precision based on value magnitude
722
+ if (v < 1) {
723
+ // For values < 1, round to nearest 0.1
724
+ return d3.format('.1f')(Math.round(v * 10) / 10);
725
+ } else if (v < 10) {
726
+ // For values 1-10, round to nearest 1
727
+ return d3.format('d')(Math.round(v));
728
+ } else if (v < 100) {
729
+ // For values 10-100, round to nearest 10
730
+ return d3.format('d')(Math.round(v / 10) * 10);
731
+ } else {
732
+ // For larger values, round to nearest 10
733
+ return d3.format('d')(Math.round(v / 10) * 10);
734
+ }
735
+ };
736
+
737
  gAxes.append('g').attr('class', 'y-axis')
738
+ .call(d3.axisLeft(yScale).ticks(5).tickSizeOuter(0).tickFormat(formatValueRounded));
739
  gAxes.selectAll('.domain, .tick line').attr('stroke', 'var(--axis-color)');
740
  gAxes.selectAll('text').attr('fill', 'var(--tick-color)');
741
 
 
1012
  const mean = values.reduce((a, b) => a + b, 0) / values.length;
1013
  allData.push({
1014
  run: runName,
1015
+ step: parseFloat(tokenKey), // Tokens are already in billions in CSV
1016
  value: mean
1017
  });
1018
  });