evaluation-guidebook

Running

App Files Files Community

Clémentine commited on 11 days ago

Commit

adc672a

1 Parent(s): ad0a935

wip

Browse files

Files changed (22) hide show

app/src/content/article.mdx +79 -12
app/src/content/chapters/2025-evaluations-for-useful-models.mdx +165 -0
app/src/content/chapters/automated-benchmarks/basics.mdx +0 -21
app/src/content/chapters/automated-benchmarks/designing-your-automatic-evaluation.mdx +34 -39
app/src/content/chapters/automated-benchmarks/some-evaluation-datasets.mdx +11 -6
app/src/content/chapters/automated-benchmarks/tips-and-tricks.mdx +24 -37
app/src/content/chapters/general-knowledge/model-inference-and-evaluation.mdx +178 -25
app/src/content/chapters/general-knowledge/tokenization.mdx +0 -76
app/src/content/chapters/human-evaluation/basics.mdx +26 -2
app/src/content/chapters/human-evaluation/tips-and-tricks.mdx +13 -0
app/src/content/chapters/human-evaluation/using-human-annotators.mdx +19 -2
app/src/content/chapters/intro.mdx +71 -57
app/src/content/chapters/model-as-a-judge/basics.mdx +14 -0
app/src/content/chapters/model-as-a-judge/designing-your-evaluation-prompt.mdx +45 -4
app/src/content/chapters/model-as-a-judge/evaluating-your-evaluator.mdx +31 -3
app/src/content/chapters/model-as-a-judge/getting-a-judge-llm.mdx +38 -2
app/src/content/chapters/model-as-a-judge/tips-and-tricks.mdx +22 -3
app/src/content/chapters/model-as-a-judge/what-about-reward-models.mdx +37 -4
app/src/content/chapters/picking-your-evaluation.mdx +156 -85
app/src/content/chapters/troubleshooting/troubleshooting-inference.mdx +29 -1
app/src/content/chapters/troubleshooting/troubleshooting-reproducibility.mdx +43 -7
app/src/content/embeds/d3-decision-tree.html +363 -0

app/src/content/article.mdx CHANGED Viewed

@@ -16,9 +16,14 @@ tags:
 tableOfContentsAutoCollapse: true
 ---
 import Intro from "./chapters/intro.mdx";
-import AutomatedBenchmarksBasics from "./chapters/automated-benchmarks/basics.mdx";
 import DesigningAutomaticEvaluation from "./chapters/automated-benchmarks/designing-your-automatic-evaluation.mdx";
 import AutomatedBenchmarksTips from "./chapters/automated-benchmarks/tips-and-tricks.mdx";
 import HumanEvaluationBasics from "./chapters/human-evaluation/basics.mdx";
 import UsingHumanAnnotators from "./chapters/human-evaluation/using-human-annotators.mdx";
@@ -32,28 +37,95 @@ import ModelAsJudgeTips from "./chapters/model-as-a-judge/tips-and-tricks.mdx";
 import TroubleshootingInference from "./chapters/troubleshooting/troubleshooting-inference.mdx";
 import TroubleshootingReproducibility from "./chapters/troubleshooting/troubleshooting-reproducibility.mdx";
 import ModelInferenceAndEvaluation from "./chapters/general-knowledge/model-inference-and-evaluation.mdx";
-import Tokenization from "./chapters/general-knowledge/tokenization.mdx";
-## Introduction
 <Intro />
-## Automated Benchmarks
-<AutomatedBenchmarksBasics />
 <DesigningAutomaticEvaluation />
 <AutomatedBenchmarksTips />
-## Human Evaluations
 <HumanEvaluationBasics />
 <UsingHumanAnnotators />
 <HumanEvaluationTips />
-## Model judges
 <ModelAsJudgeBasics />
 <GettingJudgeLLM />
@@ -68,10 +140,5 @@ import Tokenization from "./chapters/general-knowledge/tokenization.mdx";
 <TroubleshootingInference />
-<TroubleshootingReproducibility />
-## Appendix
-<ModelInferenceAndEvaluation />
-<Tokenization />

 tableOfContentsAutoCollapse: true
 ---
+import Note from "../components/Note.astro";
+import Sidenote from "../components/Sidenote.astro";
+import HtmlEmbed from "../components/HtmlEmbed.astro";
 import Intro from "./chapters/intro.mdx";
 import DesigningAutomaticEvaluation from "./chapters/automated-benchmarks/designing-your-automatic-evaluation.mdx";
+import PickingYourEval from "./chapters/picking-your-evaluation.mdx";
+import EvalsIn2025 from "./chapters/2025-evaluations-for-useful-models.mdx"
 import AutomatedBenchmarksTips from "./chapters/automated-benchmarks/tips-and-tricks.mdx";
 import HumanEvaluationBasics from "./chapters/human-evaluation/basics.mdx";
 import UsingHumanAnnotators from "./chapters/human-evaluation/using-human-annotators.mdx";
 import TroubleshootingInference from "./chapters/troubleshooting/troubleshooting-inference.mdx";
 import TroubleshootingReproducibility from "./chapters/troubleshooting/troubleshooting-reproducibility.mdx";
 import ModelInferenceAndEvaluation from "./chapters/general-knowledge/model-inference-and-evaluation.mdx";
+ https://x.com/sasuke___420/status/1984168256568226286
 <Intro />
+## LLM basics to understand evaluation
+Now that you have an idea of why evaluation is important, and how it's done, let's look at how we prompt models to get some answers out in order to evaluate them. It's very likely you can skim this section if you have already done evaluation.
+<ModelInferenceAndEvaluation />
+## Doing evaluations with existing benchmarks
+### State of evaluations in 2025
+<EvalsIn2025 />
+### Understanding what's in an eval
+Ok, we made a list of benchmarks, now what? Well, now you need to check if these benchmarks are relevant for you and your specific use cases (unless you just want to compare your model to other models, in which case you can skim and go to the next section).
+The first and most important step is, and always will be, to look at the data. You want to study the following.
+#### Creation process
+- **Who created the actual samples?**
+Ideally, you want dataset created by experts, then next tier is paid annotators, then crowdsourced, then synthetic, then MTurked. You also want to look for a data card, where you'll find annotator demographics - this can be important to understand the dataset language diversity, or potential cultural bias.
+- **Were they all examined by other annotators or by the authors?**
+You want to know if the inter-annotator score on samples is high (= are annotators in agreement?) and/or if the full dataset has been examined by the authors.
+This is especially important for datasets with the help of underpaid annotators who usually are not native speakers of your target language (think AWS Mechanical Turk), as you might otherwise find typos/grammatical errors/nonsensical answers.
+- **Were the annotators provided with clear data creation guidelines?**
+In other words, is your dataset consistent?
+#### Samples
+Take 50 random samples and manually inspect them; and I mean do it yourself, not "prompt an LLM to find unusual stuff in the data for you".
+First, you want to check the content quality. Are the prompts clear and unambiguous? Are the answers correct? (*Eg: TriviaQA contains several gold answers (aliases field) per question, sometimes conflicting.*) Is information missing? (*Eg: MMLU misses reference schematics in a number of questions.*) It's important to keep in mind that it's not because a dataset is a standard that it's a good one - and this happens because most people skip this step.
+Then, you want to check for relevance to your task. Are these questions the kind of questions you want to evaluate an LLM on? Are these examples relevant to your use case?
+You might also want to check the samples consistency (especially if you're planning on using few shots or computing aggregated statistics): do all samples have the same number of choices if it's a multiple choice evaluation? Is the spacing consistent before and after the prompt? If your evaluation comes with an additional environment, ideally you want to use it to understand tool calls.
+Lastly, you also want to quickly check how many samples are present there (to make sure results are statistically significant - 100 samples is usually a minimum for automatic benchmarks).
+TODO: ADD A VIEWER
+#### Task and metrics
+You want to check what metrics are used: are they automatic, functional, or using a model judge? The answer will change the cost of running evaluations for you, as well as the reproducibility and bias type.
+Best (but rarest) metrics are functional or based on rule based verifiers (though beware of pass/fail for coding models and code evaluations, as recent LLMs have become very good at overwriting globals to 'cheat' on such tests, especially in languages like Python where you can mess up variable scope).
+### Troubleshooting reproducibility
+<TroubleshootingReproducibility />
+### Selecting good evaluations for ablations
+<PickingYourEval />
+For these ablations, it's good to focus on tasks that give good early signal and avoid noisy benchmarks. In [FineTasks](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fine-tasks) and [FineWeb2](https://arxiv.org/pdf/2506.20920), reliable evaluation tasks are defined by four key principles:
+-  **Monotonicity:**  The benchmark scores should consistently improve as models train longer.
+-  **Low noise:**  When we train models with the same setup but different random seeds, the benchmark scores shouldn't vary wildly.
+-  **Above-random performance:**  Many capabilities only emerge later in training, so tasks that show random-level performance for extended periods aren't useful for ablations. This is the case, for example, for MMLU in multiple choice format as we will explain later.
+-  **Ranking consistency:**  If one approach outperforms another at early stages, this ordering should remain stable as training continues.
+## So you want to create your own evaluation
+### Automated Benchmarks
 <DesigningAutomaticEvaluation />
 <AutomatedBenchmarksTips />
+### Human Evaluations
 <HumanEvaluationBasics />
 <UsingHumanAnnotators />
 <HumanEvaluationTips />
+### Model judges
+https://x.com/Kangwook_Lee/status/1993438649963164121
 <ModelAsJudgeBasics />
 <GettingJudgeLLM />
 <TroubleshootingInference />

app/src/content/chapters/2025-evaluations-for-useful-models.mdx ADDED Viewed

	@@ -0,0 +1,165 @@

+---
+title: "2025 evaluations"
+---
+import Note from "../../components/Note.astro";
+import Sidenote from "../../components/Sidenote.astro";
+You can evaluate **specific capabilities** on their own - it's usually quite interesting to get signal when training, or when comparing base/pretrained models. (However, if you select and validate your training methods with the following evaluations, reporting on them on the final model is slightly biased as you have already oriented your training method towards good results on them).
+#### Reasoning and commonsense
+Reasoning and commonsense datasets are often “historic” datasets, built in the age of BERT and embeddings model, before the LLM craze. They were quite challenging at the time (especially because they were often adversarially built for models of the time), but now they are 1) too easy 2) contaminated/saturated, and should only be used for ablations or as pretraining evaluations. The bigger datasets also sometimes contain errors or low quality questions as they tend to have been built through Amazon Mechanical Turk in order to scale up fast and at low cost (what is now done by using LLMs to generate evaluation questions).
+[ARC]([https://arxiv.org/abs/1803.05457](https://arxiv.org/abs/1803.05457)) (2018) (not to confuse with ARC-AGI) is a grade school science MCQA dataset built from human tests. The choices were selected adversarially for word co-occurence systems at the time. It has several subsets, the higher quality `challenge` one is still in use today for pretraining. [WinoGrande]([https://arxiv.org/pdf/1907.10641](https://arxiv.org/pdf/1907.10641)) (2019) is a crowdsourced (mechanical turk + validation) pronoun resolution/fill in the blank dataset, using adversarial pairs of items to trick models. Both these datasets have been quite hard for models until 2022 to 2023.
+A number of historic datasets are looking specifically at reasoning requiring some sort of commonsense understanding and grounding. [HellaSwag]([https://arxiv.org/abs/1905.07830](https://arxiv.org/abs/1905.07830)) (2019) requires LLMs to select the correct next sentence in a list of adversarial choices, where the text comes from captions in ActivityNet and from tutorials in Wikihow. (It’s the follow up of a dataset called Swag). As most sentences come from tutorials or descriptions of activities, they often require physical commonsense grounding to solve. In the same vein, [CommonsenseQA]([https://arxiv.org/abs/1811.00937](https://arxiv.org/abs/1811.00937)) (2018) is a dataset of commonsense MCQA built from ConceptNet - annotators write questions, then use conceptually close distractors as options. [PIQA]([https://arxiv.org/abs/1911.11641](https://arxiv.org/abs/1911.11641)) (2019) is specifically looking at physical commonsense questions (created from examples from [Instructables.com](http://Instructables.com), with again adversarial choices from semantic perturbations or rewriting). [OpenBookQA]([https://arxiv.org/abs/1809.02789](https://arxiv.org/abs/1809.02789)) (2018) provides open book facts to help answer MCQA questions - however, these questions also require latent common sense knowledge.
+#### Knowledge
+The main evaluation dataset for knowledge has been [MMLU](https://arxiv.org/abs/2009.03300) (2020). It reached saturation/contamination, and after more in depth examination, a number of issues were identified: incomplete questions referring absent documents, incorrect ground truths, ambiguous questions, and blatant americano-centrism in the topics chosen. It was therefore cleaned in [MMLU-Redux](https://arxiv.org/abs/2406.04127) (2024), extended with more complex questions and more answers in [**MMLU-Pro**](https://arxiv.org/abs/2406.01574) (2024, the main replacement used by the community at the moment), and translated/annotated for cultural bias in [Global-MMLU](https://arxiv.org/abs/2412.03304) (2024). These are used mostly for pretraining evaluations and ablations.
+For post training, people look at harder high quality knowledge dataset. [**GPQA**](https://arxiv.org/abs/2311.12022) (2023), custom PhD level questions in biology/chemistry/physics, made to be answerable by PhD students in the correct domain and not otherwise. The most used subset is the `diamond` one, but since its publication in 2023 it has also started reaching contamination.
+Last but not least, the pompously named but very high quality [**Humanity's Last Exam**](https://agi.safe.ai/) (2024) contains 2.5K crowdsourced questions by experts in their field, across domains. It is mostly private, and questions require both complex knowledge and reasoning. It has not been broken yet, and it's imo a cool dataset. The only issue is that since there is no way to get a model scored fast, people now evaluate against it by using an LLM judge to assess their answers, insted of checking against ground truth, so it's one of these evaluations where you'll get really uncomparable results in the wild.
+However, though testing models for the raw quality of their latent knowledge made a lot of sense a couple years back (and is still interesting while training to test model quality, with evals like MMLU-Pro during pretraining and GPQA/HLE for post training), I think we will slowly phase out of benchmarks such as this in the next years, for 2 reasons.
+1. They are becoming more and more indecipherable for humans: questions are becoming so complex that it's almost impossible for non experts to understand what performance on each question means (and to make sure the datasets themselves do not contain mistakes)
+2. Now that our models are connected to tools, such as internet access, latent knowledge evaluations are increasingly becoming web search and retrieval evaluations, so they make less sense as such. In short, we're moving from closed book to open book evaluations. As a comparison, in the French school system, you get closed books examinations in high school, but as you enter university, it's often assumed that you will get access to databases, internet, and scoring becomes less about what you learnt by heart, and more about how you reason given free access to information. I believe this is also a change we will see in LLM evaluation with the increase of model capabilities.
+#### Math
+Math evaluation datasets have been used as proxies for reasoning and logic benchmarking, independently of, obviously, also checking if models can solve math problems.
+The two reference math evaluation datasets were [GSM8K](https://arxiv.org/abs/2110.14168) (2021), containing grade school math problems and [MATH](https://arxiv.org/abs/2103.03874) (2021), an aggregation of Olympiad problems present on the web, which reached saturation/contamination in the last years. The former was extended by [GSM1K](https://arxiv.org/abs/2405.00332) (2024), a recreation with 1K new problems, to test which models were contaminated on the former, [GSM-Plus](https://arxiv.org/pdf/2402.19255), a rewriting of models with adversarial changes (distractors, numerical variations, and so forth) and [GSM-Symbolic](https://arxiv.org/abs/2410.05229) (2024), less used, but a very interesting re-writing of GSM8K as problem templates, to prevent contamination: problems can be regenerated ad infinitum.
+Community has now been focusing on using:
+- The follow ups to MATH, either [**MATH-500**](https://huggingface.co/datasets/HuggingFaceH4/MATH-500) (a representative subset of 500 problems sampled to avoid overfitting) and MATH-Hard (only the 500 hardest questions)
+- **AIME** ([24](https://huggingface.co/datasets/HuggingFaceH4/aime_2024), [25](https://huggingface.co/datasets/math-ai/aime25)), american olympiad datasets for high schoolers, taken as is at publication. These datasets are interesting because, since they are made of problems renewed every year with equivalent difficulty, they allow testing for contamination by comparing results at publication with results on the previous year's dataset
+- [**Math-Arena**](https://matharena.ai/), an up to date compilation of competitions and olympiads actualised regularly (it contains AIME25, but a lot of other competitions too!)
+Most of these datasets are actually no longer "that hard", since they stop at grade school level (even though GSM-Symbolic allows to generate problems with more recursion levels, making them synthetically harder). On the other side of the spectrum, [FrontierMath](https://arxiv.org/abs/2411.04872) (2024) was an attempt at providing considerably harder math problems, written individually by mathematicians for the occasion. The dataset was theoretically private (but it appeared OpenAI has had access to parts of the dataset - such a shame). [Humanity's Last Exam](https://agi.safe.ai/) (2025) (introduced in the knowledge section) also contains interesting “made for the occasion” math problems requiring complex reasoning (notably some theorem proving).
+I would personally use AIME25 and MATH-500 for pretraining evaluations, and the Math-Arena for post training.
+#### Code
+Since agents need to interact with tools, they need coding abilities, either to call tools directly if they are code agents, or understand how to debug tool output in case of problems (for code and json agents both, see the difference [here](https://huggingface.co/learn/agents-course/en/unit2/smolagents/tool_calling_agents)). Coding evaluation sets are also good proxies for reasoning.
+Historically in 2021, code evaluation sets were [MBPP](https://arxiv.org/abs/2108.07732), 1K crowdsourced Python only entry-level programming problems, [APPS](https://arxiv.org/abs/2105.09938), 10K code generation problems curated from programming interviews and sharing websites, and [HumanEval](https://arxiv.org/abs/2107.03374), introduced with the Codex model, which contrary to the previous is made of "specifically made for the release" problems, which was super neat then! It also came with a sandbox to avoid problematic code execution on the evaluator's machine. (Last thing this paper introduced was an estimator for `pass@k`, which before that was computed with a literal check on whether an evaluation was a success more than k times on n).
+The [EvalPlus](https://openreview.net/pdf?id=1qvx610Cu7) (2023) team made HumanEval+ and MBPP+, extensions of the former, by adding more test cases and fixing bugs in the original datasets as well as adding more inputs. [EvoEval](https://arxiv.org/abs/2403.19114) (2024) also introduced a variation on HumanEval by semantically rewriting the problems and adding difficulty labeling.
+For final models, you might want harder or uncontaminated problems.
+[**LiveCodeBench**](https://arxiv.org/abs/2403.07974) (2024) follows a similar "grabbing from leetcode websites" approach, but is very interesting because it stores the problem date, to compare model performance on problems created before and after they finished training. This was an excellent contamination free benchmark, and I'm looking forward to an update!
+[**AiderBench**](https://aider.chat/docs/leaderboards/) (online since end of 2024 I think?) also uses data from existing coding websites (Exercism to be specific), but goes beyond problem solving by testing specifically code editing and refactoring.
+For post training, you want more holistic evaluations, and a couple benchmarks moved beyond evaluation on standalone problems, which were not evaluating complex coding abilities. [RepoBench](https://arxiv.org/abs/2306.03091) (2023) tests repository level auto completion systems in Python or Java, using code from Github as source. It was built by masking random lines in code bases and asking for completions, either a cross file or in file function, and defines several tests level (retrieval, completion, a combination).
+[**SweBench**](https://openreview.net/pdf?id=VTF8yNQM66) (2024) is a more well known and complete version of this, also using github, but this time testing if models can solve existing issues, so logic understanding, cross file editing and execution, long context reasoning, etc.
+At this time, I would recommend following LiveCodeBench, AiderBench and the higher quality subset of SWE-Bench (SWE-Bench verified), and reading the [METR report](https://metr.org/blog/2025-07-10-early-2025-ai-experienced-os-dev-study/) on actual code assistant usefulness.
+#### Long context
+To correctly interact with users over a long discussion, without losing track, you need good long context management. (Funny to think that 3 years ago, maximum context lengths for models were 2048 tokens, when now we're largely at 128K and beyond).
+The evaluation which started testing this in 2023 is probably [NIAH](https://github.com/gkamradt/LLMTest_NeedleInAHaystack), (Needle in a Haystack), where you place a random fact in a long unrelated text and ask the model to retrieve it. It provides a neat framework to evaluate where in the context a model is most likely to forget stuff, and from which context length. In 2023 models were really bad at it, in 2025 it's close to solved.
+More complex long context extensions have emerged since. [RULER](https://arxiv.org/pdf/2404.06654) (2024) adds multi-hop tracing (requiring the model to follow chains of variables to get the correct value), word frequency changes, and adds a QA variation of NIAH. it's also close to solved now. [Michelangelo](https://arxiv.org/pdf/2409.12640v2) (2024, also sometimes called MRCR for multi round co reference) is also using synthetic long context data: tasks (of varying length) test whether models can reproduce precisely unique portions of the context (as well as identify if relevant information is present) and understand sequence of modifications to a text. It was then extended in the [OpenAI MRCR](https://huggingface.co/datasets/openai/mrcr) (2025). [InfinityBench](https://arxiv.org/abs/2402.13718) (2024) is multilingual (En and Zh), and provides 100K tokens synthetic data tasks, across a variety of objectives (QA, retrieval as in NIAH, computations over very long context, ...). InfinityBench still provides some signal.
+[**HELMET**](https://arxiv.org/abs/2410.02694) (2024) combines tasks and existing benchmarks to get a big single dataset with more signal: RAG and QA datasets (Natural questions, TriviaQA, PopQA, HotpotQA, Narrative QA and InfinityBench), recall (RULER and JSONKV), generation with citation (subsets of ALCE), summarisation, reranking passages (MS MARCO), in context learning (TREC, NLU, Banking77, CLINIC150). Benchmark aggregations are exhaustive but present the risk of measuring things two times : don't go testing your model against both HELMET and InfinityBench, then aggregating the results, for example, as you would run the same evaluation twice! In 2025, it still has enough discriminative power to compare models.
+My favorite long context evaluations ideas are the [Novel Challenge](https://arxiv.org/abs/2406.16264) (2024), 1K true/false claims about fictional books published in the last year (by readers of said books!) requiring having read and understood the full text to answer properly, and the [**Kalamang translation dataset**](https://arxiv.org/abs/2309.16575) (2024), where models need to properly translate from English to Kalamang from reading a grammar book (Kalamang is such a low resource language that it has no online presence - only 200 speakers). The Kalamang translation set could notably be expanded to other low resource languages (but it would be cool to expand to use a rule based grammar checker to test generation validity to get strict accuracy instead of relying on BLEU...).
+#### Instruction Following
+The two main instruction following datasets are [**IFEval**](https://arxiv.org/abs/2311.07911) (2023) and its extension [**IFBench**](https://arxiv.org/abs/2507.02833) (2025). IFEval is one of the smartest evaluation ideas in the last years, in my opinion: models are asked to follow formatting instructions (about keywords, punctuation, number of words/sentences, file type formatting such as markdown or html, etc). Each of these conditions can be checked with a specific parsing test: this means that this evaluation is one of the rare free form generative evaluation where you can get a strict score without relying on a model judge.
+More generally, it falls into the functional correctness/unit test evaluation type, which is my personal favorite way to evaluate models. It's also very easy to regenerate or extend to prevent contamination.
+Side note, but some benchmarks also test "non instruction following" (non compliance): [CoCoNot](https://www.arxiv.org/pdf/2407.12043) (2024) notably tests if models will or won't comply with incomplete (underspecified/unclear), unanswerable (by lack of information or AI-humanizing, often hallucinations triggering), or unsafe requests. It used manual queries writing, models to write non compliants requests, then filtered to create an eval set presented as a classification problem.
+#### Tool-calling
+The emergence of tools is one of the features which started moving LLMs into the agentic realm.
+[**TauBench**](https://arxiv.org/pdf/2406.12045) (2024) evaluates a model on its ability to answer a user's query in the retail and airline domains (order/book/look for products/etc). The database mimics real domain data with synthetic samples, and the model is considered correct when 1) its actions updated the database correctly and 2) it answered the user appropriately. To make this benchmark automatic, the user is mocked up by an LLM, which makes this evaluation quite costly to run and prone to errors. Despite these limitations, it's quite used, notably because it reflects real use cases well.
+[ToolBench](https://arxiv.org/pdf/2305.16504) (2023) require calling APIs (OpenWeather, Cat, HomeSearch, TripBooking, GoogleSheets, WebShop, Tabletop, etc) to solve 100 test cases across dataset, requiring between one and 10 tool calls to solve. Some of these APIs are mock ups and some of them are real, which makes the dataset susceptible to accidental failure. It was therefore fixed and extended in [StableToolBench](https://arxiv.org/pdf/2403.07714) (2025), which introduces a general VirtualAPIServer mocking up everything to ensure evaluation stability, however relying on an LLM judge for evaluation, introducing another layer of bias.
+[**BFCL**](https://openreview.net/pdf?id=2GmDdhBdDk) (2025, but the benchmark actually has a couple years) evolved considerably over the year, and in its current version contains 4 subset: single turn (simple tool calls), crowdsourced real life function calls from users, multiturn conversations (to test accuracy in long context and query answering with tool calls) and agentic (web search, memory, sql data interaction). It's using a combination of Abstract Syntax Trees, execution response and state matching (is the final state the expected one) to evaluate if calls are correct. People are focusing on the v3 to test tool calling specifically, and the v4 tests web and search tool use.
+Lastly, with the creation of MCPs, some benchmarks arose to test MCP oriented tool calling - however all mostly relying on model judges, and using real world APIs, which can introduce potential failure cases/lack of reproducibility due to network issues (seems like added load for website creators is not too much of an issue as the userbase of most MCP covered is big enough).
+[MCPBench](https://arxiv.org/abs/2508.20453) (2025) connects LLMs to live, real world MCP servers (Wikipedia, HF, Reddit, Steam, arxiv, ...) with tasks requiring multiple turns to solve (created synthetically). The evaluation combines rule based checks on tool call validity and success with an LLM judge to assess if queries were properly answered.
+[**MCP-Universe**](https://arxiv.org/abs/2508.14704) (2025) uses 11 MCP servers across varied real world topics (IRL navigation, 3D design, web search, etc). What’s cool in this one is that evaluation relies on several strict evaluators, one for format correctness, and two for answer correctness: as tasks can be static (asking things that do not change) or dynamic (github stars in a repo, weather, …), in the latter case answer correctness uses a task-dependant execution based evaluation framework which grabs the latest correct answer from the relevant source automatically and compares the model output to it. This is way neater than relying on LLM judge!
+[**LiveMCPBench**](https://arxiv.org/abs/2508.01780) (2025) provides a large locally deployable collection of MCP servers to test how good models are at discriminating between tools to accomplish tasks. Best models are already reaching 80% - so we're close to saturation. However, testing if models can select proper tools in very long lists is a good use case which will be increasingly important as the web goes mcp.
+(By the way, here's a cool [doc](https://www.anthropic.com/engineering/writing-tools-for-agents) on how to write good tools.)
+While testing individual capabilities provides valuable signal, real-world assistant performance comes from how these capabilities combine. A model might excel at reasoning but fail when that reasoning must be integrated with tool calling and long context management simultaneously, so we need evaluations requiring the orchestration of multiple capabilities together.
+#### Assistant tasks
+I believe that **assistant tasks** are going to be one of the main ways to do next level evaluations: solving them requires a combination of many capabilities (long context, reasoning, tool calling, ...), while the benchmarks themselves provide insight on specific domains performance in a useful real world setup. They also tend to be more understandable (by the general public) than specific capabilities benchmarks. If the benchmarks are general enough, they do not check which precise tools were used, but instead if the end result is correct, as complex tasks allow several paths to success.
+**Real life information retrieval**
+[**GAIA**](https://arxiv.org/abs/2311.12983) (2023) kickstarted modern agentic evaluation by requiring models to use a combination of tools, reasoning and retrieval to solve real life queries (sometimes including documents). Questions were split in 3 levels, the first one now saturated and the third one still hard for models. It's also one of these benchs were numbers you find will be spread out against evaluation methods, because people are either reporting on the public validation set or using llm judges to evaluate against the private test set (when there is a public leaderboard [here](https://huggingface.co/spaces/gaia-benchmark/leaderboard)).
+It was later replicated in [BrowseComp](https://cdn.openai.com/pdf/5e10f4ab-d6f7-442e-9508-59515c65e35d/browsecomp.pdf) (2025) which tests the same thing (can a model find the adequate answer to a specific query using tools and online information) but does not guarantee uniqueness of result, as questions were constructed by starting from the result and building a question from it, with varying levels of difficulty: for example, from a specific paper to retrieve, a question will be created by combining information about metadata, for example "which paper about Topic was published at Conference with one Nationality author and two people from Entity?" However, the benchmark is probably also harder at the moment.
+Lastly, [GAIA2](https://huggingface.co/blog/gaia2) went beyond simple information retrieval, using a mock up mobile environment to test how assistants are able to answer correctly answer queries relying on chains of events and tool calls. As of now, time sensitive and deliberately noisy subsets (mocking up failing API calls) are the hardest for models, when search and execution seem extremely easy for SOTA models.
+**Science assistants**
+[SciCode](https://arxiv.org/abs/2407.13168) (2024) tests if models can solve real life scientific problems by writing appropriate scientific code, across stem fields (from biology to math/chem/...). Problems are drawn from real life workflows, and each core issue is decomposed in easier subproblems. For the first version, evaluation was done by scientists and a model judge - models were quite bad at it at publication (less than 5% scores) but I'm unsure where up to date results can be found.
+[PaperBench](https://arxiv.org/abs/2504.01848) (2025) similarly tests if models can replicate ML research, but this time with a harder setup: given ICML high quality papers, models must reconstruct the matching code base (8K individually graded tasks have been contributed by the authors of said papers, grouped as rubric trees with weighting for the final grades). Benchmark is evaluated with an LLM judge (though I suspect some of it could be done automatically by constraining a bit the shape of the code asked for).
+[DSBench](https://arxiv.org/pdf/2409.07703) (2025) is a multimodal data analysis benchmark using Kaggle and ModelOff (financial data) samples. From the examples in Appendix it seems that questions from ModelOff are provided in a multiple choice setup, which likely makes the task easier, where the Kaggle tasks each have their own metric.
+[**DABStep**](https://arxiv.org/abs/2506.23719) (2025) evaluates model on previously private (therefore uncontaminated) operational data analysis workloads using real life questions and data. All problems require multi step reasoning and varied document parsing, as well of course as specific data manipulation skills. It's a neat eval because it's hard and replicates actually useful real world use cases, and because each problem has a ground truth, so evaluation is unbiased and not too costly.
+Assistant tasks test integrated capabilities in realistic scenarios, but they're either dynamic and read only, or static in environment which doesn't change. To evaluate adaptability and dynamic decision-making, we need environments that can "surprise" the model.
+#### Game based evaluations
+**Game-based** benchmarks are very interesting for several reasons: they usually evaluate adaptability to a changing environment (contrary to most assistant tasks which are static), require long context reasoning, and last but not least, are **understandable** by most people. However, they are not grounded in real life nor necessary reflecting good performance on actually useful use cases.
+The most famous formal evaluation among these is probably [ARC-AGI](https://arcprize.org/arc-agi). The first version (2019) was made of puzzles grids in a sequence, where models had to find the last item of said sequence without explicit rules being provided. This benchmark is to me very reminiscent of logic-oriented IQ tests, and it was almost solved in 2024. A similar benchmark (extrapolation of rules) is [Baba is AI](https://arxiv.org/abs/2407.13729) (2024). The latest version of the bench, ARC-AGI3 (2025, ongoing), is still in development, and contains entire new games (requiring exploration, complex planning, memory management, ...) made specifically for the benchmark. It is still ongoing, and current best solutions on available problems are bruteforcing the games.
+The community and model providers have explored a number of existing games with LLMs. Single player adventure games/RPGs like [TextQuests](https://huggingface.co/blog/textquests) (2025) or [Pokemon](https://github.com/benchflow-ai/benchflow/tree/main/libs/pokemon-gym) (2024) (Twitch for [Claude](https://www.twitch.tv/claudeplayspokemon) and [Gemini](https://www.twitch.tv/gemini_plays_pokemon) for ex) require a combination of very long range planning to get objectives, which require adequante long context memory management, reasoning, and backtracking abilities. Same abilities are needed for single player survival games like [Crafter](https://arxiv.org/abs/2109.06780) (2021, Minecraft inspired). A number of single player game environments have been integrated into the [Balrog](https://arxiv.org/pdf/2411.13543) (2024) benchmark.
+Competitive bluffing games like [Poker](https://arxiv.org/html/2501.08328v1) (2025) or Mafia variations like [Town of Salem](https://github.com/summersonnn/Town-Of-Salem-with-LLMs) (2025) and Werewolf (2025, [here](https://arxiv.org/abs/2407.13943)/[there](https://werewolf.foaster.ai/)) are very interesting to test logic, reasoning, as well as deception abilities. Claude Opus 4 is for example incapable of winning Town of Salem as a vampire (deceptive role) but does well as a peasant (non deceptive role). Cooperative games like Hanabi can also be used to test adaptability and communication ability in a constrained environment.
+What's also very neat about these is that they have a single and unambiguous pass/fail metric: did the LLM win the game or not? At the moment, if I were to use these to evaluate models I would probably look at TextQuests for abilities and Town of Salem for safety.
+Beyond testing capabilities in controlled environments, there's one type of evaluation that's inherently impossible to game: predicting the future. (Ok it's a tangent but I find these super fun and they could be relevant!)
+#### Forecasters
+In the last year, a new category of impossible to contaminate tasks emerged: forecasting. (I guess technically forecasting on the stock markets can be cheated on by some manipulation but hopefully we're not there yet in terms of financial incentives to mess up evals). They should require a combination of reasoning across sources to try to solve questions about not yet occuring events, but it's uncertain that these benchmarks are discriminative enough to have strong value, and they likely reinforce the "slot machine success" vibe of LLMs. (Is the performance on some events close to random because they are impossible to predict or because models are bad at it? In the other direction, if models are able to predict the event correctly, is the question too easy or too formulaic?)
+[FutureBench](https://huggingface.co/blog/futurebench) tests if models can predict future news-worthy events. It uses 2 sources: browsing and an LLM generating questions with a weekly time horizon, and user predictions from betting markets. All data is heavily filtered and cleaned before use. For now, models are barely better than random on human created bets, and succeed 3/4th of the time on model generated questions (likely easier).
+[FutureX](https://arxiv.org/abs/2508.11987) is similar, but uses an array of specific websites (prediction parkets, government websites, general ranking websites and real time data platforms), then uses templates to generate questions about potential future events (`when will STOCK reach POINT?`). 500 questions are generated daily, with filtering of accidentally irrelevant questions.
+A similar approach is used to generate questions in [Arbitrage](https://arxiv.org/pdf/2412.18544), the core difference being the time horizon: events there should be resolved in 2028.
+In a similar vein, you'll also find arenas where LLMs are provided with money to actively trade on financial markets - these experiments are less likely to give meaningful results, as, because of their costs, they tend to be run once per model only, so you get no statistical significance there.
+<Note title="TLDR" emoji="🎯">
+The landscape of evaluation has evolved with the jumps in capabilities, from testing isolated skills to measuring integrated performance in more realistic scenarios.
+As of Nov 2025, I recommend using:
+- **Core capabilities** (for model builders): Old capabilities evals for training, and for post training MATH500/AIME24, GPQA, IFEval, SWE-Bench, a long range eval of your choice like HELMET, TauBench or BFCL if you're targetting tool use
+- **Core capabilities** (for comparing models at inference): IFBench, HLE, MathArena, AiderBench and LiveCodeBench, MCP-Universe
+- **Long horizon tasks** (for real-world performance): GAIA, DABStep, SciCode, or domain specific evaluations for your use cases
+- **Games** (for some extra fun in measuring robustness and adaptability): ARC-AGI3 when it's out, TextQuests, Town of Salem if you're interested in safety, or any other game you like which goes beyond Poker/Chess/Go.
+The field is moving toward evaluations that test capability orchestration rather than isolated skills for actual use. This matches our goal of building models that "work well"—systems that can reliably combine core capabilities, tool use, with a good orchestration to solve actual problems.
+<Sidenote>
+I hope the field moves towards putting more emphasis on functional testing rather than model judges, and generally understandable datasets and tasks.
+</Sidenote>
+</Note>

app/src/content/chapters/automated-benchmarks/basics.mdx DELETED Viewed

@@ -1,21 +0,0 @@
----
-title: "Automated Benchmarks: Basics"
----
-Automated benchmarks usually works the following way: you'd like to know how well your model performs on something. This something can be a well-defined concrete **task**, such as `How well can my model classify spam from non spam emails?`, or a more abstract and general **capability**, such as `How good is my model at math?`.
-From this, you construct an evaluation, using:
-- a **dataset**, made of **samples**.
-	- These samples contain an input for the model, sometimes coupled with a reference (called gold) to compare the model's output with.
-	- Samples are usually designed to try to emulate what you want to test the model on: for example, if you are looking at email classification, you create a dataset of spam and non spam emails, try to include some hard edge cases, etc.
-- a **metric**.
-	- The metric is a way to score your model.
-	  Example: how accurately can your model classify spam (score of well classified sample = 1, badly classified = 0).
-	- Metrics use your model's outputs to do this scoring. In the case of LLMs, people mostly consider two kind of outputs:
-		- the text generated by the model following the input (*generative evaluation*)
-		- the log-probability of one or several sequences provided to the model (*multiple-choice evaluations*, sometimes called MCQA, or *perplexity evaluations*)
-		- For more info on this, you should check out the [Model inference and evaluation](https://github.com/huggingface/evaluation-guidebook/blob/main/contents/general-knowledge/model-inference-and-evaluation.md) page.
-This is more interesting to do on data that the model has never been exposed to before (data absent from the model training set), because you want to test if it **generalizes** well. For example, if it can classify spam emails about 'health' products after having seen only spam emails about fake banks.
-Note: *A model which can only predict well on its training data (and has not latently learnt more high-level general patterns) is said to be **overfitting**. Similarly to a student who learned test questions by heart without understanding the topic, evaluating LLMs on data that was already present in their training set is scoring them on capabilities they do not possess.*

app/src/content/chapters/automated-benchmarks/designing-your-automatic-evaluation.mdx CHANGED Viewed

@@ -2,48 +2,24 @@
 title: "Designing your automatic evaluation"
 ---
 ### Designing your automatic evaluation
 #### Selecting or creating a dataset
 For your evaluation, you can either select an existing dataset or design your own. Through this process, it's very important to keep in mind that **your evaluation result will only be as good as your evaluation dataset**.
-##### Inspecting an existing dataset.
-You want to study the following.
-1. Creation process
-- **Who created the actual samples?**
-Imo, expert created dataset > paid annotator dataset ~ crowdsourced dataset > MTurked dataset.
-You also want to look for a data card, where you'll find annotator demographics - this can be important to understand the dataset language diversity.
-- **Were they all examined by other annotators or by the authors?**
-You want to know:
-	- if the inter-annotator score on samples is high (= are annotators in agreement?)
-	- and/or if the full dataset has been examined by the authors.
-This is especially important for datasets with the help of underpaid annotators who usually are not native speakers of your target language (think AWS Mechanical Turk), as you might otherwise find typos/grammatical errors/nonsensical answers.
-- **Were the annotators provided with clear data creation guidelines?**
-In other words, is your dataset consistent?
-2. Samples
-Take 50 random samples and manually inspect them:
-- *For quality*:
-	- are the prompts clear and unambiguous?
-	- are the answers correct? (*Eg: TriviaQA contains several gold answers (aliases field) per question, sometimes conflicting.*)
-	- is information missing? (*Eg: MMLU misses reference schematics in a number of questions.*)
-- *For relevance to your task*:
-	- are these
-	questions the kind of questions you want to evaluate an LLM on?
-	- are these examples relevant to your use case?
-3. Quantity
-You also want to know how many samples are present there (to make sure results are statistically significant - 100 samples is usually a minimum for automatic benchmarks).
 ##### Designing your own
-You can go 3 ways when designing your own dataset.
 - **Aggregating existing data**: You can aggregate existing data from different sources, evaluating a relevant capability for your task. A number of evaluation datasets are for example constructed from aggregating human evaluation datasets (such as MATH, LSAT, etc). In this case, follow the steps above.
 - **Using human annotators**: There's a whole section on using human annotators in `Human evaluation`, see [Using human annotators](https://github.com/huggingface/evaluation-guidebook/blob/main/contents/human-evaluation/using-human-annotators.md).
 - **Using synthetic data from models**: On this, you can check the very cool [Cosmopedia](https://huggingface.co/blog/cosmopedia) blog by cool HF colleagues! It's mostly studying how to create a synthetic training dataset, but similar techniques can be used for evaluation. Make sure to manually check/filter/inspect your dataset afterwards (following the above steps).
 - **Using rule-based techniques**: If your task allows, this is a very good way to get a virtually infinite supply of samples and avoid contamination! For some examples, you can look at [NPHardEval](https://arxiv.org/abs/2312.14890), [DyVal](https://arxiv.org/abs/2309.17167), [MuSR](https://arxiv.org/abs/2310.16049), [BabiQA](https://arxiv.org/abs/1502.05698), etc.
@@ -56,7 +32,7 @@ Using log-probabilities (MCQA, multi-choice question answer) is very good for mu
 	- Provides a proxy for model "confidence" (and calibration)
 	- Fast to evaluate, especially when we ask the model to predict only one token (A/B/C/D the indices of the choices, or Yes/No, etc).
 	- Allow to get signal on small models' task performance
-- Cons:
 	- Slightly over-scores small models which would have generated something outside of the range of available choices if given free rein.
 	- Some models [favor specific choices based on the order in which they have been presented](https://arxiv.org/abs/2309.03882), which could lead to unrepresentative evaluations
@@ -86,19 +62,29 @@ When defining your prompt, you need to be aware that:
 		- A costly way is to re-run the evaluation several times with prompt variations
 		- A less costly way is to run your evaluation once using a range of prompt formats allocated to different samples of equivalent difficulty
 - you can provide examples to your model to help it follow the expected format (using few-shot examples), and adding connector words helps this overall
-- but models now tend to overfit specific prompt formats.
 	- [This paper](https://arxiv.org/abs/2407.07890) is great on the topic, showing notably how some models can be over-evaluated because they have overfitted the test set **format**
 	- On the Open LLM Leaderboard 2, we've notably observed that Llama 3.2 and Qwen 2.5 are no longer following the format of the prompt provided in a few-shot setup for this reason.
 - for a number of metrics, you want a very constrained generation or output.
   *You can learn more about this in the `Constraining model outputs` section of the [Model inference and evaluation](https://github.com/huggingface/evaluation-guidebook/blob/main/contents/general-knowledge/model-inference-and-evaluation.md) page.*
 #### Choosing a metric
 If you are looking at **log-probabilities**, your metrics are going to be easy: you'll want to look at accuracy (how often the most likely choice is the best choice). It's important to normalize it by length (either character, token, or pmi). You could also look at perplexity, recall, or f1 score.
-For **generative** evaluations, your range of metrics is going to be wider.
-You'll need to
-1. decide if you compare generations as they are, or first normalize them with something.
 	- Normalizations can easily [be unfair if not designed well](https://huggingface.co/blog/open-llm-leaderboard-drop), but overall they still provide signal at the task level.
 	- They are very important for specific tasks, such as math evaluations, where you might want to extract your result from formatted outputs.
 	- They will also be important if you want to evaluate with added mechanisms for accuracy, such as Chain of Thought, as you'll need to remove the reasoning trace from the actual result
 2. decide how you compare the generation with the reference.
@@ -109,11 +95,20 @@ More generally, when picking your metric, you need to keep in mind what your tas
 #### Smart new tasks: what about functional testing?
 In the field of code, you want to evaluate generated programs not only on their semantics, but on their actual function. A good way to do so is therefore to check if code generated to follow a prompt passes correctly a suite of unit-tests designed to fit the task.
-This functionality approach is extremely promising, as it
 - allows to generate test cases more easily (in many cases, you can generate rule-based test cases)
 - therefore reducing overfitting
 - tests models on specific active capabilities
 It's however an approach which requires creativity to be translated to text!
 A good example of this is IFEval, an evaluation benchmark which tests if models can follow instructions. It works by creating a number of formatting instructions (*Add this number of bullet points. Capitalize only one sentence.* etc), and strictly testing if the format is followed. More work is clearly needed to extend this idea to other features of text to analyze!

 title: "Designing your automatic evaluation"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### Designing your automatic evaluation
 #### Selecting or creating a dataset
 For your evaluation, you can either select an existing dataset or design your own. Through this process, it's very important to keep in mind that **your evaluation result will only be as good as your evaluation dataset**.
 ##### Designing your own
+You can go 3 ways when designing your own dataset.
 - **Aggregating existing data**: You can aggregate existing data from different sources, evaluating a relevant capability for your task. A number of evaluation datasets are for example constructed from aggregating human evaluation datasets (such as MATH, LSAT, etc). In this case, follow the steps above.
 - **Using human annotators**: There's a whole section on using human annotators in `Human evaluation`, see [Using human annotators](https://github.com/huggingface/evaluation-guidebook/blob/main/contents/human-evaluation/using-human-annotators.md).
+<Note title="See also" emoji="👥" variant="info">
+For detailed guidance on using human annotators to create evaluation datasets, see the [Using human annotators](/human-evaluation/using-human-annotators) section.
+</Note>
 - **Using synthetic data from models**: On this, you can check the very cool [Cosmopedia](https://huggingface.co/blog/cosmopedia) blog by cool HF colleagues! It's mostly studying how to create a synthetic training dataset, but similar techniques can be used for evaluation. Make sure to manually check/filter/inspect your dataset afterwards (following the above steps).
 - **Using rule-based techniques**: If your task allows, this is a very good way to get a virtually infinite supply of samples and avoid contamination! For some examples, you can look at [NPHardEval](https://arxiv.org/abs/2312.14890), [DyVal](https://arxiv.org/abs/2309.17167), [MuSR](https://arxiv.org/abs/2310.16049), [BabiQA](https://arxiv.org/abs/1502.05698), etc.
 	- Provides a proxy for model "confidence" (and calibration)
 	- Fast to evaluate, especially when we ask the model to predict only one token (A/B/C/D the indices of the choices, or Yes/No, etc).
 	- Allow to get signal on small models' task performance
+- Cons:
 	- Slightly over-scores small models which would have generated something outside of the range of available choices if given free rein.
 	- Some models [favor specific choices based on the order in which they have been presented](https://arxiv.org/abs/2309.03882), which could lead to unrepresentative evaluations
 		- A costly way is to re-run the evaluation several times with prompt variations
 		- A less costly way is to run your evaluation once using a range of prompt formats allocated to different samples of equivalent difficulty
 - you can provide examples to your model to help it follow the expected format (using few-shot examples), and adding connector words helps this overall
+- but models now tend to overfit specific prompt formats.
 	- [This paper](https://arxiv.org/abs/2407.07890) is great on the topic, showing notably how some models can be over-evaluated because they have overfitted the test set **format**
 	- On the Open LLM Leaderboard 2, we've notably observed that Llama 3.2 and Qwen 2.5 are no longer following the format of the prompt provided in a few-shot setup for this reason.
+<Note title="Models can overfit prompt formats" emoji="⚠️" variant="warning">
+Recent research shows models can overfit specific prompt formats rather than learning the underlying task. [This paper](https://arxiv.org/abs/2407.07890) demonstrates how some models are over-evaluated because they've memorized test set formats. We've observed Llama 3.2 and Qwen 2.5 no longer following few-shot prompt formats for this reason.
+</Note>
 - for a number of metrics, you want a very constrained generation or output.
   *You can learn more about this in the `Constraining model outputs` section of the [Model inference and evaluation](https://github.com/huggingface/evaluation-guidebook/blob/main/contents/general-knowledge/model-inference-and-evaluation.md) page.*
 #### Choosing a metric
 If you are looking at **log-probabilities**, your metrics are going to be easy: you'll want to look at accuracy (how often the most likely choice is the best choice). It's important to normalize it by length (either character, token, or pmi). You could also look at perplexity, recall, or f1 score.
+For **generative** evaluations, your range of metrics is going to be wider.
+You'll need to
+1. decide if you compare generations as they are, or first normalize them with something.
 	- Normalizations can easily [be unfair if not designed well](https://huggingface.co/blog/open-llm-leaderboard-drop), but overall they still provide signal at the task level.
+<Sidenote>
+Normalizations can [be unfair if not designed well](https://huggingface.co/blog/open-llm-leaderboard-drop), though they generally provide useful signal. Design normalization rules carefully and test them across diverse model outputs.
+</Sidenote>
 	- They are very important for specific tasks, such as math evaluations, where you might want to extract your result from formatted outputs.
 	- They will also be important if you want to evaluate with added mechanisms for accuracy, such as Chain of Thought, as you'll need to remove the reasoning trace from the actual result
 2. decide how you compare the generation with the reference.
 #### Smart new tasks: what about functional testing?
 In the field of code, you want to evaluate generated programs not only on their semantics, but on their actual function. A good way to do so is therefore to check if code generated to follow a prompt passes correctly a suite of unit-tests designed to fit the task.
+This functionality approach is extremely promising, as it
 - allows to generate test cases more easily (in many cases, you can generate rule-based test cases)
 - therefore reducing overfitting
 - tests models on specific active capabilities
+<Note title="The promise of functional testing" emoji="✨" variant="success">
+Functional testing (like unit tests for code) offers major advantages:
+- Easier test case generation (often rule-based)
+- Reduces overfitting risk
+- Tests specific active capabilities
+- Extends beyond code to other domains (e.g., IFEval for instruction following)
+</Note>
 It's however an approach which requires creativity to be translated to text!
 A good example of this is IFEval, an evaluation benchmark which tests if models can follow instructions. It works by creating a number of formatting instructions (*Add this number of bullet points. Capitalize only one sentence.* etc), and strictly testing if the format is followed. More work is clearly needed to extend this idea to other features of text to analyze!

app/src/content/chapters/automated-benchmarks/some-evaluation-datasets.mdx CHANGED Viewed

@@ -2,17 +2,22 @@
 title: "Some evaluation datasets"
 ---
 ### Some evaluation datasets
 If the task you are interested is already well studied, chances are that a dataset exists for it.
-Below are a number of evaluation datasets which were developed in the last few years.
-However, careful:
-- Some of them can be obsolete, as they were designed pre-LLM and are now easily solved: they aimed to investigate one specific property of text (translation, summarization) which is no longer really how we evaluate models (evaluations are now more general/holistic).
-	 (*If you've got some bandwidth, this could really benefit from adding the publication dates!*)
-	 (*This will also be updated with post LLM evals at some point*)
-- They are likely contaminated, as they have been publicly on the web for a number of years. However, it doesn't mean they won't hold signal for your task!
 ### Math specific datasets

 title: "Some evaluation datasets"
 ---
+import Note from "../../../components/Note.astro";
 ### Some evaluation datasets
 If the task you are interested is already well studied, chances are that a dataset exists for it.
+Below are a number of evaluation datasets which were developed in the last few years.
+<Note title="Caveat: Dataset age and contamination" emoji="⚠️" variant="warning">
+Many datasets listed here may be:
+- **Obsolete**: Designed pre-LLM for specific properties (translation, summarization) no longer central to model evaluation
+- **Contaminated**: Publicly available for years, likely in training data
+However, contamination doesn't mean these datasets have no signal for your task!
+</Note>
 ### Math specific datasets

app/src/content/chapters/automated-benchmarks/tips-and-tricks.mdx CHANGED Viewed

@@ -2,6 +2,9 @@
 title: "Automated Benchmarks: Tips and tricks"
 ---
 ### Pros and cons of using automated benchmarks
 Automated benchmarks have the following advantages:
@@ -11,6 +14,11 @@ Automated benchmarks have the following advantages:
   *Eg: an exact match will tell you if the generated text matches perfectly with the reference, and an accuracy score will tell you in how many cases the selected choice was the correct one (this will be a bit less the case for metrics such as `BLEU` or `ROUGE` for example).*
 - **Dataset quality**: A number of automated benchmarks are using expert generated datasets or pre-existing high quality data (like MMLU or MATH). However, this does not mean these datasets are perfect: for MMLU, several errors have been identified in samples afterwards, from parsing issues to actually non-sensical questions, leading to the creation of several follow-up datasets, like MMLU-Pro and MMLU-Redux.
 However, they also present the following limitations:
 - **Reduced use on more complex tasks**: Automated benchmarks are working well for tasks where performance is easy to define and assess (for example, classification). More complex capabilities, on the other hand, are harder to decompose into well-defined and precise tasks.
   *Eg: what does "good at math" mean? Is it being good at arithmetic? - at logic? - able to reason on new mathematical concepts?*
@@ -21,7 +29,12 @@ However, they also present the following limitations:
 ### Tips and tricks
 #### Managing contamination
-In general, you should assume that a dataset publicly available on the internet is or will be contaminated.
 Solutions to mitigate this include:
 - providing a **canary string** in the evaluation set (like in [BigBench](https://github.com/google/BIG-bench)): it is a specific character combination that model creators can look for in their training sets, which would indicate that it contains an evaluation
@@ -29,48 +42,22 @@ Solutions to mitigate this include:
 - running [dynamic benchmarks](https://arxiv.org/abs/2104.14337): benchmarks regularly updated through time so that models can't "learn the answers by heart" (but it makes datasets more costly)
 - if you are running a benchmark, trying to [detect contamination](https://arxiv.org/abs/2311.06233) post-hoc (for example, by looking at the generation perplexity or designing adversarial versions of the prompts - however, no method is a foolproof contamination detection method)
-However, it's not because a dataset is contaminated that it won't still be interesting and have signal during training.
-#### Managing fine-tuned models, system prompts and chat templates
-Pre-2022, models used to simply be pretrained: text in, text out, nothing else. Then, we got instruction tuning and chat models in 2023, and in 2025 reasoning models. This means that we went from using text "as is" to using chat templates (= providing models with json) to using reasoning tags (= mixing up the json chat template with xml tags for reasoning).
-This means a number of models are going to perform terribly if you do not make sure to:
-- add their system prompt at the very beginning of inference
-- prompt them using a chat template if they require it (usually adding `Assistant` and `User` prefixes to the dialogue turns - learn more about this in [this cool guide](https://huggingface.co/docs/transformers/main/en/chat_templating))
-- remove the thinking trace from the model answer before processing it (you can usually regex to remove what's between the `<think>` tags)
-It's also very important to not assume that different tokenizers will behave the same,as you can see in this cool picture about tokenization spacing and chat templates, from [this tweet](https://x.com/danielhanchen/status/1796952220619157694).
-![Spacing, tokenization and template](https://pbs.twimg.com/media/GPANfpiasAA9b6F?format=png&name=medium)
-#### Beware of tokenization
-1. **Tokenizing the context and choices together or separately**
-When looking at an MCQA evaluation, in general, you want to tokenize the context together with the choices, as it creates a succession of tokens which is likely/natural for the model.
-However, some tokenizers (like the [Llama one](https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257)) do not satisfy `enc(context + choice) = enc(context) + enc(choice)` (and add or remove spacing). This means that comparing the logprobabilities of the choices is not easy, as the context tokens can "bleed out" into them, messing up the comparison.
-So if this is the case for your model, you might want to compute the tokens of context and choice separately and then concatenate them after removing the special start/end of sentence tokens which might have been added.
-2. **Paying attention to start and end of sentence tokens**
-Some models, like the `Gemma` ones, are extremely sensitive to the [inclusion of start of sentence tokens](https://github.com/EleutherAI/lm-evaluation-harness/pull/1465) at inference. You might need to do a couple of experiments to see if that happens for you, and add these tokens manually when evaluating.
-You can also encounter some issues where your model won't stop on an end of sentence token like you would expect (for example, on `\n`), because your model will not predict this token alone but included in an higher level token (for example, `\n\n`, which can be a single token, especially for code models). In this case, you might need to add a specific check to "backtrack" on generated text to make sure you're cutting your generated sentence at the proper spot before computing metrics.
-3. **Multilinguality and tokenization**
-When looking at multilingual evaluations, you'll also need to see how to tokenize your text, depending on your evaluation task and metrics. As some languages do not always use spacing as a word separator (Korean, Thai, Japanese, Chinese, to cite a few), they will require language specific tokenizers to be split properly, else it will affect their scores on metrics such as [BLEU](https://github.com/EleutherAI/lm-evaluation-harness/issues/212), F1 scores, etc.
-4. **Code evaluations and end of sentence tokens**
-Code models usually have been trained with `\n\t` as a single token. This means that when generating text, they will often generate `\n\t` in one step. A task which defines `\n` as an end of sentence token (= to stop the generation) will let the model continue generating after a `\n\t`, if predicted as one token, since it's not the same as `\n`. But you would actually still want the model to stop. In these cases, you either need to update your end of sentence tokens, or define a mechanism to backtrack on the character representation of the latest tokens to stop (and cut) the generation a posteriori.
 ####  Tip: an easy speed up for MCQA evaluations
 You can speed up your MCQA predictions by a lot if you make sure your model needs to predict only one token for the task.
-This way, instead of running your `number_of_choices` predictions (`context + choice 1`, `context + choice 2`, etc), you can simply run inference on `context` and compute the probability distribution on the full vocabulary (which will include all your one token choices) to get your logprobabilities of interest, and do this step in one pass.
 (That's how we do it in `lighteval`).

 title: "Automated Benchmarks: Tips and tricks"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### Pros and cons of using automated benchmarks
 Automated benchmarks have the following advantages:
   *Eg: an exact match will tell you if the generated text matches perfectly with the reference, and an accuracy score will tell you in how many cases the selected choice was the correct one (this will be a bit less the case for metrics such as `BLEU` or `ROUGE` for example).*
 - **Dataset quality**: A number of automated benchmarks are using expert generated datasets or pre-existing high quality data (like MMLU or MATH). However, this does not mean these datasets are perfect: for MMLU, several errors have been identified in samples afterwards, from parsing issues to actually non-sensical questions, leading to the creation of several follow-up datasets, like MMLU-Pro and MMLU-Redux.
+<Sidenote>
+Several errors in MMLU (parsing issues, nonsensical questions) led to improved versions like MMLU-Pro and MMLU-Redux. Always inspect benchmark samples manually before relying on them for evaluation.
+</Sidenote>
 However, they also present the following limitations:
 - **Reduced use on more complex tasks**: Automated benchmarks are working well for tasks where performance is easy to define and assess (for example, classification). More complex capabilities, on the other hand, are harder to decompose into well-defined and precise tasks.
   *Eg: what does "good at math" mean? Is it being good at arithmetic? - at logic? - able to reason on new mathematical concepts?*
 ### Tips and tricks
 #### Managing contamination
+In general, you should assume that a dataset publicly available on the internet is or will be contaminated.
+<Note title="Assume contamination" emoji="🔍" variant="warning">
+You should assume that any dataset publicly available on the internet is or will be contaminated in model training data. Design your evaluation strategy with this assumption in mind.
+</Note>
 Solutions to mitigate this include:
 - providing a **canary string** in the evaluation set (like in [BigBench](https://github.com/google/BIG-bench)): it is a specific character combination that model creators can look for in their training sets, which would indicate that it contains an evaluation
 - running [dynamic benchmarks](https://arxiv.org/abs/2104.14337): benchmarks regularly updated through time so that models can't "learn the answers by heart" (but it makes datasets more costly)
 - if you are running a benchmark, trying to [detect contamination](https://arxiv.org/abs/2311.06233) post-hoc (for example, by looking at the generation perplexity or designing adversarial versions of the prompts - however, no method is a foolproof contamination detection method)
+<Sidenote>
+Even contaminated datasets can provide useful signal during training. Performance improvements on contaminated benchmarks often correlate with genuine capability improvements, though the absolute scores may be inflated.
+</Sidenote>
+However, it's not because a dataset is contaminated that it won't still be interesting and have signal during training.
 ####  Tip: an easy speed up for MCQA evaluations
 You can speed up your MCQA predictions by a lot if you make sure your model needs to predict only one token for the task.
+This way, instead of running your `number_of_choices` predictions (`context + choice 1`, `context + choice 2`, etc), you can simply run inference on `context` and compute the probability distribution on the full vocabulary (which will include all your one token choices) to get your logprobabilities of interest, and do this step in one pass.
+<Note title="Speed optimization for MCQA" emoji="⚡" variant="success">
+Speed up MCQA evaluations by using single-token choices. Instead of running N predictions for N choices, run inference once on the context and examine the probability distribution over all vocabulary tokens (which includes your choices). This is how `lighteval` achieves fast MCQA evaluation.
+</Note>
 (That's how we do it in `lighteval`).

app/src/content/chapters/general-knowledge/model-inference-and-evaluation.mdx CHANGED Viewed

@@ -6,27 +6,150 @@ import llmTk1 from '../../assets/image/llm_tk_1.png';
 import llmLogprob from '../../assets/image/llm_logprob.png';
 import llmGen from '../../assets/image/llm_gen.png';
 import Image from '../../../components/Image.astro';
-### Model inference and evaluation
-### Introduction
 Current large language model work in a simple way: given some text as input, they have learned to predict plausible follow up.
 This is done in two steps.
 ### Tokenization
-The input text (called a *prompt* at inference) is first split into *tokens*, small units of texts (which can be one or several characters, up to the word level) each associated with a number. The whole range of tokens a model can parse is called its *vocabulary*. *(To understand this more in depth, go read the [Tokenization](https://github.com/huggingface/evaluation-guidebook/blob/main/contents/general-knowledge/tokenization.md) page)*.
-### Prediction
-<Image src={llmTk1} alt="LLM tokenization and prediction process" />
 From this input text, the LLM generates a probability distribution of the most likely next tokens over all the vocabulary. To get a continued generation, we can take the most probable token (give or take some added randomness to get more interesting outputs) as the next one, then repeat the operation, using the new token as the end of the prompt, etc.
-### What do you want to predict?
-LLM evaluations mostly fall into 2 categories:
-- Given a prompt and one (or several) answers, what is probability of said answer(s) for my model?
-- Given a prompt, what text does my model generate?
-### Log-likelihood evaluations
 For log-likelihood evaluations, we want the conditional probability of one or several choices given a prompt - in other terms, what is the likelihood to get a specific continuation given an input?
 So:
 - we concatenate each choice with the prompt, and pass them to our LLM, which outputs the logits of each token depending on the previous ones
@@ -40,41 +163,71 @@ This allows us to apply one of the following metrics:
 - get the preferred answer of a model among several choice, like in the above picture. (*However, this can advantage scores of models which would have, freely, generated something else, like `Zygote` in the picture.*)
 - test if a single choice has a probability above 0.5
 - study model calibration. A well calibrated model is a model for which the correct answers have the highest probabilities.
-  *(To learn more about calibration, you can check [this paper](https://arxiv.org/abs/2207.05221) from Anthropic, on what it is, how to detect it, and how to train models to be well calibrated, and [this paper](https://arxiv.org/abs/2311.14648) on some possible limits of calibration).*
-### Generative evaluations
 For a generative evaluation, we want the text generated by the model given an input prompt.
 It is obtained in an auto-regressive way: we pass the prompt to the model, look at the most likely next token, select it as being the model's "choice first token", then repeat until we reach an end of generation condition (maximum length, special token to stop the generation, etc). All the tokens generated by the model are consider its answer to the prompt.
 <Image src={llmGen} alt="LLM generative evaluation process" />
 We can then compare this generation with references and score the distance between both (using either simple metrics like exact match, more complex metrics like BLEU, or models as judges).
-### Going further
--  ⭐ [Blog on several ways to evaluate MMLU](https://huggingface.co/blog/open-llm-leaderboard-mmlu) , by my team at Hugging Face. I recommend reading it if you want to delve deeper into the differences between multi choice log-likelihood evaluations and generative ones, including what it can mean with respect to score changes
-	- The above illustrations come from the blog and have been made by Thom Wolf
 - ⭐ [A beautiful mathematical formalization of the above inference methods](https://arxiv.org/abs/2405.14782v2), from EleutherAI. Go to the Appendix directly.
 ### Constraining model outputs
 In a number of cases, we want the model output to follow a specific format, for example to compare them to a reference.
-### Using a prompt
 The easiest way to do this is to add a task prompt which contains very specific instructions as to how the model should answer (`Provide numerical answers in digits.`,`Use no abbreviation.`, etc).
-It won't necessarily work all the time but should be good enough for high capability models. That's the approach we followed in the [GAIA](https://huggingface.co/papers/2311.12983) paper, and you can find our task prompt in the Submission tab of the [leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) if you want some inspiration.
-### Few shots and in context learning
 The next way to do so is to constrain the model through what is called "in context learning". By providing examples in the prompt (what is called `few-shot prompting`), the model is implicitly biased towards following the repeated prompt shape for the actual sample.
-It's a method which was overall working quite well until end of 2023! However, the widespread adoption of instruction-tuning methods and the addition of instruction data in later stages of model pre-training (continuous pre-training) seem to have biased more recent models towards specific output formats (what is being called [here](https://arxiv.org/abs/2407.07890) `Training on the test task`, and what I would call `overfitting the prompt format`). It's also a method which can be limited for older models with smaller context sizes, as some few-shot examples can not fit into the context window.
-### Structured text generation
 Structured text generation constrains the outputs to follow a given path, defined by a grammar or by regular expressions, for example. The `outlines` library implements this using finite state machines, which is very neat. (Other approaches exist, such as using interleaved generation for json generation, but the FSM one is my favorite).
-To understand more about what happens when using structured generation, you can check the [blog](https://huggingface.co/blog/evaluation-structured-outputs) we wrote together: structured generation reduce prompt variance in evaluation, and make results and rankings more stable. You can also check the overall `outlines` [blog](https://blog.dottxt.co/) for interesting implementations and observations linked to structured generation.
 However, some recent [research](https://arxiv.org/abs/2408.02442) seems to show that structured generation can lower model performance on some tasks (like reasoning), by moving the prior too far away from the expected probability distribution.
-### Going further
 -  ⭐ [Understanding how Finite State Machine when using structured generation](https://blog.dottxt.co/coalescence.html), by Outlines. Super clear guide on how their method works!
 - [The outlines method paper](https://arxiv.org/abs/2307.09702), a more academic explanation of the above
 - [Interleaved generation](https://github.com/guidance-ai/guidance?tab=readme-ov-file#guidance-acceleration), another method to constrain generations for some specific output formats

 import llmLogprob from '../../assets/image/llm_logprob.png';
 import llmGen from '../../assets/image/llm_gen.png';
 import Image from '../../../components/Image.astro';
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
+import Accordion from "../../../components/Accordion.astro";
 Current large language model work in a simple way: given some text as input, they have learned to predict plausible follow up.
 This is done in two steps.
 ### Tokenization
+The input text (called a *prompt* at inference) is first split into *tokens*, small units of texts (which can be one or several characters, up to the word level) each associated with a number. The whole range of tokens a model can parse is called its *vocabulary*. If any of these concept is unclear, please open the relevant accordion, else skip to the next section.
+<Accordion title="Basics of tokenization: Why and how do we tokenize text?">
+Since large language models are actually big mathematical functions, they eat numbers, not text.
+Say you want to transform a sentence to numbers. You first need to decide how to cut your sentence into small pieces, then map every small piece to a number; this is *tokenization*.
+In the past, people would try to map each character of a text with its index in a alphabet (`a` -> 1, `b` -> 2, etc) which is called *character based tokenization* (you split between characters). On the other end of the spectrum, people also tried to map each word with its index in a dictionary (`a` -> 1, `aardvark` -> 2, `ab` -> 3, etc) which is called *word based tokenization* (you split on spaces, if your language has spaces - if not, it's a bit harder).
+Both these methods share a strong limitation: they remove information from the input text. They erase semantic connections that you can see from word shape (ex: `dis similar`, `similar`, `similar ity`, `similar ly`), information we would like our model to retain, so it connects related words together.
+(Plus, what happens if you suddenly have a completely new word in input? It gets no number, and your model can't process it 😔 )
+Some people therefore had the idea to cut words into sub-words, and assign index to these sub-words (`dis`, `similar`, `ity`, `ly`)!
+This was initially done using morpho-syntactic rules (*morpho-syntax* is like the grammar of word creation). Now most people use byte pair encoding (BPE), a smart statistical method to create the sub-words automatically depending on their frequency in a reference text.
+So as a summary: tokenization is a way to map small units of texts (which can be one or several characters, up to the word level) to numbers (similar to an index). When you want to process text, your input text (called a *prompt* at inference) is split into these *tokens* by a tokenizer. The whole range of tokens a model or tokenizer can parse is called its *vocabulary*.
+<Note title="Going further: Understanding tokenization" emoji="📚" variant="warning">
+- ⭐ [Explanation of different tokenization methods in the 🤗 NLP Course](https://huggingface.co/learn/nlp-course/en/chapter2/4)
+- ⭐ [Conceptual guide about tokenization in the 🤗 doc](https://huggingface.co/docs/transformers/en/tokenizer_summary)
+- [Course by Jurafsky on tokenization (and other things)](https://web.stanford.edu/~jurafsky/slp3/2.pdf) - academic approach, skip to 2.5 and 2.6 (the rest is interesting too but too broad)
+</Note>
+<Note title="Going further: Byte Pair Encoding" emoji="📚" variant="warning">
+- ⭐ [Explanation of BPE in the 🤗 NLP Course](https://huggingface.co/learn/nlp-course/en/chapter6/5)
+- [Paper introducing BPE to NLP](https://aclanthology.org/P16-1162/)
+</Note>
+</Accordion>
+<Accordion title="Using your own tokenizer? Don't forget to consider the following">
+I recommend making sure you understand BPE before this section, see above for some references!
+**Choosing the correct vocabulary size**
+The size of the vocabulary indicates how many individual tokens (for example, sub-words) the model will have to learn. A vocabulary which is **too big** might contain some very rare words as full tokens (for example: `aardvark`), which can lead to 2 problems. If such a rare word almost never appears in the training data, it can be hard to connect to other concepts, and the model might be unable to infer what it is about. On the other hand, if it appears rarely and only in specific contexts, it can be linked to some very specific other words: for example, if you train on forum data, and your tokenizer mapped a username as one single token in its vocabulary, your model might then associate this token to the specific user's content.
+A vocabulary which is **too small** will present 2 other problems: worst representation capabilities, and increased cost at inference.
+Let's go back to our above example, where we tokenized words derived from `similar`. Using a pseudo BPE approach (large vocabulary) to tokenize `similarly` has split the word into 2 tokens (`similar`, `ly`). If we had used instead character level tokenization (therefore with a very small vocabulary, the size of an alphabet), the same word would be cut into 9 tokens (`s`, `i`, `m`, `i`, `l`, `a`, `r`, `l`, `y`). Where the first method splits `similarly` into tokens which have an individual semantic  meaning, it's not the case in the second method: with too small a vocabulary, we lost some semantic representation. The difference in representations length also means that it's many times as costly to generate our word with a smaller vocabulary (takes 9 tokens instead of 2, so 5 times more costly!).
+At the moment, most people seem to use heuristics for vocabulary size, which seems correlated to number of languages covered and model size, so it's likely that using a number of tokens close to the reference models of a similar size could work for you.
+<Note title="Going further: Rare tokens effect" emoji="📚">
+- [SolidGoldMagikarp post on Less Wrong](https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation): Very interesting read on how some people identified very rare tokens in Open AI's vocabulary - this is quite cool because it's done without access to the model's internals (we don't know what the training data contains for example)
+- [Fishing for Magikarp, paper by Cohere](https://arxiv.org/abs/2405.05417): Follow up work on to detect these tokens
+</Note>
+**Managing several languages**
+When building or choosing your tokenizer, you construct your vocabulary from reference text. This means that your tokenizer will know vocabulary words and characters from this reference text. Usually, it means using data in English, with a Latin script.
+If you want to add new language, and your new language uses the same script and share some roots, you could theoretically hope that some of your original language semantics transfer to the new language.
+However, if you want to allow your tokenizer to correctly split text in other languages (especially languages written in other scripts) you'd better include data from these languages when building said tokenizer. Most of the time, though, this data will contain an unbalanced proportion of the initial language (ex: English) to the new language (ex: Thai, or Burmese), the initial language being much more present. Since most efficient tokenizer methods used nowadays (like BPE) create their complex vocabulary tokens based on the most frequent words seen, most of the long tokens will be English words - and most of the words from the less frequent languages will only be split at the character level.
+This effect leads to an unfairness in multilingual tokenization: some (less frequent, or *lower-resourced*) languages require orders of magnitude more tokens to generate a sentence of equivalent length as English.
+<Note title="Going further: Language and tokenization" emoji="📚" variant="warning">
+- ⭐ [A beautiful breakdown and demo by Yennie Jun on tokenization issues across languages](https://www.artfish.ai/p/all-languages-are-not-created-tokenized): The breakdown in itself is very clear, and it's worth playing around with the [demo space](https://huggingface.co/spaces/yenniejun/tokenizers-languages)
+- ⭐ [A demo by Aleksandar Petrov on unfairness of tokenization](https://aleksandarpetrov.github.io/tokenization-fairness/): I recommend looking at `Compare tokenization of sentences` to get a feel for the differences in cost of inference depending on languages
+</Note>
+**What about numbers?**
+When building your tokenizer, you need to decide what to do about numbers. Do you only index 0 to 9, and assume all other numbers will be compositions of digits, or do you want to store numbers up to, say, one billion, individually? Current well known models display a range of approaches to this, but it's unclear what works better to allow mathematical reasoning. Maybe new approaches to tokenization, such as hierarchical tokenization, might be needed for this.
+<Note title="Going further: Number tokenization" emoji="📚" variant="warning">
+- ⭐ [A nice visual demo by Yennie Jun of how tokenizers of Anthropic, Meta, OpenAI, and Mistral models split numbers](https://www.artfish.ai/p/how-would-you-tokenize-or-break-down)
+- [Small history by Beren Millidge of the evolution of number tokenization through the years](https://www.beren.io/2024-05-11-Integer-tokenization-is-now-much-less-insane/)
+</Note>
+</Accordion>
+<Accordion title="How tokenization can mess up your evaluation">
+**Managing fine-tuned models, system prompts and chat templates**
+Pre-2022, models used to simply be pretrained: text in, text out, nothing else. Then, we got instruction tuning and chat models in 2023, and in 2025 reasoning models. This means that we went from using text "as is" to using chat templates (= providing models with json) to using reasoning tags (= mixing up the json chat template with xml tags for reasoning).
+This means a number of models are going to perform terribly if you do not make sure to:
+1. add their system prompt at the very beginning of inference
+2. prompt them using a chat template if they require it (usually adding `Assistant` and `User` prefixes to the dialogue turns - learn more about this in [this cool guide](https://huggingface.co/docs/transformers/main/en/chat_templating))
+3. remove the thinking trace from the model answer before processing it (you can usually regex to remove what's between the `<think>` tags)
+<Note title="Critical: Chat templates and tokenization" emoji="⚡" variant="danger">
+![Spacing, tokenization and template](https://pbs.twimg.com/media/GPANfpiasAA9b6F?format=png&name=medium)
+Different tokenizers behave differently with spacing and special tokens. See this [visualization](https://x.com/danielhanchen/status/1796952220619157694) showing how spacing, tokenization, and templates interact. Never assume tokenizers behave identically!
+</Note>
+**Tokenizing the context and choices together or separately**
+When looking at an MCQA evaluation, in general, you want to tokenize the context together with the choices, as it creates a succession of tokens which is likely/natural for the model.
+However, some tokenizers (like the [Llama one](https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257)) do not satisfy `enc(context + choice) = enc(context) + enc(choice)` (and add or remove spacing). This means that comparing the logprobabilities of the choices is not easy, as the context tokens can "bleed out" into them, messing up the comparison.
+<Sidenote>
+The [Llama tokenizer](https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257) doesn't satisfy `enc(context + choice) = enc(context) + enc(choice)`, making log probability comparisons tricky. Tokenize separately and concatenate, removing special tokens.
+</Sidenote>
+So if this is the case for your model, you might want to compute the tokens of context and choice separately and then concatenate them after removing the special start/end of sentence tokens which might have been added.
+**Paying attention to start and end of sentence tokens**
+Some models, like the `Gemma` ones, are extremely sensitive to the [inclusion of start of sentence tokens](https://github.com/EleutherAI/lm-evaluation-harness/pull/1465) at inference. You might need to do a couple of experiments to see if that happens for you, and add these tokens manually when evaluating.
+You can also encounter some issues where your model won't stop on an end of sentence token like you would expect (for example, on `\n`), because your model will not predict this token alone but included in an higher level token (for example, `\n\n`, which can be a single token, especially for code models). In this case, you might need to add a specific check to "backtrack" on generated text to make sure you're cutting your generated sentence at the proper spot before computing metrics.
+**Multilinguality and tokenization**
+When looking at multilingual evaluations, you'll also need to see how to tokenize your text, depending on your evaluation task and metrics. As some languages do not always use spacing as a word separator (Korean, Thai, Japanese, Chinese, to cite a few), they will require language specific tokenizers to be split properly, else it will affect their scores on metrics such as [BLEU](https://github.com/EleutherAI/lm-evaluation-harness/issues/212), F1 scores, etc.
+**Code evaluations and end of sentence tokens**
+Code models usually have been trained with `\n\t` as a single token. This means that when generating text, they will often generate `\n\t` in one step. A task which defines `\n` as an end of sentence token (= to stop the generation) will let the model continue generating after a `\n\t`, if predicted as one token, since it's not the same as `\n`. But you would actually still want the model to stop. In these cases, you either need to update your end of sentence tokens, or define a mechanism to backtrack on the character representation of the latest tokens to stop (and cut) the generation a posteriori.
+</Accordion>
+### Inference
 From this input text, the LLM generates a probability distribution of the most likely next tokens over all the vocabulary. To get a continued generation, we can take the most probable token (give or take some added randomness to get more interesting outputs) as the next one, then repeat the operation, using the new token as the end of the prompt, etc.
+<Image src={llmTk1} alt="LLM tokenization and prediction process" />
+<Note title="Two main evaluation approaches" emoji="🎯" variant="info">
+**Log-likelihood evaluations**: Given a prompt and one (or several) answers, what is probability of said answer(s) for my model?
+**Generative evaluations**: Given a prompt, what text does my model generate?
+Choice depends on your task: multiple-choice questions use log-likelihood, while open-ended tasks require generative evaluation.
+</Note>
+#### Log-likelihood evaluations
 For log-likelihood evaluations, we want the conditional probability of one or several choices given a prompt - in other terms, what is the likelihood to get a specific continuation given an input?
 So:
 - we concatenate each choice with the prompt, and pass them to our LLM, which outputs the logits of each token depending on the previous ones
 - get the preferred answer of a model among several choice, like in the above picture. (*However, this can advantage scores of models which would have, freely, generated something else, like `Zygote` in the picture.*)
 - test if a single choice has a probability above 0.5
 - study model calibration. A well calibrated model is a model for which the correct answers have the highest probabilities.
+<Sidenote>
+To learn more about calibration, you can check [this paper](https://arxiv.org/abs/2207.05221) from Anthropic, on what it is, how to detect it, and how to train models to be well calibrated, and [this paper](https://arxiv.org/abs/2311.14648) on some possible limits of calibration).
+</Sidenote>
+<Note>
+A multiple choice question answer can be expressed as a free form generative evaluation too! For this reason, you'll sometimes see a mention of the task **formulation**.
+There are three common task formulations:
+- **Multiple choice format (MCF)**: we compare the likelihood of choices indices, where choices are explicitly presented in the prompt and prefixed with A/B/C/D (as in MMLU)
+- **Cloze formulation (CF)**: we compare the likelihood of different choices without providing them in the prompt
+- **Freeform generation (FG)**: we evaluate the accuracy of greedy generation for a given prompt
+FG requires substantial latent knowledge and is usually too difficult for models during short pre-training ablations. For this reason, we typically focus on multiple choice formulations (MCF or CF) when running small-scale ablations. However, for post-trained models, FG becomes the primary formulation since we're evaluating whether the model can actually generate useful responses.
+However, research has also shown that models struggle with MCF early in training, only learning this skill after extensive training, making CF better for early signal. We thus recommend using CF for small ablations, and integrate MCF in the main run as it gives better mid-training signal once a model has passed a threshold to get sufficiently high signal-over-noise ratio for MCF.
+A quick note also that, to score a model's answer in sequence likelihood evaluations like CF, we compute accuracy as the percentage of questions where the the correct answer has the highest log probability normalised by character/token count. This normalisation prevents a bias toward shorter answers.
+<Sidenote>
+The point at which MMLU MCF becomes non-random depends on the model size and training data. For a 7B transformer, the OLMES paper found the model starts showing non-random performance after 500B tokens. For 1.7B model, we found this happens after 6T tokens in SmolLM2.
+</Sidenote>
+</Note>
+#### Generative evaluations
 For a generative evaluation, we want the text generated by the model given an input prompt.
 It is obtained in an auto-regressive way: we pass the prompt to the model, look at the most likely next token, select it as being the model's "choice first token", then repeat until we reach an end of generation condition (maximum length, special token to stop the generation, etc). All the tokens generated by the model are consider its answer to the prompt.
 <Image src={llmGen} alt="LLM generative evaluation process" />
 We can then compare this generation with references and score the distance between both (using either simple metrics like exact match, more complex metrics like BLEU, or models as judges).
+<Note title="Going further" emoji="📚" variant="warning">
+- ⭐ [Blog on several ways to evaluate MMLU](https://huggingface.co/blog/open-llm-leaderboard-mmlu) , by my team at Hugging Face. I recommend reading it if you want to delve deeper into the differences between multi choice log-likelihood evaluations and generative ones, including what it can mean with respect to score changes (The above illustrations come from the blog and have been made by Thom Wolf)
 - ⭐ [A beautiful mathematical formalization of the above inference methods](https://arxiv.org/abs/2405.14782v2), from EleutherAI. Go to the Appendix directly.
+</Note>
 ### Constraining model outputs
 In a number of cases, we want the model output to follow a specific format, for example to compare them to a reference.
+#### Using a prompt
 The easiest way to do this is to add a task prompt which contains very specific instructions as to how the model should answer (`Provide numerical answers in digits.`,`Use no abbreviation.`, etc).
+It won't necessarily work all the time but should be good enough for high capability models. That's the approach we followed in the [GAIA](https://huggingface.co/papers/2311.12983) paper for example.
+#### Few shots and in context learning
 The next way to do so is to constrain the model through what is called "in context learning". By providing examples in the prompt (what is called `few-shot prompting`), the model is implicitly biased towards following the repeated prompt shape for the actual sample.
+<Note>
+It's a method which was overall working quite well until end of 2023!
+However, the widespread adoption of instruction-tuning methods and the addition of instruction data in later stages of model pre-training (continuous pre-training) has biased more recent models towards specific output formats (what is being called [here](https://arxiv.org/abs/2407.07890) *Training on the test task*, and what I would call *overfitting the prompt format*). Reasoning models are also not playing that well with few shot examples because of the reasoning trace.
+It's also a method which can be limited for older models with smaller context sizes, as some few-shot examples can not fit into the context window.
+</Note>
+#### Structured text generation
 Structured text generation constrains the outputs to follow a given path, defined by a grammar or by regular expressions, for example. The `outlines` library implements this using finite state machines, which is very neat. (Other approaches exist, such as using interleaved generation for json generation, but the FSM one is my favorite).
+To understand more about what happens when using structured generation, you can check the [blog](https://huggingface.co/blog/evaluation-structured-outputs) we wrote together: structured generation reduce prompt variance in evaluation, and make results and rankings more stable. You can also check the overall `outlines` [blog](https://blog.dottxt.co/) for interesting implementations and observations linked to structured generation.
 However, some recent [research](https://arxiv.org/abs/2408.02442) seems to show that structured generation can lower model performance on some tasks (like reasoning), by moving the prior too far away from the expected probability distribution.
+<Note title="Going further" emoji="📚" variant="warning">
 -  ⭐ [Understanding how Finite State Machine when using structured generation](https://blog.dottxt.co/coalescence.html), by Outlines. Super clear guide on how their method works!
 - [The outlines method paper](https://arxiv.org/abs/2307.09702), a more academic explanation of the above
 - [Interleaved generation](https://github.com/guidance-ai/guidance?tab=readme-ov-file#guidance-acceleration), another method to constrain generations for some specific output formats
+</Note>

app/src/content/chapters/general-knowledge/tokenization.mdx DELETED Viewed

@@ -1,76 +0,0 @@
----
-title: "Tokenization"
----
-### Tokenization
-### Why and how do we tokenize text?
-Since large language models are actually big mathematical functions, they eat numbers, not text.
-Say you want to transform a sentence to numbers. You first need to decide how to cut your sentence into small pieces, then map every small piece to a number; this is *tokenization*.
-In the past, people would try to map each character of a text with its index in a alphabet (`a` -> 1, `b` -> 2, etc) which is called *character based tokenization* (you split between characters). On the other end of the spectrum, people also tried to map each word with its index in a dictionary (`a` -> 1, `aardvark` -> 2, `ab` -> 3, etc) which is called *word based tokenization* (you split on spaces, if your language has spaces - if not, it's a bit harder).
-Both these methods share a strong limitation: they remove information from the input text. They erase semantic connections that you can see from word shape (ex: `dis similar`, `similar`, `similar ity`, `similar ly`), information we would like our model to retain, so it connects related words together.
-(Plus, what happens if you suddenly have a completely new word in input? It gets no number, and your model can't process it 😔 )
-Some people therefore had the idea to cut words into sub-words, and assign index to these sub-words (`dis`, `similar`, `ity`, `ly`)!
-This was initially done using morpho-syntactic rules ("morpho-syntax" is like the grammar of word creation). Now most people use byte pair encoding (BPE), a smart statistical method to create the sub-words automatically depending on their frequency in a reference text.
-So as a summary: tokenization is a way to map small units of texts (which can be one or several characters, up to the word level) to numbers (similar to an index). When you want to process text, your input text (called a *prompt* at inference) is split into these *tokens* by a tokenizer. The whole range of tokens a model or tokenizer can parse is called its *vocabulary*.
-### Going further: Understanding tokenization
-I advise reading one of the first 2 links in depth.
-- ⭐ [Explanation of different tokenization methods in the 🤗 NLP Course](https://huggingface.co/learn/nlp-course/en/chapter2/4)
-- ⭐ [Conceptual guide about tokenization in the 🤗 doc](https://huggingface.co/docs/transformers/en/tokenizer_summary)
-- [Course by Jurafsky on tokenization (and other things)](https://web.stanford.edu/~jurafsky/slp3/2.pdf) - more academical in its approach, skip to 2.5 and 2.6 (the rest is interesting too but too broad)
-### Going further: Byte Pair Encoding
-- ⭐ [Explanation of BPE in the 🤗 NLP Course](https://huggingface.co/learn/nlp-course/en/chapter6/5)
-- [Paper introducing BPE to NLP](https://aclanthology.org/P16-1162/)
-### Some of the many problems of tokenizations
-### Choosing the correct vocabulary size
-The size of the vocabulary indicates how many individual tokens (for example, sub-words) the model will have to learn.
-A vocabulary which is **too big** might contain some very rare words as full tokens (for example: `aardvark`), which can lead to 2 problems.
-If such a rare word almost never appears in the training data, it can be hard to connect to other concepts, and the model might be unable to infer what it is about.
-On the other hand, if it appears rarely and only in specific contexts, it can be linked to some very specific other words: for example, if you train on forum data, and your tokenizer mapped a username as one single token in its vocabulary, your model might then associate this token to the specific user's content.
-A vocabulary which is **too small** will present 2 other problems: worst representation capabilities, and increased cost at inference.
-Let's go back to our above example, where we tokenized words derived from `similar`. Using a pseudo BPE approach (large vocabulary) to tokenize `similarly` has split the word into 2 tokens (`similar`, `ly`). If we had used instead character level tokenization (therefore with a very small vocabulary, the size of an alphabet), the same word would be cut into 9 tokens (`s`, `i`, `m`, `i`, `l`, `a`, `r`, `l`, `y`).
-Where the first method splits `similarly` into tokens which have an individual semantic  meaning, it's not the case in the second method: with too small a vocabulary, we lost some semantic representation. The difference in representations length also means that it's many times as costly to generate our word with a smaller vocabulary (takes 9 tokens instead of 2, so 5 times more costly!).
-At the moment, most people seem to use heuristics for vocabulary size, which seems correlated to number of languages covered and model size, so it's likely that using a number of tokens close to the reference models of a similar size could work for you.
-### Going further: Rare tokens effect
-- [SolidGoldMagikarp post on Less Wrong](https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation)
-	- Very interesting read on how some people identified very rare tokens in Open AI's vocabulary - this is quite cool because it's done without access to the model's internals (we don't know what the training data contains for example)
-- [Fishing for Magikarp, paper by Cohere](https://arxiv.org/abs/2405.05417)
-	- Follow up work on to detect these tokens
-### Managing several languages
-(Recommended: read an explanation of BPE before this section)
-When building or choosing your tokenizer, you construct your vocabulary from reference text. This means that your tokenizer will know vocabulary words and characters from this reference text. Usually, it means using data in English, with a Latin script.
-If you want to add new language, and your new language uses the same script and share some roots, you could theoretically hope that some of your original language semantics transfer to the new language.
-However, if you want to allow your tokenizer to correctly split text in other languages (especially languages written in other scripts) you'd better include data from these languages when building said tokenizer. Most of the time, though, this data will contain an unbalanced proportion of the initial language (ex: English) to the new language (ex: Thai, or Burmese), the initial language being much more present. Since most efficient tokenizer methods used nowadays (like BPE) create their complex vocabulary tokens based on the most frequent words seen, most of the long tokens will be English words - and most of the words from the less frequent languages will only be split at the character level.
-This effect leads to an unfairness in multilingual tokenization: some (less frequent, or *lower-resourced*) languages require orders of magnitude more tokens to generate a sentence of equivalent length as English.
-### Going further: Language and tokenization
-- ⭐ [A beautiful breakdown and demo by Yennie Jun on tokenization issues across languages](https://www.artfish.ai/p/all-languages-are-not-created-tokenized)
-	- The breakdown in itself is very clear, and it's worth playing around with the [demo space](https://huggingface.co/spaces/yenniejun/tokenizers-languages)
-- ⭐ [A demo by Aleksandar Petrov on unfairness of tokenization](https://aleksandarpetrov.github.io/tokenization-fairness/)
-	- I recommend looking at `Compare tokenization of sentences` to get a feel for the differences in cost of inference depending on languages
-### What about numbers?
-When building your tokenizer, you need to decide what to do about numbers. Do you only index 0 to 9, and assume all other numbers will be compositions of digits, or do you want to store numbers up to, say, one billion, individually? Current well known models display a range of approaches to this, but it's unclear what works better to allow mathematical reasoning. Maybe new approaches to tokenization, such as hierarchical tokenization, might be needed for this.
-### Going further: Number tokenization
-- ⭐ [A nice visual demo by Yennie Jun of how tokenizers of Anthropic, Meta, OpenAI, and Mistral models split numbers](https://www.artfish.ai/p/how-would-you-tokenize-or-break-down)
-- [Small history by Beren Millidge of the evolution of number tokenization through the years](https://www.beren.io/2024-05-11-Integer-tokenization-is-now-much-less-insane/)

app/src/content/chapters/human-evaluation/basics.mdx CHANGED Viewed

@@ -2,6 +2,9 @@
 title: "Human Evaluation: Basics"
 ---
 Human evaluation is simply asking humans to evaluate models. In this document, we'll look at post-hoc evaluation: your model has been trained, you have a given task in mind, and humans are providing scores.
 ### Systematic evaluation
@@ -26,6 +29,11 @@ Two other approaches exist to do human-based evaluation, in a more casual way.
 **Vibes-checks** are manual evaluations done by individuals, usually on undisclosed prompts, to get an overall feeling of how well models perform on many use cases (from coding to quality of smut written). Often shared on Twitter and Reddit, results mostly constitute anecdotal evidence, and tend to be highly sensitive to confirmation bias (in other words, people tend to find what they look for). However, they can be [good starting point for your own use cases](https://olshansky.substack.com/p/vibe-checks-are-all-you-need).
 **Arenas** are crowdsourced human evaluation to rank models.
 A well known example of this is the [LMSYS chatbot arena](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard), where community users are asked to chat with models until they find one is better than the other. Votes are then aggregated in an Elo ranking (a ranking of matches) to select which model is "the best".
 ### Pros and cons of human evaluation
@@ -37,10 +45,21 @@ Human evaluation is very interesting for the following reasons:
   *Note: However, when doing evaluation with humans, you need to make sure your annotators are diverse enough that your results generalizes.*
 However, it also present a number of limitations:
-- **First impressions bias**: Human evaluators tend to estimate the quality of answers [based on first impressions](https://arxiv.org/abs/2309.16349), instead of actual factuality or faithfulness.
-- **Tone bias**: Crowdsourced annotators are notably very sensitive to tone, and underestimate the number of factual or logical errors in an assertive answer. In other terms, if a model says wrong things in a confident tone, human evaluators are much less likely to notice it, which could skew ratings towards the more assertive models. (Expert annotators are less likely to fall prey to these biases.)
 - **Self-preference bias**: Humans are [most likely to prefer answers which appeal to their views or align with their opinions or errors](https://arxiv.org/abs/2310.13548), rather than answers which are factually correct.
 - **Identity bias**: People with different identities tend to have different values, and rate model answers very differently (for example on [toxicity](https://arxiv.org/abs/2205.00501))
 ### Systematic human evaluation
 Pros of systematic human evaluations, especially with paid annotators, are
 - **Getting high quality data** adapted to your use case, that you will be able to build on later (if you need to develop preference models for example)
@@ -60,5 +79,10 @@ Pros of casual human evaluations are:
 The obvious problems of casual approaches (without annotator selection) are:
 - **High subjectivity**: it's hard to enforce a consistent grading from many community members using broad guidelines, especially since annotators preferences tend to be [culturally bound](https://arxiv.org/abs/2404.16019v1). One can hope that these effect is smoothed over by the sheer scale of the votes, through a "wisdom of the crowd" effect (see Galton's wikipedia page).
 - **Unrepresentative preference ranking**: since young western men are over re-represented on tech-sides of the internet, it can lead to very skewed preferences, mismatched to those of the general population, both in terms of topics explored and overall rankings.
 - **Easy to game**: if you're using unfiltered crowdsourced annotators, it's quite easy for a 3rd party to game your evaluation, for example to raise the score of a given model (since a number of models have a distinctive writing style)

 title: "Human Evaluation: Basics"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 Human evaluation is simply asking humans to evaluate models. In this document, we'll look at post-hoc evaluation: your model has been trained, you have a given task in mind, and humans are providing scores.
 ### Systematic evaluation
 **Vibes-checks** are manual evaluations done by individuals, usually on undisclosed prompts, to get an overall feeling of how well models perform on many use cases (from coding to quality of smut written). Often shared on Twitter and Reddit, results mostly constitute anecdotal evidence, and tend to be highly sensitive to confirmation bias (in other words, people tend to find what they look for). However, they can be [good starting point for your own use cases](https://olshansky.substack.com/p/vibe-checks-are-all-you-need).
+<Sidenote>
+While vibe-checks are anecdotal and subject to confirmation bias, systematic approaches like [Wolfram Ravenwolf's comparisons](https://olshansky.substack.com/p/vibe-checks-are-all-you-need) can provide useful starting points for identifying use cases to evaluate formally.
+</Sidenote>
 **Arenas** are crowdsourced human evaluation to rank models.
 A well known example of this is the [LMSYS chatbot arena](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard), where community users are asked to chat with models until they find one is better than the other. Votes are then aggregated in an Elo ranking (a ranking of matches) to select which model is "the best".
 ### Pros and cons of human evaluation
   *Note: However, when doing evaluation with humans, you need to make sure your annotators are diverse enough that your results generalizes.*
 However, it also present a number of limitations:
+- **First impressions bias**: Human evaluators tend to estimate the quality of answers [based on first impressions](https://arxiv.org/abs/2309.16349), instead of actual factuality or faithfulness.
+- **Tone bias**: Crowdsourced annotators are notably very sensitive to tone, and underestimate the number of factual or logical errors in an assertive answer. In other terms, if a model says wrong things in a confident tone, human evaluators are much less likely to notice it, which could skew ratings towards the more assertive models. (Expert annotators are less likely to fall prey to these biases.)
 - **Self-preference bias**: Humans are [most likely to prefer answers which appeal to their views or align with their opinions or errors](https://arxiv.org/abs/2310.13548), rather than answers which are factually correct.
 - **Identity bias**: People with different identities tend to have different values, and rate model answers very differently (for example on [toxicity](https://arxiv.org/abs/2205.00501))
+<Note title="Critical human evaluation biases" emoji="⚠️" variant="warning">
+Human evaluators have significant biases:
+- **First impressions**: Judge on presentation over factuality
+- **Tone**: Confident incorrect answers score higher than hesitant correct ones
+- **Self-preference**: Prefer answers aligning with their views over factually correct ones
+- **Identity**: Different demographics rate identical content very differently
+Expert annotators are less susceptible, but these biases affect crowdsourced evaluation significantly.
+</Note>
 ### Systematic human evaluation
 Pros of systematic human evaluations, especially with paid annotators, are
 - **Getting high quality data** adapted to your use case, that you will be able to build on later (if you need to develop preference models for example)
 The obvious problems of casual approaches (without annotator selection) are:
 - **High subjectivity**: it's hard to enforce a consistent grading from many community members using broad guidelines, especially since annotators preferences tend to be [culturally bound](https://arxiv.org/abs/2404.16019v1). One can hope that these effect is smoothed over by the sheer scale of the votes, through a "wisdom of the crowd" effect (see Galton's wikipedia page).
+<Sidenote>
+The "wisdom of the crowd" effect (discovered by statistician Galton) suggests individual biases cancel out at scale. However, this requires truly diverse crowds—tech forums skew heavily toward young western men, potentially undermining this effect.
+</Sidenote>
 - **Unrepresentative preference ranking**: since young western men are over re-represented on tech-sides of the internet, it can lead to very skewed preferences, mismatched to those of the general population, both in terms of topics explored and overall rankings.
 - **Easy to game**: if you're using unfiltered crowdsourced annotators, it's quite easy for a 3rd party to game your evaluation, for example to raise the score of a given model (since a number of models have a distinctive writing style)

app/src/content/chapters/human-evaluation/tips-and-tricks.mdx CHANGED Viewed

@@ -2,6 +2,9 @@
 title: "Human Evaluation: Tips and tricks"
 ---
 ### Tips and tricks
 Here are a few practical tips you might want consider when using human annotators to build an evaluation dataset. If you haven't done so yet, we recommend reading first the page on "Using human annotators" and then come back to this page.
@@ -19,6 +22,11 @@ Here are a few practical tips you might want consider when using human annotator
 - **Annotators should work independently**: It's better if annotators don't help each other or see each other's work during the task, as they can propagate their own biases and cause annotation drift. Alignment should always happen through comprehensive guidelines. You may want to train any new team members first on a separate dataset and/or use inter-annotator agreement metrics to make sure the team is aligned.
 - **Consistency is key**: If you make important changes to your guidelines (e.g., changed a definition or instruction, or have added/removed labels), consider if you need to iterate over the annotated data. At least, you should track the changes in your dataset through a metadata value like `guidelines-v1`.
 ### Hybrid human-machine annotation
@@ -27,6 +35,11 @@ Sometimes teams face contraints on time and resources but don't want to sacrific
 - **Model-aided annotation**: You may use the predictions or generations of a model as pre-annotations, so that the annotation team doesn't need to start from scratch. Just note that this could introduce the model's biases into human annotations, and that if the model's accuracy is poor it may increase work for annotators.
 - **Supervise model as a judge**: You can combine the power of the model as a judge methodology (see the section on "Model as a judge") and human supervisors who validate or discard the results. Note that the biases discussed in the "Pros and cons of human evaluation" will apply here.
 - **Idenfity edge cases**: For an even faster task, use a jury of models and then have your human supervisor(s) step in where models disagree or there's a tie to break. Again, be aware of the biases discussed in the "Pros and cons of human evaluation".

 title: "Human Evaluation: Tips and tricks"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### Tips and tricks
 Here are a few practical tips you might want consider when using human annotators to build an evaluation dataset. If you haven't done so yet, we recommend reading first the page on "Using human annotators" and then come back to this page.
 - **Annotators should work independently**: It's better if annotators don't help each other or see each other's work during the task, as they can propagate their own biases and cause annotation drift. Alignment should always happen through comprehensive guidelines. You may want to train any new team members first on a separate dataset and/or use inter-annotator agreement metrics to make sure the team is aligned.
+<Note title="Prevent annotation drift" emoji="🎯" variant="info">
+Annotators must work independently. Collaboration can propagate individual biases and cause "annotation drift" where the team gradually diverges from guidelines. Alignment should happen only through comprehensive written guidelines.
+</Note>
 - **Consistency is key**: If you make important changes to your guidelines (e.g., changed a definition or instruction, or have added/removed labels), consider if you need to iterate over the annotated data. At least, you should track the changes in your dataset through a metadata value like `guidelines-v1`.
 ### Hybrid human-machine annotation
 - **Model-aided annotation**: You may use the predictions or generations of a model as pre-annotations, so that the annotation team doesn't need to start from scratch. Just note that this could introduce the model's biases into human annotations, and that if the model's accuracy is poor it may increase work for annotators.
+<Sidenote>
+Model-aided annotation (using predictions as pre-annotations) can speed up work but introduces model biases into human annotations. If model accuracy is poor, fixing errors may take longer than annotating from scratch.
+</Sidenote>
 - **Supervise model as a judge**: You can combine the power of the model as a judge methodology (see the section on "Model as a judge") and human supervisors who validate or discard the results. Note that the biases discussed in the "Pros and cons of human evaluation" will apply here.
 - **Idenfity edge cases**: For an even faster task, use a jury of models and then have your human supervisor(s) step in where models disagree or there's a tie to break. Again, be aware of the biases discussed in the "Pros and cons of human evaluation".

app/src/content/chapters/human-evaluation/using-human-annotators.mdx CHANGED Viewed

@@ -4,11 +4,18 @@ title: "Using human annotators"
 import bestAnnotationPractices from '../../assets/image/best_annotation_practices.png';
 import Image from '../../../components/Image.astro';
 ### Using human annotators
 I suggest reading Section 3 of this [review](https://aclanthology.org/2024.cl-3.1/) of good practices in data annotation quality. If you want production level quality and have the means to implement all of these methods, go ahead!
   <Image src={bestAnnotationPractices} alt="Best annotation practices diagram" />
 However, important guidelines (no matter your project size) are the following, once you defined your task and scoring guidelines.
@@ -18,13 +25,23 @@ You likely want the people working on your task to:
 1) obey some demographics.
 	Some examples: be native speakers of the target language, have a higher education level, be experts in a specific domain, be diverse in their geographical origins, etc.
 	 Your needs will vary depending on your task.
-1) produce high quality work.
 	It's notably important now to add a way to check if answers are LLM-generated, and you'll need to filter some annotators out of your pool.
   *Imo, unless you're counting on highly motivated crowdsourced annotators, it's always better to pay your annotators correctly.*
-- **Guideline design**
 Make sure to spend a lot of time really brainstorming your guidelines! That's one of the points on which we spent the most time for the [GAIA](https://huggingface.co/gaia-benchmark) dataset.
 - **Iterative annotation**
 Be ready to try several rounds of annotations, as your annotators will misunderstand your guidelines (they are more ambiguous than you think)! Generating samples several times will allow your annotators to really converge on what you need.

 import bestAnnotationPractices from '../../assets/image/best_annotation_practices.png';
 import Image from '../../../components/Image.astro';
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### Using human annotators
 I suggest reading Section 3 of this [review](https://aclanthology.org/2024.cl-3.1/) of good practices in data annotation quality. If you want production level quality and have the means to implement all of these methods, go ahead!
+<Note title="Comprehensive annotation guide" emoji="📚" variant="info">
+For production-level quality, read Section 3 of this [comprehensive review](https://aclanthology.org/2024.cl-3.1/) of data annotation best practices. The diagram below summarizes key principles.
+</Note>
   <Image src={bestAnnotationPractices} alt="Best annotation practices diagram" />
 However, important guidelines (no matter your project size) are the following, once you defined your task and scoring guidelines.
 1) obey some demographics.
 	Some examples: be native speakers of the target language, have a higher education level, be experts in a specific domain, be diverse in their geographical origins, etc.
 	 Your needs will vary depending on your task.
+1) produce high quality work.
 	It's notably important now to add a way to check if answers are LLM-generated, and you'll need to filter some annotators out of your pool.
   *Imo, unless you're counting on highly motivated crowdsourced annotators, it's always better to pay your annotators correctly.*
+<Sidenote>
+Unless you have highly motivated crowdsourced annotators, always pay fairly. Underpaid annotators produce lower quality work, introduce more errors, and may use LLMs to complete tasks quickly.
+</Sidenote>
+- **Guideline design**
 Make sure to spend a lot of time really brainstorming your guidelines! That's one of the points on which we spent the most time for the [GAIA](https://huggingface.co/gaia-benchmark) dataset.
+<Sidenote>
+When creating the [GAIA benchmark](https://huggingface.co/gaia-benchmark), guideline design consumed more time than any other phase. Clear, unambiguous guidelines are worth the investment—they prevent costly re-annotation rounds.
+</Sidenote>
 - **Iterative annotation**
 Be ready to try several rounds of annotations, as your annotators will misunderstand your guidelines (they are more ambiguous than you think)! Generating samples several times will allow your annotators to really converge on what you need.

app/src/content/chapters/intro.mdx CHANGED Viewed

@@ -3,112 +3,126 @@ title: "Intro"
 ---
 import HtmlEmbed from "../../components/HtmlEmbed.astro";
-### How do we do LLM evaluation?
-First, let's align on a couple definitions. There are, to my knowledge, at the moment, 3 main ways to do evaluation: automated benchmarking, using humans as judges, and using models as judges. Each approach has its own reason for existing, uses, and limitations.
-#### Benchmarks
-Automated benchmarking usually works the following way: you'd like to know how well your model performs on something. This something can be a well-defined concrete **task**, such as *How well can my model classify spam from non spam emails?*, or a more abstract and general **capability**, such as *How good is my model at math?*.
-From this, you construct an evaluation, usually made of two things:
-- a collection of *samples*, given as input to the model to see what comes out as output, sometimes coupled with a reference (called gold) to compare with. Samples are usually designed to try to emulate what you want to test the model on: for example, if you are looking at email classification, you create a dataset of spam and non spam emails, try to include some hard edge cases, etc. For LLMs, the two main tasks are generation evaluation (comparing generated text with a reference after normalization), or multi-choice (compare the relative log-probabilities of possible continuations after a prompt).
-- a *metric*, which is a way to compute a score for the model. For example, how accurately can your model classify spam (score of well classified sample = 1, badly classified = 0).
-This is more interesting to do on data that was not included in the model training set, because you want to test if it **generalizes** well. You don't want a model which can only classify emails it has already "seen", that would not be very useful!
-Note: A model which can only predict well on its training data (and has not latently learnt more high-level general patterns) is said to be **overfitting**. In less extreme cases, you still want to test if your model is able to generalize to data patterns which were not in the training set's distribution (for example, classify spam emails about 'health' products after having seen only spam emails about fake banks).
-This works quite well for very well-defined tasks, where performance is "easy" to assess and measure: when you are literally testing your model on spam classification, you can say "the model classified correctly n% of these samples". For LLMs benchmarks, some issues can arise, such as models [favoring specific choices based on the order in which they have been presented for multi-choice evaluations](https://arxiv.org/abs/2309.03882), and generative evaluations relying on normalisations which can easily [be unfair if not designed well](https://huggingface.co/blog/open-llm-leaderboard-drop), but overall they still provide signal at the task level.
-For capabilities however, it's hard to decompose them into well-defined and precise tasks: what does "good at math" mean? good at arithmetic? at logic? able to reason on mathematical concepts?
-In this case, people tend to do more "holistic" evaluations, by not decomposing the capability in actual tasks, but assuming that performance on general samples will be a **good proxy** for what we aim to measure. For example, GSM8K is made of actual high school math problems, which require a whole set of capabilities to solve. It also means that both failure and success are very hard to interpret. Some capabilities or topics, such as "is this model good at writing poetry?" or "are the model outputs helpful?" are even harder to evaluate with automatic metrics - and at the same time, models now seem to have more and more **generalist** capabilities, so we need to evaluate their abilities in a broader manner. (For example, there was a debate in the scientific community as to whether LLMs [can draw](https://arxiv.org/abs/2303.12712) unicorns [or not](https://twitter.com/DimitrisPapail/status/1719119242186871275). Most likely not at this point, but clearly an important point to investigate.)
-Automatic benchmarks also tend to have another problem: once they are published publicly in plain text, they are very likely to end up (often accidentally) in the training datasets of models. Some benchmarks creators, like the authors of BigBench, have tried to mitigate this by adding a "canary string" (a very specific combination of characters) for people to look for, and remove from training sets, but not everybody is aware of the mechanism nor trying to do this removal. There is also a non negligible quantity of benchmarks, so looking for accidental copies of absolutely all of them in data is costly. Other options include providing benchmarks in an [encrypted form](https://arxiv.org/pdf/2309.16575), or behind a [gating system](https://huggingface.co/datasets/Idavidrein/gpqa). However, when evaluating closed models behind black box APIs, there is no guarantee that the provided data won’t be later used internally for training or fine-tuning.
-The case were an evaluation dataset ends up in the training set is called **contamination**, and a model which was contaminated will have a high benchmark performance that does not generalize well to the underlying task (an extensive description of contamination can be found [here](https://aclanthology.org/2023.findings-emnlp.722/), and here is a fun way to [detect it](https://arxiv.org/abs/2311.06233)). A way to address contamination is to run [**dynamic benchmarks**](https://arxiv.org/abs/2104.14337) (evaluations on datasets which are regularly refreshed to provide scores on systematically unseen new data), but this approach is costly in the long term.
-#### Human as a judge
-A solution to both contamination and more open-ended evaluation is asking humans to evaluate model outputs.
-This is usually done by tasking humans with first, prompting models, then, grading a model answer or ranking several outputs according to guidelines. Using humans as judges allows to study more complex tasks, with more flexibility than automated metrics. It also prevents most contamination cases, since the written prompts are (hopefully) new. Lastly, it correlates well with human preference, since this is literally what is evaluated!
-Different approaches exist to evaluate models with humans in the loop.
-**Vibes-checks** is the name given to manual evaluations done individually by some members of the community, usually on undisclosed prompts, to get an overall "feeling" of how well models perform on many use cases, which range from coding to quality of smut written. (I've also seen the term "canary-testing" used for this, in reference to high signal canary in a coalmine approach). Often shared on Twitter and Reddit, they mostly constitute anecdotal evidence, and tend to be highly sensitive to confirmation bias (in other words, people tend to find what they look for). However, some people have been trying to do more methodical vibe-checks evaluations; for example, the user *Wolfram Ravenwolf* shares his model comparisons findings in a very systematic way through blogs (see [here](https://huggingface.co/blog/wolfram/llm-comparison-test-llama-3) for an example).
-Using community feedback to establish massive model rankings is what we call an **arena**. A well known example of this is the [LMSYS chatbot arena](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard), where community users are asked to chat with models until they find one is better than the other. Votes are then aggregated in an Elo ranking (a ranking of matches) to select which model is "the best". The obvious problem of such an approach is the high subjectivity - it's hard to enforce a consistent grading from many community members using broad guidelines, especially since annotators preferences tend to be [culturally bound](https://arxiv.org/abs/2404.16019v1) (with different people favoring different discussion topics, for example). One can hope that this effect is smoothed over by the sheer scale of the votes, through a "wisdom of the crowd" effect (this effect was found by a statistician named Galton, who observed that individual answers trying to estimate a numerical value, like the weight of a hog, could be modeled as a probability distribution centered around the actual answer).
-The last approach is **systematic annotations**, where you provide extremely specific guidelines to paid selected annotators, in order to remove as much as the subjectivity bias as possible (this is the approach used by [ScaleAI](https://scale.com/guides/data-labeling-annotation-guide#hight-quality-data-annotations), and most data annotation companies). However, it can get extremely expensive fast, as you have to keep on doing evaluations in a continuous and non automatic manner for every new model you want to evaluate, and it can still fall prey to human bias (this [study](https://arxiv.org/abs/2205.00501) showed that people with different identities tend to rate model answer toxicity very differently).
-Recent [work](https://arxiv.org/pdf/2309.16349) has also shown that human evaluators tend to estimate the quality of answers based on first impressions, instead of actual factuality or faithfulness. Crowdsourced annotators are notably very sensitive to tone, and underestimate the number of factual or logical errors in an assertive answer. In other terms, if a model says wrong things in a confident tone, human evaluators are much less likely to notice it, which could skew ratings towards the more assertive models. (Expert annotators are less likely to fall prey to these biases.) This kind of human bias was confirmed in another [paper](https://arxiv.org/pdf/2310.13548) : humans are most likely to prefer answers which appeal to their views or align with their opinions or errors, rather than answers which are factually correct.
-These biases are not unexpected, but they must be taken into account: not all use cases should rely on using human annotators, especially crowdsourced, unexpert ones - any task requiring factuality (such as code writing, evaluation of model knowledge, etc) should include another, more robust, type of evaluation to complete the benchmark.
-#### Model as a judge
-To mitigate the cost of human annotators, some people have looked into using models or derived artifacts (preferably aligned with human preferences) to evaluate models' outputs. This approach is not new, as you can find techniques to measure summarization quality from [model embeddings](https://arxiv.org/abs/1904.09675) in 2019.
-Two approach exist for grading: using [generalist, high capability models](https://arxiv.org/abs/2306.05685v4) or using [small specialist models](https://arxiv.org/pdf/2405.01535) trained specifically to discriminate from preference data. The former approach gives results well correlated with human preference, but most strong enough models tend to be closed source, therefore subject to change behind APIs, and uninterpretable.
-LLM as judges have several strong limitations: they tend to [favor their own outputs](https://arxiv.org/abs/2404.13076) when scoring answers, are [bad at providing consistent score ranges](https://twitter.com/aparnadhinak/status/1748368364395721128) (though you can improve this with asking the model to explain its reasoning [before providing a score](https://twitter.com/seungonekim/status/1749289437165769177)), and are actually not that consistent [with human rankings](https://arxiv.org/pdf/2308.15812).
-My main personal gripe with using models as judges is that they introduce very subtle and un-interpretable bias in the answer selection. I feel that, much like when crossbreeding too much in genetics studies, you end up with dysfunctional animals or plants, by using LLMs to select and train LLMs, we are just as likely to introduce minute changes that will have bigger repercussions a couple generations down the line. I believe this type of bias is less likely to occur in smaller and more specialized models as judges (such as toxicity classifiers), but this remains to be rigorously tested and proven.
-### Why do we do LLM evaluation?
-Now that we’ve seen how we do evaluation, what is it actually useful for?
-I strongly believe that there are 3 main reasons for which people do evaluation, which tend to be conflated together, but are actually **very different**, and each answer a separate question.
-<HtmlEmbed src="d3-intro-boxes.html" title="Evaluation purposes" />
-#### 1) Is my model training well? Is my training method sound? - Non-regression testing
-**Non-regression testing** is a concept which comes from the software industry, to make sure small changes have not broken the overall approach.
-The idea is the following: when you add a new feature to your software, or fix a problem in the code base, have you broken something else? That's what non-regression tests are for: making sure the expected, high-level behavior of your software is not suddenly broken by a (seemingly unrelated) change.
-When you select a setup to train models, you want to test something very similar, and make sure that your changes (choosing different training data, architecture, parameters, etc) have not "broken" the expected performance for a model of these properties.
-To give a concrete example, you would expect a 7B base LLM to get between 50 and 65 on (multiple choice) MMLU after training, and on the other hand, know that performance fluctuating between 20 and 30 indicates that no learning occurred.
-For "non-regression" evaluation, you need to look at 1) evaluation scores **trajectories** (is the performance better now that when starting training), 2) evaluation scores **ranges** (is the performance within what's expected). You actually... don't care about the precise score themselves!
-This evaluation is therefore not here to tell you anything about actual model capabilities, but instead just here to confirm that your training approach is "as sound" as the other training approach, and that your model behaves in similar ways. I believe that even some evaluations simply looking at changes in the perplexity (probabilities) of text could be sufficient for this step, but you usually want benchmarks which have a high "signal to noise" ratio, or in other words, you want to make sure that a big change in the score reflects a big shift in your model.
-#### 2) Which model is the best? Is my model better than your model? - Leaderboards and rankings
-The next role of evaluation is simply to sort models to find and select the best architectures and approaches overall. If you have a leaderboard, take the best model, and it's not working on your use case, it's unlikely the next best model will work. In [their paper](https://arxiv.org/pdf/2404.02112) about lessons learned on benchmarking and dataset design from the ImageNet era, the authors argue that, since scores are susceptible to instability, the only robust way to evaluate models is through rankings, and more specifically by finding broad groups of evaluations which provide consistent and stable rankings.
-I believe looking for ranking stability is indeed an extremely interesting approach to model benchmarking, as we have shown that LLMs *scores* on automated benchmarks are extremely susceptible to [minute changes in prompting](https://huggingface.co/blog/evaluation-structured-outputs), and that human evaluations are not more consistent - where *rankings* are actually more stable when using robust evaluation methods.
-If scores, by themselves, are not that relevant, could using the relative ordering of models tell us something of value instead?
-In the related ICLR 2024 plenary of evaluation, Moritz Hardt compared adding perturbations to the Open LLM Leaderboard (through minuscule score modification, well within score ranges) and on the Chatbot Arena (through adding a bad contender to the arena to see how it affected the Elo rankings). Neither these benchmarks provide stable and consistent rankings at the moment. We'll be sure to explore this aspect with future versions of the Open LLM Leaderboard!
-#### 3) Where are we, as a field, in terms of model capabilities? Can my model do X?
-"How do you know if models can do X?" is a question which comes up a lot, and I think it is a very valid one.
-However, for any complex capability, **we cannot at the moment just say "this model is the best at this", but instead "this model is the best on this task that we hope is a good proxy for this capability, without any guarantee"**.
-We are strongly missing any kind of good definitions and framework on what a capability is for a machine learning model, especially for those surrounding reasoning and mind theory. However, this is not specific to machine learning! In human and animal studies, it is also quite hard to define what constitutes a "capability", and metrics which try to provide precise scores (IQ and EQ for example) are hotly debated and controversial, with reason.
-We might want to look at social sciences to think about evaluation of capabilities, as in these fields, people are used to thinking seriously about confounding factors in data gathering and analysis. However, I also believe it likely that 1) we cannot define these broad capabilities at all, since we cannot define them in humans and animals at the moment, 2) frameworks made with the human (or animal) in mind will not transfer well to models, as the underlying behaviors and assumptions are not the same.
-### Conclusion
-LLM evaluation is nowadays done in the following manner:
-Using automatic benchmarks, affected by contamination and lack of “generalness” (the latter not necessarily being a bad thing, as specialized evaluations are interesting)
-Using human evaluations, which tends to suffer from lack of reproducibility at a small scale, and psychological biases overall (such as preference for sycophantic answers), though one can hope some of the biases get smoothed over at a high scale
-Using models as judges, which has very subtle biases when evaluating, likely to be unnoticed but introduce perturbations downstream.
-However, all is not lost: evaluation, within its limits, is still able to provide some signal on which new training methods or datasets sound promising or not, both from looking at how performance falls within expected ranges (non-regression testing), and at how models are ranked overall (with stable enough evaluations). We can also hope that combining enough data points across topics and tasks will provide us with enough signal to get an idea of overall model performance, without however assuming anything about more “general” capabilities.
-Contrary to hype, we cannot really evaluate “general model capabilities” at the moment, first and foremost because we have not defined what that means. However, LLM evaluation, as a research field, is very much in its infancy at the moment, and there is a lot to be done, which is very exciting! Inspiration can be grabbed from many fields, from machine learning [interpretability](https://transformer-circuits.pub/2024/scaling-monosemanticity/index.html) to sociology, in order to define new metrics and tasks. Interdisciplinary work will likely open very new cool directions for the field!

 ---
 import HtmlEmbed from "../../components/HtmlEmbed.astro";
+import Note from "../../components/Note.astro";
+import Sidenote from "../../components/Sidenote.astro";
+## Intro
+### Why should you even care about evaluation?
+Evaluation, in short, is how you know a model is "good at" something (though we'll see the reality is more complex than this).
+As you navigate the world of LLMs—whether you're training or fine-tuning your own models, selecting one for your application, or trying to understand the state of the field—you'll inevitably encounter evaluation. It's everywhere: leaderboards ranking models, benchmarks claiming to measure "reasoning" or "knowledge," papers announcing new state-of-the-art results.
+But what does it all actually mean? And more importantly, what can evaluation really tell you?
+This guide is here to help you understand evaluation in practice: what it can and cannot do, when to trust different approaches (what their limitations and biases are too!), and how to think critically about the claims made from evaluation results.
+Before we dive into the details, let's quickly look at why people do evaluation, concretely, and how.
+### Why do we do LLM evaluation?
+There are 3 main reasons for which people do evaluation, which tend to be conflated together, but are actually **very different**, and each answer a separate question.
+<HtmlEmbed src="d3-intro-boxes.html" title="Evaluation purposes" />
+#### Is this model training correctly?
+**Non-regression testing** is a concept which comes from the software industry, to make sure small changes have not broken the overall approach. The idea is the following: when you add a new feature to your software, or fix a problem in the code base, have you broken something else? That's what non-regression tests are for: making sure the expected, high-level behavior of your software is not suddenly broken by a (seemingly unrelated) change.
+When you select a setup to train models, you want to test something very similar, and make sure that your changes (choosing different training data, architecture, parameters, etc) have not "broken" the expected performance for a model of these properties.
+In ML, these experiments are often referred to as ablations, and the core of them is actually having a good set of evaluations (looking at the loss will only get you so far!)
+For these evaluations, you need to select evaluations which
+- give you a strong enough signal (see section TODO on how to select your evals)
+- while being relatively cheap to run as you'll be running them **a lot**.
+You'll also need to look at both **trajectories** (is the performance better now that when starting training) and scores **ranges** (is the performance within what's expected). You actually... don't really care about the precise score themselves! This evaluation is therefore not here to tell you anything about actual model capabilities, but instead just here to confirm that your training approach is "as sound" as the other training approach, and that your model behaves in similar ways.
+#### Which model is the best on X?
+The next role of evaluation is simply to sort models to find and select the best architectures and approaches for use case X.
+If you have a leaderboard for your domain and task, take the best model, and it's not working for you, it's unlikely the next best model will work.
+<Sidenote>
+In [their paper](https://arxiv.org/pdf/2404.02112) about lessons learned on benchmarking and dataset design from the ImageNet era, the authors argue that, since scores are susceptible to instability, the only robust way to evaluate models is through rankings, and more specifically by finding broad groups of evaluations which provide consistent and stable rankings. I believe looking for ranking stability is indeed an extremely interesting approach to model benchmarking, as we have shown that LLMs *scores* on automated benchmarks are extremely susceptible to [minute changes in prompting](https://huggingface.co/blog/evaluation-structured-outputs), and that human evaluations are not more consistent - where *rankings* are actually more stable when using robust evaluation methods.
+</Sidenote>
+If you don't... that's where you need to think about designing your own evaluations, which we will cover below in section TODO.
+<Note>
+"How do you know for sure if models can do X?" is a question which comes up a lot, and it is a very valid one. However, for any complex capability, **we cannot at the moment just say "this model is the best at this", but instead "this model is the best on this task that we hope is a good proxy for this capability, without any guarantee"**.
+</Note>
+#### When will we finally reach AGI?
+We are strongly missing any kind of good definitions and framework on what intelligence is for machine learning models (though some people have tried, for example [Chollet](https://arxiv.org/abs/1911.01547) in 2019 and [Hendrycks et al](https://www.agidefinition.ai/paper.pdf) this year). However, this problem is not specific to machine learning! In human and animal studies, it is also quite hard to define what constitutes intelligence, and metrics which try to provide precise scores (IQ and EQ for example) are hotly debated and controversial, with reason.
+To solve this, we should look at social sciences, as in these fields, people are used to thinking seriously about confounding factors in data gathering and results analysis, which I'm not seeing a lot in "intelligence evaluation" in ML for now.
+However, I also don't think we'll be able to define these broad capabilities at all (we'll just end up with moving targets) since we cannot define them in humans and animals at the moment, and frameworks made with the human (or animal) in mind will most likely not transfer well to models, as the underlying behaviors and assumptions are not the same.
+<Sidenote>
+I also believe that this question is a bad one, as targeting "general intelligence" is much more blurry, risky, and less useful than targetting good tools with specific capabilities for actual problems that humans encounter at their jobs.
+</Sidenote>
+### So how do people evaluate models, then?
+To my knowledge, at the moment, people use 3 main ways to do evaluation: automated benchmarking, using humans as judges, and using models as judges. Each approach has its own reason for existing, uses, and limitations.
+#### Automated benchmarks
+Automated benchmarking usually works the following way: you'd like to know how well your model performs on something. This something can be a well-defined concrete **task**, such as *How well can my model classify spam from non spam emails?*, or a more abstract and general **capability**, such as *How good is my model at math?*.
+From this, you construct an evaluation, usually made of two things:
+- a collection of *samples*, given as input to the model to see what comes out as output, sometimes coupled with a reference (called gold) to compare with. Samples are usually designed to try to emulate what you want to test the model on: for example, if you are looking at email classification, you create a dataset of spam and non spam emails, try to include some hard edge cases, etc. For LLMs, the two main tasks are generation evaluation (comparing generated text with a reference after normalization), or multi-choice (compare the relative log-probabilities of possible continuations after a prompt).
+- a *metric*, which is a way to compute a score for the model. For example, how accurately can your model classify spam (score of well classified sample = 1, badly classified = 0).
+This is more interesting to do on data that was not included in the model training set, because you want to test if it **generalizes** well. You don't want a model which can only classify emails it has already "seen", that would not be very useful!
+<Note>
+A model which can only predict well on its training data (and has not latently learnt more high-level general patterns) is said to be **overfitting**. In less extreme cases, you still want to test if your model is able to generalize to data patterns which were not in the training set's distribution (for example, classify spam emails about 'health' products after having seen only spam emails about fake banks).
+</Note>
+This works quite well for very well-defined tasks, where performance is "easy" to assess and measure: when you are literally testing your model on spam classification, you can say "the model classified correctly n% of these samples". For LLMs benchmarks, some issues can arise, such as models [favoring specific choices based on the order in which they have been presented for multi-choice evaluations](https://arxiv.org/abs/2309.03882), and generative evaluations relying on normalisations which can easily [be unfair if not designed well](https://huggingface.co/blog/open-llm-leaderboard-drop), but overall they still provide signal at the task level.
+For capabilities however, it's hard to decompose them into well-defined and precise tasks: what does "good at math" mean? good at arithmetic? at logic? able to reason on mathematical concepts?
+In this case, people tend to do more "holistic" evaluations, by not decomposing the capability in actual tasks, but assuming that performance on general samples will be a **good proxy** for what we aim to measure. For example, GSM8K is made of actual high school math problems, which require a whole set of capabilities to solve. It also means that both failure and success are very hard to interpret. Some capabilities or topics, such as "is this model good at writing poetry?" or "are the model outputs helpful?" are even harder to evaluate with automatic metrics - and at the same time, models now seem to have more and more **generalist** capabilities, so we need to evaluate their abilities in a broader manner. (For example, there was a debate in the scientific community as to whether LLMs [can draw](https://arxiv.org/abs/2303.12712) unicorns [or not](https://twitter.com/DimitrisPapail/status/1719119242186871275). A year later, seems like most can!)
+Automatic benchmarks also tend to have another problem: once they are published publicly in plain text, they are very likely to end up (often accidentally) in the training datasets of models. Some benchmarks creators, like the authors of BigBench, have tried to mitigate this by adding a "canary string" (a very specific combination of characters) for people to look for, and remove from training sets, but not everybody is aware of the mechanism nor trying to do this removal. There is also a non negligible quantity of benchmarks, so looking for accidental copies of absolutely all of them in data is costly. Other options include providing benchmarks in an [encrypted form](https://arxiv.org/pdf/2309.16575), or behind a [gating system](https://huggingface.co/datasets/Idavidrein/gpqa). However, when evaluating closed models behind black box APIs, there is no guarantee that the provided data won’t be later used internally for training or fine-tuning.
+<Note>
+The case were an evaluation dataset ends up in the training set is called **contamination**, and a model which was contaminated will have a high benchmark performance that does not generalize well to the underlying task (an extensive description of contamination can be found [here](https://aclanthology.org/2023.findings-emnlp.722/), and here is a fun way to [detect it](https://arxiv.org/abs/2311.06233)). A way to address contamination is to run [**dynamic benchmarks**](https://arxiv.org/abs/2104.14337) (evaluations on datasets which are regularly refreshed to provide scores on systematically unseen new data), but this approach is costly in the long term.
+</Note>
+#### Human as a judge
+A solution to both contamination and more open-ended evaluation is asking humans to evaluate model outputs.
+This is usually done by tasking humans with first, prompting models, then, grading a model answer or ranking several outputs according to guidelines. Using humans as judges allows to study more complex tasks, with more flexibility than automated metrics. It also prevents most contamination cases, since the written prompts are (hopefully) new. Lastly, it correlates well with human preference, since this is literally what is evaluated!
+Different approaches exist to evaluate models with humans in the loop.
+**Vibes-checks** is the name given to manual evaluations done individually by some members of the community, usually on undisclosed prompts, to get an overall "feeling" of how well models perform on many use cases, which range from coding to quality of smut written. (I've also seen the term "canary-testing" used for this, in reference to high signal canary in a coalmine approach). Often shared on Twitter and Reddit, they mostly constitute anecdotal evidence, and tend to be highly sensitive to confirmation bias (in other words, people tend to find what they look for). However, some people have been trying to do more methodical vibe-checks evaluations; for example, the user *Wolfram Ravenwolf* shares his model comparisons findings in a very systematic way through blogs (see [here](https://huggingface.co/blog/wolfram/llm-comparison-test-llama-3) for an example).
+Using community feedback to establish massive model rankings is what we call an **arena**. A well known example of this is the [LMSYS chatbot arena](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard), where community users are asked to chat with models until they find one is better than the other. Votes are then aggregated in an Elo ranking (a ranking of matches) to select which model is "the best". The obvious problem of such an approach is the high subjectivity - it's hard to enforce a consistent grading from many community members using broad guidelines, especially since annotators preferences tend to be [culturally bound](https://arxiv.org/abs/2404.16019v1) (with different people favoring different discussion topics, for example). One can hope that this effect is smoothed over by the sheer scale of the votes, through a "wisdom of the crowd" effect (this effect was found by a statistician named Galton, who observed that individual answers trying to estimate a numerical value, like the weight of a hog, could be modeled as a probability distribution centered around the actual answer).
+The last approach is **systematic annotations**, where you provide extremely specific guidelines to paid selected annotators, in order to remove as much as the subjectivity bias as possible (this is the approach used by most data annotation companies). However, it can get extremely expensive fast, as you have to keep on doing evaluations in a continuous and non automatic manner for every new model you want to evaluate, and it can still fall prey to human bias (this [study](https://arxiv.org/abs/2205.00501) showed that people with different identities tend to rate model answer toxicity very differently).
+Recent [work](https://arxiv.org/pdf/2309.16349) has also shown that human evaluators tend to estimate the quality of answers based on first impressions, instead of actual factuality or faithfulness. Crowdsourced annotators are notably very sensitive to tone, and underestimate the number of factual or logical errors in an assertive answer. In other terms, if a model says wrong things in a confident tone, human evaluators are much less likely to notice it, which could skew ratings towards the more assertive models. (Expert annotators are less likely to fall prey to these biases.) This kind of human bias was confirmed in another [paper](https://arxiv.org/pdf/2310.13548) : humans are most likely to prefer answers which appeal to their views or align with their opinions or errors, rather than answers which are factually correct.
+These biases are not unexpected, but they must be taken into account: not all use cases should rely on using human annotators, especially crowdsourced, unexpert ones - any task requiring factuality (such as code writing, evaluation of model knowledge, etc) should include another, more robust, type of evaluation to complete the benchmark.
+#### Model as a judge
+To mitigate the cost of human annotators, some people have looked into using models or derived artifacts (preferably aligned with human preferences) to evaluate models' outputs. This approach is not new, as you can find techniques to measure summarization quality from [model embeddings](https://arxiv.org/abs/1904.09675) in 2019.
+Two approach exist for grading: using [generalist, high capability models](https://arxiv.org/abs/2306.05685v4) or using [small specialist models](https://arxiv.org/pdf/2405.01535) trained specifically to discriminate from preference data. The former approach gives results well correlated with human preference, but most strong enough models tend to be closed source, therefore subject to change behind APIs, and uninterpretable.
+LLM as judges have several strong limitations: they tend to [favor their own outputs](https://arxiv.org/abs/2404.13076) when scoring answers, are [bad at providing consistent score ranges](https://twitter.com/aparnadhinak/status/1748368364395721128) (though you can improve this with asking the model to explain its reasoning [before providing a score](https://twitter.com/seungonekim/status/1749289437165769177)), and are actually not that consistent [with human rankings](https://arxiv.org/pdf/2308.15812).
+My main personal gripe with using models as judges is that they introduce very subtle and un-interpretable bias in the answer selection. I feel that, much like when crossbreeding too much in genetics studies, you end up with dysfunctional animals or plants, by using LLMs to select and train LLMs, we are just as likely to introduce minute changes that will have bigger repercussions a couple generations down the line. I believe this type of bias is less likely to occur in smaller and more specialized models as judges (such as toxicity classifiers), but this remains to be rigorously tested and proven.

app/src/content/chapters/model-as-a-judge/basics.mdx CHANGED Viewed

@@ -2,6 +2,9 @@
 title: "Model as a Judge: Basics"
 ---
 Judge models are simply **neural network used to evaluate the output of other neural networks**. In most cases, they evaluate text generations.
 Judge models range from small specialized classifiers (think "spam filter", but for toxicity for example) to LLMs, either large and generalist or small and specialized. In the latter case, when using an LLM as a judge, you give it a prompt to explain how to score models (ex: `Score the fluency from 0 to 5, 0 being completely un-understandable, ...`).
@@ -30,6 +33,17 @@ In my opinion, using LLM judges correctly is extremely tricky, and it's easy to
 - They are indeed scalable, but contribute to creating massive amounts of data which themselves need to be examined to ensure their quality (for example, you can improve the quality of LLM-judges by asking them to generate a thinking trace, or reasoning around their data, which makes even more new artificial data to analyse)
 - They are indeed cheap to instantiate, but paying actual expert human annotators is likely to give you qualitatively better results for your specific use cases.
 This section is a bit long, because you need to be well aware of their limitations: a lot of people are blindly jumping into using model judges because they seem easier, but then end up with uninsterpretable data with tricky bias to extract.
 If you want to give it a go, I suggest first reading this [very good guide](https://huggingface.co/learn/cookbook/en/llm_judge) (⭐) by Aymeric Roucher on how to setup your first LLM as judge!

 title: "Model as a Judge: Basics"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 Judge models are simply **neural network used to evaluate the output of other neural networks**. In most cases, they evaluate text generations.
 Judge models range from small specialized classifiers (think "spam filter", but for toxicity for example) to LLMs, either large and generalist or small and specialized. In the latter case, when using an LLM as a judge, you give it a prompt to explain how to score models (ex: `Score the fluency from 0 to 5, 0 being completely un-understandable, ...`).
 - They are indeed scalable, but contribute to creating massive amounts of data which themselves need to be examined to ensure their quality (for example, you can improve the quality of LLM-judges by asking them to generate a thinking trace, or reasoning around their data, which makes even more new artificial data to analyse)
 - They are indeed cheap to instantiate, but paying actual expert human annotators is likely to give you qualitatively better results for your specific use cases.
+<Note title="Critical limitations of LLM judges" emoji="⚠️" variant="warning">
+Using LLM judges is extremely tricky:
+- **Hidden biases**: Harder to detect than human biases; creates echo-chamber effects
+- **Data overload**: Generates massive synthetic data needing quality examination
+- **False objectivity**: Seems objective but reinforces subtle biases
+- **Expert humans better**: For critical use cases, expert annotators provide higher quality
+See [Tips and tricks](./tips-and-tricks) for bias mitigation strategies.
+</Note>
 This section is a bit long, because you need to be well aware of their limitations: a lot of people are blindly jumping into using model judges because they seem easier, but then end up with uninsterpretable data with tricky bias to extract.
 If you want to give it a go, I suggest first reading this [very good guide](https://huggingface.co/learn/cookbook/en/llm_judge) (⭐) by Aymeric Roucher on how to setup your first LLM as judge!

app/src/content/chapters/model-as-a-judge/designing-your-evaluation-prompt.mdx CHANGED Viewed

@@ -2,13 +2,16 @@
 title: "Designing your evaluation prompt"
 ---
 ### Designing your evaluation prompt
 Once you've selected your model, you need to define what is the best possible prompt for your task.
 Some general guidelines I've come across online when designing the prompt itself are:
 - Provide a clear description of the task at hand:
-	- `Your task is to do X`.
 	- `You will be provided with Y`.
 - Provide clear instructions on the evaluation criteria, including a detailed scoring system if needed:
 	- `You should evaluate property Z on a scale of 1 - 5, where 1 means ...`
@@ -18,6 +21,16 @@ Some general guidelines I've come across online when designing the prompt itself
 - Specify the desired output format (adding fields will help consistency)
 	- `Your answer should be provided in JSON, with the following format {"Score": Your score, "Reasoning": The reasoning which led you to this score}`
 You can and should take inspiration from [MixEval](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended/mix_eval/judge_prompts.pyy) or [MTBench](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py) prompt templates.
 Other tidbits:
@@ -25,16 +38,44 @@ Other tidbits:
 - If you really want a score, use an integer scale make sure you provide a detailed explanation for what [each score represents](https://x.com/seungonekim/status/1749289437165769177), or an additive prompt (`provide 1 point for this characteristic of the answer, 1 additional point if ...` etc)
 - Using one prompt per capability to score tends to give better and more robust results
 You can also improve accuracy using the following, possibly more costly, techniques:
 - **Few shot examples**: like in many other tasks, if you provide examples it can help its reasoning. However, this adds to your context length.
-- **Reference**: you can also enhance your prompt with a reference if present, which increases accuracy
 - **CoT**: [improves accuracy](https://arxiv.org/abs/2212.08073), if you ask the model to output its chain of thought **before** the score (also observed [here](https://x.com/seungonekim/status/1749289437165769177))
 - **Multiturn analysis**: can improve [factual error detection](https://arxiv.org/abs/2305.13281)
-- Using **a jury** (many judges, where you pick an aggregate of the answers): [gives better results](https://arxiv.org/abs/2404.18796) than using a single model.
-	- It can be made considerably less costly by leveraging many smaller models instead of one big expensive model.
 	- You can also experiment with using one model with variations on temperature
 - Surprisingly, the community has found that adding stakes to the prompts (`answer correctly and you'll get a kitten`) can increase correctness. Your mileage may vary on this one, adapt to your needs.
 Note on prompting: Depending on the stakes of your use case, to remove as much bias as possible, you would want to look at work done in sociology on how to design good surveys. If you treat your evaluator as a replacement for a human annotator, then you need to look at similar metrics: computing inter-annotator agreement, using correct survey design methodology to mitigate bias, etc.
 However, most people don't really want a reproducible and high quality unbiased eval, and will be happy with quick and dirty evaluation through OK-ish prompts. (Which is an OK situation to be in! Just depends on the consequences attached).

 title: "Designing your evaluation prompt"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### Designing your evaluation prompt
 Once you've selected your model, you need to define what is the best possible prompt for your task.
 Some general guidelines I've come across online when designing the prompt itself are:
 - Provide a clear description of the task at hand:
+	- `Your task is to do X`.
 	- `You will be provided with Y`.
 - Provide clear instructions on the evaluation criteria, including a detailed scoring system if needed:
 	- `You should evaluate property Z on a scale of 1 - 5, where 1 means ...`
 - Specify the desired output format (adding fields will help consistency)
 	- `Your answer should be provided in JSON, with the following format {"Score": Your score, "Reasoning": The reasoning which led you to this score}`
+<Note title="Core prompt design principles" emoji="📝" variant="info">
+**Essential elements for effective judge prompts:**
+- **Clear task description**: Specify exactly what the judge needs to do
+- **Detailed criteria**: Provide explicit scoring scales with clear definitions
+- **Reasoning steps**: Guide the judge through the evaluation process
+- **Structured output**: Use JSON format for consistency and parsability
+</Note>
 You can and should take inspiration from [MixEval](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended/mix_eval/judge_prompts.pyy) or [MTBench](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py) prompt templates.
 Other tidbits:
 - If you really want a score, use an integer scale make sure you provide a detailed explanation for what [each score represents](https://x.com/seungonekim/status/1749289437165769177), or an additive prompt (`provide 1 point for this characteristic of the answer, 1 additional point if ...` etc)
 - Using one prompt per capability to score tends to give better and more robust results
+<Sidenote>
+Pairwise comparison consistently outperforms absolute scoring for judging model outputs. It correlates better with human preferences and is less sensitive to judge biases and scale interpretation issues.
+</Sidenote>
 You can also improve accuracy using the following, possibly more costly, techniques:
 - **Few shot examples**: like in many other tasks, if you provide examples it can help its reasoning. However, this adds to your context length.
+- **Reference**: you can also enhance your prompt with a reference if present, which increases accuracy
 - **CoT**: [improves accuracy](https://arxiv.org/abs/2212.08073), if you ask the model to output its chain of thought **before** the score (also observed [here](https://x.com/seungonekim/status/1749289437165769177))
 - **Multiturn analysis**: can improve [factual error detection](https://arxiv.org/abs/2305.13281)
+- Using **a jury** (many judges, where you pick an aggregate of the answers): [gives better results](https://arxiv.org/abs/2404.18796) than using a single model.
+	- It can be made considerably less costly by leveraging many smaller models instead of one big expensive model.
 	- You can also experiment with using one model with variations on temperature
 - Surprisingly, the community has found that adding stakes to the prompts (`answer correctly and you'll get a kitten`) can increase correctness. Your mileage may vary on this one, adapt to your needs.
+<Note title="Advanced techniques to improve accuracy" emoji="⚡" variant="success">
+**More sophisticated but effective approaches:**
+- **Chain-of-Thought (CoT)**: Ask for reasoning BEFORE the score
+- **Judge jury**: Multiple judges with aggregated results (can use smaller models to reduce cost)
+- **Few-shot examples**: Provide examples, though this increases context length
+- **Reference answers**: Include reference material to improve accuracy
+- **Multi-turn analysis**: Better for detecting factual errors
+</Note>
 Note on prompting: Depending on the stakes of your use case, to remove as much bias as possible, you would want to look at work done in sociology on how to design good surveys. If you treat your evaluator as a replacement for a human annotator, then you need to look at similar metrics: computing inter-annotator agreement, using correct survey design methodology to mitigate bias, etc.
+<Note title="High-stakes evaluation requires rigor" emoji="⚠️" variant="warning">
+For production or critical use cases, apply rigorous methodologies from sociology:
+- Compute inter-annotator agreement metrics
+- Use proper survey design methodology to mitigate bias
+- Treat the evaluator like a human annotator with similar quality standards
+Quick evaluations with "OK-ish prompts" may suffice for low-stakes exploration, but don't mistake convenience for quality when decisions matter.
+</Note>
 However, most people don't really want a reproducible and high quality unbiased eval, and will be happy with quick and dirty evaluation through OK-ish prompts. (Which is an OK situation to be in! Just depends on the consequences attached).

app/src/content/chapters/model-as-a-judge/evaluating-your-evaluator.mdx CHANGED Viewed

@@ -2,19 +2,37 @@
 title: "Evaluating your evaluator"
 ---
 ### Evaluating your evaluator
-Before using a judge-LLM in production or at scale, you want to first evaluate its quality for your task, to make sure its scores are actually relevant and useful for you.
-Note: *This will be easier to do if it predicts binary outputs, because you'll be able to interpretable classification metrics (accuracy/recall/precision). If it predicts scores on a scale, it will be much harder to estimate the quality of the correlation with a reference.*
 So, once you have selected your model judge and its prompt, you'll need to do the following.
 1. **Pick your baseline**
-You'll need to compare your evaluator judgments to a baseline: it can be human annotations, the output of another judge model that you know is qualitative on your task, a gold truth, itself with another prompt, etc.
 You don't necessarily need a lot of examples (50 can be enough), but you need them to be extremely representative of your task, discriminative (representative of edge cases notably), and of as high quality as you can manage.
 2. **Pick your metric**
 Your metric will be used to compare your judge's evaluations with your reference.
@@ -29,5 +47,15 @@ For this step, you simply need to use your model and its prompt to evaluate your
 You need to decide what your threshold for acceptance is. Depending on how hard your task is, you can aim for 80% to 95% accuracy, if you're doing pairwise comparison. Regarding correlations (if you're using scores), people in the literature tend to seem happy with 0.8 Pearson correlation with a reference. However, I've seen some papers declare that 0.3 indicates a good correlation with human annotators (^^") so ymmv.

 title: "Evaluating your evaluator"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### Evaluating your evaluator
+Before using a judge-LLM in production or at scale, you want to first evaluate its quality for your task, to make sure its scores are actually relevant and useful for you.
+Note: *This will be easier to do if it predicts binary outputs, because you'll be able to interpretable classification metrics (accuracy/recall/precision). If it predicts scores on a scale, it will be much harder to estimate the quality of the correlation with a reference.*
+<Sidenote>
+Binary outputs (yes/no, pass/fail) are much easier to evaluate than continuous scores. You can use clear metrics like accuracy, precision, and recall. Continuous scores require correlation analysis which is harder to interpret.
+</Sidenote>
 So, once you have selected your model judge and its prompt, you'll need to do the following.
 1. **Pick your baseline**
+You'll need to compare your evaluator judgments to a baseline: it can be human annotations, the output of another judge model that you know is qualitative on your task, a gold truth, itself with another prompt, etc.
 You don't necessarily need a lot of examples (50 can be enough), but you need them to be extremely representative of your task, discriminative (representative of edge cases notably), and of as high quality as you can manage.
+<Note title="Quality over quantity for baseline" emoji="🎯" variant="info">
+You don't need many baseline examples (50 can suffice), but they must be:
+- **Representative**: Cover the full range of your task
+- **Discriminative**: Include edge cases and challenging examples
+- **High quality**: Use the best reference data you can obtain
+</Note>
 2. **Pick your metric**
 Your metric will be used to compare your judge's evaluations with your reference.
 You need to decide what your threshold for acceptance is. Depending on how hard your task is, you can aim for 80% to 95% accuracy, if you're doing pairwise comparison. Regarding correlations (if you're using scores), people in the literature tend to seem happy with 0.8 Pearson correlation with a reference. However, I've seen some papers declare that 0.3 indicates a good correlation with human annotators (^^") so ymmv.
+<Note title="Acceptance thresholds vary widely" emoji="📊" variant="warning">
+**Realistic thresholds for judge quality:**
+- **Pairwise comparison**: Aim for 80-95% accuracy depending on task difficulty
+- **Score correlation**: 0.8 Pearson correlation is considered good, but some papers claim 0.3 is acceptable
+The wide range in reported "acceptable" correlations (0.3 to 0.8) suggests you should carefully set your own thresholds based on your specific use case requirements.
+</Note>

app/src/content/chapters/model-as-a-judge/getting-a-judge-llm.mdx CHANGED Viewed

@@ -2,19 +2,38 @@
 title: "Getting a Judge-LLM"
 ---
 ### Getting a Judge-Model
 When using an existing LLM, you can go for [generalist, high capability models](https://arxiv.org/abs/2306.05685v4),  using [small specialist models](https://arxiv.org/abs/2405.01535) trained specifically to discriminate from preference data, or training your own.
 #### Using a generalist LLM
-With the introduction of more capable LLMs (such as ChatGPT), some researchers started exploring using big models as judges. The best current big model judges tend to be closed source models (like Claude or gpt-o models) though the gap with open source is closing very fast thanks to high quality models such as [Qwen 2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e), [Command R+](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) or [Llama 3.1-405-Instruct](meta-llama/Llama-3.1-405B-Instruct).
 Closed source models, despite their performance, present the multiple disadvantages of being:
 - under APIs, which mean that models (therefore results) can change with no notice, hurting the reproducibility of evals
 - black boxes, which makes them un-interpretable
 - possible sources of data leakage/lack of data privacy, as you send your data to a third party through the internet (which tends to be less safe than locally managed data), and you don't know for certain what is done with it (you often need to opt out of it being used in training sets).
 However, they also allow anyone to have access to a high quality model without needing to setup things locally or requiring access to hardware. This pros are now also present for most high quality open models, which are accessible through model providers, and solve the first 2 problems above.
 You'll find a good cost analysis of model providers [here](https://huggingface.co/spaces/ArtificialAnalysis/LLM-Performance-Leaderboard) if you need help picking one.
@@ -23,6 +42,12 @@ You'll find a good cost analysis of model providers [here](https://huggingface.c
 You can also make the choice to use tiny specialized LLM judges. With often a couple billion parameters, they can run locally on most recent consumer hardware, while being trained from scratch or fine-tuned using instruction data. You often need to follow their specific prompt formats.
 Some existing models:
 - Flow-Judge-v0.1 ([weights](https://huggingface.co/collections/flowaicom/flow-judge-v01-66e6af5fc3b3a128bde07dec)), 3.8B parameters, a Phi-3.5-mini-instruct fine-tuned on a synthetic preference dataset
 - Prometheus ([weights](https://huggingface.co/prometheus-eval/prometheus-13b-v1.0), [paper](https://arxiv.org/abs/2310.08491)), 13B parameters, a model trained from scratch on synthetic preference dataset. A 7B parameter [v2](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0) also exists, a Mistral-7B-Instruct-v0.2 fine-tune on a bigger synthetic preference dataset, with added weight merging
@@ -35,8 +60,19 @@ You first need to gather preference data for your task of interest, which can co
 - From existing [human preference datasets](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
 - From model generated preference data (which you can generate following the above tiny-model judges papers data sections, or get directly, for example from the Prometheus [preference](https://huggingface.co/datasets/prometheus-eval/Preference-Collection) and [feedback](https://huggingface.co/datasets/prometheus-eval/Feedback-Collection) collections).
-Then you need to decide whether to start from a small model to train from scratch, or from an existing model, that you can
 - distill into a new smaller model
 - quantize.
 - then fine-tune (using peft or adapter weights if the model is big and your training compute low) using the above data
 	- apparently [starting from a reward model works better than from an instruct model](https://x.com/dk21/status/1826292289930674590)

 title: "Getting a Judge-LLM"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### Getting a Judge-Model
 When using an existing LLM, you can go for [generalist, high capability models](https://arxiv.org/abs/2306.05685v4),  using [small specialist models](https://arxiv.org/abs/2405.01535) trained specifically to discriminate from preference data, or training your own.
 #### Using a generalist LLM
+With the introduction of more capable LLMs (such as ChatGPT), some researchers started exploring using big models as judges. The best current big model judges tend to be closed source models (like Claude or gpt-o models) though the gap with open source is closing very fast thanks to high quality models such as [Qwen 2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e), [Command R+](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) or [Llama 3.1-405-Instruct](meta-llama/Llama-3.1-405B-Instruct).
 Closed source models, despite their performance, present the multiple disadvantages of being:
 - under APIs, which mean that models (therefore results) can change with no notice, hurting the reproducibility of evals
 - black boxes, which makes them un-interpretable
 - possible sources of data leakage/lack of data privacy, as you send your data to a third party through the internet (which tends to be less safe than locally managed data), and you don't know for certain what is done with it (you often need to opt out of it being used in training sets).
+<Note title="Closed vs open source judge models" emoji="⚖️" variant="warning">
+**Closed source models (Claude, GPT-o) tradeoffs:**
+Disadvantages:
+- **Non-reproducible**: Models can change without notice via API updates
+- **Black box**: Un-interpretable decision-making
+- **Privacy risks**: Data sent to third parties, potential leakage
+Advantages:
+- Easy access without local setup or hardware requirements
+**Open source models are closing the gap** while solving reproducibility and interpretability issues. Models like Qwen 2.5, Command R+, and Llama 3.1-405-Instruct are now competitive alternatives.
+</Note>
 However, they also allow anyone to have access to a high quality model without needing to setup things locally or requiring access to hardware. This pros are now also present for most high quality open models, which are accessible through model providers, and solve the first 2 problems above.
 You'll find a good cost analysis of model providers [here](https://huggingface.co/spaces/ArtificialAnalysis/LLM-Performance-Leaderboard) if you need help picking one.
 You can also make the choice to use tiny specialized LLM judges. With often a couple billion parameters, they can run locally on most recent consumer hardware, while being trained from scratch or fine-tuned using instruction data. You often need to follow their specific prompt formats.
+<Sidenote>
+Tiny specialized judge models (3-13B parameters) can run on consumer hardware while being trained specifically for evaluation tasks. They require following specific prompt formats but offer local deployment and fast inference.
+</Sidenote>
 Some existing models:
 - Flow-Judge-v0.1 ([weights](https://huggingface.co/collections/flowaicom/flow-judge-v01-66e6af5fc3b3a128bde07dec)), 3.8B parameters, a Phi-3.5-mini-instruct fine-tuned on a synthetic preference dataset
 - Prometheus ([weights](https://huggingface.co/prometheus-eval/prometheus-13b-v1.0), [paper](https://arxiv.org/abs/2310.08491)), 13B parameters, a model trained from scratch on synthetic preference dataset. A 7B parameter [v2](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0) also exists, a Mistral-7B-Instruct-v0.2 fine-tune on a bigger synthetic preference dataset, with added weight merging
 - From existing [human preference datasets](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
 - From model generated preference data (which you can generate following the above tiny-model judges papers data sections, or get directly, for example from the Prometheus [preference](https://huggingface.co/datasets/prometheus-eval/Preference-Collection) and [feedback](https://huggingface.co/datasets/prometheus-eval/Feedback-Collection) collections).
+Then you need to decide whether to start from a small model to train from scratch, or from an existing model, that you can
 - distill into a new smaller model
 - quantize.
 - then fine-tune (using peft or adapter weights if the model is big and your training compute low) using the above data
 	- apparently [starting from a reward model works better than from an instruct model](https://x.com/dk21/status/1826292289930674590)
+<Note title="Training your own judge model" emoji="🔧" variant="info">
+**Key steps for custom judge training:**
+1. **Gather preference data**: Use human preference datasets or synthetic data from other models
+2. **Choose starting point**: Train from scratch, distill from larger model, or fine-tune existing model
+3. **Optimize for compute**: Use PEFT/adapter weights for efficient training on limited hardware
+4. **Pro tip**: Starting from a reward model reportedly works better than starting from an instruct model
+</Note>

app/src/content/chapters/model-as-a-judge/tips-and-tricks.mdx CHANGED Viewed

@@ -2,20 +2,39 @@
 title: "Model as a Judge: Tips and tricks"
 ---
 ### Tips and tricks
 **Mitigating well known biases of LLM as judges**
 - **Lack of internal consistency**: a judge might give you different judgments if you prompt it several times (if the temperature is not 0)
 	- You can mitigate this by doing self-consistency prompting of your judge, prompting it multiple times and keeping the majority output
 - **Self-preference**: they tend to [favor their own outputs](https://arxiv.org/abs/2404.13076) when scoring answers
 	- You can mitigate this by using a jury
-- **Blindness to input perturbation**: models are bad at identifying [perturbated input](https://arxiv.org/abs/2406.13439) and tangentially [bad at providing consistent score ranges](https://twitter.com/aparnadhinak/status/1748368364395721128) (extended experiments on this [here](https://github.com/LeonEricsson/llmjudge/blob/main/README.md)). For example, if asked to rank text quality on text where noise has been added on a consistent scale, the grades predicted do not reflect this scale.
-	- You can mitigate this by
 		- asking the model to explain its reasoning [before providing a score](https://twitter.com/seungonekim/status/1749289437165769177)
 		- providing a coherent grading scale in the prompt.
 - **Position-bias**: they tend to [favor specific answer positions](https://arxiv.org/abs/2306.05685). For example, when presented with pairwise comparisons, Claude and GPT3.5 tend to quite systematically prefer the first choice, or the second choice
-	- You can mitigate this by
 		- switching answer positions randomly
 		- computing the log-probabilities of all possible choices to get a normalized answer
 - **Verbosity-bias** (or length-bias): they tend to like more verbose answers

 title: "Model as a Judge: Tips and tricks"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### Tips and tricks
 **Mitigating well known biases of LLM as judges**
+<Note title="Known LLM judge biases and mitigations" emoji="⚠️" variant="warning">
+- **Lack of internal consistency**: Different judgments at temperature > 0
+  - Mitigation: Self-consistency prompting (multiple runs, majority vote)
+- **Self-preference**: [Favor own outputs](https://arxiv.org/abs/2404.13076)
+  - Mitigation: Use judge jury
+- **Blindness to perturbation**: Can't identify [perturbed input](https://arxiv.org/abs/2406.13439)
+  - Mitigation: Chain-of-thought before scoring, coherent grading scale
+- **Position bias**: [Favor specific positions](https://arxiv.org/abs/2306.05685)
+  - Mitigation: Random position switching, log-probability normalization
+- **Verbosity bias**: Prefer verbose answers
+  - Mitigation: [Account for length differences](https://arxiv.org/abs/2404.04475)
+- **Format bias**: Fail when format differs from training
+  - Mitigation: Match training prompt format
+</Note>
 - **Lack of internal consistency**: a judge might give you different judgments if you prompt it several times (if the temperature is not 0)
 	- You can mitigate this by doing self-consistency prompting of your judge, prompting it multiple times and keeping the majority output
 - **Self-preference**: they tend to [favor their own outputs](https://arxiv.org/abs/2404.13076) when scoring answers
 	- You can mitigate this by using a jury
+- **Blindness to input perturbation**: models are bad at identifying [perturbated input](https://arxiv.org/abs/2406.13439) and tangentially [bad at providing consistent score ranges](https://twitter.com/aparnadhinak/status/1748368364395721128) (extended experiments on this [here](https://github.com/LeonEricsson/llmjudge/blob/main/README.md)). For example, if asked to rank text quality on text where noise has been added on a consistent scale, the grades predicted do not reflect this scale.
+	- You can mitigate this by
 		- asking the model to explain its reasoning [before providing a score](https://twitter.com/seungonekim/status/1749289437165769177)
 		- providing a coherent grading scale in the prompt.
 - **Position-bias**: they tend to [favor specific answer positions](https://arxiv.org/abs/2306.05685). For example, when presented with pairwise comparisons, Claude and GPT3.5 tend to quite systematically prefer the first choice, or the second choice
+	- You can mitigate this by
 		- switching answer positions randomly
 		- computing the log-probabilities of all possible choices to get a normalized answer
 - **Verbosity-bias** (or length-bias): they tend to like more verbose answers

app/src/content/chapters/model-as-a-judge/what-about-reward-models.mdx CHANGED Viewed

@@ -2,6 +2,9 @@
 title: "What about Reward Models?"
 ---
 ### What about Reward Models?
 Reward models learn to predict a score from human annotations for given prompt/completion pairs. The end goal is for them to do predictions aligned with human preference.
@@ -13,11 +16,21 @@ $$p(\text{completion b is better than completion a}) = \text{sigmoid}(\text{scor
 This model is trained using only pairwise comparisons of completions, which are easier to collect than scores, but can only compare several completions for one prompt, and not completions across prompts.
-Other models have expanded on this approach to predict a more nuanced probability that a completion is better than the other one ([example](https://huggingface.co/RLHFlow/pair-preference-model-LLaMA3-8B)).
 This allows them to (theoretically) judge subtle differences between completions, at the cost of not being able to easily save and compare many different scores across prompts for the same test set. In addition, context length and memory limits can become an issue when comparing too long completions.
-Some reward models such as [SteerLM](https://arxiv.org/abs/2311.09528) output **absolute scores**, which can be used to evaluate completions directly without the need for pairwise comparisions. These models can be easier to use for evaluation, but are also harder to collect data for, as absolute scores tend to be less stable than pairwise scores in human preferences.
 More recently, models have been proposed that output both absolute and relative scores, such as [HelpSteer2-Preference](https://arxiv.org/abs/2410.01257) and [ArmoRM](https://arxiv.org/abs/2406.12845).
@@ -29,8 +42,14 @@ For models that give absolute scores, the resulting scores can be averaged to ge
 However, in the more common case of relative scores, the average reward can be biased by outliers (a few very good or very bad completions) as different prompts may have inherently different reward scales (some prompts are way harder or easier than others).
-Instead, we can use
-- win rates: take a reference set of completions and calculate the percentage of completions from the model that are ranked higher than the reference completions. It is slightly more granular.
 - win probabilities: the mean probability of the completions being better than the reference completions, which can give a more fine-grained and smoothly changing signal.
 #### Pros and Cons of Reward Models
@@ -45,6 +64,20 @@ On the other hand they:
 - **Require specific fine-tuning**: This can be a relatively costly step, and elthough they inherit many capabilities from a base model, they may still perform poorly on tasks that are out of the training distribution.
 - **Loose efficiency when used both in reinforcement learning and evaluation** (or when using direct alignment algorithms on datasets that are similar to the training data of the reward model), as the language model may overfit to the reward model's preferences.
 Some notes:
 - A good place to find high performing models is the [RewardBench Leaderboard](https://huggingface.co/spaces/allenai/reward-bench).
 - You can look at how reward models have been used in the [Nemotron](https://arxiv.org/abs/2406.11704) paper.

 title: "What about Reward Models?"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ### What about Reward Models?
 Reward models learn to predict a score from human annotations for given prompt/completion pairs. The end goal is for them to do predictions aligned with human preference.
 This model is trained using only pairwise comparisons of completions, which are easier to collect than scores, but can only compare several completions for one prompt, and not completions across prompts.
+Other models have expanded on this approach to predict a more nuanced probability that a completion is better than the other one ([example](https://huggingface.co/RLHFlow/pair-preference-model-LLaMA3-8B)).
 This allows them to (theoretically) judge subtle differences between completions, at the cost of not being able to easily save and compare many different scores across prompts for the same test set. In addition, context length and memory limits can become an issue when comparing too long completions.
+<Note title="Types of reward models" emoji="📊" variant="info">
+**Three main approaches:**
+- **Pairwise (Bradley-Terry)**: Most common. Compares two completions for same prompt. Easier to train (pairwise comparisons) but can't compare across different prompts.
+- **Absolute scores** (e.g., SteerLM): Direct evaluation without comparison. Easier to use but harder to collect training data (absolute scores less stable in human preferences).
+- **Hybrid models** (HelpSteer2, ArmoRM): Output both absolute and relative scores for maximum flexibility.
+</Note>
+Some reward models such as [SteerLM](https://arxiv.org/abs/2311.09528) output **absolute scores**, which can be used to evaluate completions directly without the need for pairwise comparisions. These models can be easier to use for evaluation, but are also harder to collect data for, as absolute scores tend to be less stable than pairwise scores in human preferences.
 More recently, models have been proposed that output both absolute and relative scores, such as [HelpSteer2-Preference](https://arxiv.org/abs/2410.01257) and [ArmoRM](https://arxiv.org/abs/2406.12845).
 However, in the more common case of relative scores, the average reward can be biased by outliers (a few very good or very bad completions) as different prompts may have inherently different reward scales (some prompts are way harder or easier than others).
+<Sidenote>
+For relative scores, don't just average raw rewards—outliers and varying prompt difficulty scales will bias results. Use win rates or win probabilities against a reference instead.
+</Sidenote>
+Instead, we can use
+- win rates: take a reference set of completions and calculate the percentage of completions from the model that are ranked higher than the reference completions. It is slightly more granular.
 - win probabilities: the mean probability of the completions being better than the reference completions, which can give a more fine-grained and smoothly changing signal.
 #### Pros and Cons of Reward Models
 - **Require specific fine-tuning**: This can be a relatively costly step, and elthough they inherit many capabilities from a base model, they may still perform poorly on tasks that are out of the training distribution.
 - **Loose efficiency when used both in reinforcement learning and evaluation** (or when using direct alignment algorithms on datasets that are similar to the training data of the reward model), as the language model may overfit to the reward model's preferences.
+<Note title="Reward models vs LLM judges" emoji="⚡" variant="success">
+**Reward models excel at:**
+- **Speed**: Single forward pass for a score (no text generation)
+- **Determinism**: Reproducible scores, no temperature variation
+- **No positional bias**: Models trained on balanced data avoid order effects
+- **Zero prompt engineering**: Just pass completions, get scores
+**But beware:**
+- **Require fine-tuning**: Costly setup, may fail out-of-distribution
+- **Overfitting risk**: Language models can learn to game the reward model during RL training
+</Note>
 Some notes:
 - A good place to find high performing models is the [RewardBench Leaderboard](https://huggingface.co/spaces/allenai/reward-bench).
 - You can look at how reward models have been used in the [Nemotron](https://arxiv.org/abs/2406.11704) paper.

app/src/content/chapters/picking-your-evaluation.mdx CHANGED Viewed

@@ -1,132 +1,203 @@
-Here are the many ways one can evaluate a post trained model:
-1.  **Capability evals**
-This class of evals target fundamental skills, like reasoning and competitive math and coding.
-    -  **Knowledge.**  We currently use GPQA Diamond [@gpqa] as the main eval for scientific knowledge. This benchmark consists of graduate-level, multiple-choice questions.  For small models, it's far from saturated and gives better signal than MMLU and friends, while being much faster to run. Another good test of factuality is SimpleQA [[@simpleqa](https://huggingface.co/papers/2411.04368)], although small models tend to struggle significantly on this benchmark due to their limited knowledge.
-    -  **Math.**  To measure mathematical ability, most models today are evaluated on the latest version of AIME (currently the 2025 version). MATH-500 [[@openaiprm](https://arxiv.org/abs/2305.20050)] remains a useful sanity test for small models, but is largely saturated by reasoning models. For a more comprehensive set of math evals, we recommend those from [MathArena](https://matharena.ai/).
-    -  **Code.**  We use the latest version of [LiveCodeBench](https://livecodebench.github.io/leaderboard.html) to track coding competency. Although targeted towards competitive programming problems, we've found that improvements on LiveCodeBench do translate into better coding models, albeit limited to Python. [SWE-bench Verified](https://openai.com/index/introducing-swe-bench-verified/) is a more sophisticated measure of coding skill, but tends to be too hard for small models and thus is not one we usually consider.
-    -  **Multilinguality.** Unfortunately, there are not many options when it comes to testing the multilingual capabilities of models. We currently rely on Global MMLU [[@globalmmlu](https://arxiv.org/abs/2412.03304)] to target the main languages our models should perform well in, with MGSM [[@mgsm](/2421384ebcac800cb22cdf0bb34c69f7)] included as a test of multilingual mathematical ability.
-1.  **Integrated task evals**
-These evals test things that are close to what we'll ship: multi-turn reasoning, long-context use, and tool calls in semi-realistic settings.
-    -  **Long context.**  The most commonly used test for long-context retrieval is the Needle in a Haystack (NIAH) [[@niah](https://github.com/gkamradt/LLMTest_NeedleInAHaystack)], where a random fact ("needle") is placed in somewhere within a long document ("haystack") and the model has to retrieve it. However, this benchmark is too superficial to discriminate long-context understanding, so the community has developed more comprehensive evals like RULER [[@ruler](https://arxiv.org/abs/2404.06654)] and HELMET [[@helmet](https://arxiv.org/abs/2410.02694)]. More recently, OpenAI have released the [MRCR](https://huggingface.co/datasets/openai/mrcr) and [GraphWalks](https://huggingface.co/datasets/openai/graphwalks) benchmarks which extend the difficulty of long-context evals.
-<Sidenote>
-See also this <a href="https://nrehiew.github.io/blog/long_context/">excellent [blog post](https://nrehiew.github.io/blog/long_context/)</a> on the limitations of long-context evals and how to design realistic ones.
-</Sidenote>
-    -  **Instruction following.**  IFEval [[@ifeval](https://arxiv.org/abs/2311.07911)] is currently the most popular eval to measure instruction following, and uses automatic scoring against "verifiable instructions". IFBench [[@ifbench](https://arxiv.org/abs/2507.02833)] is a new extension from Ai2 which includes a more diverse set of constraints than IFEval and mitigates some benchmaxxing that has occurred in recent model releases. For multi-turn instruction following, we recommend Multi-IF [[@multiif](https://arxiv.org/abs/2410.15553)] or MultiChallenge [[@multichallenge](https://arxiv.org/abs/2501.17399)].
-    -  **Alignment.** Measuring how well models align to user intent is typically done through human annotators or by public leaderboards like [LMArena](https://lmarena.ai/). This is because qualities such as free-form generation, style, or overall helpfulness are difficult to measure quantitatively with automated metrics. However, in all cases it is very expensive to run these evaluations which is why the community has resorted to using LLMs as a proxy for human preferences. The most popular benchmarks of this flavour include AlpacaEval [[@alpacaeval](https://arxiv.org/abs/2404.04475)], ArenaHard [[@arenahard](https://arxiv.org/abs/2406.11939)] and MixEval [[@mixeval](https://arxiv.org/abs/2406.06565)], with the latter having the strongest correlation with human Elo ratings on LMArena.
-    -  **Tool calling.**  [BFCL](https://gorilla.cs.berkeley.edu/leaderboard.html) provides a comprehensive test of tool calling, albeit one that is often saturated quite quickly. TAU-Bench [[@taubench](https://arxiv.org/abs/2506.07982)] provides a test of a model's ability to use tools and resolve user problems in simulated customer service settings and has also become a popular benchmark to report on.
-1.  **Overfitting-prevention evals**
-To test whether our models are overfitting to a specific skill, we include some robustness or adaptability evals in our set, like GSMPlus [[@gsmplus](https://arxiv.org/abs/2402.19255)], which perturbs problems from GSM8k [[@gsm8k](https://arxiv.org/abs/2110.14168)] to test whether models can still solve problems of similar difficulty.
-1.  **Internal evals**
-Although public benchmarks can provide some useful signal during model development, they are no substitute for implementing your own internal evals to target specific capabilities, or asking internal experts to interact with your model.
-<Sidenote>
-This is especially true if you are building an AI product. See Hamel Husain's wonderful <a href="https://hamel.dev/blog/posts/evals/">[blog post](https://decodingml.substack.com/?utm_source=navbar&utm_medium=web)</a> for specific advice on this topic.
-</Sidenote>
-For example, for SmolLM3 we needed a benchmark to evaluate whether the model was capable of  *multi-turn*   *reasoning* , so we implemented a variant of Multi-IF to measure this.
-1.  **Vibe evaluations and arenas**
-Similarly, we have found that "vibe testing" intermediate checkpoints (aka interacting with your model) is essential for uncovering subtle quirks in model behaviour that are not captured by eval scores. As we discuss later, vibe testing uncovered a bug in our data processing code where all system messages were deleted from the corpus!
-This is also something that can be done at scale to measure human preference, like on the popular [LMArena](https://lmarena.ai/). However, crowdsourced human evaluation tends to be brittle (favouring sycophancy and flowery speech over actual usefulness), so it's important to see it as a low signal feedback.
-<Note title="Decontaminate your training data" emoji="☝️" variant="danger">
-One risk with relying on public benchmarks is that the models can easily be overfit to them, especially when synthetic data is used to generate prompts and responses that are similar to the target benchmarks. For this reason, it is essential to decontaminate your training data against the evals you will use to guide model development. You can do this with N-gram matching using scripts like those in <a href="https://github.com/huggingface/open-r1/blob/main/scripts/decontaminate.py">Open-R1.</a>
-</Note>
-For SmolLM3 specifically, we wanted a hybrid reasoning model that could reliably follow instructions and reason well in popular domains like mathematics and code. We also wanted to ensure we preserved the base model's capabilities of multilinguality and long-context retrieval.
-This led us to the following set of evals:
-| Benchmark | Category | Number of prompts | Metric |
-| --- | --- | --- | --- |
-| AIME25 | Competitive mathematics | 30 | avg@64 |
-| LiveCodeBench (v4 for validation, v5 for final release) | Competitive programming | 100 (268) | avg@16 |
-| GPQA Diamond | Graduate-level reasoning | 198 | avg@8 |
-| IFEval | Instruction following | 541 | accuracy |
-| MixEval Hard | Alignment | 1000 | accuracy |
-| BFCL v3 | Tool use | 4441 | mixed |
-| Global MMLU (lite for validation) | Multilingual Q&A | 590,000 (6,400) | accuracy |
-| GSMPlus (mini for validation) | Robustness | 10,000 (2,400) | accuracy |
-| RULER | Long context | 6,500 | accuracy |
-Let's look at a few example questions from each to get a concrete sense of what these evaluations actually test:
-<iframe
-  src="https://huggingface.co/datasets/HuggingFaceTB/post-training-benchmarks-viewer/embed/viewer/aime25/test"
-  frameborder="0"
-  width="100%"
-  height="560px"
-></iframe>
-Browse through the examples above to see the types of questions in each benchmark. Notice how the diversity of domains ensures we're testing different aspects of model capability throughout our ablations.
-####  **Understanding what works: evaluation**
-Once we launch our ablations, how do we know what works or not?
-The first instinct of anyone who trains models might be to look at the loss, and yes, that's indeed important. You want to see it decreasing smoothly without wild spikes or instability. For many architectural choices, the loss correlates well with downstream performance and can be sufficient [@chen2025]. However, looking at the loss only is not always reliable. Taking the example of data ablations, you would find that training on Wikipedia gives a lower loss than training on web pages (the next token is easier to predict), but that doesn't mean you'd get a more capable model. Similarly, if we change the tokenizer between runs, the losses aren't directly comparable since text gets split differently. Some changes might also specifically affect certain capabilities like reasoning and math and get washed away in the average loss. Last but not least, models can continue improving on downstream tasks even after pretraining loss has converged [@liu2022].
-We need more fine-grained evaluation to see the full picture and understand these nuanced effects and a natural approach is to use downstream evaluations that test knowledge, understanding, reasoning, and whatever other domains matter for us.
-For these ablations, it's good to focus on tasks that give good early signal and avoid noisy benchmarks. In [FineTasks](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fine-tasks) and [FineWeb2](https://arxiv.org/pdf/2506.20920), reliable evaluation tasks are defined by four key principles:
--  **Monotonicity:**  The benchmark scores should consistently improve as models train longer.
--  **Low noise:**  When we train models with the same setup but different random seeds, the benchmark scores shouldn't vary wildly.
--  **Above-random performance:**  Many capabilities only emerge later in training, so tasks that show random-level performance for extended periods aren't useful for ablations. This is the case, for example, for MMLU in multiple choice format as we will explain later.
--  **Ranking consistency:**  If one approach outperforms another at early stages, this ordering should remain stable as training continues.
-The quality of a task also depends on the task formulation (how we ask the model questions) and metric choice (how we compute the answer score).
-Three common task formulations are multiple choice format (MCF), cloze formulation (CF) and freeform generation (FG). Multiple choice format requires models to select an option from a number of choices explicitly presented in the prompt and prefixed with A/B/C/D (as is done in MMLU, for example). In cloze formulation, we compare the likelihood of the different choices to see which one is more likely without having provided them in the prompt. In FG, we look at the accuracy of the greedy generation for a given prompt. FG requires a lot of latent knowledge in the model and is usually too difficult a task for the models to be really useful in short pre-training ablations before a full of training. We thus focus on multiple choice formulations when running small sized ablations (MCF or CF).
-<Note title="Heads‑up" emoji="📍" variant="info">
-For post-trained models, FG becomes the primary formulation since
-we're evaluating whether the model can actually generate useful responses.
-We'll cover evaluation for these models in the [post-training chapter](#beyond-base-models--post-training-in-2025).
 </Note>
-Research has also shown that models struggle with MCF early in training, only learning this skill after extensive training, making CF better for early signal [@olmes; @du2025; @datacomp]. We thus use CF for small ablations, and integrate MCF in the main run as it gives better mid-training signal once a model has passed a threshold to get sufficiently high signal-over-noise ratio for MCF. A quick note also that, to score a model's answer in sequence likelihood evaluations like CF, we compute accuracy as the percentage of questions where the the correct answer has the highest log probability normalised by character count. This normalisation prevents a bias toward shorter answers.
-<Sidenote>
-The point at which MMLU MCF becomes non-random depends on the model size and training data. For a 7B transformer, the OLMES paper [@olmes] found the model starts showing non-random performance after 500B tokens. For 1.7B model, we found this happens after 6T tokens in SmolLM2 [@smollm2]. @du2025 argue this is fundamentally about the pre-training loss reaching a certain threshold.
-</Sidenote>
-Our ablations evaluation suite includes the benchmarks from [FineWeb](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1) ablations, except for SIQA which we found to be too noisy. We add math and code benchmarks like GSM8K and HumanEval and the long context benchmark RULER for long context ablations. This aggregation of tasks test world knowledge, reasoning, and common sense across a variety of formats, as shown in the table below. To speed up evaluations at the expense of some additional noise, we only evaluate on 1,000 questions from each benchmark (except for GSM8K, HumanEval & RULER, which we used in full for the 3B SmolLM3 ablations but omit from the 1B experiments below). We also use the cloze fomulation (CF) way of evaluating for all multiple-choice benchmarks, as explained above.  Note that for multilingual ablations and actual training, we add more benchmarks to test multilinguality, which we detail later. These evaluations are run using [LightEval](https://github.com/huggingface/lighteval) and the table below summarises the key characteristics of each benchmark:
-| Benchmark | Domain | Task Type | Questions | What it Tests |
-| --- | --- | --- | --- | --- |
-| MMLU | Knowledge | Multiple choice | 14k | Broad academic knowledge across 57 subjects |
-| ARC | Science & reasoning | Multiple choice | 7k | Grade-school level science reasoning |
-| HellaSwag | Commonsense reasoning | Multiple choice | 10k | Commonsense reasoning about everyday situations (narrative completion) |
-| WinoGrande | Commonsense reasoning | Binary choice | 1.7k | Pronoun resolution requiring world knowledge |
-| CommonSenseQA | Commonsense reasoning | Multiple choice | 1.1k | Commonsense reasoning about everyday concepts |
-| OpenBookQA | Science | Multiple choice | 500 | Elementary science facts with reasoning |
-| PIQA | Physical commonsense | Binary choice | 1.8k | Physical commonsense about everyday objects |
-| GSM8K | Math | Free-form generation | 1.3k | Grade-school math word problems |
-| HumanEval | Code | Free-form generation | 164 | Python function synthesis from docstrings |
-Let's look at a few example questions from each to get a concrete sense of what these evaluations actually test:
-<iframe src="https://huggingface.co/datasets/HuggingFaceTB/llm-benchmarks-viewer/embed/viewer/default/mmlu" class="card card--p0" frameborder="0" width="100%" height="450px"></iframe>
-Browse through the examples above to see the types of questions in each benchmark. Notice how MMLU and ARC test factual knowledge with multiple choices, GSM8K requires computing numerical answers to math problems, and HumanEval requires generating complete Python code. This diversity ensures we're testing different aspects of model capability throughout our ablations.

+---
+title: "Picking good automatic evaluations for pretraining"
+---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
+import HtmlEmbed from "../../../components/HtmlEmbed.astro";
+## What Makes a Task "Fine"?
+Covering all 7000+ languages spoken over the world would be monumental endeavor, so we settled on using **9 languages** that offered diversity in script, language family and resource availability: **Chinese, French, Arabic, Russian, Thai, Hindi, Turkish, Swahili, and Telugu**.
+For these languages, we collected all available tasks that we could find, implementing a total of **185 tasks across languages** in [LightEval](https://github.com/huggingface/lighteval), HuggingFace's model evaluation library.
+Then, we began task selection with two primary goals: ensuring **evaluation diversity**, and making sure each task provided a **reliable signal** during pre-training.
+For evaluation diversity, we aimed to assess a broad range of model capabilities, including:
+- **Reading comprehension (RC)**: Understanding provided context and answering questions based on it.
+- **General knowledge (GK)**: Answering questions about facts from various fields without added context.
+- **Natural Language Understanding (NLU)**: Comprehending the semantics of provided input.
+- **Common-sense reasoning (RES)**: Demonstrating the ability to perform simple reasoning requiring embodied knowledge.
+- **Generative tasks**: Ability to generate text in the target language without the "help" of multiple choice options.
+We consider that tasks provide a reliable signal if they provide a dependable score. This means the score should be above the random baseline, increase as training progresses, show low variability across different seeds, and provide consistent model ranking at each training step<d-footnote>For similar sized models trained with the same hyperparameters on the same amount of data.</d-footnote>.
+### Finding how much signal our tasks give during pre-training
+To thoroughly examine the signal our tasks provide, we trained many 1.5B parameter models for each language, using 30B tokens from subsets of the supported languages of the five largest openly available multilingual web datasets. These models were trained with the same hyperparameters and tokenizer. We then evaluated them at regular checkpoint intervals on the collected tasks (with no instruction and no system prompt in a 0-shot setting).
+This process required multiple evaluation runs for each task due to iterations on its implementation, resulting in a total of **73 000 GPU hours consumed** 🔥!
+With **49 models trained** we could finally define what a **reliable signal** means to us!
+#### Monotonicity
+One of our core requirements for a task is that it can be learned from training data and this **learning can be gradually observed as the training progresses**. Without this improvement through time, it's uncertain whether there will ever be an improvement in the future.
+To measure this, we used the **Spearman rank correlation** to quantify the correlation between steps and score. Spearman rank correlation can capture monotonicity even when scores don't evolve linearly with the number of steps. We required each task to have at least an average correlation of 0.5 over all model training runs.
+<div style="display: flex; grid-column: middle">
+    <div class="task-signal-plot" data-language="French" data-task="mlmm_hellaswag_fra_cf" data-show-controls="false" data-task-metrics="monotonicity" data-metric="acc_norm_token" data-group-seeds="true" data-title="✅ Good monotonicity: mlmm_hellaswag_fra_cf [fr]"></div>
+    <div class="task-signal-plot" data-language="Arabic" data-task="mlmm_truthfulqa_ara_cf:mc1" data-show-controls="false" data-task-metrics="monotonicity" data-metric="acc_norm_token" data-group-seeds="true" data-title="❌ Bad monotonicity: mlmm_truthfulqa_ara_cf:mc1 [ar]"></div>
+</div>
+#### Low noise
+When comparing model performance on tasks, we need to consider whether differences are due to **evaluation noise or genuine performance variations**.
+Noise can arise from the stochastic processes involved in model training, such as random token sampling, data shuffling, or model initialization.[Madaan et al., 2024](https://arxiv.org/abs/2406.10229) To measure how sensitive each task is to this noise, we trained four additional models on our own monolingual corpora (unfiltered CommonCrawl data in each language) using different seeds.
+For each task, we computed:
+1. First, a standard deviation of model scores for every step (approximately every 1B tokens), which we call the **per-step-std**.
+2. Then, to obtain a global variability measurement, we averaged all the per-step-std values to get the **avg-std** over the full training. We assume this value is an upper-bound across model architectures and training datasets (as it was approximated by models trained on a "dirtier" dataset, therefore with higher variability).
+3. Finally, we computed the **signal-to-noise ratio** (SNR) as the main metric for task variability. We calculate SNR as the mean score at 30B tokens of all runs divided by the avg-std. This metric measures how significant the overall score is relative to the score variations (noise).
+We aimed for each task to have an SNR > 20. The only exception to this rule are generative tasks, which typically have relatively low SNR, but are still worth including as they provide insights into how the model behaves when prompted to generate unconstrained (without answer options). In a multilingual setting, this is particularly relevant as some models trained on multiple languages can exhibit high task scores but then suddenly reply in the wrong language for generative tasks!
+<div style="display: flex; grid-column: middle">
+    <div class="task-signal-plot" data-language="Telugu" data-task="xstory_cloze_tel_cf" data-show-controls="false" data-task-metrics="snr" data-metric="acc_norm_token" data-group-seeds="false" data-title="✅ Good SNR: xstory_cloze_tel_cf [te]"></div>
+    <div class="task-signal-plot" data-language="Telugu" data-task="tydiqa_tel" data-show-controls="false" data-task-metrics="snr" data-metric="acc_norm_token" data-group-seeds="false" data-title="❌ Bad SNR: tydiqa_tel [te]"></div>
+</div>
+<Note>
+Assuming model performance is normally distributed across different seeds, we want the benchmark-run performance to be at least 3 final-stds above the benchmark random baseline. This would mean that 99.85% of seed scores are above the random baseline (formally, benchmark-run performance - benchmark random baseline > 3 * final-std).
+</Note>
+#### Non-Random Performance
+Many model capabilities are acquired later in training, thus **many tasks** (especially harder ones, such as math-related ones) **show baseline-level performance for an extended period**. While these tasks are useful, they're not ideal for early pre-training evaluation, and **we did not want to keep them** for this setting.
+We first computed the baseline random performance of the task (as the sum of 1/n_choices for all samples for multiple choice questions, and as zero for generative evaluations). Then we calculated the task's distance from the baseline as the maximum score across all models minus the baseline.
+<div style="display: flex; grid-column: middle">
+    <div class="task-signal-plot" data-language="Chinese" data-task="agieval_zho_cf:_average" data-show-controls="false" data-task-metrics="randomness" data-metric="acc_norm_pmi" data-group-seeds="true" data-title="✅ Non-random: agieval_zho_cf/acc_pmi [zh]"></div>
+    <div class="task-signal-plot" data-language="Chinese" data-task="agieval_zho_cf:_average" data-show-controls="false" data-task-metrics="randomness" data-metric="acc" data-group-seeds="true" data-title="❌ Random perf: agieval_zho_cf/acc [zh]"></div>
+</div>
+#### Model Ordering Consistency
+Let's not forget that the main goal of these evaluations is to compare models and datasets!
+In the future, we want to use these evaluations to select the best datasets for full model pretraining. This means **our tasks should rank datasets trained using very few tokens (we typically run data ablations on 30B tokens), in the same order as they would when trained for longer, after significantly more steps.**
+In other words, we would like tasks to have **predictive capability regarding future performance during pre-training**: if pre-training dataset A outperforms pre-training dataset B at 30 billion tokens, we would like this trend to continue at 300 billion tokens.
+Proving this is inherently impossible, but there is a necessary preliminary condition that we can test for: for the results to be consistent at large scales, they must also first show consistency at smaller scales!
+To measure this consistency in task ordering, we computed the average **Kendall's Tau** of models ranking between every two consecutive steps. We only considered steps starting after 15B tokens of pre-training, as we found orderings before the range incredibly noisy. A high value of this metric indicates that the ordering remains consistent as training progresses.
+<Note>
+We had no strict minimum value requirement for this property, instead using it to establish comparisons between tasks.
 </Note>
+<div style="display: flex; grid-column: middle">
+    <div class="task-signal-plot" data-language="Arabic" data-task="xcsqa_ara_cf" data-show-controls="false" data-task-metrics="ordering" data-metric="acc_norm_token" data-group-seeds="true" data-title="✅ Good ordering: xcsqa_ara_cf [ar]"></div>
+    <div class="task-signal-plot" data-language="Thai" data-task="thai_exams_tha_cf:_average" data-show-controls="false" data-task-metrics="ordering" data-metric="acc_norm_token" data-group-seeds="true" data-title="❌ Bad ordering: thai_exams_tha_cf [th]"></div>
+</div>
+## Important properties of evaluation impacting stability
+Now that we covered what we were looking for in our tasks, let's examine two important aspects that can affect the above properties: task formulations and metric choice.
+<Note>Both of these aspects are thoroughly described and studied in the brilliant OLMES paper [Gu et al., 2024](https://arxiv.org/abs/2406.08446), which greatly inspired our work.</Note>
+### Task Formulations
+The way tasks are presented to the model is crucial, particularly for multiple-choice (MC) tasks. In these scenarios, we must carefully determine how the choices are displayed and what the model is expected to predict.
+There are two common approaches: **Cloze Formulation** (CF) and **Multi-Choice Formulation** (MCF). In CF, choices are not provided in context, allowing the model to predict each option directly. In contrast, MCF presents the choices in the prompt, using A/B/C/D prefixes, with the targets being those letter prefixes.
+It's important to know that:
+- The choice of formulation significantly impacts task scores (see [the release blog of the Open LLM Leaderboard 2](https://huggingface.co/spaces/open-llm-leaderboard/blog)).
+- Both formulations **behave very differently during training**. As noted by both OLMES [Gu et al., 2024](https://arxiv.org/abs/2406.08446) and DataComp-LM [Li et al., 2024](https://arxiv.org/abs/2406.11794), when employing MCF, task scores initially show random performance over extended training periods before experiencing a sudden increase. Conversely, with CF, task scores improve right from the beginning but tend to plateau relatively early.
+Therefore, we decided to utilize CF for task selection and MCF for later evaluation of major open source models, as they have generally undergone enough training for these evaluations to have a signal.
+### Metrics
+As the targets in CF of multiple choice tasks are choices themselves, each target can have a different number of tokens, characters, and unconditional probability (probability of generating the choice without a context prefix).
+<Note>Measuring accuracy without normalization would have the models prefer answers with fewer tokens, for example.</Note>
+To account for this, we consider the following accuracy variations:
+- **Accuracy** :
+  `acc` = <d-math>\underset{i}{\arg\max}(ln(P (a_i|q)))</d-math>
+- **Accuracy normalized over character length** :
+  `acc_char` = <d-math> \underset{i}{\arg\max}\frac{ln(P (a_i|q))}{num\_characters(a_i)}</d-math>
+- **Accuracy normalized over token length** :
+  `acc_token` = <d-math> \underset{i}{\arg\max}\frac{ln(P (a_i|q))}{num\_tokens(a_i)}</d-math>
+- **PMI Accuracy** :
+  `acc_pmi` = <d-math> \underset{i}{\arg\max}ln\frac{P (a_i|q)}{P (a_i|u)}</d-math>, where <d-math>u =</d-math>''Answer:''
+Where <d-math>a_i</d-math> is the answer choice <d-math>i</d-math>, <d-math>q</d-math> is a question prompt and <d-math>P (a_i|q)</d-math> is the probability of having <d-math>a_i</d-math> follow <d-math>q</d-math>. For more details see [Gu et al., 2024](https://arxiv.org/abs/2406.08446) and [Biderman et al., 2024](https://arxiv.org/abs/2405.14782).
+<Note>`acc_pmi` metric measures how much more likely a model is to predict A_i if provided with question context compared to if there was no context at all. This can be useful if the correct choice contains generally unlikely tokens, making the model less likely to choose such an answer.</Note>
+For our generative tasks on the other hand, we used the following metrics:
+- `prefix_match`: Exact match where only the prefix of the answer must match
+- `f1`: F1 score computed over predicted/gold words extracted using a word tokenizer
+For both generative metrics, minor preprocessing is applied to remove articles and punctuation, and lowercase the text.
+## The Fine selection
+With our goals and evaluation setup properly defined, we proceeded with **task selection**!
+We reviewed tasks one by one, choosing based on the quantified properties. For each language, we aimed to have at least one task for each of the four categories outlined above. Additionally we wanted to have at least 1 generative task for each language.
+In cases where multiple versions of a task existed (e.g., MMLU with different translation methods or native versions), we **prioritized native versions** as long as their metrics were reasonable, followed by human translations of English tasks. If no such version was available, we made our selection entirely based on metrics.
+Thus, **after removing about half of the tasks**, we arrived at **96 final ones**, forming "FineTasks."
+### Explore tasks
+Use the dropdowns below to navigate the list of tasks and how different metrics affect them.
+<div id="fine-tasks-results"></div>
+All tasks from the selection **comply with the criteria** outlined in previous sections, with the only exception being indicqa_tel, which we chose to include to ensure we had at least one generative task for Telugu. Overall we managed to cover all task categories for each language (the only exception being Thai Reasoning, where all tasks were unfortunately too noisy with low monotonicity to consider them).
+One of the **biggest surprises** was that some tasks, even when translated using the same method, were **reliable in one language but not in others**. This was evident with xWinograd, which worked quite well for Russian but did not meet our conditions for French. An even more extreme example was XNLI, which performed well for 6 out of 7 languages, failing to satisfy the reliability properties for Chinese. We had to test four different implementations before finding a reliable version, which, interestingly, was the only one that was created by native speakers and not machine translated.
+Feel free to use the dropdowns below to explore the evolution of scores over training for all tested tasks and metrics.
+<div class="task-signal-plot" data-language="French" data-task="frenchbench_hellaswag_fra_cf" data-show-controls="true" data-metric="acc_norm_token" data-group-seeds="true" data-title=""></div>
+### Metrics recommendation
+Selecting the best evaluation metrics proved to be a **challenging task**. Not only is there no single metric that consistently outperforms the rest, but we often encountered situations where one metric had better monotonicity while another had a higher signal-to-noise ratio. In such cases, we typically made our decision based on the selected metric for tasks' implementation in a different language. We are aware that such hand-picking is often not possible and thus offer the following recommendations:
+#### Multichoice Tasks
+- We found **base accuracy** to perform well for tasks with answer options varying subtly (e.g. Yes/No/Also), particularly NLI tasks. In such cases, where the answer options are often each a single token, the base accuracy is advisable to use.
+- While OLMES [Gu et al., 2024](https://arxiv.org/abs/2406.08446) recommends using PMI for tasks with unusual words, we found **PMI** to be highly effective for "difficult" reasoning and knowledge tasks like AGIEVAL or MMLU. In these cases, PMI provided the best results and was often the only metric delivering performance above random. That said, PMI was, on average, the weakest metric across all other tasks, while also being two times more expensive to compute. We therefore only recommend its use for complex reasoning and knowledge tasks.
+- The metrics we found to be **most reliable overall** were length normalization metrics (token or character-based). However, the best choice was dependent on language, rather than being consistent for a given task. Due to that, we recommend using the maximum of acc_char and acc_token for the most reliable results.<d-footnote>Note that acc_token is heavily tokenizer dependent. On our ablations all models were trained using the same tokenizer.</d-footnote>
+#### Generative Tasks
+For **generative metrics**, the choice is clearer: we suggest using the F1 score unless exact matching is required, as in math-related tasks. F1 is generally less noisy and more resilient to small changes in the generations.
+## Open/Closed Source models tackle FineTasks
+Since we spent a lot of time and compute on task selection, we were interested in how well major **open-source** models would do on FineTasks. Given that our evaluation suite primarily targets pretrained models, we focused on these, with a few exceptions for models that don't offer a base (pretrained) version. These exceptions were included mainly out of curiosity, and their results should be interpreted with **caution**. Such models may significantly outperform other models due to the inclusion of supervised fine-tuning (SFT) data.
+To assess the multilingual performance disparity between open-source and closed-source models, we expanded our selection by adding a closed source model: **gpt-4o-mini**.
+As outlined in the task formulations, we are using MCF for this evaluation and employing a 5-shot approach, as recommended by OLMES [Gu et al., 2024](https://arxiv.org/abs/2406.08446) (and made possible by the large context size of the models).
+### Computing a global "multilingual" score
+In the previous sections, we treated each task independently. However, to determine an overall "multilingual" score of a model, we need to **aggregate** the results from these tasks. We begin by **rescaling** the individual task scores in line with the OpenLLM leaderboard [Fourrier et al., 2024](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Then, we **average the scores** across task types (GK, RES, etc) for each language separately. To compute the score for each language, we take the average of the task type scores.<d-footnote>We first average by task type to properly measure all model capabilities without letting a single category dominate.</d-footnote>
+For the final global "multilingual" score we followed a different approach. Instead of averaging the language scores directly, we **ranked the model's performance across languages** in comparison to other models and then averaged those rank scores. This method ensures that the result reflects the overall model's performance across all languages, preventing an exceptionally high score in one language from skewing the final outcome.

app/src/content/chapters/troubleshooting/troubleshooting-inference.mdx CHANGED Viewed

@@ -2,6 +2,9 @@
 title: "Troubleshooting inference"
 ---
 ## Troubleshooting inference
 ### My model is very slow!
@@ -30,10 +33,25 @@ You can estimate the minimal theoretical memory required to load a given model (
 Since you can store 8 bits in a Byte, the memory required is the total number of parameters times the number of Bytes required to store one parameter. The precision factor is therefore 4 for `float32`,  2 for `float16` or `bfoat16`, 1 for `8bit`, and 0.5 for `4bit` models, etc.
-And that's it!
 I would actually recommend using  `<memory (in GB)> = <number of parameters (in G)> * (<precision factor> * 110%)`, to be on the safer side, as inference will require a bit more memory than just loading the model (you'll also need to load the batches).
 ### My model does not fit on a GPU
 ➡️ Quantization
@@ -48,6 +66,16 @@ The 2 main types of model parallelism are
 - Pipeline parallelism, where the model is split at the whole layer level, and the layers are dispatched on different GPUs. Since layer 1's output is layer 2's input, this leads to a slower execution, as GPUs will be idle while waiting, which is called a "bubble" (and data must be transferred from one GPU to the next). The bubble can be reduced by splitting the inputs into smaller batches. It's being natively added to PyTorch with the `PiPPy` [lib](https://github.com/pytorch/PiPPy), and this is what `accelerate` uses under the hood for parallelism.
 - Tensor parallelism, where the model is split at the matrix computation level. This means that the matrices will be split on rows or columns, and the total result aggregated. This is incredibly efficient as long as all GPUs are on the same node (to avoid inter node network bottlenecks), but can be hard to code. You'll find cool implementations of this in the `vllm` lib. It provides **insane speedups**.
 The best document on the different kinds of parallelism (including data parallelism, for speedups) is [here](https://huggingface.co/docs/transformers/v4.15.0/en/parallelism).
 ➡️ CPU offloading

 title: "Troubleshooting inference"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ## Troubleshooting inference
 ### My model is very slow!
 Since you can store 8 bits in a Byte, the memory required is the total number of parameters times the number of Bytes required to store one parameter. The precision factor is therefore 4 for `float32`,  2 for `float16` or `bfoat16`, 1 for `8bit`, and 0.5 for `4bit` models, etc.
+And that's it!
 I would actually recommend using  `<memory (in GB)> = <number of parameters (in G)> * (<precision factor> * 110%)`, to be on the safer side, as inference will require a bit more memory than just loading the model (you'll also need to load the batches).
+<Note title="Estimating GPU memory requirements" emoji="💾" variant="info">
+**Quick formula:**
+`Memory (GB) = Params (billions) × Precision factor × 1.1`
+**Precision factors:**
+- float32: 4
+- float16/bfloat16: 2
+- 8-bit: 1
+- 4-bit: 0.5
+The 1.1 multiplier accounts for batch loading overhead. Example: A 7B model in float16 needs ~15.4GB (7 × 2 × 1.1).
+</Note>
 ### My model does not fit on a GPU
 ➡️ Quantization
 - Pipeline parallelism, where the model is split at the whole layer level, and the layers are dispatched on different GPUs. Since layer 1's output is layer 2's input, this leads to a slower execution, as GPUs will be idle while waiting, which is called a "bubble" (and data must be transferred from one GPU to the next). The bubble can be reduced by splitting the inputs into smaller batches. It's being natively added to PyTorch with the `PiPPy` [lib](https://github.com/pytorch/PiPPy), and this is what `accelerate` uses under the hood for parallelism.
 - Tensor parallelism, where the model is split at the matrix computation level. This means that the matrices will be split on rows or columns, and the total result aggregated. This is incredibly efficient as long as all GPUs are on the same node (to avoid inter node network bottlenecks), but can be hard to code. You'll find cool implementations of this in the `vllm` lib. It provides **insane speedups**.
+<Note title="Model parallelism strategies" emoji="🔀" variant="info">
+**Two main approaches to split models across GPUs:**
+- **Pipeline parallelism**: Split by layers, dispatch to different GPUs. Simpler but creates "bubbles" (idle GPU time waiting for previous layer). Reduce bubbles by using smaller micro-batches. Used by PyTorch PiPPy and Accelerate.
+- **Tensor parallelism**: Split matrix operations across GPUs within each layer. Much faster (insane speedups!) but requires all GPUs on same node to avoid network bottlenecks. Check out `vllm` for implementations.
+</Note>
 The best document on the different kinds of parallelism (including data parallelism, for speedups) is [here](https://huggingface.co/docs/transformers/v4.15.0/en/parallelism).
 ➡️ CPU offloading

app/src/content/chapters/troubleshooting/troubleshooting-reproducibility.mdx CHANGED Viewed

@@ -2,6 +2,9 @@
 title: "Troubleshooting reproducibility"
 ---
 ## Troubleshooting reproducibility
 Let's say you have read a recent tech report about a cool new model, and you want to reproduce their results on your machine... but you're not managing to?
@@ -18,20 +21,32 @@ If you want to easily understand what kind of discrepancies happen when using di
 ### Other subtle ways in which the implementation can be different
 We've observed that the following were easy things to mess up, even when using the same code base:
-- **Different random seeds.**
-	- Normally, inference is less affected by random seeds than training. However, they can still affect some CUDA operations (see the PyTorch page on [reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)) and change predictions if you're using a non greedy generation strategy. They can also affect the prompt if you're using few-shots, and some pre or post-processing functions.
 	  -> A tiny change can result in a couple of points of difference.
-- **Actually different metrics**.
   Metrics can be different in practice even if they share the same name. Some examples:
 	- If the original implementation is a *log likelihood* `exact match` (computing the log probabilities of different possible answers), and you're using a *generative* `exact match` (only comparing the main greedy generation with the reference), you won't get the same scores.
-	- We also saw, in evaluation code bases, a number of tasks which were defined as `exact match`, but were actually `prefix exact match` (comparing only the beginning of the generation with the reference), or `suffix exact match` (the opposite), or `quasi exact match` (exact match with a normalization).
 	 -> You therefore can't rely only on the metric name to determine what is happening, and need to look at the code.
 - **Different normalization**.
-	- To go back to our above `exact match` comparison example, in `lm_eval` v1, a number of tasks were simply named generative `exact match`: you would assume from this that the prediction is *compared as such* to a reference.
-	  Looking at the code, the prediction would instead go through a normalization step (removing punctuation, homogenizing numbers, etc) before being compared to the reference. This will obviously change results quite a lot.
 	  (The `lm_eval` v2 now includes the normalization name in most metric names.)
 	 -> This is one of the easiest things to mess up, especially for tasks which require a lot of normalization/answer post processing, like math evaluations (where you want to extract the answer from a generated explanation).
 ### Different prompt
 3 main things can come into play for prompt variation.
 ### Prompt itself
@@ -57,6 +72,16 @@ These prompts are **semantically equivalent**, as they contain the exact same co
 Some tasks are also prefixed with a task prompt (eg: `The following questions are about <topic>`) - its presence or absence will also affect the scores.
 This [great paper](https://arxiv.org/abs/2407.07890)⭐ also highlights a side effect of this: a number of models are now trained to overfit benchmark prompts and answer formats, to the cost of adaptation to other prompts at evaluation time.
 This is something we observed on the Open LLM Leaderboard 2 for the Llama3.1 models. They were predicting the correct answers to our MATH-Hard evaluations, but were getting low scores, being unable to fit to the template provided in few-shot because they overfit the GSM8K prompt and answer format (another math eval).
@@ -88,7 +113,18 @@ Some sources of differences that we have observed are:
   Pytorch does not ensure reproducibility of non deterministic operations across hardware
 - using **different libraries**.
   For example, if you use `transformers` vs `vllm` as your backend for inference, matrix computations are not managed exactly in the same way)
-- using **different batch sizes**.
   It's been documented in several evaluation libraries and model backends that using different batch sizes will change inference results - if you want fully reproducible evaluations, you should fix the batch size, though it might not always be possible for memory issues
 - using **different loading precision** for your model weights.
   Using a lower precision can reduce memory and inference costs, but it will also change the numerical results, since you are using different versions of the weights.

 title: "Troubleshooting reproducibility"
 ---
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
 ## Troubleshooting reproducibility
 Let's say you have read a recent tech report about a cool new model, and you want to reproduce their results on your machine... but you're not managing to?
 ### Other subtle ways in which the implementation can be different
 We've observed that the following were easy things to mess up, even when using the same code base:
+- **Different random seeds.**
+	- Normally, inference is less affected by random seeds than training. However, they can still affect some CUDA operations (see the PyTorch page on [reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)) and change predictions if you're using a non greedy generation strategy. They can also affect the prompt if you're using few-shots, and some pre or post-processing functions.
 	  -> A tiny change can result in a couple of points of difference.
+- **Actually different metrics**.
   Metrics can be different in practice even if they share the same name. Some examples:
 	- If the original implementation is a *log likelihood* `exact match` (computing the log probabilities of different possible answers), and you're using a *generative* `exact match` (only comparing the main greedy generation with the reference), you won't get the same scores.
+	- We also saw, in evaluation code bases, a number of tasks which were defined as `exact match`, but were actually `prefix exact match` (comparing only the beginning of the generation with the reference), or `suffix exact match` (the opposite), or `quasi exact match` (exact match with a normalization).
 	 -> You therefore can't rely only on the metric name to determine what is happening, and need to look at the code.
 - **Different normalization**.
+	- To go back to our above `exact match` comparison example, in `lm_eval` v1, a number of tasks were simply named generative `exact match`: you would assume from this that the prediction is *compared as such* to a reference.
+	  Looking at the code, the prediction would instead go through a normalization step (removing punctuation, homogenizing numbers, etc) before being compared to the reference. This will obviously change results quite a lot.
 	  (The `lm_eval` v2 now includes the normalization name in most metric names.)
 	 -> This is one of the easiest things to mess up, especially for tasks which require a lot of normalization/answer post processing, like math evaluations (where you want to extract the answer from a generated explanation).
+<Note title="Subtle reproducibility pitfalls" emoji="⚠️" variant="warning">
+**Common sources of score differences (even with same codebase):**
+- **Random seeds**: Can affect CUDA ops, sampling strategies, and few-shot prompt selection (multi-point differences possible)
+- **Metric ambiguity**: "Exact match" can mean log-likelihood matching, generative matching, prefix/suffix/quasi-matching—always check the code, not just the name
+- **Hidden normalization**: Predictions may be normalized (punctuation removal, number formatting) before comparison—easy to miss especially in math evals
+**Key lesson**: Never trust metric names alone. Read the actual implementation.
+</Note>
 ### Different prompt
 3 main things can come into play for prompt variation.
 ### Prompt itself
 Some tasks are also prefixed with a task prompt (eg: `The following questions are about <topic>`) - its presence or absence will also affect the scores.
+<Note title="Prompt format sensitivity" emoji="📝" variant="danger">
+**Semantically identical prompts can cause 7+ point score differences!**
+Even tiny formatting variations (like `A.` vs `(A)` vs just listing choices) significantly impact scores. Models increasingly overfit to specific benchmark prompt formats during training, losing adaptation ability.
+**Real example**: Llama 3.1 models predicted correct MATH-Hard answers but scored poorly because they overfit to GSM8K's prompt format and couldn't adapt to different few-shot templates.
+</Note>
 This [great paper](https://arxiv.org/abs/2407.07890)⭐ also highlights a side effect of this: a number of models are now trained to overfit benchmark prompts and answer formats, to the cost of adaptation to other prompts at evaluation time.
 This is something we observed on the Open LLM Leaderboard 2 for the Llama3.1 models. They were predicting the correct answers to our MATH-Hard evaluations, but were getting low scores, being unable to fit to the template provided in few-shot because they overfit the GSM8K prompt and answer format (another math eval).
   Pytorch does not ensure reproducibility of non deterministic operations across hardware
 - using **different libraries**.
   For example, if you use `transformers` vs `vllm` as your backend for inference, matrix computations are not managed exactly in the same way)
+- using **different batch sizes**.
   It's been documented in several evaluation libraries and model backends that using different batch sizes will change inference results - if you want fully reproducible evaluations, you should fix the batch size, though it might not always be possible for memory issues
 - using **different loading precision** for your model weights.
   Using a lower precision can reduce memory and inference costs, but it will also change the numerical results, since you are using different versions of the weights.
+<Note title="Model loading affects reproducibility" emoji="🔧" variant="warning">
+**Four factors that change results even with identical code:**
+- **Hardware**: PyTorch doesn't guarantee reproducibility across different GPUs/hardware
+- **Inference library**: transformers vs vllm handle matrix ops differently
+- **Batch size**: Different batch sizes = different results (fix batch size for reproducibility, though memory may limit this)
+- **Loading precision**: Lower precision (float16 vs float32) changes numerical results
+</Note>

app/src/content/embeds/d3-decision-tree.html ADDED Viewed

	@@ -0,0 +1,363 @@

+<div class="d3-decision-tree"></div>
+<style>
+  .d3-decision-tree {
+    position: relative;
+    width: 100%;
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+  }
+  .d3-decision-tree svg {
+    display: block;
+    overflow: hidden;
+  }
+  .d3-decision-tree .node-rect {
+    stroke: var(--border-color);
+    stroke-width: 2.5px;
+    rx: 10px;
+    cursor: pointer;
+    transition: all 0.3s ease;
+    filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.1));
+  }
+  .d3-decision-tree .node-rect.start {
+    stroke: var(--primary-color);
+    stroke-width: 3.5px;
+    filter: drop-shadow(0 3px 8px rgba(0, 0, 0, 0.15));
+  }
+  .d3-decision-tree .node-rect.choice {
+    fill: var(--surface-bg);
+  }
+  .d3-decision-tree .node-rect.outcome {
+    fill: var(--surface-bg);
+    stroke-width: 2.5px;
+  }
+  .d3-decision-tree .node-rect:hover {
+    transform: scale(1.03);
+    stroke: var(--primary-color);
+    filter: drop-shadow(0 4px 12px rgba(0, 0, 0, 0.2));
+  }
+  .d3-decision-tree .node-text {
+    fill: var(--text-color);
+    font-size: 13.5px;
+    font-weight: 500;
+    text-anchor: middle;
+    pointer-events: none;
+    user-select: none;
+  }
+  .d3-decision-tree .node-text.start {
+    font-weight: 700;
+    font-size: 15px;
+  }
+  .d3-decision-tree .node-text.outcome {
+    font-weight: 600;
+    font-size: 13.5px;
+  }
+  .d3-decision-tree .link {
+    fill: none;
+    stroke: var(--muted-color);
+    stroke-width: 2.5px;
+    stroke-opacity: 0.5;
+  }
+  .d3-decision-tree .link-label {
+    fill: var(--text-color);
+    font-size: 11px;
+    text-anchor: middle;
+    pointer-events: none;
+    user-select: none;
+    font-style: italic;
+    opacity: 0.7;
+    font-weight: 500;
+  }
+</style>
+<script>
+  (() => {
+    const ensureD3 = (cb) => {
+      if (window.d3 && typeof window.d3.select === 'function') return cb();
+      let s = document.getElementById('d3-cdn-script');
+      if (!s) {
+        s = document.createElement('script');
+        s.id = 'd3-cdn-script';
+        s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
+        document.head.appendChild(s);
+      }
+      const onReady = () => {
+        if (window.d3 && typeof window.d3.select === 'function') cb();
+      };
+      s.addEventListener('load', onReady, { once: true });
+      if (window.d3) onReady();
+    };
+    const bootstrap = () => {
+      const scriptEl = document.currentScript;
+      let container = scriptEl ? scriptEl.previousElementSibling : null;
+      if (!(container && container.classList && container.classList.contains('d3-decision-tree'))) {
+        const candidates = Array.from(document.querySelectorAll('.d3-decision-tree'))
+          .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
+        container = candidates[candidates.length - 1] || null;
+      }
+      if (!container) return;
+      if (container.dataset) {
+        if (container.dataset.mounted === 'true') return;
+        container.dataset.mounted = 'true';
+      }
+      // Decision tree data structure
+      const treeData = {
+        name: "Are you a...",
+        type: "start",
+        children: [
+          {
+            name: "Model Builder",
+            type: "choice",
+            edgeLabel: "model builder",
+            children: [
+              {
+                name: "Training goes well",
+                type: "choice",
+                edgeLabel: "make sure training\ngoes well",
+                children: [
+                  { name: "Ablations", type: "outcome" }
+                ]
+              },
+              {
+                name: "Compare models",
+                type: "choice",
+                edgeLabel: "compare models",
+                children: [
+                  { name: "Leaderboards", type: "outcome" },
+                  { name: "Design Your Evals", type: "outcome" }
+                ]
+              }
+            ]
+          },
+          {
+            name: "Model User",
+            type: "choice",
+            edgeLabel: "model user",
+            children: [
+              {
+                name: "Test on use case",
+                type: "choice",
+                edgeLabel: "test a model on\nyour use case",
+                children: [
+                  { name: "Design Your Evals", type: "outcome" },
+                  { name: "Vibe Checks", type: "outcome" }
+                ]
+              }
+            ]
+          },
+          {
+            name: "ML Enthusiast",
+            type: "choice",
+            edgeLabel: "ML enthusiast",
+            children: [
+              {
+                name: "Fun use cases",
+                type: "choice",
+                edgeLabel: "test a model on\nfun use cases",
+                children: [
+                  { name: "Vibe Checks", type: "outcome" },
+                  { name: "Fun Use Cases", type: "outcome" }
+                ]
+              }
+            ]
+          }
+        ]
+      };
+      // Get colors
+      const getColors = () => {
+        if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') {
+          return window.ColorPalettes.getColors('categorical', 3);
+        }
+        // Fallback colors
+        return ['#4e79a7', '#e15759', '#76b7b2'];
+      };
+      const colors = getColors();
+      // SVG setup
+      const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
+      const gRoot = svg.append('g');
+      let width = 800, height = 600;
+      const margin = { top: 80, right: 120, bottom: 80, left: 120 };
+      function updateSize() {
+        width = container.clientWidth || 800;
+        height = Math.max(700, Math.round(width * 1.0));
+        svg.attr('width', width).attr('height', height);
+        gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
+        return {
+          innerWidth: width - margin.left - margin.right,
+          innerHeight: height - margin.top - margin.bottom
+        };
+      }
+      function wrapText(text, maxWidth) {
+        const words = text.split(/\s+/);
+        const lines = [];
+        let currentLine = words[0];
+        for (let i = 1; i < words.length; i++) {
+          const testLine = currentLine + ' ' + words[i];
+          if (testLine.length * 7 < maxWidth) {
+            currentLine = testLine;
+          } else {
+            lines.push(currentLine);
+            currentLine = words[i];
+          }
+        }
+        lines.push(currentLine);
+        return lines;
+      }
+      function render() {
+        const { innerWidth, innerHeight } = updateSize();
+        // Create tree layout
+        const treeLayout = d3.tree().size([innerWidth, innerHeight]);
+        const root = d3.hierarchy(treeData);
+        treeLayout(root);
+        // Draw links
+        const links = gRoot.selectAll('.link')
+          .data(root.links())
+          .join('path')
+          .attr('class', 'link')
+          .attr('d', d3.linkVertical()
+            .x(d => d.x)
+            .y(d => d.y));
+        // Draw link labels - only for edges from level 0 to level 1
+        const linkLabels = gRoot.selectAll('.link-label')
+          .data(root.links().filter(d => d.source.depth === 0))
+          .join('text')
+          .attr('class', 'link-label')
+          .attr('x', d => d.source.x + (d.target.x - d.source.x) * 0.3)
+          .attr('y', d => d.source.y + (d.target.y - d.source.y) * 0.4)
+          .attr('dy', -5)
+          .each(function(d) {
+            const label = d.target.data.edgeLabel || '';
+            if (label) {
+              const lines = label.split('\n');
+              d3.select(this).selectAll('tspan').remove();
+              lines.forEach((line, i) => {
+                d3.select(this)
+                  .append('tspan')
+                  .attr('x', d.source.x + (d.target.x - d.source.x) * 0.3)
+                  .attr('dy', i === 0 ? 0 : 13)
+                  .text(line);
+              });
+            }
+          });
+        // Draw link labels for deeper levels (more compact)
+        const deeperLinkLabels = gRoot.selectAll('.link-label-deep')
+          .data(root.links().filter(d => d.source.depth > 0))
+          .join('text')
+          .attr('class', 'link-label link-label-deep')
+          .attr('x', d => d.source.x + (d.target.x - d.source.x) * 0.4)
+          .attr('y', d => d.source.y + (d.target.y - d.source.y) * 0.35)
+          .attr('dy', -5)
+          .style('font-size', '10px')
+          .each(function(d) {
+            const label = d.target.data.edgeLabel || '';
+            if (label) {
+              const lines = label.split('\n');
+              d3.select(this).selectAll('tspan').remove();
+              lines.forEach((line, i) => {
+                d3.select(this)
+                  .append('tspan')
+                  .attr('x', d.source.x + (d.target.x - d.source.x) * 0.4)
+                  .attr('dy', i === 0 ? 0 : 11)
+                  .text(line);
+              });
+            }
+          });
+        // Node dimensions - responsive based on depth
+        const getNodeDimensions = (depth) => {
+          if (depth === 0) return { width: 160, height: 60 };
+          if (depth === 1) return { width: 145, height: 55 };
+          if (depth === 2) return { width: 145, height: 55 };
+          return { width: 140, height: 50 };
+        };
+        // Draw nodes
+        const nodes = gRoot.selectAll('.node')
+          .data(root.descendants())
+          .join('g')
+          .attr('class', 'node')
+          .attr('transform', d => `translate(${d.x},${d.y})`);
+        // Node rectangles
+        nodes.selectAll('rect').remove();
+        nodes.append('rect')
+          .attr('class', d => `node-rect ${d.data.type}`)
+          .attr('x', d => -getNodeDimensions(d.depth).width / 2)
+          .attr('y', d => -getNodeDimensions(d.depth).height / 2)
+          .attr('width', d => getNodeDimensions(d.depth).width)
+          .attr('height', d => getNodeDimensions(d.depth).height)
+          .attr('fill', d => {
+            if (d.data.type === 'start') return colors[0];
+            if (d.data.type === 'outcome') return colors[2];
+            return colors[1];
+          })
+          .attr('fill-opacity', d => {
+            if (d.data.type === 'start') return 0.2;
+            if (d.data.type === 'outcome') return 0.25;
+            return 0.12;
+          });
+        // Node text
+        nodes.selectAll('text').remove();
+        nodes.append('text')
+          .attr('class', d => `node-text ${d.data.type}`)
+          .attr('dy', '0.35em')
+          .each(function(d) {
+            const nodeDims = getNodeDimensions(d.depth);
+            const lines = wrapText(d.data.name, nodeDims.width - 14);
+            const textEl = d3.select(this);
+            const lineHeight = 13.5;
+            const startY = -(lines.length - 1) * lineHeight / 2;
+            lines.forEach((line, i) => {
+              textEl.append('tspan')
+                .attr('x', 0)
+                .attr('dy', i === 0 ? startY : lineHeight)
+                .text(line);
+            });
+          });
+      }
+      // Initial render
+      render();
+      // Resize handling
+      const rerender = () => render();
+      if (window.ResizeObserver) {
+        const ro = new ResizeObserver(() => rerender());
+        ro.observe(container);
+      } else {
+        window.addEventListener('resize', rerender);
+      }
+    };
+    if (document.readyState === 'loading') {
+      document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
+    } else {
+      ensureD3(bootstrap);
+    }
+  })();
+</script>