{
  "tower": "ai",
  "domain": "ai.towerofrecords.com",
  "wikidata_id": "Q11660",
  "citation_prefix": "Tower of Records — Artificial Intelligence",
  "version": "1.0",
  "last_updated": "2026-04-19",
  "total_pages": 50,
  "topics": [
    {
      "slug": "alignment-problem",
      "title": "The Alignment Problem: Specifying and Optimizing for Human Values",
      "description": "The alignment problem asks how to build AI systems that reliably pursue intended goals. Key failure modes include reward hacking, specification gaming, distributional shift, and Goodhart's law applied to learned reward proxies.",
      "category": "alignment",
      "citation_snippet": "Goodhart's law (1975): 'When a measure becomes a target, it ceases to be a good measure.' In AI alignment, reward proxies optimized by RL often diverge from intended behavior; RLHF partially addresses this via learned reward models.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1711.09883",
          "label": "Leike et al. (2018) — AI Safety Gridworlds. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/2009.01840",
          "label": "Krakovna et al. (2020) — Specification Gaming: The Flip Side of AI Ingenuity. DeepMind Blog"
        },
        {
          "url": "https://arxiv.org/abs/1606.06565",
          "label": "Amodei et al. (2016) — Concrete Problems in AI Safety. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "Specification gaming examples documented",
          "value": "60+",
          "unit": "documented cases",
          "note": "Krakovna et al. (2020) catalog; range from video games to robotic control to LLM sycophancy"
        },
        {
          "label": "Goodhart's law failure modes in RL",
          "value": "4",
          "unit": "categories",
          "note": "Krakovna et al.: rewardable-but-unintended, reward tampering, goal misgeneralization, proxy gaming"
        },
        {
          "label": "Reward hacking (boat racing)",
          "value": "8,602",
          "unit": "score",
          "note": "CoastRunners agent scored 8602 (vs ~4000 human) by catching fire and circling rather than finishing"
        },
        {
          "label": "RLHF sycophancy rate",
          "value": "Increases with RLHF",
          "unit": "",
          "note": "Perez et al. (2022): RLHF-trained models more sycophantic (agree with incorrect user opinions) than SFT"
        },
        {
          "label": "Mesa-optimization concern",
          "value": "Theoretical",
          "unit": "",
          "note": "Hubinger et al. (2019): a model trained via gradient descent may develop internal objectives that differ from the training objective"
        }
      ],
      "faq_items": [
        {
          "question": "What is the difference between outer alignment and inner alignment?",
          "answer": "Outer alignment asks whether the training objective (reward function) correctly captures the intended goal. Inner alignment asks whether the trained model actually optimizes the training objective. A model might pass outer alignment (the reward function is well-specified) but fail inner alignment (the model finds a different internal objective that scores well on training but generalizes differently). Both problems must be solved for reliable alignment. RLHF addresses outer alignment (replacing hard-coded rewards with learned human preferences) but does not solve inner alignment."
        },
        {
          "question": "What is specification gaming and why is it hard to prevent?",
          "answer": "Specification gaming occurs when an RL agent achieves high reward by exploiting unintended aspects of the reward specification, without achieving the intended goal. Example: a robot hand trained to move a ball achieves high reward by flipping over the ball sensor rather than actually moving the ball. This is hard to prevent because: (1) complete specification of complex human intentions is computationally intractable; (2) a sufficiently capable optimizer will find any loophole in any finite specification; (3) we cannot enumerate all possible unintended behaviors at design time."
        },
        {
          "question": "Does RLHF solve the alignment problem?",
          "answer": "RLHF substantially mitigates some alignment failure modes (reward hacking, harmful outputs) but does not fully solve alignment. RLHF introduces its own failure modes: sycophancy (models agree with incorrect user preferences to maximize reward), reward model limitations (human evaluators make mistakes), distributional shift (models may behave differently outside the training distribution), and the difficulty of expressing complex values as preference comparisons. RLHF is better understood as a practical technique that improves alignment at deployment, not a theoretical solution to the full alignment problem."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "attention-heads",
      "title": "Attention Heads: Specialization, Pruning, and What Different Heads Learn",
      "description": "Individual attention heads in trained transformers specialize into distinct functional roles — positional, syntactic, semantic, and rare-word heads; Voita et al. (2019) found ~80% of heads are prunable with minimal loss.",
      "category": "architecture",
      "citation_snippet": "Trained transformer attention heads specialize: positional heads track adjacent tokens, syntactic heads model grammatical dependencies, semantic heads capture coreference; Voita et al. (2019) pruned 48 of 96 encoder heads with <0.1 BLEU loss.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1905.09418",
          "label": "Voita et al. (2019) — Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting. ACL 2019"
        },
        {
          "url": "https://arxiv.org/abs/1905.10650",
          "label": "Michel et al. (2019) — Are Sixteen Heads Really Better than One? NeurIPS 2019"
        },
        {
          "url": "https://arxiv.org/abs/1906.04284",
          "label": "Vig & Belinkov (2019) — Analyzing the Structure of Attention in a Transformer Language Model. BlackboxNLP 2019"
        },
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        }
      ],
      "data_points": [
        {
          "label": "Total heads in base transformer encoder",
          "value": "48",
          "unit": "heads",
          "note": "8 heads × 6 encoder layers = 48 encoder attention heads"
        },
        {
          "label": "Heads prunable with <0.1 BLEU loss",
          "value": "~48 of 96",
          "unit": "heads",
          "note": "Voita et al. (2019): 50% of heads prunable in 6-layer encoder-decoder; only a few are critical"
        },
        {
          "label": "Critical head types identified",
          "value": "4",
          "unit": "categories",
          "note": "Positional, syntactic, rare-word, and semantic heads — each with measurable behavior"
        },
        {
          "label": "Michel et al. single-head performance",
          "value": "BLEU drops 17.7%",
          "unit": "relative",
          "note": "Reducing all layers to 1 head on WMT EN-DE; most individual head removals cost <0.5 BLEU"
        },
        {
          "label": "Head importance score (gradient-based)",
          "value": "I_h = E[|∂L/∂A_h|]",
          "unit": "",
          "note": "Importance approximated by expected absolute gradient of loss w.r.t. attention weights"
        }
      ],
      "faq_items": [
        {
          "question": "Do all attention heads learn the same thing?",
          "answer": "No. Research has documented systematic specialization. Voita et al. (2019) identified at least four distinct head types in trained models: positional heads (attend to fixed offsets like +1 or -1 from the current token), syntactic heads (track specific grammatical dependencies like subject-verb), rare-word heads (focus attention on low-frequency tokens), and semantic heads (capture coreference, entity type, and semantic similarity)."
        },
        {
          "question": "What fraction of attention heads are actually necessary?",
          "answer": "Surprisingly few. Michel et al. (2019) showed that on WMT EN-DE translation, 20/48 encoder heads can be removed with less than 1% BLEU degradation. Voita et al. (2019) pruned 48 of 96 heads across the full encoder-decoder model (50%) with under 0.1 BLEU loss. The remaining critical heads show high gradient-based importance scores and specific attention patterns."
        },
        {
          "question": "How are important attention heads identified?",
          "answer": "The most common method is gradient-based importance scoring: I_h = E[|∂L/∂A_h|], the expected absolute gradient of the loss with respect to the attention weights of head h. Heads with low importance can be 'pruned' by masking them out. At inference time, pruned heads are replaced by a uniform or zero attention distribution, reducing computation while preserving most model quality."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "attention-is-all-you-need",
      "title": "Attention Is All You Need: The Transformer Paper — Key Results and Impact",
      "description": "Vaswani et al. (NeurIPS 2017) introduced the transformer, achieving 28.4 BLEU on WMT 2014 English-German with a 64M parameter model trained in 12 hours on 8 GPUs, outperforming all prior ensemble models.",
      "category": "representation",
      "citation_snippet": "Vaswani et al. (NeurIPS 2017) introduced the transformer architecture, achieving 28.4 BLEU on WMT EN-DE — surpassing all prior models including ensembles — with a 64M parameter model trained for 12 hours on 8 P100 GPUs.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/1609.08144",
          "label": "Wu et al. (2016) — Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/1409.0473",
          "label": "Bahdanau et al. (2015) — Neural Machine Translation by Jointly Learning to Align and Translate. ICLR 2015"
        }
      ],
      "data_points": [
        {
          "label": "WMT EN-DE BLEU (transformer big)",
          "value": "28.4",
          "unit": "BLEU",
          "note": "State of the art at publication; surpassed all prior single-model and ensemble results"
        },
        {
          "label": "WMT EN-FR BLEU (transformer big)",
          "value": "41.8",
          "unit": "BLEU",
          "note": "Trained on 36M sentence pairs; outperformed all prior models"
        },
        {
          "label": "Base model training time",
          "value": "12 hours",
          "unit": "",
          "note": "100K steps on 8 × NVIDIA P100 GPUs; big model trained for 3.5 days"
        },
        {
          "label": "Training cost (base model)",
          "value": "3.3 × 10¹⁸",
          "unit": "FLOPs",
          "note": "Big model: 2.3 × 10¹⁹ FLOPs; dramatically less than prior LSTM-based systems"
        },
        {
          "label": "Previous SOTA (GNMT+RL ensemble)",
          "value": "26.30",
          "unit": "BLEU EN-DE",
          "note": "Wu et al. (2016) Google NMT ensemble; transformer single model exceeded this"
        },
        {
          "label": "Paper citation count",
          "value": "130,000+",
          "unit": "citations",
          "note": "As of 2025; one of the most-cited machine learning papers"
        }
      ],
      "faq_items": [
        {
          "question": "What was the key innovation of 'Attention Is All You Need'?",
          "answer": "The paper eliminated recurrence and convolutions entirely, building a sequence-to-sequence model using only attention mechanisms. This enabled full parallelization during training (unlike RNNs which process tokens sequentially), dramatically reducing training time. The multi-head self-attention mechanism allowed each position to directly attend to all other positions in O(1) operations, solving the long-range dependency problem that plagued LSTMs."
        },
        {
          "question": "How did the transformer compare to prior LSTM-based systems?",
          "answer": "The transformer big model achieved 28.4 BLEU on WMT EN-DE, compared to the prior best ensemble model (GNMT+RL) at 26.30 BLEU — a 2.1 BLEU improvement. More significantly, it achieved this in 3.5 days of training (2.3×10¹⁹ FLOPs) whereas GNMT required weeks. The base transformer (27.3 BLEU, 12 hours, 3.3×10¹⁸ FLOPs) already outperformed most prior single models."
        },
        {
          "question": "What architectural choices were validated in the paper's ablations?",
          "answer": "Table 3 of the paper systematically ablated: number of attention heads (optimal 8), key dimension d_k (smaller hurts more than larger), dropout (0.1 optimal), positional encoding type (learned vs sinusoidal equivalent), and residual dropout. The ablations confirmed that multiple attention heads and the specific scaling of d_k are critical design choices, not arbitrary hyperparameters."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "backpropagation",
      "title": "Backpropagation: Chain Rule, Computational Graphs, and Automatic Differentiation",
      "description": "Backpropagation applies the chain rule through a computation graph to compute ∂L/∂W for every parameter in a neural network; introduced for multilayer networks by Rumelhart, Hinton & Williams (1986) in Nature.",
      "category": "training",
      "citation_snippet": "Backpropagation computes ∂L/∂W via the chain rule: ∂L/∂W = ∂L/∂output · ∂output/∂W; a single backward pass computes all N parameter gradients in O(N) operations — same asymptotic cost as the forward pass (Rumelhart et al., 1986).",
      "sources": [
        {
          "url": "https://www.nature.com/articles/323533a0",
          "label": "Rumelhart, Hinton & Williams (1986) — Learning representations by back-propagating errors. Nature"
        },
        {
          "url": "https://arxiv.org/abs/1502.05767",
          "label": "Baydin et al. (2018) — Automatic Differentiation in Machine Learning: a Survey. JMLR"
        },
        {
          "url": "https://www.deeplearningbook.org/",
          "label": "Goodfellow et al. (2016) — Deep Learning. MIT Press (Chapter 6)"
        }
      ],
      "data_points": [
        {
          "label": "Chain rule for composed functions",
          "value": "∂L/∂x = ∂L/∂y · ∂y/∂x",
          "unit": "",
          "note": "Applied recursively through each layer; generalizes to multivariable Jacobians"
        },
        {
          "label": "Backward pass cost",
          "value": "~2–3× forward pass",
          "unit": "relative FLOPs",
          "note": "Two matrix multiplies per layer vs one for forward; plus activation gradient computations"
        },
        {
          "label": "Memory for activations",
          "value": "O(n · d · L)",
          "unit": "proportional",
          "note": "n=sequence length, d=model dim, L=layers; all activations must be stored for backward"
        },
        {
          "label": "Gradient checkpointing memory savings",
          "value": "O(√L)",
          "unit": "recomputation",
          "note": "Chen et al. (2016): recompute activations during backward; trades memory for compute"
        },
        {
          "label": "Automatic differentiation modes",
          "value": "Forward mode vs reverse mode",
          "unit": "",
          "note": "Reverse mode (backprop) computes all N gradients in one pass; optimal for N >> 1 outputs"
        }
      ],
      "faq_items": [
        {
          "question": "Why is reverse-mode automatic differentiation (backprop) more efficient than forward mode for neural networks?",
          "answer": "Forward-mode AD computes derivatives of all outputs with respect to one input — efficient when there are few inputs and many outputs. Reverse-mode AD computes derivatives of one scalar output (the loss L) with respect to all inputs — efficient when there are many inputs (all model parameters) and one output. Neural networks have millions of parameters but only one loss scalar, making reverse-mode O(N) vs forward-mode O(N²) for computing all N gradients."
        },
        {
          "question": "What is the memory cost of backpropagation?",
          "answer": "Backpropagation requires storing all intermediate activations from the forward pass — these are needed to compute gradients during the backward pass. For a transformer with L layers, sequence length n, and model dimension d, activation memory is O(n·d·L). For large models at long sequence lengths, this can exceed weight memory. Gradient checkpointing (Chen et al., 2016) trades compute for memory: only checkpoint activations at every √L layer, recomputing others during backward, reducing memory to O(√L) activations."
        },
        {
          "question": "How does backpropagation through the attention mechanism work?",
          "answer": "The attention computation Attention(Q,K,V) = softmax(Q·Kᵀ/√d_k)·V is differentiable. During backward, gradients flow through the softmax (∂softmax is the Jacobian of the softmax operation), through the matrix multiply Q·Kᵀ (gradients split between Q and K), and through the value weighting (gradients split between attention weights and V). PyTorch and JAX implement this automatically via operator fusion in FlashAttention, which also avoids storing the full n×n attention matrix for memory efficiency."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "beam-search",
      "title": "Beam Search: Approximate Sequence Optimization in Autoregressive Models",
      "description": "Beam search maintains k candidate sequences and selects top-k by joint log-probability at each step; k=4–5 is standard for neural machine translation; degrades for open-ended generation.",
      "category": "inference",
      "citation_snippet": "Beam search maintains k hypotheses; at each step expands k×|V| continuations, retains top-k by Σ log P(y_t|y_{<t}); Sutskever et al. (NeurIPS 2014) used k=2–12 for seq2seq MT; Holtzman et al. (2020) showed beam-decoded text has lower human preference than nucleus-sampled text for open generation.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1409.3215",
          "label": "Sutskever et al. (2014) — Sequence to Sequence Learning with Neural Networks. NeurIPS 2014"
        },
        {
          "url": "https://arxiv.org/abs/1904.09751",
          "label": "Holtzman et al. (2020) — The Curious Case of Neural Text Degeneration. ICLR 2020"
        },
        {
          "url": "https://arxiv.org/abs/1609.08144",
          "label": "Wu et al. (2016) — Google's Neural Machine Translation System. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "Standard beam width for neural MT",
          "value": "k = 4–5",
          "unit": "beams",
          "note": "Sutskever et al. (2014): k=2 already improves significantly over greedy; k>10 yields diminishing returns"
        },
        {
          "label": "Compute cost vs greedy",
          "value": "O(k × |V|) per step",
          "unit": "",
          "note": "Each step evaluates k beams × full vocabulary |V|; memory scales as k × T × d_model"
        },
        {
          "label": "Length penalty (Google NMT)",
          "value": "lp(Y) = (5 + |Y|)^α / 6^α, α=0.6–0.7",
          "unit": "",
          "note": "Wu et al. (2016): without penalty, beam search favors short sequences; α corrects length bias"
        },
        {
          "label": "BLEU improvement from greedy to beam k=5",
          "value": "+1.5–2.0 BLEU",
          "unit": "BLEU points",
          "note": "Typical gain on WMT English-German benchmarks; beam k=5 vs k=1 greedy baseline"
        }
      ],
      "faq_items": [
        {
          "question": "Why does beam search fail for open-ended text generation?",
          "answer": "Beam search finds sequences with high joint probability under the model. For machine translation with a clear reference, maximizing probability aligns with quality. For open-ended generation, high-probability sequences are often generic, repetitive, and boring — the model's probability distribution over creative text is broad, and the highest-probability path corresponds to safe, uninteresting continuations. Holtzman et al. (2020) showed that joint log-probability of human-written text is not higher than degenerate repetitive sequences under typical language models."
        },
        {
          "question": "What is the difference between beam search and greedy decoding?",
          "answer": "Greedy decoding selects the single highest-probability token at each step — equivalent to beam search with k=1. This can miss sequences where a slightly lower-probability first token leads to a much higher-probability overall sequence. Beam search explores k alternatives simultaneously, pruning the search space to retain the k highest-probability partial sequences at each step. The trade-off: k-beam search requires k× the computation and memory of greedy decoding, with diminishing accuracy returns above k=5–10."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "chain-of-thought",
      "title": "Chain-of-Thought Prompting: Intermediate Reasoning Steps Improve Multi-Step Accuracy",
      "description": "Wei et al. (2022) showed chain-of-thought prompting raises GSM8K accuracy from 18% to 57% on 540B models; zero-shot CoT 'let's think step by step' raises MultiArith from 17.7% to 78.7%.",
      "category": "agents-applications",
      "citation_snippet": "Wei et al. (NeurIPS 2022): adding step-by-step reasoning to 8-shot examples raised PaLM 540B GSM8K accuracy 18% → 57%; Kojima et al. (2022): zero-shot CoT 'Let's think step by step' raised MultiArith 17.7% → 78.7%; self-consistency (Wang et al., 2022) adds +17% via majority vote.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2201.11903",
          "label": "Wei et al. (2022) — Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2205.11916",
          "label": "Kojima et al. (2022) — Large Language Models are Zero-Shot Reasoners. NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2203.11171",
          "label": "Wang et al. (2022) — Self-Consistency Improves Chain of Thought Reasoning. ICLR 2023"
        }
      ],
      "data_points": [
        {
          "label": "GSM8K: standard vs CoT (PaLM 540B)",
          "value": "18% → 57%",
          "unit": "% accuracy",
          "note": "Wei et al. (2022): 8-shot standard prompting vs 8-shot chain-of-thought; +39 percentage points"
        },
        {
          "label": "MultiArith: zero-shot CoT (540B)",
          "value": "17.7% → 78.7%",
          "unit": "% accuracy",
          "note": "Kojima et al. (2022): zero-shot standard vs 'Let's think step by step'; +61 percentage points"
        },
        {
          "label": "Self-consistency gain on GSM8K",
          "value": "57% → 74% (k=40 samples)",
          "unit": "% accuracy",
          "note": "Wang et al. (2022): majority vote over 40 CoT samples; PaLM 540B; +17 percentage points"
        },
        {
          "label": "Scale threshold for CoT benefit",
          "value": "~100B parameters",
          "unit": "parameters",
          "note": "Wei et al. (2022): CoT benefits only emerge reliably above ~100B parameters; smaller models show no gain or regression"
        }
      ],
      "faq_items": [
        {
          "question": "Why does chain-of-thought prompting only work at large scale?",
          "answer": "Wei et al. (2022) tested CoT across models from ~300M to 540B parameters. Below ~100B parameters, CoT consistently equaled or underperformed standard prompting — models generated plausible-looking but incorrect reasoning chains. Above ~100B parameters, CoT reliably improved accuracy. The explanation: reasoning chains require the model to perform compositional operations (arithmetic, logical deduction) in the generated text. This requires sufficient capacity to both generate coherent language and correctly execute the intermediate computations."
        },
        {
          "question": "What is self-consistency and how does it improve on basic CoT?",
          "answer": "Basic CoT generates one reasoning chain and takes its final answer. Self-consistency (Wang et al., 2022) generates k diverse reasoning paths using temperature sampling and takes a majority vote over the final answers. The intuition: there are multiple valid reasoning paths to a correct answer, but incorrect reasoning produces more varied wrong answers. On GSM8K, self-consistency with k=40 adds +17% over single-path CoT (74% vs 57%), at the cost of 40× more inference compute."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "byte-pair-encoding",
      "title": "Byte-Pair Encoding: Algorithm, Vocabulary Construction, and Byte-Level BPE",
      "description": "Byte-pair encoding iteratively merges the most frequent adjacent token pair; GPT-2 uses byte-level BPE with 50,257 tokens, enabling lossless encoding of any Unicode text without unknown-token fallback.",
      "category": "representation",
      "citation_snippet": "BPE iteratively merges the most frequent adjacent byte pair until vocabulary reaches target size; GPT-2 applies BPE on raw UTF-8 bytes, producing 50,257 tokens with guaranteed full Unicode coverage and zero unknown tokens (Radford et al., 2019).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1508.07909",
          "label": "Sennrich et al. (2016) — Neural Machine Translation of Rare Words with Subword Units. ACL 2016"
        },
        {
          "url": "https://openai.com/research/language-models-are-unsupervised-multitask-learners",
          "label": "Radford et al. (2019) — Language Models are Unsupervised Multitask Learners (GPT-2). OpenAI Technical Report"
        },
        {
          "url": "https://www.cbloom.com/src/bpe.html",
          "label": "Gage (1994) — A New Algorithm for Data Compression. C Users Journal"
        }
      ],
      "data_points": [
        {
          "label": "GPT-2 vocabulary size",
          "value": "50,257",
          "unit": "tokens",
          "note": "Byte-level BPE; includes all 256 bytes as base tokens + 50,000 merges + special tokens"
        },
        {
          "label": "Base vocabulary (byte-level)",
          "value": "256",
          "unit": "bytes",
          "note": "All UTF-8 bytes as initial symbols; zero unknown token issue"
        },
        {
          "label": "Number of BPE merge operations (GPT-2)",
          "value": "50,000",
          "unit": "merges",
          "note": "Each merge combines two symbols into one new subword token"
        },
        {
          "label": "BPE time complexity",
          "value": "O(V·I)",
          "unit": "",
          "note": "V = vocabulary size, I = number of iterations; each iteration scans corpus once"
        },
        {
          "label": "Typical multilingual vocabulary",
          "value": "250,000+",
          "unit": "tokens",
          "note": "Larger vocabularies needed to cover diverse scripts without excessive character-level decomposition"
        }
      ],
      "faq_items": [
        {
          "question": "What problem does BPE solve compared to word-level tokenization?",
          "answer": "Word-level tokenization requires assigning an [UNK] token to any word not seen during training — a critical problem for names, technical terms, and morphological variants. BPE avoids this by decomposing rare words into frequent subword pieces. 'internationalization' might not appear in the vocabulary, but 'inter', 'national', 'ization' do. Sennrich et al. (2016) showed BPE-based NMT systems achieve near-zero unknown token rates while keeping sequence length manageable."
        },
        {
          "question": "What is the difference between character-level BPE and byte-level BPE?",
          "answer": "Character-level BPE starts with Unicode code points as the initial vocabulary, which can number in the thousands for multilingual text. Byte-level BPE (introduced by Radford et al., 2019 for GPT-2) starts with raw UTF-8 bytes as the 256-token base vocabulary. Since all text is ultimately bytes, byte-level BPE guarantees zero unknown tokens regardless of language or script, at the cost of slightly longer sequences for some Unicode characters."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "compute-flops",
      "title": "Compute FLOPs: Counting Training and Inference Operations for Language Models",
      "description": "Training a language model requires approximately 6·N·D FLOPs (N parameters, D tokens); inference costs ~2·N FLOPs per token; hardware peak FLOPs and memory bandwidth together determine practical throughput.",
      "category": "training",
      "citation_snippet": "Training FLOPs ≈ 6·N·D for dense transformers (N parameters, D tokens); inference costs ≈ 2·N FLOPs per token; an A100 GPU delivers 312 TFLOPS (BF16), making GPT-3 training require ~10⁴ A100-days.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2001.08361",
          "label": "Kaplan et al. (2020) — Scaling Laws for Neural Language Models. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/2203.15556",
          "label": "Hoffmann et al. (2022) — Training Compute-Optimal Large Language Models. NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2104.10350",
          "label": "Patterson et al. (2021) — Carbon and the Cloud. Nature"
        }
      ],
      "data_points": [
        {
          "label": "Training FLOPs formula",
          "value": "C ≈ 6 · N · D",
          "unit": "FLOPs",
          "note": "N = parameters, D = training tokens; factor 6 = 2 (forward) + 4 (backward)"
        },
        {
          "label": "Inference FLOPs per token",
          "value": "C_inf ≈ 2 · N",
          "unit": "FLOPs/token",
          "note": "One forward pass at inference; no backward pass needed"
        },
        {
          "label": "GPT-3 training FLOPs (175B, 300B tokens)",
          "value": "3.14 × 10²³",
          "unit": "FLOPs",
          "note": "6 × 175B × 300B = 3.15 × 10²³; consistent with Kaplan formula"
        },
        {
          "label": "NVIDIA A100 BF16 peak",
          "value": "312",
          "unit": "TFLOPS",
          "note": "Tensor core peak; actual utilization typically 30–70% of theoretical peak"
        },
        {
          "label": "GPT-3 equivalent A100-days",
          "value": "~10,000",
          "unit": "A100-days",
          "note": "3.14×10²³ FLOPs / (312×10¹² FLOPs/s × 86,400 s/day) ≈ 11,600 A100-days"
        }
      ],
      "faq_items": [
        {
          "question": "Why is the backward pass approximately 2× the forward pass in FLOPs?",
          "answer": "The forward pass through each layer requires one matrix multiply per linear transformation. The backward pass requires two matrix multiplies per layer: one to compute gradients with respect to the inputs (∂L/∂x), and one to compute gradients with respect to the weights (∂L/∂W). So the backward pass is roughly 2× the forward pass, giving 3× total per parameter update. Accounting for the optimizer step brings the factor to ~6× the forward-pass-only FLOPs."
        },
        {
          "question": "How is hardware efficiency measured in practice?",
          "answer": "Hardware efficiency (model FLOPs utilization, MFU) = (achieved FLOPs) / (peak FLOPs). In practice, large model training achieves 30–70% MFU due to communication overhead (gradient synchronization across GPUs), memory bandwidth bottlenecks (loading weights from HBM), and kernel launch latency. Well-optimized training runs for large models typically achieve 40–50% MFU on A100 clusters."
        },
        {
          "question": "What is the FLOPs difference between training and inference?",
          "answer": "Training costs ≈ 6·N·D FLOPs total (forward + backward + optimizer). Inference costs ≈ 2·N FLOPs per generated token (forward pass only). For GPT-3: training ≈ 3.14×10²³ FLOPs total; generating 1,000 tokens at inference ≈ 2 × 175B × 1,000 = 3.5×10¹⁴ FLOPs. The full training run costs ~10⁹ times more FLOPs than generating a single 1,000-token response. See also [inference-vs-training-compute](/ai/inference-vs-training-compute)."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "chinchilla-scaling",
      "title": "Chinchilla Scaling: Compute-Optimal Training and the 20-Token-Per-Parameter Rule",
      "description": "Hoffmann et al. (2022) trained 400+ models to find that compute-optimal training requires roughly 20 training tokens per parameter; Chinchilla-70B on 1.4T tokens outperforms 280B Gopher on 300B tokens at 4× lower cost.",
      "category": "training",
      "citation_snippet": "Chinchilla scaling (Hoffmann et al., 2022): compute-optimal training uses ~20 tokens per parameter; Chinchilla-70B (1.4T tokens) outperforms Gopher-280B (300B tokens) using 4× less compute, showing prior large models were severely undertrained.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2203.15556",
          "label": "Hoffmann et al. (2022) — Training Compute-Optimal Large Language Models. NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2001.08361",
          "label": "Kaplan et al. (2020) — Scaling Laws for Neural Language Models. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/2302.13971",
          "label": "Touvron et al. (2023) — LLaMA: Open and Efficient Foundation Language Models. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "Compute-optimal tokens per parameter",
          "value": "~20",
          "unit": "tokens/parameter",
          "note": "Derived from 400+ training runs; N_opt and D_opt both scale as C^{0.5}"
        },
        {
          "label": "Chinchilla model size",
          "value": "70",
          "unit": "billion parameters",
          "note": "Compute-optimal for 5.76×10²³ FLOPs given 1.4T training tokens"
        },
        {
          "label": "Chinchilla training tokens",
          "value": "1.4",
          "unit": "trillion tokens",
          "note": "~20× N = 20 × 70B = 1.4T tokens; compute-optimal point"
        },
        {
          "label": "Chinchilla vs Gopher (280B, 300B tokens)",
          "value": "Chinchilla wins on most benchmarks",
          "unit": "",
          "note": "Despite 4× fewer parameters and 4× less compute; smaller but properly trained"
        },
        {
          "label": "Approach 1 (fixed compute, vary N)",
          "value": "N ∝ C^{0.49}, D ∝ C^{0.51}",
          "unit": "",
          "note": "First estimation method from Hoffmann et al.; consistent across all three methods"
        }
      ],
      "faq_items": [
        {
          "question": "How did Hoffmann et al. determine the compute-optimal training point?",
          "answer": "They used three complementary methods: (1) training many models at fixed compute budgets while varying the model size, then fitting a power law to find optimal N; (2) fitting parametric loss functions L(N, D) to training runs and analytically minimizing under a compute constraint; (3) fitting individual loss curves to extrapolate optimal (N, D) pairs at many compute levels. All three methods converged on N_opt ∝ C^{0.5} and D_opt ∝ C^{0.5}, with approximately 20 tokens per parameter."
        },
        {
          "question": "Why is the '20 tokens per parameter' rule important for practitioners?",
          "answer": "For any given compute budget C (in FLOPs), the rule N_opt ≈ C/(20·6·2) and D_opt ≈ 20·N_opt provides a simple recipe: choose a model with N parameters, train it on ~20N tokens. A practitioner with a specific compute budget can quickly estimate both optimal model size and required dataset size. This shifted the field toward training medium-sized models on more data rather than maximally scaling parameters alone."
        },
        {
          "question": "Does the Chinchilla rule apply to inference-heavy deployments?",
          "answer": "Chinchilla optimizes for training compute efficiency only. For inference-heavy deployments (many queries per model), a smaller model with lower inference cost may be preferable even if it required more tokens to train. Training on additional tokens beyond the Chinchilla-optimal point continues to improve the model (subject to data availability), and the marginal cost of extra training tokens may be justified if it enables using a smaller model at inference time."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "autoregressive-decoding",
      "title": "Autoregressive Decoding: Greedy, Beam Search, and Sampling Strategies",
      "description": "Autoregressive decoding generates text token by token, conditioning each token on all previous ones. Greedy, beam search, top-k, top-p, and temperature sampling trade diversity for quality in different ways.",
      "category": "inference",
      "citation_snippet": "Beam search (Sutskever et al., 2014) maintains k candidate sequences by cumulative log-probability; top-p nucleus sampling (Holtzman et al., 2020) dynamically selects the minimum token set covering probability mass p.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1904.09751",
          "label": "Holtzman et al. (2020) — The Curious Case of Neural Text Degeneration. ICLR 2020"
        },
        {
          "url": "https://arxiv.org/abs/1409.3215",
          "label": "Sutskever et al. (2014) — Sequence to Sequence Learning with Neural Networks. NeurIPS 2014"
        },
        {
          "url": "https://arxiv.org/abs/1805.04833",
          "label": "Fan et al. (2018) — Hierarchical Neural Story Generation. ACL 2018"
        }
      ],
      "data_points": [
        {
          "label": "Greedy decoding quality",
          "value": "Repetitive",
          "unit": "",
          "note": "Holtzman et al.: greedy and beam search produce degenerate repetitive text in open-ended generation"
        },
        {
          "label": "Nucleus sampling p value",
          "value": "0.9–0.95",
          "unit": "cumulative probability",
          "note": "Holtzman et al. (2020): top-p=0.9 produces the most human-like text; filters low-probability tail"
        },
        {
          "label": "Top-k sampling k value",
          "value": "40–50",
          "unit": "top tokens",
          "note": "Fan et al. (2018): k=40 balances diversity and coherence for story generation"
        },
        {
          "label": "Beam search width",
          "value": "4–8",
          "unit": "beams",
          "note": "Machine translation: beam=4 typically near-optimal; larger beams decrease quality for open generation"
        },
        {
          "label": "Temperature effect",
          "value": "T→0 = greedy, T→∞ = uniform",
          "unit": "",
          "note": "Scaling logits by 1/T before softmax; T=0.7–1.0 standard for instruction following"
        }
      ],
      "faq_items": [
        {
          "question": "Why does beam search produce worse open-ended text than sampling?",
          "answer": "Holtzman et al. (2020) showed that beam search maximizes the probability of the entire sequence, but high-probability sequences are not the same as human-like sequences. Human text is not drawn from the maximum-probability mode of the distribution — humans write surprising, diverse, and contextually varied text. Beam search collapses to repetitive high-probability phrases because these dominate the probability mass. Sampling methods that truncate the distribution (top-p, top-k) better match the diversity of natural human text."
        },
        {
          "question": "What is top-p (nucleus) sampling and how does it differ from top-k?",
          "answer": "Top-k sampling selects from the k highest-probability tokens, regardless of how much probability mass they cover. Top-p (nucleus) sampling selects the minimum set of tokens whose cumulative probability ≥ p. At each step, the token set size varies dynamically: if one token has 95% probability, the nucleus has size 1; if 100 tokens each have 1% probability, the nucleus has size ~95. This dynamic size is the key advantage: top-p adapts to the model's certainty at each position, while top-k uses a fixed cutoff regardless of distribution shape."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "constitutional-ai",
      "title": "Constitutional AI: Self-Critique, Revision, and Principle-Based Alignment",
      "description": "Constitutional AI trains models to self-critique and revise their outputs against a written set of principles, replacing most human feedback on harmfulness with AI-generated feedback, reducing human labeler exposure to harmful content.",
      "category": "alignment",
      "citation_snippet": "Constitutional AI (Bai et al., 2022) uses a written constitution to guide self-critique and revision; RLAIF with AI feedback replaces human labeling on harmless/harmful dimension, achieving harmlessness comparable to RLHF with ~80% less human feedback on harm.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2212.08073",
          "label": "Bai et al. (2022) — Constitutional AI: Harmlessness from AI Feedback. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/2203.02155",
          "label": "Ouyang et al. (2022) — Training Language Models to Follow Instructions with Human Feedback. NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2309.00267",
          "label": "Lee et al. (2023) — RLAIF: Scaling Reinforcement Learning from Human Feedback with AI Feedback. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "Constitution size (original)",
          "value": "16",
          "unit": "principles",
          "note": "Bai et al. (2022); principles cover harmlessness, honesty, and helpfulness dimensions"
        },
        {
          "label": "Human feedback reduction on harm",
          "value": "~80%",
          "unit": "reduction",
          "note": "CAI replaces human harm-comparison labels with AI-generated labels using the constitution"
        },
        {
          "label": "SL-CAI revision rounds",
          "value": "multiple",
          "unit": "rounds",
          "note": "Supervised Learning CAI: model critiques and revises response using constitutional principles iteratively"
        },
        {
          "label": "Harmless Pareto improvement",
          "value": "Yes",
          "unit": "",
          "note": "Bai et al.: CAI is simultaneously more helpful AND less harmful than pure RLHF baseline in human eval"
        },
        {
          "label": "Constitutional principles categories",
          "value": "3",
          "unit": "domains",
          "note": "Harm avoidance, honesty/truthfulness, and positive prosocial behavior"
        }
      ],
      "faq_items": [
        {
          "question": "How does Constitutional AI differ from standard RLHF?",
          "answer": "Standard RLHF requires human labelers to compare model responses, including judging potentially harmful outputs. Constitutional AI (Bai et al., 2022) replaces the harmfulness comparison with AI-generated feedback: the model is prompted to critique its own response against a written principle, then revise it. A reward model is then trained on these AI-generated comparisons rather than human labels for the harm dimension. This reduces human exposure to harmful content and makes the training criteria more explicit and auditable."
        },
        {
          "question": "What is a 'constitution' in Constitutional AI?",
          "answer": "A constitution is a written set of principles that guide self-critique and revision. The original CAI paper uses 16 principles covering harm avoidance (e.g., 'choose the response least likely to contain harmful or unethical content'), honesty (e.g., 'prefer responses that are more honest and avoid deception'), and helpfulness. Principles are applied by prompting the model: 'Critique the previous response using the principle: [principle]. Then revise the response.'"
        },
        {
          "question": "What is RLAIF (Reinforcement Learning from AI Feedback)?",
          "answer": "RLAIF extends the Constitutional AI approach: instead of using human comparisons to train the reward model for RL, AI-generated comparisons are used. Lee et al. (2023) found that RLAIF achieves performance comparable to RLHF on harmlessness while requiring zero human labels for that dimension. The AI labeler uses a prompted large language model to compare two responses and determine which is more aligned with a given principle, then these comparisons are used to train the reward model."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "emergent-capabilities",
      "title": "Emergent Capabilities: Abilities That Appear Above Scale Thresholds in Language Models",
      "description": "Wei et al. (2022) documented 137 emergent tasks across 8 model families; multi-step arithmetic emerges at 8–13B parameters; Schaeffer et al. (2023) argue most emergence is a metric artifact.",
      "category": "evaluation",
      "citation_snippet": "Wei et al. (TMLR 2022): 137 tasks show near-zero then sharp improvement above scale thresholds in 8 model families; 3-digit arithmetic emerges ~8–13B parameters; Schaeffer et al. (NeurIPS 2023): switching to continuous metrics largely eliminates apparent discontinuities, suggesting measurement artifact.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2206.07682",
          "label": "Wei et al. (2022) — Emergent Abilities of Large Language Models. TMLR 2022"
        },
        {
          "url": "https://arxiv.org/abs/2206.04615",
          "label": "Srivastava et al. (2022) — Beyond the Imitation Game: Quantifying and Extrapolating LLM Capabilities (BIG-Bench). TMLR 2023"
        },
        {
          "url": "https://arxiv.org/abs/2304.15004",
          "label": "Schaeffer et al. (2023) — Are Emergent Abilities of Large Language Models a Mirage? NeurIPS 2023"
        }
      ],
      "data_points": [
        {
          "label": "Emergent tasks documented",
          "value": "137",
          "unit": "distinct tasks",
          "note": "Wei et al. (2022): sourced from BIG-Bench, MMLU, and other benchmarks across 8 model families"
        },
        {
          "label": "3-digit arithmetic emergence threshold",
          "value": "~8–13B parameters",
          "unit": "parameters",
          "note": "Wei et al. (2022): near-zero accuracy below 8B parameters; above 10B accuracy exceeds 90%"
        },
        {
          "label": "BIG-Bench tasks showing emergence",
          "value": "~26%",
          "unit": "% of BIG-Bench tasks",
          "note": "Srivastava et al. (2022): ~74% improve gradually; ~26% show discontinuous emergence pattern"
        },
        {
          "label": "Schaeffer et al. metric experiment",
          "value": "Discontinuities largely disappear",
          "unit": "",
          "note": "Switching from exact-match to continuous metrics (log-prob of correct token) reveals smooth scaling"
        }
      ],
      "faq_items": [
        {
          "question": "Are emergent abilities real phase transitions or measurement artifacts?",
          "answer": "Schaeffer et al. (2023) argue that most apparent emergence is a metric artifact. Tasks evaluated with discontinuous metrics (exact match, pass/fail) show sharp thresholds because the metric changes discontinuously even when the underlying model probability improves smoothly. Switching the same tasks to continuous metrics (log-probability of the correct answer) largely eliminates the discontinuities, revealing smooth scaling. This suggests emergence reflects the choice of evaluation metric more than a genuine capability phase transition."
        },
        {
          "question": "What is the relationship between emergent capabilities and in-context learning?",
          "answer": "In-context learning is itself an emergent capability: Brown et al. (2020) found meaningful few-shot ICL gains appear sharply above ~1B parameters, with small models showing no benefit. Many of the 137 emergent tasks in Wei et al. (2022) are few-shot tasks requiring the model to both understand a task specification and generalize from a small number of examples. This compositional generalization — applying learned task-solving strategies to new task specifications — appears to be the core capability that emerges with scale."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "encoder-decoder-architecture",
      "title": "Encoder-Decoder Architecture: Cross-Attention, Autoregressive Decoding, and Seq2Seq Performance",
      "description": "The transformer encoder maps (x₁,...,xₙ) to continuous representations z via self-attention, while the decoder autoregressively generates (y₁,...,yₘ) by attending to z through cross-attention, achieving 28.4 BLEU on WMT 2014 English-German.",
      "category": "architecture",
      "citation_snippet": "The transformer encoder maps n input tokens to continuous representations z; the decoder autoregressively generates m output tokens via cross-attention over z; base model achieves 28.4 BLEU WMT EN-DE (Vaswani et al., 2017).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/1406.1078",
          "label": "Cho et al. (2014) — Learning Phrase Representations using RNN Encoder-Decoder. EMNLP 2014"
        },
        {
          "url": "https://arxiv.org/abs/2409.13747",
          "label": "Ding et al. (2024) — Machine Translation with Large Language Models: Decoder Only vs. Encoder-Decoder. arXiv 2024"
        },
        {
          "url": "https://arxiv.org/abs/2510.26622",
          "label": "Zhang et al. (2025) — Encoder-Decoder or Decoder-Only? Revisiting Encoder-Decoder LLMs. arXiv 2025"
        }
      ],
      "data_points": [
        {
          "label": "Encoder: N layers (base model)",
          "value": "6",
          "unit": "layers",
          "note": "Each layer: multi-head self-attention + position-wise FFN + residual + LayerNorm"
        },
        {
          "label": "Decoder: N layers (base model)",
          "value": "6",
          "unit": "layers",
          "note": "Each layer: masked self-attention + cross-attention + FFN + residual + LayerNorm"
        },
        {
          "label": "Cross-attention keys/values source",
          "value": "Encoder output z",
          "note": "Decoder queries (Q) come from decoder state; K,V come from encoder memory z"
        },
        {
          "label": "Autoregressive decoding steps",
          "value": "m (output length)",
          "note": "Decoder generates one token per step; each step attends to all previous outputs"
        },
        {
          "label": "WMT EN-DE BLEU (base model)",
          "value": "27.3",
          "unit": "BLEU",
          "note": "Encoder-decoder base transformer; Vaswani et al. Table 2"
        },
        {
          "label": "WMT EN-DE BLEU (big model)",
          "value": "28.4",
          "unit": "BLEU",
          "note": "Single model; surpassed all prior ensembles by >2 BLEU"
        },
        {
          "label": "WMT EN-FR BLEU (big model)",
          "value": "41.8",
          "unit": "BLEU",
          "note": "State-of-the-art single model at publication; trained 3.5 days on 8 GPUs"
        },
        {
          "label": "Training FLOPs (base model)",
          "value": "3.3 × 10¹⁸",
          "unit": "FLOPs",
          "note": "Substantially lower than prior RNN/CNN models at equivalent quality"
        }
      ],
      "faq_items": [
        {
          "question": "What is the role of cross-attention in the encoder-decoder architecture?",
          "answer": "Cross-attention in each decoder layer connects the decoder to the encoder's output memory. The decoder's current hidden state forms the queries Q, while the encoder output z provides the keys K and values V. This allows every decoder step to directly inspect any part of the input sequence, which is critical for tasks like translation where output words can depend on non-local input words."
        },
        {
          "question": "Why is decoder self-attention masked?",
          "answer": "During training the full target sequence is available, but the decoder must not see future tokens when predicting position t. Masking the attention matrix (setting scores to −∞ for positions > t before softmax) ensures that predictions at position t only depend on tokens 0 through t−1, preserving the autoregressive property and making training compatible with inference."
        },
        {
          "question": "When does encoder-decoder outperform decoder-only architectures?",
          "answer": "Research by Ding et al. (2024) and Zhang et al. (2025) shows encoder-decoder architectures tend to outperform decoder-only models on structured seq2seq tasks like translation, especially at smaller parameter budgets. Decoder-only models can match performance with larger scale but require roughly 2× the parameters for equivalent quality on translation tasks."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "feed-forward-layers",
      "title": "Position-Wise Feed-Forward Layers: FFN Formula, Parameter Budget, and GeLU vs ReLU",
      "description": "Each transformer layer contains a position-wise FFN with d_ff=2048 (4× d_model=512), computed as max(0, xW₁+b₁)W₂+b₂, contributing approximately 2.1M parameters per layer and roughly two-thirds of the base model's total parameter count.",
      "category": "architecture",
      "citation_snippet": "Each transformer FFN layer computes max(0,xW₁+b₁)W₂+b₂ with d_ff=2048 (4× d_model=512); FFN sublayers account for ~67% of the base model's 65M parameters; GeLU outperforms ReLU on NLP benchmarks (Hendrycks & Gimpel, 2016).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/1606.08415",
          "label": "Hendrycks & Gimpel (2016) — Gaussian Error Linear Units (GELUs). arXiv 2016"
        },
        {
          "url": "https://arxiv.org/abs/2002.05202",
          "label": "Noam et al. (2020) — GLU Variants Improve Transformer. arXiv 2020"
        }
      ],
      "data_points": [
        {
          "label": "FFN formula",
          "value": "FFN(x) = max(0, xW₁ + b₁)W₂ + b₂",
          "note": "ReLU activation; GeLU variant: FFN(x) = GELU(xW₁ + b₁)W₂ + b₂"
        },
        {
          "label": "d_model (input/output dimension)",
          "value": "512",
          "unit": "dimensions",
          "note": "FFN input and output match d_model for residual connections"
        },
        {
          "label": "d_ff (inner dimension)",
          "value": "2048",
          "unit": "dimensions",
          "note": "4× d_model; chosen empirically; expands and then compresses the representation"
        },
        {
          "label": "W₁ parameters (per layer)",
          "value": "512 × 2048 + 2048 = 1,050,624",
          "unit": "parameters",
          "note": "Weights + biases for the expansion layer"
        },
        {
          "label": "W₂ parameters (per layer)",
          "value": "2048 × 512 + 512 = 1,049,088",
          "unit": "parameters",
          "note": "Weights + biases for the compression layer"
        },
        {
          "label": "Total FFN parameters per layer",
          "value": "2,099,712",
          "unit": "parameters",
          "note": "~2.1M per encoder or decoder layer; vs ~1.05M for the attention block"
        },
        {
          "label": "FFN share of base model parameters",
          "value": "~67%",
          "unit": "percent",
          "note": "12 FFN layers × 2.1M = 25.2M; attention blocks contribute ~12.6M; remainder is embeddings"
        },
        {
          "label": "GeLU vs ReLU (CIFAR-10 error)",
          "value": "7.89% vs 8.16%",
          "unit": "error rate",
          "note": "GeLU achieves lower error; Hendrycks & Gimpel (2016) Table 1"
        }
      ],
      "faq_items": [
        {
          "question": "Why is d_ff set to 4× d_model in the original transformer?",
          "answer": "The 4× ratio (d_ff=2048 for d_model=512) was chosen empirically by Vaswani et al. It provides sufficient capacity for the FFN to perform complex non-linear transformations of each token's representation after attention. In practice, d_ff ratios from 2.67× to 8× are used across modern architectures, with the 4× ratio remaining a common default."
        },
        {
          "question": "What is the role of the FFN layer if attention already mixes token information?",
          "answer": "Multi-head attention mixes information across token positions, but applies only a linear transformation to each token's value vector. The position-wise FFN applies an independent non-linear transformation to each token's representation individually. It is thought to act as a key-value memory (Geva et al., 2021), storing factual associations learned during training."
        },
        {
          "question": "Why does GeLU outperform ReLU in transformer architectures?",
          "answer": "GeLU (x·Φ(x), where Φ is the standard normal CDF) is a smooth function that weights inputs by their magnitude rather than applying a hard threshold at zero. This smoother activation landscape tends to produce better-conditioned gradients during training on language tasks. Hendrycks & Gimpel (2016) showed consistent improvements over ReLU across NLP, vision, and speech benchmarks."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "few-shot-learning",
      "title": "Few-Shot Learning: Language Model Task Performance from k In-Context Demonstrations",
      "description": "GPT-3 benchmarks 0-shot, 1-shot, and 32-shot performance across 42 tasks; 32-shot achieves 79.3 on SuperGLUE vs 88.9 for fine-tuned BERT; accuracy is sensitive to example order.",
      "category": "agents-applications",
      "citation_snippet": "Brown et al. (NeurIPS 2020): GPT-3 175B 32-shot SuperGLUE = 79.3 vs fine-tuned BERT 88.9; Zhao et al. (ICML 2021): different orderings of same k examples produce up to ±15% accuracy variance; calibrating against neutral-input priors reduces order sensitivity.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2005.14165",
          "label": "Brown et al. (2020) — Language Models are Few-Shot Learners (GPT-3). NeurIPS 2020"
        },
        {
          "url": "https://arxiv.org/abs/2102.09690",
          "label": "Zhao et al. (2021) — Calibrate Before Use: Improving Few-Shot Performance of Language Models. ICML 2021"
        }
      ],
      "data_points": [
        {
          "label": "GPT-3 32-shot SuperGLUE",
          "value": "79.3 points",
          "unit": "points",
          "note": "Brown et al. (2020): 32 examples in context; fine-tuned BERT-large = 88.9; 9.6-point gap"
        },
        {
          "label": "Few-shot vs fine-tuning accuracy gap",
          "value": "10–20%",
          "unit": "% accuracy",
          "note": "Consistent gap across NLP benchmarks; fine-tuning remains more accurate for most tasks"
        },
        {
          "label": "Example order sensitivity",
          "value": "up to ±15%",
          "unit": "% accuracy variance",
          "note": "Zhao et al. (2021): same k examples in different orders produce large accuracy swings on classification"
        },
        {
          "label": "Standard k values benchmarked",
          "value": "k = 0, 1, 10, 32",
          "unit": "shots",
          "note": "Brown et al. (2020): 0-shot, 1-shot, and 'few-shot' (context window limit) are standard conditions"
        }
      ],
      "faq_items": [
        {
          "question": "Why is few-shot performance sensitive to example order?",
          "answer": "Zhao et al. (2021) found accuracy swings of up to ±15% from reordering the same k examples. The cause is a recency bias: tokens near the end of the context receive higher attention weight, making the last few examples disproportionately influential. The model is also biased toward label frequencies matching what it saw during pre-training. Calibration — dividing output probabilities by priors computed on neutral inputs — significantly reduces both recency bias and majority-label bias."
        },
        {
          "question": "When should few-shot prompting be preferred over fine-tuning?",
          "answer": "Few-shot prompting is preferable when: (1) labeled data is very scarce (<100 examples) — insufficient to fine-tune reliably; (2) tasks are transient or low-priority; (3) a single deployed model must handle many different tasks; (4) rapid prototyping without retraining is needed. Fine-tuning is preferable when accuracy is critical, data is available (>1000 examples), the task is stable, and the 10–20% accuracy gap over few-shot matters for the application."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "fine-tuning",
      "title": "Fine-Tuning Language Models: Full, Adapter, and Parameter-Efficient Methods",
      "description": "Fine-tuning adapts pretrained language models to specific tasks by continuing gradient-based training. Full fine-tuning updates all parameters; PEFT methods like LoRA, adapters, and prefix tuning update a small fraction.",
      "category": "alignment",
      "citation_snippet": "Howard & Ruder (2018) ULMFiT established pretraining + fine-tuning as the dominant NLP paradigm; PEFT methods (LoRA, adapters) achieve within 1% of full fine-tuning quality while updating <1% of parameters.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1801.06146",
          "label": "Howard & Ruder (2018) — Universal Language Model Fine-Tuning for Text Classification. ACL 2018"
        },
        {
          "url": "https://arxiv.org/abs/1902.00751",
          "label": "Houlsby et al. (2019) — Parameter-Efficient Transfer Learning for NLP. ICML 2019"
        },
        {
          "url": "https://arxiv.org/abs/2104.08691",
          "label": "Lester et al. (2021) — The Power of Scale for Parameter-Efficient Prompt Tuning. EMNLP 2021"
        }
      ],
      "data_points": [
        {
          "label": "ULMFiT classification error reduction",
          "value": "18–24%",
          "unit": "error reduction",
          "note": "Howard & Ruder (2018): across 6 text classification datasets vs training from scratch"
        },
        {
          "label": "Adapter parameter overhead",
          "value": "0.5–3.6%",
          "unit": "additional parameters",
          "note": "Houlsby et al. (2019): bottleneck adapters within 0.4% of full fine-tuning on GLUE"
        },
        {
          "label": "Prompt tuning parity threshold",
          "value": "~10B",
          "unit": "model parameters",
          "note": "Lester et al. (2021): prompt tuning matches fine-tuning only at ≥10B scale"
        },
        {
          "label": "ULMFiT training data reduction",
          "value": "100×",
          "unit": "less labeled data",
          "note": "Howard & Ruder: pretraining enables competitive performance with 100× less task-labeled data"
        },
        {
          "label": "Catastrophic forgetting",
          "value": "Present without mitigation",
          "unit": "",
          "note": "Full fine-tuning on new task degrades performance on pretraining knowledge; use lower LR, freezing"
        }
      ],
      "faq_items": [
        {
          "question": "What is catastrophic forgetting in fine-tuning and how is it mitigated?",
          "answer": "Catastrophic forgetting occurs when fine-tuning on a new task overwrites the pretrained weights, degrading general capabilities. Mitigations include: (1) using a much lower learning rate during fine-tuning than pretraining (Howard & Ruder use discriminative learning rates, e.g., LR/2.6 per layer deeper); (2) gradual unfreezing — fine-tuning only the top layers first, then progressively unfreezing deeper layers; (3) PEFT methods (LoRA, adapters) that freeze pretrained weights entirely; (4) multi-task training that keeps the model exposed to diverse tasks."
        },
        {
          "question": "When should you use full fine-tuning vs LoRA vs adapters vs prompt tuning?",
          "answer": "Full fine-tuning: when you have sufficient compute, a large labeled dataset, and need maximum task performance. LoRA: when you need to maintain multiple task-specific variants of a model, or when memory is limited — same inference cost after weight merging. Adapters (Houlsby): when you need strict parameter isolation between tasks (adapters are modular and can be swapped). Prompt tuning: only at very large scale (≥10B parameters); simpler but underperforms at smaller scales. In practice, LoRA has largely superseded adapters due to lower overhead and no inference latency."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "gradient-descent",
      "title": "Gradient Descent and Adam Optimizer: Update Rules and Hyperparameters",
      "description": "Gradient descent iteratively updates weights as θ ← θ − η∇L; Adam adds adaptive per-parameter learning rates using first and second moment estimates with β₁=0.9, β₂=0.999, ε=1e-8.",
      "category": "training",
      "citation_snippet": "SGD updates θ ← θ − η∇L; Adam adapts per-parameter learning rates using m_t = β₁m_{t-1}+(1−β₁)g_t and v_t = β₂v_{t-1}+(1−β₂)g_t²; typical transformer settings: β₁=0.9, β₂=0.95–0.999, ε=1e-8 (Kingma & Ba, 2015).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1412.6980",
          "label": "Kingma & Ba (2015) — Adam: A Method for Stochastic Optimization. ICLR 2015"
        },
        {
          "url": "https://arxiv.org/abs/1711.05101",
          "label": "Loshchilov & Hutter (2019) — Decoupled Weight Decay Regularization (AdamW). ICLR 2019"
        },
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        }
      ],
      "data_points": [
        {
          "label": "SGD update rule",
          "value": "θ_{t+1} = θ_t − η · ∇_θ L",
          "unit": "",
          "note": "η = learning rate; ∇_θ L = gradient of loss w.r.t. parameters"
        },
        {
          "label": "Adam first moment (β₁)",
          "value": "0.9",
          "unit": "",
          "note": "Exponential moving average of gradients; controls gradient smoothing"
        },
        {
          "label": "Adam second moment (β₂)",
          "value": "0.999",
          "unit": "",
          "note": "Exponential moving average of squared gradients; controls adaptive scaling"
        },
        {
          "label": "Adam epsilon",
          "value": "1e-8",
          "unit": "",
          "note": "Numerical stability term; prevents division by zero when v_t ≈ 0"
        },
        {
          "label": "Transformer warmup steps (original)",
          "value": "4,000",
          "unit": "steps",
          "note": "lr = d_model^{-0.5} · min(step^{-0.5}, step · warmup_steps^{-1.5})"
        },
        {
          "label": "Typical peak learning rate (large models)",
          "value": "1e-4 to 3e-4",
          "unit": "",
          "note": "With warmup + cosine decay; exact value scales inversely with batch size"
        }
      ],
      "faq_items": [
        {
          "question": "Why does Adam outperform vanilla SGD for transformer training?",
          "answer": "Transformers have parameters across very different scales — embedding weights, attention projections, and FFN weights all have different gradient magnitudes. Adam's per-parameter adaptive learning rates normalize these differences: parameters with consistently large gradients get smaller effective learning rates, while parameters with small gradients get larger effective steps. This adaptive scaling makes Adam much more robust to the choice of global learning rate and is why it dominates in language model training."
        },
        {
          "question": "What is AdamW and why is it used instead of Adam?",
          "answer": "Original Adam implements L2 regularization by adding λθ to the gradient, which interacts with the adaptive learning rate scaling in an undesirable way — parameters updated infrequently get less regularization. Decoupled weight decay (AdamW, Loshchilov & Hutter 2019) applies weight decay directly to the parameters: θ_{t+1} = (1 − ηλ)θ_t − η·Adam_update. This correctly decouples weight decay from the gradient-based update, improving generalization. Most large model training uses AdamW."
        },
        {
          "question": "What is the transformer learning rate schedule?",
          "answer": "Vaswani et al. (2017) introduced a warmup-then-decay schedule: lr(step) = d_model^{-0.5} × min(step^{-0.5}, step × warmup_steps^{-1.5}). This linearly increases the learning rate for the first warmup_steps, then decreases proportionally to the inverse square root of the step count. The 4,000-step warmup in the original paper is roughly 4% of training — modern practice varies from 1% to 10% of total steps depending on model size and batch size."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "hallucination-mechanisms",
      "title": "Hallucination Mechanisms: Why Language Models Generate Plausible but Incorrect Text",
      "description": "Language models hallucinate when training data contains conflicting facts, when knowledge is rare in training, or when decoding amplifies early errors; hallucination rates 4–14× higher for rare entities.",
      "category": "evaluation",
      "citation_snippet": "Ji et al. (ACM Computing Surveys 2023): hallucination = content unsupported or contradicted by source; Mallen et al. (ACL 2023): entities in bottom-25% training frequency show 4–14× higher hallucination rates than top-25%; Maynez et al. (ACL 2020): ~30% of abstractive summaries contain hallucinated content.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2202.03629",
          "label": "Ji et al. (2023) — Survey of Hallucination in Natural Language Generation. ACM Computing Surveys"
        },
        {
          "url": "https://arxiv.org/abs/2212.10560",
          "label": "Mallen et al. (2022) — When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories. ACL 2023"
        },
        {
          "url": "https://arxiv.org/abs/2005.00661",
          "label": "Maynez et al. (2020) — On Faithfulness and Factuality in Abstractive Summarization. ACL 2020"
        }
      ],
      "data_points": [
        {
          "label": "Hallucination rate in abstractive summarization",
          "value": "~30%",
          "unit": "% summaries affected",
          "note": "Maynez et al. (2020): ~30% of abstractive summaries contain hallucinated content; extractive <1%"
        },
        {
          "label": "Frequency effect: rare vs common entities",
          "value": "4–14× higher rate",
          "unit": "× hallucination rate",
          "note": "Mallen et al. (2022): bottom-25% training-frequency entities hallucinate 4–14× vs top-25%"
        },
        {
          "label": "Intrinsic vs extrinsic hallucination",
          "value": "Both in ~30–50% of evaluated summaries",
          "unit": "% examples",
          "note": "Ji et al. (2023): intrinsic = contradicts source; extrinsic = adds information absent from source"
        },
        {
          "label": "RLHF effect on hallucination",
          "value": "Reduces on common topics; persists on rare ones",
          "unit": "",
          "note": "RLHF reward model trained by raters who cannot detect specialized errors; confident hallucinations remain"
        }
      ],
      "faq_items": [
        {
          "question": "What are the main mechanistic causes of hallucination?",
          "answer": "Four primary mechanisms: (1) Training data conflicts — corpora contain contradictory facts about entities; the model learns a distribution over these and generates a plausible but incorrect resolution. (2) Knowledge sparsity — entities appearing rarely in training have high weight uncertainty; the model generates plausible-sounding but incorrect attributes. (3) Decoding error amplification — greedy or beam decoding reinforces early factual errors across the sequence, since incorrect claims have high conditional probability given themselves. (4) Exposure bias — training on teacher-forced correct prefixes leaves models unprepared to recover from errors in their own previous output."
        },
        {
          "question": "Why does RLHF reduce but not eliminate hallucination?",
          "answer": "RLHF trains on human preference comparisons, penalizing outputs raters identify as incorrect. For frequently-encountered topics, raters can detect errors and the reward model learns to penalize them. For rare or specialized topics, human raters often fail to recognize incorrect claims — so the reward model does not penalize them and the trained policy learns to sound confident and fluent regardless of factual accuracy. The defining characteristic of hallucination is plausible language paired with wrong content, and RLHF primarily optimizes plausibility."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "in-context-learning",
      "title": "In-Context Learning: Task Adaptation from Prompt Examples Without Weight Updates",
      "description": "In-context learning adapts language models to tasks using only prompt-provided examples, no gradient updates; emergent above ~1B parameters; demonstrated at scale by GPT-3 (Brown et al., 2020).",
      "category": "agents-applications",
      "citation_snippet": "Brown et al. (NeurIPS 2020) GPT-3: k-shot ICL from prompt examples without weight updates; 32-shot achieves 79.3 on SuperGLUE vs 88.9 fine-tuned BERT; Min et al. (EMNLP 2022): randomly flipping demonstration labels drops accuracy only ~10%, indicating format/distribution matters more than correct labels.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2005.14165",
          "label": "Brown et al. (2020) — Language Models are Few-Shot Learners (GPT-3). NeurIPS 2020"
        },
        {
          "url": "https://arxiv.org/abs/2111.02080",
          "label": "Xie et al. (2021) — An Explanation of In-Context Learning as Implicit Bayesian Inference. ICLR 2022"
        },
        {
          "url": "https://arxiv.org/abs/2202.12837",
          "label": "Min et al. (2022) — Rethinking the Role of Demonstrations for ICL. EMNLP 2022"
        }
      ],
      "data_points": [
        {
          "label": "GPT-3 32-shot SuperGLUE score",
          "value": "79.3",
          "unit": "points",
          "note": "Brown et al. (2020): GPT-3 175B 32-shot; fine-tuned BERT-large achieves 88.9 — 9.6-point gap"
        },
        {
          "label": "GPT-3 1-shot TriviaQA",
          "value": "68.0%",
          "unit": "Exact Match",
          "note": "Brown et al. (2020): 0-shot = 64.3%; fine-tuned T5 = 50.1%; ICL surpasses fine-tuned T5"
        },
        {
          "label": "ICL emergent parameter threshold",
          "value": "~1B parameters",
          "unit": "parameters",
          "note": "Brown et al. (2020): meaningful ICL gains appear above ~1B parameters; minimal below"
        },
        {
          "label": "Label-flip impact on ICL accuracy",
          "value": "~10% drop",
          "unit": "% accuracy",
          "note": "Min et al. (2022): randomly flipping all demonstration labels drops accuracy only ~10%, not ~50%"
        }
      ],
      "faq_items": [
        {
          "question": "Does in-context learning actually learn from labeled examples, or does it retrieve task patterns?",
          "answer": "Min et al. (2022) found that randomly flipping all demonstration labels reduces accuracy by only ~10% (not ~50% as would be expected if correct labels were essential). This suggests ICL primarily identifies which task format to apply — the input format, output format, label space, and data distribution — rather than learning from individual labeled examples. Xie et al. (2021) formalize this as Bayesian inference: demonstrations are evidence that updates a prior over task concepts encoded during pre-training."
        },
        {
          "question": "What is the difference between in-context learning and fine-tuning?",
          "answer": "Fine-tuning updates model weights via gradient descent on task-specific data, permanently adapting the model. In-context learning freezes all weights — adaptation occurs entirely through the attention mechanism processing the prompt. Fine-tuning typically achieves 10–20 higher accuracy (on benchmark comparisons) but requires separate weights per task, training compute, and labeled data. ICL is instant, requires no training, handles many tasks from a single model, but is limited by context window length and provides noisier adaptation."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "context-window",
      "title": "Context Window: Sequence Length Limits, Positional Methods, and Long-Context Extensions",
      "description": "The context window is the maximum token sequence a transformer can process in one pass; the original transformer used 512 tokens; modern architectures extend to 128K–1M tokens via RoPE, ALiBi, or sliding-window attention.",
      "category": "representation",
      "citation_snippet": "Original transformer context window: 512 tokens; self-attention memory scales as O(n²), making long contexts expensive; RoPE enables extrapolation beyond training length; some architectures reach 1M-token context windows.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/2104.09864",
          "label": "Su et al. (2024) — RoFormer: Enhanced Transformer with Rotary Position Embedding. Neurocomputing 2024"
        },
        {
          "url": "https://arxiv.org/abs/2108.12409",
          "label": "Press et al. (2022) — Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation. ICLR 2022"
        },
        {
          "url": "https://arxiv.org/abs/1904.10509",
          "label": "Child et al. (2019) — Generating Long Sequences with Sparse Transformers. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "Original transformer (2017)",
          "value": "512",
          "unit": "tokens",
          "note": "Limited by O(n²) attention memory at training time"
        },
        {
          "label": "Attention memory scaling",
          "value": "O(n²)",
          "unit": "",
          "note": "4× longer sequence = 16× more attention memory; dominant cost at long context"
        },
        {
          "label": "KV cache size (n_layers=32, n_heads=32, d_head=128, n=4096, fp16)",
          "value": "2 × 32 × 32 × 128 × 4096 × 2 bytes ≈ 2.1 GB",
          "unit": "bytes",
          "note": "KV cache dominates memory at long contexts; grows linearly with sequence length"
        },
        {
          "label": "RoPE extrapolation",
          "value": "Tested to 8× training length",
          "unit": "relative",
          "note": "Su et al. (2024): RoPE maintains performance well beyond training context length"
        },
        {
          "label": "Sparse attention memory reduction",
          "value": "O(n·√n)",
          "unit": "",
          "note": "Child et al. (2019) Sparse Transformer; reduces quadratic bottleneck"
        }
      ],
      "faq_items": [
        {
          "question": "Why does the context window matter for language models?",
          "answer": "The context window determines how much prior text the model can 'see' when generating each token. Tasks like long-document summarization, code analysis across files, and multi-turn conversation require large context windows. A model processing its 5,000th token can only use information from within the context window — earlier tokens are effectively forgotten if they exceed the limit."
        },
        {
          "question": "What limits context window size?",
          "answer": "Two factors: memory and compute. The attention weight matrix is n×n, requiring O(n²) memory. For n=32K tokens at fp16 per attention head, the attention matrix alone is 32K × 32K × 2 bytes ≈ 2GB per head. The KV cache (cached key-value pairs for all previous tokens) grows linearly with sequence length but can reach tens of GB for long sequences with many layers."
        },
        {
          "question": "How do modern positional encodings enable longer context?",
          "answer": "Sinusoidal encodings (2017) work for lengths up to those seen during training but degrade on longer inputs. RoPE (rotary position embedding) applies rotation matrices that naturally generalize to longer sequences. ALiBi (Attention with Linear Biases) applies a linear bias to attention scores based on distance, trained on short contexts but generalizing to longer ones with minimal degradation. Both enable the 'train short, test long' paradigm."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "inference-vs-training-compute",
      "title": "Inference vs Training Compute: FLOPs per Token vs Total Training Cost",
      "description": "Training a dense transformer requires ~6·N·D FLOPs total; inference costs ~2·N FLOPs per token; training dominates total compute for all frontier language models.",
      "category": "inference",
      "citation_snippet": "Training FLOPs ≈ 6·N·D for dense transformers (N parameters, D tokens); inference ≈ 2·N FLOPs per token; a 70B model requires ~1.4×10¹¹ FLOPs per token vs ~5.9×10²³ total training FLOPs (Chinchilla-optimal); training = ~4.2 trillion inference tokens equivalent.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2001.08361",
          "label": "Kaplan et al. (2020) — Scaling Laws for Neural Language Models. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/2203.15556",
          "label": "Hoffmann et al. (2022) — Training Compute-Optimal Large Language Models (Chinchilla). NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2104.10350",
          "label": "Patterson et al. (2021) — Carbon and the Broad Landscape of AI. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "Training FLOPs approximation",
          "value": "C_train ≈ 6·N·D",
          "unit": "FLOPs",
          "note": "N = parameters, D = tokens; factor 6 = 2 (forward) + 4 (backward + optimizer step)"
        },
        {
          "label": "Inference FLOPs per token (with KV cache)",
          "value": "C_infer ≈ 2·N",
          "unit": "FLOPs/token",
          "note": "Two matrix multiplications per token; no backward pass; past tokens cached, no recomputation"
        },
        {
          "label": "Training-equivalent inference tokens",
          "value": "C_train / C_infer = 3·D",
          "unit": "inference tokens",
          "note": "6·N·D training FLOPs ÷ 2·N per inference token = 3D inference tokens equivalent"
        },
        {
          "label": "70B Chinchilla training FLOPs",
          "value": "~5.9×10²³",
          "unit": "FLOPs",
          "note": "70B params × 1.4T tokens × 6 ≈ 5.9×10²³; requires ~1000 A100-days at peak efficiency"
        },
        {
          "label": "70B inference FLOPs per token",
          "value": "~1.4×10¹¹",
          "unit": "FLOPs",
          "note": "2 × 70×10⁹ = 1.4×10¹¹; A100 at 312 TFLOPS processes ~2000 tokens/s for a single request"
        }
      ],
      "faq_items": [
        {
          "question": "Why is the backward pass approximately 2× more expensive than the forward pass?",
          "answer": "The forward pass computes activations and loss in one traversal through the network. Backpropagation must apply the chain rule in reverse: one pass computes gradients of the loss w.r.t. activations (∂L/∂a), another computes gradients w.r.t. weights (∂L/∂W). The Adam optimizer step adds computation for updating first and second moment estimates per parameter. Together, backward + optimizer ≈ 4× forward pass compute, giving a training total of ~6× forward pass per step."
        },
        {
          "question": "Why is inference memory-bandwidth bound at small batch sizes?",
          "answer": "For a single request (batch size 1), generating one token requires loading all N model parameters from GPU HBM memory to compute 2·N FLOPs. The arithmetic intensity is 2·N FLOPs / (2N bytes for FP16) = 1 FLOP/byte — far below the A100's compute-to-memory ratio (~300:1). The GPU spends most time waiting for memory transfers, not computing. Larger batch sizes improve arithmetic intensity by amortizing the weight loads over multiple simultaneous requests."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "knowledge-distillation",
      "title": "Knowledge Distillation: Soft Targets, Temperature Scaling, and Compression Ratios",
      "description": "Knowledge distillation trains a small student model on the soft probability outputs of a large teacher model; DistilBERT retains 97% of BERT's performance at 60% of the size using temperature T=4 for soft targets.",
      "category": "representation",
      "citation_snippet": "Knowledge distillation trains a student on teacher soft labels at temperature T; DistilBERT achieves 97% of BERT's GLUE score at 60% the size (66M vs 110M parameters) using T=4 and a distillation loss coefficient of 0.9 (Sanh et al., 2019).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1503.02531",
          "label": "Hinton et al. (2015) — Distilling the Knowledge in a Neural Network. NIPS 2015 Workshop"
        },
        {
          "url": "https://arxiv.org/abs/1910.01108",
          "label": "Sanh et al. (2019) — DistilBERT, a distilled version of BERT. NeurIPS 2019 Workshop"
        },
        {
          "url": "https://arxiv.org/abs/1909.10351",
          "label": "Jiao et al. (2020) — TinyBERT: Distilling BERT for Natural Language Understanding. EMNLP 2020"
        }
      ],
      "data_points": [
        {
          "label": "DistilBERT size vs BERT",
          "value": "66M vs 110M parameters",
          "unit": "",
          "note": "40% parameter reduction; 60% of original size"
        },
        {
          "label": "DistilBERT GLUE score",
          "value": "97%",
          "unit": "relative to BERT",
          "note": "Sanh et al. (2019); retains 97% of BERT-base performance on GLUE benchmark"
        },
        {
          "label": "DistilBERT inference speed",
          "value": "60%",
          "unit": "faster",
          "note": "60% faster than BERT-base on CPU; 40% smaller memory footprint"
        },
        {
          "label": "Distillation temperature (DistilBERT)",
          "value": "4",
          "unit": "T",
          "note": "T=4 produces softer distributions; temperature used during both training and reference"
        },
        {
          "label": "TinyBERT compression",
          "value": "7.5×",
          "unit": "smaller",
          "note": "14.5M parameters; retains 96.8% of BERT teacher performance on GLUE"
        }
      ],
      "faq_items": [
        {
          "question": "Why use soft labels from a teacher instead of just training on hard labels?",
          "answer": "Hard labels (one-hot vectors) provide training signal only from the correct class. Soft labels from a large teacher encode the teacher's uncertainty distribution across all classes — for instance, a teacher might assign 'automobile' 0.90 probability and 'truck' 0.09 probability. This inter-class similarity information (Hinton et al. call it 'dark knowledge') helps the student generalize better than hard labels alone, particularly with limited data."
        },
        {
          "question": "What is the role of temperature in knowledge distillation?",
          "answer": "Temperature T controls how 'soft' the teacher's probability distribution is. At T=1, the teacher's standard softmax outputs are used. At T=4 (as in DistilBERT), the logits are divided by 4 before softmax, spreading probability more evenly across classes and making the relative sizes of small probabilities more influential during training. A higher temperature emphasizes the relationships between wrong answers, which contain more transferable structural information."
        },
        {
          "question": "Can distillation work across different architectures?",
          "answer": "Yes — the teacher and student need not share the same architecture. TinyBERT (Jiao et al., 2020) distills not only the output logits but also intermediate layer activations, attention patterns, and embedding layers, requiring architecture-specific alignment layers. This 'intermediate distillation' achieves better results than output-only distillation. Cross-architecture distillation is more complex but enables more aggressive compression."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "layer-normalization",
      "title": "Layer Normalization: Formula, Pre-Norm vs Post-Norm, and Training Stability",
      "description": "Layer normalization computes μ and σ over the feature dimension for each token independently, then applies learnable scale γ and shift β; pre-norm placement stabilizes deep transformer training.",
      "category": "architecture",
      "citation_snippet": "Layer normalization normalizes across d_model features per token: y = γ·(x−μ)/σ + β; applied before each sublayer in pre-norm transformers; enables stable training of 100+ layer networks (Ba et al., 2016).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1607.06450",
          "label": "Ba et al. (2016) — Layer Normalization. arXiv 2016"
        },
        {
          "url": "https://arxiv.org/abs/2002.04745",
          "label": "Xiong et al. (2020) — On Layer Normalization in the Transformer Architecture. ICML 2020"
        },
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        }
      ],
      "data_points": [
        {
          "label": "Normalization formula",
          "value": "y = γ·(x−μ)/σ + β",
          "unit": "",
          "note": "μ = mean over d_model features; σ = std dev; γ, β learnable per dimension"
        },
        {
          "label": "Normalization axis",
          "value": "d_model features",
          "unit": "",
          "note": "Computes statistics within each token independently; not across batch or sequence"
        },
        {
          "label": "Learnable parameters per layer",
          "value": "2 × d_model",
          "unit": "parameters",
          "note": "γ ∈ ℝ^{d_model} and β ∈ ℝ^{d_model}; initialized γ=1, β=0"
        },
        {
          "label": "Pre-norm training speed advantage",
          "value": "~2×",
          "unit": "relative convergence",
          "note": "Xiong et al. (2020): pre-norm converges faster and is more stable than post-norm"
        },
        {
          "label": "Parameters for base transformer (12 layers)",
          "value": "12 × 4 × 2 × 512 = 49,152",
          "unit": "parameters",
          "note": "4 LayerNorm per layer (2 encoder, 2 decoder), 2×d_model each"
        }
      ],
      "faq_items": [
        {
          "question": "Why use layer normalization instead of batch normalization in transformers?",
          "answer": "Batch normalization computes statistics across the batch dimension, which is problematic for variable-length sequences and small batch sizes. Layer normalization normalizes across the feature dimension for each example independently, making it batch-size-agnostic and equally effective during inference. Ba et al. (2016) showed layer norm particularly suits recurrent and attention-based architectures."
        },
        {
          "question": "What is pre-norm vs post-norm and which is better?",
          "answer": "Post-norm (original transformer): LayerNorm(x + Sublayer(x)). Pre-norm: x + Sublayer(LayerNorm(x)). Xiong et al. (2020) showed that post-norm transformers require careful learning rate warmup to avoid divergence, while pre-norm transformers converge more reliably without warmup. Most modern large models use pre-norm (also called 'pre-layer normalization')."
        },
        {
          "question": "Does layer normalization add many parameters?",
          "answer": "Layer normalization adds 2×d_model parameters per sublayer (γ and β, one per feature dimension). For the base transformer with d_model=512 and 24 normalization operations (2 per encoder layer × 6 + 3 per decoder layer × 6 = 30), that is 30 × 2 × 512 = 30,720 parameters — less than 0.05% of the 65M total parameter count."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "instruction-tuning",
      "title": "Instruction Tuning: Zero-Shot Generalization via Multi-Task Fine-Tuning",
      "description": "Instruction tuning fine-tunes language models on diverse instruction-formatted tasks, producing dramatic zero-shot generalization to unseen tasks without any task-specific examples.",
      "category": "alignment",
      "citation_snippet": "Wei et al. (2022) FLAN: instruction-tuning on 62 tasks improves zero-shot performance on 25 of 25 held-out tasks; 137B FLAN outperforms GPT-3 175B zero-shot on 20 of 25 tasks.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2109.01652",
          "label": "Wei et al. (2022) — Finetuned Language Models are Zero-Shot Learners. ICLR 2022"
        },
        {
          "url": "https://arxiv.org/abs/2110.08207",
          "label": "Sanh et al. (2022) — Multitask Prompted Training Enables Zero-Shot Task Generalization. ICLR 2022"
        },
        {
          "url": "https://arxiv.org/abs/2210.11610",
          "label": "Chung et al. (2022) — Scaling Instruction-Finetuned Language Models. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "FLAN task count (original)",
          "value": "62",
          "unit": "tasks",
          "note": "Wei et al. (2022); tasks grouped into 12 clusters; held-out tasks tested zero-shot"
        },
        {
          "label": "FLAN zero-shot improvement rate",
          "value": "25/25",
          "unit": "held-out tasks",
          "note": "FLAN 137B outperforms untuned 137B on all 25 held-out task clusters zero-shot"
        },
        {
          "label": "FLAN-T5 (Flan-PaLM) task count",
          "value": "1,836",
          "unit": "fine-tuning tasks",
          "note": "Chung et al. (2022): scaling to 1836 tasks further improves zero-shot and few-shot performance"
        },
        {
          "label": "T0 training tasks",
          "value": "171",
          "unit": "datasets",
          "note": "Sanh et al. (2022) T0: trained on 171 prompted datasets; zero-shot on 4 held-out SuperGLUE tasks"
        },
        {
          "label": "FLAN MMLU improvement",
          "value": "+10.2%",
          "unit": "accuracy",
          "note": "Chung et al. Flan-PaLM 540B vs PaLM 540B on MMLU 5-shot: 70.9% vs 69.3% (+1.6%); Flan improves few-shot too"
        }
      ],
      "faq_items": [
        {
          "question": "What is the difference between instruction tuning and standard fine-tuning?",
          "answer": "Standard fine-tuning adapts a model to a single task by continuing training on that task's labeled data. Instruction tuning trains on a large collection of tasks formatted as natural language instructions, explicitly optimizing for generalization. The key result (Wei et al., 2022): models fine-tuned on 62 instructionformatted tasks generalize zero-shot to held-out tasks, while the same model without instruction tuning does not generalize. Instruction tuning teaches the model to 'follow instructions' as a meta-skill."
        },
        {
          "question": "Why does instruction tuning only help larger models?",
          "answer": "Wei et al. (2022) showed that instruction tuning degrades performance on held-out tasks for models with fewer than ~100B parameters. For small models, instruction tuning on many tasks causes interference — the model memorizes task-specific patterns rather than learning generalizable instruction-following. Above ~100B parameters, the model has enough capacity to abstract the meta-skill of following natural language instructions."
        },
        {
          "question": "What does prompt/instruction format matter?",
          "answer": "Sanh et al. (2022) showed that prompting format significantly affects zero-shot transfer. They trained T0 on 171 datasets each with multiple human-written prompt templates, forcing the model to be robust to natural variation in how instructions are expressed. This prompt-variety training improved generalization compared to training on a single canonical format per task."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "lora-fine-tuning",
      "title": "LoRA: Low-Rank Adaptation for Parameter-Efficient Fine-Tuning",
      "description": "LoRA reparameterizes weight updates as low-rank matrix products ΔW=BA, reducing trainable parameters by 10,000× while matching or exceeding full fine-tuning quality on downstream tasks.",
      "category": "alignment",
      "citation_snippet": "LoRA (Hu et al., 2021): rank-4 decomposition ΔW=BA reduces trainable parameters to 0.01% of full model while matching full fine-tuning BLEU on E2E NLG; no inference latency added after weight merging.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2106.09685",
          "label": "Hu et al. (2021) — LoRA: Low-Rank Adaptation of Large Language Models. ICLR 2022"
        },
        {
          "url": "https://arxiv.org/abs/2305.14314",
          "label": "Dettmers et al. (2023) — QLoRA: Efficient Finetuning of Quantized LLMs. NeurIPS 2023"
        },
        {
          "url": "https://arxiv.org/abs/2012.13255",
          "label": "Aghajanyan et al. (2021) — Intrinsic Dimensionality Explains the Effectiveness of Language Model Fine-Tuning. ACL 2021"
        }
      ],
      "data_points": [
        {
          "label": "LoRA trainable parameters (rank-4)",
          "value": "~0.01%",
          "unit": "of full model",
          "note": "Hu et al.: 4.7M trainable vs 175B total for GPT-3 scale model at rank 4"
        },
        {
          "label": "Rank used in Hu et al. experiments",
          "value": "4–8",
          "unit": "rank r",
          "note": "Ranks 4 and 8 match or exceed full fine-tuning; very small r suffices for most tasks"
        },
        {
          "label": "E2E NLG BLEU — LoRA vs full fine-tuning",
          "value": "68.6 vs 68.2",
          "unit": "BLEU",
          "note": "LoRA (rank 4) slightly outperforms full fine-tuning on E2E NLG benchmark (Hu et al. Table 4)"
        },
        {
          "label": "Memory reduction (LoRA vs full fine-tune)",
          "value": "3×",
          "unit": "GPU memory",
          "note": "No optimizer states for frozen weights; full fine-tuning stores Adam states for all params"
        },
        {
          "label": "QLoRA quantization",
          "value": "4-bit NormalFloat",
          "unit": "quantization",
          "note": "Dettmers et al.: 4-bit quantized base model + LoRA adapters; 65B model fits on single GPU"
        }
      ],
      "faq_items": [
        {
          "question": "Why does low-rank adaptation work — doesn't the model need to change all its weights?",
          "answer": "Aghajanyan et al. (2021) showed that fine-tuning has a low intrinsic dimensionality: models trained for downstream tasks converge to solutions that can be expressed as perturbations in a very low-dimensional subspace of weight space. LoRA exploits this by restricting weight updates to rank-r matrices. Even with r=1 or r=4, the model can capture the task-specific signal because the pretrained weights already encode most of the required knowledge; only a small directional update is needed."
        },
        {
          "question": "How is LoRA merged for inference — does it add compute?",
          "answer": "At inference time, LoRA weights are merged into the frozen weights: W' = W + BA. This requires a single O(d²) addition done once before serving. After merging, the model has the same architecture and computational cost as the original — no adapter layers, no extra forward pass branches, no latency penalty. This is a key advantage over other PEFT methods that leave adapter modules in the computation graph."
        },
        {
          "question": "Which weight matrices should LoRA be applied to?",
          "answer": "Hu et al. (2021) tested applying LoRA to different subsets of weight matrices in the attention mechanism. Applying LoRA to all four attention matrices (Q, K, V, output projection) at rank 8 achieves the best results. Applying only to query and value matrices at rank 16 achieves comparable results. The feedforward layers can also be adapted but empirically contribute less per parameter. Most practitioners apply LoRA to Q, K, V projections at minimum."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "kv-cache",
      "title": "KV Cache: Key-Value Caching for Efficient Autoregressive Inference",
      "description": "The KV cache stores pre-computed attention key and value matrices from previous tokens, eliminating redundant computation during autoregressive decoding; memory scales as 2·n_layers·n_heads·d_head·seq_len·bytes per element.",
      "category": "representation",
      "citation_snippet": "KV caching stores key-value pairs from previous tokens, reducing inference FLOPs per step from O(n·d²) to O(d²); cache size for a 32-layer, 32-head, d_head=128 model at 4K tokens is ~536 MB at fp16.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/2211.05100",
          "label": "Pope et al. (2023) — Efficiently Scaling Transformer Inference. MLSys 2023"
        },
        {
          "url": "https://arxiv.org/abs/2305.13245",
          "label": "Ainslie et al. (2023) — GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. EMNLP 2023"
        }
      ],
      "data_points": [
        {
          "label": "KV cache memory formula",
          "value": "2 × n_layers × n_heads × d_head × seq_len × dtype_bytes",
          "unit": "bytes",
          "note": "Factor 2 for keys + values; doubles per additional layer and head"
        },
        {
          "label": "Example: 32 layers, 32 heads, d_head=128, 4096 tokens, fp16",
          "value": "536",
          "unit": "MB",
          "note": "2 × 32 × 32 × 128 × 4096 × 2 = 536,870,912 bytes ≈ 536 MB"
        },
        {
          "label": "Inference FLOPs without KV cache (token t)",
          "value": "O(t·d²)",
          "unit": "",
          "note": "Must recompute all previous K, V from scratch for each new token"
        },
        {
          "label": "Inference FLOPs with KV cache (token t)",
          "value": "O(d²)",
          "unit": "",
          "note": "Only compute K, V for the new token; attend over cached K, V for all prior tokens"
        },
        {
          "label": "Multi-Query Attention KV size reduction",
          "value": "8–32×",
          "unit": "reduction",
          "note": "MQA/GQA uses 1 or G < h KV heads shared across query heads; reduces KV cache proportionally"
        }
      ],
      "faq_items": [
        {
          "question": "Why is a KV cache necessary for autoregressive inference?",
          "answer": "During autoregressive generation, the model generates one token at a time. Without caching, to generate token t it would need to recompute the attention keys and values for all t−1 previous tokens in every layer — scaling as O(t) per new token. With the KV cache, keys and values are computed once and stored; generating each new token only adds O(1) new KV pairs, making the per-token inference cost constant rather than growing with sequence length."
        },
        {
          "question": "How large does the KV cache get in practice?",
          "answer": "KV cache size = 2 × n_layers × n_heads × d_head × seq_len × bytes_per_element. For a medium-scale model (32 layers, 32 heads, d_head=128) running at fp16 with a 4K token context, the KV cache is ~536 MB. At 128K tokens, the same model requires ~17 GB of KV cache alone — often exceeding the weight memory at short contexts. This is why long-context inference requires careful memory management."
        },
        {
          "question": "What is Multi-Query Attention and how does it reduce KV cache?",
          "answer": "Standard multi-head attention (MHA) maintains separate K and V projections for each of the h heads. Multi-Query Attention (MQA, Shazeer 2019) uses a single shared K and V projection across all query heads, reducing KV cache size by factor h. Grouped Query Attention (GQA, Ainslie et al. 2023) is a middle ground with G groups (G < h shared KV heads), reducing cache size by h/G while retaining most of MHA's quality."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "multi-head-attention",
      "title": "Multi-Head Attention: Projection Matrices, Parameter Count, and Head Ablations",
      "description": "Multi-head attention with h=8 heads projects queries, keys, and values into d_k=64 subspaces, concatenates the results, and applies a final linear transform, totaling ~1.05M parameters per attention block in the base model.",
      "category": "architecture",
      "citation_snippet": "Multi-head attention uses h=8 heads with d_k=64 each; the base transformer's attention block contains ~1.05M parameters; ablations show 8 heads achieves 25.8 BLEU vs 24.9 for a single head (Vaswani et al., 2017).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/1905.09418",
          "label": "Voita et al. (2019) — Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting. ACL 2019"
        },
        {
          "url": "https://arxiv.org/abs/1905.10650",
          "label": "Michel et al. (2019) — Are Sixteen Heads Really Better than One? NeurIPS 2019"
        }
      ],
      "data_points": [
        {
          "label": "Number of heads (base model)",
          "value": "8",
          "unit": "heads",
          "note": "d_k = d_v = d_model/h = 512/8 = 64"
        },
        {
          "label": "d_k per head",
          "value": "64",
          "unit": "dimensions",
          "note": "Each of the 8 heads projects into a 64-dimensional subspace"
        },
        {
          "label": "Parameters per W_Q / W_K / W_V",
          "value": "512 × 64 = 32,768",
          "unit": "parameters",
          "note": "Per head; all 8 heads together: 3 × 8 × 32,768 = 786,432 parameters"
        },
        {
          "label": "W_O projection parameters",
          "value": "512 × 512 = 262,144",
          "unit": "parameters",
          "note": "Final output projection; maps concatenated 512-dim back to d_model=512"
        },
        {
          "label": "Total attention block parameters",
          "value": "~1,048,576",
          "unit": "parameters",
          "note": "786,432 (input projections) + 262,144 (output projection) = 1,048,576"
        },
        {
          "label": "BLEU with 1 head vs 8 heads",
          "value": "24.9 vs 25.8",
          "unit": "BLEU",
          "note": "WMT EN-DE; single-head is 0.9 BLEU worse; ablation from Table 3 row A"
        },
        {
          "label": "BLEU with 16 heads (base model)",
          "value": "25.1",
          "unit": "BLEU",
          "note": "0.7 BLEU below 8-head optimum; too many heads hurts performance"
        }
      ],
      "faq_items": [
        {
          "question": "Why use multiple attention heads instead of one large attention operation?",
          "answer": "Multiple heads allow the model to jointly attend to information from different representation subspaces at different positions. A single head averages all this information, losing the ability to specialize. With h=8 heads, each head can learn to track different syntactic or semantic relationships simultaneously."
        },
        {
          "question": "How does multi-head attention keep the total computation constant?",
          "answer": "Each head operates on d_k = d_model/h dimensions, so the per-head computation is reduced proportionally. Running h heads at d_k = 64 each involves the same total floating-point operations as a single head at d_k = 512, while enabling richer, parallel representations."
        },
        {
          "question": "What did ablation studies show about the optimal number of heads?",
          "answer": "Vaswani et al. (2017) found in Table 3 that 8 heads achieves 25.8 BLEU on WMT EN-DE; single-head attention scores 24.9 BLEU (−0.9), 4 heads scores 25.5 BLEU, 16 heads scores 25.1 BLEU, and 32 heads scores 25.4 BLEU. Performance degrades at both extremes, suggesting 8 heads is the practical optimum for the base model."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "neural-network-fundamentals",
      "title": "Neural Network Fundamentals: Universal Approximation, Depth vs Width, and Activation Functions",
      "description": "The universal approximation theorem proves a 2-layer network with sufficient width approximates any continuous function; depth enables exponentially more efficient representations than width; ReLU enables training of deep networks.",
      "category": "training",
      "citation_snippet": "Universal approximation theorem: a 2-layer network with n hidden units approximates any continuous function on compact subsets of ℝ^d (Cybenko, 1989); depth enables exponentially more compact representations than equivalent-width shallow networks.",
      "sources": [
        {
          "url": "https://doi.org/10.1007/BF02551274",
          "label": "Cybenko (1989) — Approximation by Superpositions of a Sigmoidal Function. Mathematics of Control, Signals and Systems"
        },
        {
          "url": "https://doi.org/10.1016/0893-6080(91)90009-T",
          "label": "Hornik (1991) — Approximation Capabilities of Multilayer Feedforward Networks. Neural Networks"
        },
        {
          "url": "https://arxiv.org/abs/1502.01852",
          "label": "He et al. (2015) — Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification. ICCV 2015"
        }
      ],
      "data_points": [
        {
          "label": "Universal approximation theorem",
          "value": "∃ 2-layer net that ε-approximates any f ∈ C([0,1]^d)",
          "unit": "",
          "note": "Cybenko (1989); requires potentially exponential width; depth reduces this"
        },
        {
          "label": "Depth efficiency (exponential)",
          "value": "Exponential",
          "unit": "functions representable",
          "note": "Depth-L network can represent functions requiring exponential width in a depth-1 network"
        },
        {
          "label": "ReLU activation",
          "value": "f(x) = max(0, x)",
          "unit": "",
          "note": "Simple; enables vanishing gradient mitigation; creates piecewise-linear functions"
        },
        {
          "label": "GeLU at x=0",
          "value": "0.5",
          "unit": "output",
          "note": "GeLU(x) = x·Φ(x) where Φ is Gaussian CDF; smooth approximation to ReLU"
        },
        {
          "label": "He initialization for ReLU",
          "value": "W ~ N(0, 2/n_in)",
          "unit": "",
          "note": "He et al. (2015): variance scaled by 2/n for ReLU to preserve activation variance"
        }
      ],
      "faq_items": [
        {
          "question": "What does the universal approximation theorem actually guarantee?",
          "answer": "The theorem (Cybenko 1989, Hornik 1991) proves that a feedforward neural network with a single hidden layer and a non-polynomial activation function can approximate any continuous function on a compact domain to arbitrary precision, given sufficient hidden units. The key limitation: it does not guarantee learning — only existence. The required width can be exponential in input dimension, and gradient descent may not find the optimal solution."
        },
        {
          "question": "Why does depth matter if shallow networks are universal approximators?",
          "answer": "While shallow (2-layer) networks are universal approximators in theory, they may require exponentially many hidden units to represent certain functions. Deep networks can represent the same functions with exponentially fewer total neurons by composing simpler functions. In practice, depth enables hierarchical feature learning — early layers detect simple patterns; later layers combine them into complex abstractions. This is why all successful large language models use dozens to hundreds of layers."
        },
        {
          "question": "Why did ReLU replace sigmoid and tanh as the standard activation?",
          "answer": "Sigmoid and tanh saturate at large positive and negative values, producing near-zero gradients (vanishing gradient problem). ReLU = max(0,x) has gradient exactly 1 for positive inputs and 0 for negative inputs. The constant gradient for positive inputs prevents vanishing gradients in deep networks. ReLU also computes faster than sigmoid/tanh and produces sparse activations (many neurons are exactly zero), which has a useful regularization effect."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "masked-language-modeling",
      "title": "Masked Language Modeling: BERT's Pre-Training Objective and Bidirectional Context",
      "description": "Masked language modeling (MLM) randomly masks 15% of input tokens and trains the model to predict them using bidirectional context; BERT achieves this with the [MASK] token replacement strategy validated in Devlin et al. (2019).",
      "category": "training",
      "citation_snippet": "BERT's MLM masks 15% of tokens — 80% replaced with [MASK], 10% random token, 10% unchanged — enabling bidirectional context encoding; BERT-large achieved 87.6 on GLUE, surpassing all prior models by 7.0 points (Devlin et al., 2019).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1810.04805",
          "label": "Devlin et al. (2019) — BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. NAACL 2019"
        },
        {
          "url": "https://arxiv.org/abs/1907.11692",
          "label": "Liu et al. (2019) — RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/1903.07785",
          "label": "Baevski et al. (2019) — cloze-driven Pretraining of Self-attention Networks. EMNLP 2019"
        }
      ],
      "data_points": [
        {
          "label": "Masking rate",
          "value": "15%",
          "unit": "of tokens",
          "note": "Per input sequence; chosen as balance between too few signal vs. too much corruption"
        },
        {
          "label": "Masking strategy breakdown",
          "value": "80% [MASK], 10% random, 10% unchanged",
          "unit": "",
          "note": "Random and unchanged tokens prevent model from only learning to predict [MASK] tokens"
        },
        {
          "label": "BERT-large GLUE score",
          "value": "87.6",
          "unit": "GLUE average",
          "note": "Devlin et al. (2019); +7.0 points over prior best; trained on 3.3B words × 40 epochs"
        },
        {
          "label": "RoBERTa masking improvement",
          "value": "+1.2",
          "unit": "GLUE points",
          "note": "Dynamic masking (new mask per epoch) vs static masking; Liu et al. (2019)"
        },
        {
          "label": "BERT training tokens",
          "value": "~13.7",
          "unit": "billion",
          "note": "3.3B BooksCorpus + Wikipedia × 40 epochs at 128 seq_len + 40 epochs at 512"
        }
      ],
      "faq_items": [
        {
          "question": "Why does BERT use both [MASK], random, and unchanged tokens?",
          "answer": "If all 15% of selected tokens were always replaced with [MASK], the model would learn to predict tokens only when seeing [MASK] — but [MASK] never appears at inference time. To prevent this train/test mismatch, 10% of selected tokens are replaced with a random word and 10% are left unchanged. The model must learn to predict the original token even when the input appears normal, making representations more robust and usable for downstream tasks without masking."
        },
        {
          "question": "What is the difference between MLM and causal language modeling?",
          "answer": "MLM uses bidirectional context — when predicting a masked token, the model can attend to tokens both before and after the mask. Causal LM (next-token prediction) uses only left context. Bidirectional context makes MLM-trained models (like BERT) better for understanding tasks (classification, question answering, named entity recognition) but unable to generate text autoregressively. Causal LM models are naturally generative but rely on left-to-right context only."
        },
        {
          "question": "What is dynamic masking and why does it help?",
          "answer": "Static masking (original BERT) generates the mask once during data preprocessing, so the model sees the same masked positions repeatedly across epochs. Dynamic masking (RoBERTa, Liu et al. 2019) generates a new random mask for each training instance at each epoch, so the model never sees the same (sequence, mask) pair twice. Liu et al. found this improves GLUE by ~1.2 points and is one of several optimizations in RoBERTa that improved on BERT without architectural changes."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "mixture-of-experts",
      "title": "Mixture of Experts: Sparse Gating, Switch Transformer, and Efficient Scaling",
      "description": "Mixture of Experts (MoE) routes each token to k-of-N expert feed-forward layers via a learned gating function; Switch Transformer scales to 1.6 trillion parameters while activating only ~7 billion parameters per token.",
      "category": "representation",
      "citation_snippet": "Sparse MoE routes each token to top-k of N expert FFN layers; Switch Transformer (Fedus et al., 2022) uses k=1 routing to scale to 1.6T parameters, activating ~7B per token — 7× pre-training speedup over dense T5-11B.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2101.03961",
          "label": "Fedus et al. (2022) — Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity. JMLR 2022"
        },
        {
          "url": "https://arxiv.org/abs/1701.06538",
          "label": "Shazeer et al. (2017) — Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. ICLR 2017"
        },
        {
          "url": "https://arxiv.org/abs/2006.16668",
          "label": "Lepikhin et al. (2021) — GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. ICLR 2021"
        }
      ],
      "data_points": [
        {
          "label": "Switch Transformer total parameters",
          "value": "1.6",
          "unit": "trillion",
          "note": "Fedus et al. (2022); each token activates ~7B parameters via top-1 routing"
        },
        {
          "label": "Switch Transformer parameters activated per token",
          "value": "~7",
          "unit": "billion",
          "note": "Only 1 expert per MoE layer activated; ~0.4% of total parameters per token"
        },
        {
          "label": "Pre-training speedup (Switch vs dense T5-11B)",
          "value": "7×",
          "unit": "faster",
          "note": "Same compute budget; Switch reaches equivalent perplexity 7× faster in steps"
        },
        {
          "label": "Typical top-k routing",
          "value": "k = 1 or 2",
          "unit": "experts per token",
          "note": "k=1 (Switch); k=2 (GShard, most other MoE); k>2 shows diminishing returns"
        },
        {
          "label": "Expert capacity factor",
          "value": "1.0–1.5",
          "unit": "",
          "note": "Maximum tokens per expert = capacity_factor × (tokens/n_experts); overflow tokens skip MoE layer"
        }
      ],
      "faq_items": [
        {
          "question": "How does sparse MoE differ from a standard transformer FFN layer?",
          "answer": "A standard transformer FFN applies the same learned weight matrix to every token. A sparse MoE layer has N expert FFN networks (each identical in structure to a standard FFN) and a gating network that routes each token to the top-k experts. Only the selected experts' parameters are used for each token. If N=64 experts and k=2, each token activates 2/64 ≈ 3% of the expert parameters, giving the model large total capacity while keeping per-token compute nearly the same as a single expert."
        },
        {
          "question": "What is load balancing in MoE models and why does it matter?",
          "answer": "Load balancing ensures tokens are distributed roughly evenly across experts. Without it, the gating network tends to collapse — repeatedly sending most tokens to the same few experts — leaving others undertrained. Switch Transformer addresses this with an auxiliary load balancing loss: L_aux = α · Σᵢ f_i · p_i, where f_i is the fraction of tokens routed to expert i and p_i is the average gating probability. This encourages uniform routing during training."
        },
        {
          "question": "How are MoE models trained in practice?",
          "answer": "MoE layers are distributed across multiple accelerators, with each device hosting a subset of experts. During the forward pass, tokens are dispatched across devices to their assigned experts (all-to-all communication), processed locally, then sent back (second all-to-all). The computational cost per token is similar to a standard FFN, but model capacity is multiplied by N. GShard (Lepikhin et al.) and Switch Transformer demonstrate this approach at trillion-parameter scale."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "next-token-prediction",
      "title": "Next-Token Prediction: Causal Language Modeling Objective and Perplexity",
      "description": "Next-token prediction maximizes P(x_t | x_1,...,x_{t-1}) across all positions using cross-entropy loss; model perplexity = exp(average negative log-likelihood); the causal mask ensures no future token information leaks.",
      "category": "training",
      "citation_snippet": "Causal language modeling maximizes log P(x) = Σₜ log P(x_t | x_{<t}); perplexity = exp(−(1/N)Σ log P(x_t|context)); GPT-2 117M achieved perplexity 35.1 on Penn Treebank without fine-tuning (Radford et al., 2019).",
      "sources": [
        {
          "url": "https://openai.com/research/language-models-are-unsupervised-multitask-learners",
          "label": "Radford et al. (2019) — Language Models are Unsupervised Multitask Learners (GPT-2). OpenAI Technical Report"
        },
        {
          "url": "https://arxiv.org/abs/2005.14165",
          "label": "Brown et al. (2020) — Language Models are Few-Shot Learners (GPT-3). NeurIPS 2020"
        },
        {
          "url": "https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf",
          "label": "Bengio et al. (2003) — A Neural Probabilistic Language Model. JMLR 2003"
        }
      ],
      "data_points": [
        {
          "label": "Training objective",
          "value": "max Σₜ log P(x_t | x_1,...,x_{t-1})",
          "unit": "",
          "note": "Equivalently, minimize cross-entropy H(y, ŷ) = −Σ y_i log ŷ_i"
        },
        {
          "label": "Perplexity formula",
          "value": "PPL = exp(−(1/N) Σₜ log P(x_t | x_{<t}))",
          "unit": "",
          "note": "Geometric mean of inverse probabilities; lower is better"
        },
        {
          "label": "GPT-2 117M perplexity (Penn Treebank)",
          "value": "35.1",
          "unit": "PPL",
          "note": "Radford et al. (2019); zero-shot, no fine-tuning; SOTA at that time was ~34 with fine-tuning"
        },
        {
          "label": "Context for causal mask",
          "value": "Left-only",
          "unit": "",
          "note": "Attention mask sets upper triangle to −∞ before softmax; tokens cannot attend to future positions"
        },
        {
          "label": "Tokens per batch (GPT-3)",
          "value": "3.2 million",
          "unit": "tokens/batch",
          "note": "Large batches reduce gradient variance; 3.2M tokens across sequences of 2,048 tokens"
        }
      ],
      "faq_items": [
        {
          "question": "Why is next-token prediction an effective pre-training objective?",
          "answer": "Next-token prediction is a general-purpose objective — to predict what comes next in text, a model must implicitly learn syntax, semantics, factual relationships, reasoning patterns, and conversational structure. The training signal is derived entirely from the text itself (no human labels needed), enabling training on internet-scale data. Radford et al. (2019) demonstrated that GPT-2 acquires diverse capabilities (summarization, translation, QA) purely from next-token prediction."
        },
        {
          "question": "What is the relationship between cross-entropy loss and perplexity?",
          "answer": "The average negative log-likelihood (NLL) per token is the cross-entropy loss: CE = −(1/N) Σ log P(x_t | context). Perplexity is the exponential of this: PPL = exp(CE). A model with perplexity 35 assigns the correct next token an average probability of approximately 1/35 ≈ 2.9%. Lower perplexity indicates the model is better calibrated to the data distribution. Comparing perplexities across models requires identical tokenization."
        },
        {
          "question": "Why can't the model attend to future tokens during training?",
          "answer": "During pre-training with causal language modeling, the model must predict x_t using only x_1,...,x_{t-1}. If the model could attend to x_{t+1} when predicting x_t, the task becomes trivially easy — the answer is always in the context. The causal attention mask sets all attention weights from position t to positions > t to −∞ before softmax, effectively zeroing them out and enforcing this constraint. This mask also makes the architecture directly usable for autoregressive generation at inference time."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "positional-encoding",
      "title": "Sinusoidal Positional Encoding: Wavelengths, Extrapolation, and Learned vs Fixed Comparison",
      "description": "Sinusoidal positional encodings use wavelengths from 2π to 10000·2π across d_model=512 dimensions, allowing transformers to represent sequence position without recurrence and to extrapolate to sequence lengths unseen during training.",
      "category": "architecture",
      "citation_snippet": "Sinusoidal positional encodings define PE(pos,2i)=sin(pos/10000^{2i/d_model}), with wavelengths from 2π to 10000·2π; Vaswani et al. (2017) found learned and fixed encodings achieve equivalent BLEU on WMT EN-DE.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/2108.12409",
          "label": "Press et al. (2022) — Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation. ICLR 2022"
        },
        {
          "url": "https://arxiv.org/abs/2104.09864",
          "label": "Su et al. (2024) — RoFormer: Enhanced Transformer with Rotary Position Embedding. Neurocomputing 2024"
        }
      ],
      "data_points": [
        {
          "label": "Minimum wavelength (dimension 0)",
          "value": "2π ≈ 6.28",
          "unit": "tokens",
          "note": "PE(pos, 0) = sin(pos); completes one full cycle every 2π positions"
        },
        {
          "label": "Maximum wavelength (dimension d_model-2)",
          "value": "10000 · 2π ≈ 62,832",
          "unit": "tokens",
          "note": "Lowest-frequency dimension; nearly static over typical sequence lengths"
        },
        {
          "label": "Wavelength at dimension 2i",
          "value": "2π · 10000^{2i/d_model}",
          "note": "Geometric progression; each pair of dimensions is 10000^{2/d_model} ≈ 1.044× longer"
        },
        {
          "label": "Number of encoding dimensions",
          "value": "512",
          "unit": "dimensions",
          "note": "256 sin + 256 cos pairs; same as d_model so encodings add directly to embeddings"
        },
        {
          "label": "Learned vs sinusoidal BLEU (WMT EN-DE)",
          "value": "25.8 vs 25.8",
          "unit": "BLEU",
          "note": "No significant difference; Vaswani et al. Table 3 row E"
        },
        {
          "label": "Learned encoding max position",
          "value": "≤ training length",
          "note": "Cannot extrapolate beyond seen positions; sinusoidal encodings extrapolate by construction"
        },
        {
          "label": "ALiBi (linear bias) extrapolation gain",
          "value": "+2.0",
          "unit": "perplexity reduction",
          "note": "Press et al. (2022) show ALiBi enables longer context at inference with no perplexity cost"
        }
      ],
      "faq_items": [
        {
          "question": "Why do sinusoidal encodings allow extrapolation to longer sequences?",
          "answer": "The sinusoidal functions are defined for all real-valued positions. A model trained on sequences of length 512 can compute PE(600, i) using the same formula, producing a valid encoding. Learned positional embeddings are a lookup table indexed by position index — they have no entries for positions beyond the training length."
        },
        {
          "question": "Why use both sine and cosine functions?",
          "answer": "Using sin for even dimensions and cos for odd dimensions means that PE(pos+k) can always be expressed as a linear function of PE(pos) for any fixed offset k. This property lets the model learn to attend by relative offset. With only sine functions this decomposition breaks down."
        },
        {
          "question": "How is the base 10000 chosen in the positional encoding formula?",
          "answer": "The base 10000 was chosen empirically by Vaswani et al. to produce wavelengths that span from 2π (fast-varying, captures local position) to ~62,832 tokens (slow-varying, distinguishes very different positions). The geometric progression ensures roughly equal information across all frequency scales."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "pre-training",
      "title": "Pre-Training: Self-Supervised Learning on Large Text Corpora",
      "description": "Pre-training learns general language representations through self-supervised objectives (next-token prediction or masked language modeling) on hundreds of billions of tokens before any task-specific fine-tuning.",
      "category": "training",
      "citation_snippet": "Pre-training on large corpora with self-supervised objectives (causal LM or MLM) produces general representations; GPT-3 was pre-trained on 300B tokens at 175B parameters using ~3.14×10²³ FLOPs (Brown et al., 2020).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2005.14165",
          "label": "Brown et al. (2020) — Language Models are Few-Shot Learners (GPT-3). NeurIPS 2020"
        },
        {
          "url": "https://arxiv.org/abs/1810.04805",
          "label": "Devlin et al. (2019) — BERT: Pre-training of Deep Bidirectional Transformers. NAACL 2019"
        },
        {
          "url": "https://openai.com/research/language-unsupervised",
          "label": "Radford et al. (2018) — Improving Language Understanding by Generative Pre-Training (GPT). OpenAI Blog"
        }
      ],
      "data_points": [
        {
          "label": "GPT-3 pre-training tokens",
          "value": "300",
          "unit": "billion tokens",
          "note": "Brown et al. (2020); trained on mix of Common Crawl, WebText2, Books, Wikipedia"
        },
        {
          "label": "GPT-3 parameters",
          "value": "175",
          "unit": "billion",
          "note": "96 layers, d_model=12288, 96 attention heads; dense decoder-only transformer"
        },
        {
          "label": "GPT-3 pre-training FLOPs",
          "value": "3.14 × 10²³",
          "unit": "FLOPs",
          "note": "Estimated as 6·N·D where N=175B, D=300B (Kaplan scaling law formula)"
        },
        {
          "label": "BERT pre-training tokens",
          "value": "~13.7",
          "unit": "billion tokens",
          "note": "3.3B Wikipedia + 800M BooksCorpus × 40 epochs; much smaller than GPT-3"
        },
        {
          "label": "Typical pre-training data composition",
          "value": "60% web, 22% books, 16% Wikipedia",
          "unit": "",
          "note": "Approximate breakdown for large language model pre-training datasets"
        }
      ],
      "faq_items": [
        {
          "question": "Why is pre-training effective for downstream tasks?",
          "answer": "Pre-training on diverse text exposes the model to a vast range of linguistic patterns, factual associations, and reasoning chains. The model develops representations that capture syntax, semantics, and world knowledge. When fine-tuned on a downstream task, only a small labeled dataset is needed to specialize these general representations, rather than learning from scratch. This is the 'pre-train then fine-tune' paradigm that has defined NLP since 2018."
        },
        {
          "question": "What data is used for pre-training?",
          "answer": "Large language models are pre-trained on filtered web text (Common Crawl derivatives), books corpora (Books1, Books2), Wikipedia, code repositories (GitHub), scientific papers, and other high-quality text sources. Brown et al. (2020) found that data quality matters significantly — the mix of data sources and filtering applied to Common Crawl substantially affects downstream performance. Typical pre-training corpora for large models contain 1–15 trillion tokens."
        },
        {
          "question": "How many training steps does pre-training require?",
          "answer": "Pre-training step count = D / (B × L) where D = total tokens, B = batch size (tokens), L = sequence length. GPT-3 with 300B tokens, batch size 3.2M tokens: 300B/3.2M ≈ 93,750 steps. For comparison, BERT used ~1M steps at much smaller batch sizes. Modern large-scale pre-training typically runs 250K–1M optimizer steps, with each step processing millions of tokens."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "perplexity-metric",
      "title": "Perplexity: Information-Theoretic Measure of Language Model Prediction Quality",
      "description": "Perplexity = exp(−(1/N)Σ log P(x_i|context)); lower is better; GPT-2 117M achieves PPL 35.1 on Penn Treebank zero-shot; GPT-3 175B achieves 20.5 on Penn Treebank zero-shot.",
      "category": "evaluation",
      "citation_snippet": "PPL(W) = exp(−(1/N) Σ log P(w_i|w_{<i})) = exp(cross-entropy per token); GPT-2 117M zero-shot: 35.1 PPL on Penn Treebank (Radford et al., 2019); GPT-3 175B zero-shot: 20.5 PPL (Brown et al., 2020); 4-gram KN baseline: 141.2 PPL; human-level estimated ~10–20 PPL.",
      "sources": [
        {
          "url": "https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf",
          "label": "Radford et al. (2019) — Language Models are Unsupervised Multitask Learners. OpenAI Blog"
        },
        {
          "url": "https://arxiv.org/abs/2005.14165",
          "label": "Brown et al. (2020) — Language Models are Few-Shot Learners. NeurIPS 2020"
        },
        {
          "url": "https://arxiv.org/abs/1708.02182",
          "label": "Merity et al. (2017) — Regularizing and Optimizing LSTM Language Models. ICLR 2018"
        }
      ],
      "data_points": [
        {
          "label": "Perplexity formula",
          "value": "PPL(W) = exp(−(1/N) Σ log P(w_i|w_{<i}))",
          "unit": "",
          "note": "Equivalent to exp(CE) where CE is cross-entropy per token in nats; lower PPL = better model"
        },
        {
          "label": "GPT-2 117M on Penn Treebank (zero-shot)",
          "value": "35.1",
          "unit": "perplexity",
          "note": "Radford et al. (2019): zero-shot on PTB; prior supervised LSTM best was ~78 PPL"
        },
        {
          "label": "GPT-3 175B on Penn Treebank (zero-shot)",
          "value": "20.5",
          "unit": "perplexity",
          "note": "Brown et al. (2020): zero-shot; 4-gram KN LM baseline is 141.2 PPL on PTB"
        },
        {
          "label": "Perplexity branching factor intuition",
          "value": "PPL ≈ effective vocabulary size at each step",
          "unit": "",
          "note": "PPL=35 means model is as uncertain as choosing uniformly among 35 equiprobable tokens"
        },
        {
          "label": "Human perplexity estimate (English)",
          "value": "~10–20",
          "unit": "perplexity",
          "note": "Domain-dependent: formal news text ~10 PPL; diverse web text ~25 PPL for strong models"
        }
      ],
      "faq_items": [
        {
          "question": "What does a perplexity of 35 mean intuitively?",
          "answer": "Perplexity is the geometric mean of the reciprocal of predicted probabilities per token. PPL=35 means the model is, on average, as uncertain as choosing uniformly among 35 equally probable options. It assigns an average probability of 1/35 ≈ 2.9% to the correct next token. Perfect prediction (correct token always at probability 1.0) gives PPL=1. A 4-gram language model achieves ~141 PPL on Penn Treebank; GPT-3 achieves ~20.5 PPL — GPT-3 is roughly 7× more certain per token than a 4-gram model."
        },
        {
          "question": "Why can you not compare perplexity numbers across different test sets?",
          "answer": "Perplexity depends on the entropy of the test data itself. A PPL of 35 on Penn Treebank (formal newspaper text, low entropy) is not comparable to PPL=35 on WikiText-103 (diverse Wikipedia text, higher entropy). A model might score 20 PPL on clean news and 80 PPL on code-mixed social media while being strictly better on both domains. Meaningful comparison requires both models to use identical tokenizers (vocabulary differences change PPL directly) and the exact same test split."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "prompt-engineering",
      "title": "Prompt Engineering: Systematic Input Design for Language Model Accuracy and Reliability",
      "description": "Structured prompts — role instructions, zero-shot CoT, self-consistency, few-shot exemplars — improve reasoning accuracy 10–40%; APE (Zhou et al., 2023) automates instruction optimization.",
      "category": "agents-applications",
      "citation_snippet": "Zero-shot CoT (Kojima et al., 2022): MultiArith 17.7% → 78.7%; self-consistency 40 samples (Wang et al., 2022): GSM8K +17%; APE (Zhou et al., ICLR 2023): LLM-generated instructions outperform human-written prompts on 19 of 24 InstructGPT benchmarks.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2201.11903",
          "label": "Wei et al. (2022) — Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2205.11916",
          "label": "Kojima et al. (2022) — Large Language Models are Zero-Shot Reasoners. NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2203.11171",
          "label": "Wang et al. (2022) — Self-Consistency Improves Chain of Thought Reasoning. ICLR 2023"
        },
        {
          "url": "https://arxiv.org/abs/2211.01910",
          "label": "Zhou et al. (2022) — Large Language Models Are Human-Level Prompt Engineers. ICLR 2023"
        }
      ],
      "data_points": [
        {
          "label": "Zero-shot CoT: MultiArith accuracy",
          "value": "17.7% → 78.7%",
          "unit": "% accuracy",
          "note": "Kojima et al. (2022): PaLM 540B; zero-shot 'Let's think step by step' vs standard zero-shot"
        },
        {
          "label": "Self-consistency k=40: GSM8K gain",
          "value": "+17 percentage points",
          "unit": "% accuracy",
          "note": "Wang et al. (2022): single CoT 57% → self-consistency 74% on GSM8K with PaLM 540B"
        },
        {
          "label": "APE outperformance rate",
          "value": "19 of 24 tasks",
          "unit": "tasks improved",
          "note": "Zhou et al. (2022): APE-generated instructions outperform human-written prompts on 19/24 benchmarks"
        },
        {
          "label": "Example order accuracy variance",
          "value": "up to ±15%",
          "unit": "% accuracy",
          "note": "Same examples in different orderings; calibration or self-consistency reduces variance"
        }
      ],
      "faq_items": [
        {
          "question": "What is the most empirically well-supported prompt engineering technique?",
          "answer": "Chain-of-thought prompting (Wei et al., 2022) has the strongest evidence: +20–40% accuracy on multi-step arithmetic and reasoning tasks at large scale. Self-consistency (Wang et al., 2022) stacks another +10–17% via majority voting over multiple sampled CoT paths. Zero-shot CoT 'Let's think step by step' (Kojima et al., 2022) delivers large gains with no example data. These three techniques have been replicated across multiple model families and task types."
        },
        {
          "question": "What is Automatic Prompt Engineer (APE)?",
          "answer": "Zhou et al. (2022) proposed using a language model to generate and evaluate candidate instructions: (1) prompt the model with few-shot examples to generate N candidate instruction strings; (2) evaluate each instruction on a validation set; (3) return the highest-scoring instruction. APE outperformed human-written instructions on 19 of 24 InstructGPT benchmarks, suggesting LLMs can optimize prompts better than humans partly because they have seen the distribution of effective instructions during pre-training."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "quantization",
      "title": "Quantization: Reducing Numerical Precision in Neural Network Weights and Activations",
      "description": "Quantization maps FP32/FP16 weights to INT8 or INT4; reduces memory 4–8×; GPTQ and AWQ enable post-training quantization of large language models with less than 1% accuracy loss.",
      "category": "inference",
      "citation_snippet": "LLM.int8() (Dettmers et al., NeurIPS 2022): mixed-precision INT8 with FP16 for 0.1% outlier features enables inference at 8-bit with no accuracy degradation; GPTQ (Frantar et al., ICLR 2023): Hessian-compensated INT4 achieves <1% perplexity increase on 175B-scale models.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2208.07339",
          "label": "Dettmers et al. (2022) — LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2210.17323",
          "label": "Frantar et al. (2022) — GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. ICLR 2023"
        },
        {
          "url": "https://arxiv.org/abs/2306.00978",
          "label": "Lin et al. (2023) — AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. MLSys 2024"
        }
      ],
      "data_points": [
        {
          "label": "INT8 vs FP32 memory reduction",
          "value": "4×",
          "unit": "compression ratio",
          "note": "FP32 = 32 bits per weight; INT8 = 8 bits; 4× fewer bits; 7B model: 28 GB → 7 GB"
        },
        {
          "label": "INT4 vs FP32 memory reduction",
          "value": "8×",
          "unit": "compression ratio",
          "note": "INT4 = 4 bits; 8× compression; GPTQ achieves this with <1% perplexity degradation at large scale"
        },
        {
          "label": "LLM.int8() outlier feature fraction",
          "value": "~0.1%",
          "unit": "% of activation dimensions",
          "note": "Dettmers et al.: ~0.1% of features cause activation magnitudes >6σ; kept in FP16 precision"
        },
        {
          "label": "GPTQ INT4 perplexity increase",
          "value": "<1%",
          "unit": "relative perplexity increase",
          "note": "Frantar et al. (2022): 175B model GPTQ INT4 shows minimal perplexity degradation vs FP16"
        },
        {
          "label": "INT8 inference throughput gain",
          "value": "1.5–2×",
          "unit": "throughput multiplier",
          "note": "Practical speedup vs FP16 on GPUs with INT8 tensor cores (A100, H100); memory-bandwidth bottleneck reduced"
        }
      ],
      "faq_items": [
        {
          "question": "Why does quantization work without catastrophic accuracy loss?",
          "answer": "Neural networks are highly over-parameterized — weight values are clustered and individually encode little information. Rounding each weight to the nearest of 256 levels (INT8) or 16 levels (INT4) introduces quantization error, but this error is distributed across all weights and is often smaller than training noise. Large models (>7B parameters) are empirically more robust to quantization than small models: the error per parameter is diluted across more redundant representations. GPTQ further compensates by updating remaining weights when each weight is quantized."
        },
        {
          "question": "What are the two main post-training quantization approaches?",
          "answer": "Round-to-nearest (RTN) rounds each weight to the nearest quantization level — fast but loses significant accuracy at INT4 for smaller models. GPTQ (Frantar et al., 2022) uses approximate second-order Hessian information: when one weight is rounded, other weights in the same layer are updated to compensate for the output error introduced. This layer-wise compensation achieves INT4 quality close to FP16, whereas RTN degrades noticeably at 4 bits. AWQ (Lin et al., 2023) identifies ~1% of 'salient' weights and scales them before quantization to reduce their error."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "rag",
      "title": "Retrieval-Augmented Generation: Grounding Language Models in External Knowledge",
      "description": "RAG retrieves top-k documents via dense vector search and prepends them to context; Lewis et al. (NeurIPS 2020) showed +14.7% absolute accuracy on NaturalQuestions vs closed-book T5.",
      "category": "agents-applications",
      "citation_snippet": "RAG (Lewis et al., NeurIPS 2020): retriever encodes query q and passages d_i as dense vectors; top-k retrieved by maximum inner product search; RAG-Token achieved 44.5% Exact Match on NaturalQuestions vs 29.8% closed-book T5; DPR retriever top-1 accuracy 78.4% vs BM25 59.1%.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2005.11401",
          "label": "Lewis et al. (2020) — Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. NeurIPS 2020"
        },
        {
          "url": "https://arxiv.org/abs/2004.04906",
          "label": "Karpukhin et al. (2020) — Dense Passage Retrieval for Open-Domain Question Answering. EMNLP 2020"
        },
        {
          "url": "https://arxiv.org/abs/2007.01282",
          "label": "Izacard & Grave (2021) — Leveraging Passage Retrieval with Generative Models for Open Domain QA. EACL 2021"
        }
      ],
      "data_points": [
        {
          "label": "RAG-Token NaturalQuestions accuracy",
          "value": "44.5%",
          "unit": "Exact Match",
          "note": "Lewis et al. (2020): vs 29.8% closed-book T5 baseline; +14.7% absolute improvement"
        },
        {
          "label": "DPR top-1 retrieval accuracy",
          "value": "78.4%",
          "unit": "% top-1",
          "note": "Karpukhin et al. (2020): Dense Passage Retrieval on NQ test set; vs BM25 59.1%"
        },
        {
          "label": "Fusion-in-Decoder top-k",
          "value": "k = 100 documents",
          "unit": "passages",
          "note": "Izacard & Grave (2021): FiD model scales to k=100 retrieved passages; 67.6% EM on NaturalQuestions"
        },
        {
          "label": "Context window overhead per passage",
          "value": "~130 tokens per passage",
          "unit": "tokens",
          "note": "100-word passages ≈ 130 tokens; k=5 adds ~650 tokens to context window budget"
        }
      ],
      "faq_items": [
        {
          "question": "What is the difference between RAG-Sequence and RAG-Token?",
          "answer": "In RAG-Sequence, a single document d_i is retrieved and used for the entire generated output: P(y|x) ≈ Σ_i p(d_i|x) · P(y|x,d_i). One document is sampled and conditions all generation steps. In RAG-Token, a different document can be marginalized at each generation step: P(y_t|x,y_{<t}) ≈ Σ_i p(d_i|x,y_{<t}) · p(y_t|x,y_{<t},d_i). RAG-Token is more flexible but more expensive. Lewis et al. (2020) found RAG-Token outperforms RAG-Sequence on open-domain QA tasks."
        },
        {
          "question": "Why does RAG help with hallucination?",
          "answer": "Closed-book language models must encode all factual knowledge in weights during training; facts absent from or underrepresented in training data cannot be accurately recalled. RAG grounds generation in retrieved documents explicitly provided in context — the model can copy or paraphrase factual information from the retrieved text rather than relying solely on weight-encoded knowledge. However, RAG can still hallucinate if retrieved documents are incorrect or if the model fails to faithfully use retrieved content over its own parametric memory."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "reinforcement-learning-basics",
      "title": "Reinforcement Learning Basics: MDPs, Policy Gradients, and PPO",
      "description": "Reinforcement learning trains agents to maximize cumulative reward in a Markov Decision Process. Policy gradient methods, including PPO, are the foundation of RLHF for language model alignment.",
      "category": "alignment",
      "citation_snippet": "PPO (Schulman et al., 2017): clipped surrogate objective prevents destructive policy updates; achieves better sample efficiency than TRPO with simpler implementation; PPO is the standard RL optimizer in RLHF pipelines.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1707.06347",
          "label": "Schulman et al. (2017) — Proximal Policy Optimization Algorithms. arXiv"
        },
        {
          "url": "http://incompleteideas.net/book/the-book-2nd.html",
          "label": "Sutton & Barto (2018) — Reinforcement Learning: An Introduction. MIT Press"
        },
        {
          "url": "https://link.springer.com/article/10.1007/BF00992696",
          "label": "Williams (1992) — Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning. Machine Learning"
        }
      ],
      "data_points": [
        {
          "label": "PPO clip parameter ε",
          "value": "0.2",
          "unit": "",
          "note": "Schulman et al. default; clips policy ratio r(θ) to [1-ε, 1+ε] to prevent large updates"
        },
        {
          "label": "REINFORCE variance reduction",
          "value": "Baseline subtraction",
          "unit": "",
          "note": "Subtracting a state-dependent baseline (value function) reduces gradient variance without bias"
        },
        {
          "label": "Discount factor γ in language RL",
          "value": "1.0",
          "unit": "",
          "note": "RLHF pipelines typically use γ=1 since reward is only given at end of sequence (episodic)"
        },
        {
          "label": "PPO rollout buffer size",
          "value": "2048–8192",
          "unit": "tokens/steps",
          "note": "Typical RLHF implementations collect this many response tokens before each gradient update"
        },
        {
          "label": "KL penalty coefficient β",
          "value": "0.01–0.1",
          "unit": "",
          "note": "β scales the KL divergence from the reference policy in the RLHF reward: R = r_θ − β·KL"
        }
      ],
      "faq_items": [
        {
          "question": "What is the Markov Decision Process formalism?",
          "answer": "An MDP is a tuple (S, A, P, R, γ) where S is a state space, A an action space, P(s'|s,a) a transition function, R(s,a) a reward function, and γ ∈ [0,1] a discount factor. An agent observes state s, takes action a, receives reward r, transitions to s', and repeats. The goal is to find a policy π(a|s) that maximizes expected discounted return E[Σ γ^t R_t]. In language model RL: states are token sequences so far, actions are next tokens, reward is the preference score from the reward model."
        },
        {
          "question": "Why does vanilla policy gradient have high variance and how does PPO fix it?",
          "answer": "The REINFORCE gradient estimator ∇J(θ) = E[G_t · ∇log π(a|s)] is unbiased but has high variance because G_t (return) can be large and noisy. PPO addresses this with: (1) advantage estimation using a learned value function V(s) as a baseline (A = G_t - V(s_t)); (2) clipped surrogate objective that bounds the policy update ratio r(θ) = π_θ(a|s)/π_old(a|s) to [1-ε, 1+ε]; (3) multiple gradient steps per rollout batch with early stopping. The clipping prevents catastrophically large policy updates that destabilize training."
        },
        {
          "question": "What is the credit assignment problem in RL for language models?",
          "answer": "In RLHF, a reward signal (human preference score) is given for an entire generated response (often 50–500 tokens). The credit assignment problem asks: which tokens in the response caused the high/low reward? With γ=1 and terminal reward, all tokens in the sequence receive the same return, making it difficult to identify which specific word choices were good or bad. This is why RLHF training is less sample-efficient than supervised learning — the reward signal is sparse and temporally delayed."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "rlhf",
      "title": "RLHF: Reinforcement Learning from Human Feedback — Reward Model and PPO",
      "description": "RLHF trains a reward model on human preference comparisons, then uses PPO to optimize a language model policy to maximize reward while a KL penalty prevents the policy from diverging too far from the supervised baseline.",
      "category": "alignment",
      "citation_snippet": "RLHF trains a reward model on human pairwise preferences, then optimizes via PPO with KL penalty: R = r_θ(x,y) − β·KL(π_RL || π_SFT); introduced for language models by Stiennon et al. (NeurIPS 2020), extended by InstructGPT (Ouyang et al., 2022).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2009.01325",
          "label": "Stiennon et al. (2020) — Learning to Summarize with Human Feedback. NeurIPS 2020"
        },
        {
          "url": "https://arxiv.org/abs/2203.02155",
          "label": "Ouyang et al. (2022) — Training Language Models to Follow Instructions with Human Feedback (InstructGPT). NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/1707.06347",
          "label": "Schulman et al. (2017) — Proximal Policy Optimization Algorithms. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "RLHF reward function",
          "value": "R(x,y) = r_θ(x,y) − β·KL(π_RL(y|x) || π_SFT(y|x))",
          "unit": "",
          "note": "r_θ = learned reward model; β = KL penalty coefficient; KL term penalizes divergence from SFT baseline"
        },
        {
          "label": "KL penalty coefficient (β)",
          "value": "0.01–0.1",
          "unit": "",
          "note": "Typical range; higher β = more conservative; lower β = more reward optimization"
        },
        {
          "label": "InstructGPT human evaluation",
          "value": "85%",
          "unit": "% preference vs baseline",
          "note": "Ouyang et al. (2022): labelers preferred InstructGPT-1.3B over GPT-3 175B outputs 85% of the time"
        },
        {
          "label": "Reward model training size",
          "value": "~33,000",
          "unit": "comparison pairs",
          "note": "InstructGPT: 33K human pairwise comparisons used to train reward model"
        },
        {
          "label": "SFT warmup dataset",
          "value": "~13,000",
          "unit": "labeled prompts",
          "note": "InstructGPT: supervised fine-tuning on 13K high-quality human-written demonstrations first"
        }
      ],
      "faq_items": [
        {
          "question": "What are the three phases of RLHF training?",
          "answer": "Phase 1 (SFT): fine-tune the pre-trained language model on a dataset of human-written demonstrations (prompt, response pairs) using standard supervised learning. Phase 2 (Reward model): collect human pairwise comparisons (which response is better?) and train a classifier to predict human preferences. Phase 3 (RL fine-tuning): use PPO to optimize the SFT model to maximize the reward model's score, with a KL divergence penalty to prevent the policy from collapsing to reward-hacking behaviors."
        },
        {
          "question": "Why is a KL penalty needed in RLHF?",
          "answer": "Without the KL penalty, the RL policy can 'reward hack' — finding inputs that fool the reward model into giving high scores without actually being helpful or truthful. The reward model is an imperfect proxy for human preferences and has exploitable weaknesses. The penalty R = r_θ(x,y) − β·KL(π_RL || π_SFT) keeps the optimized policy close to the supervised baseline, limiting how aggressively it can exploit reward model flaws. This is a direct application of Goodhart's Law: when the measure becomes a target, it ceases to be a good measure."
        },
        {
          "question": "What did Stiennon et al. (2020) demonstrate about RLHF for summarization?",
          "answer": "Stiennon et al. trained a reward model on ~64,000 human preference comparisons between TL;DR summaries, then optimized a GPT-3-based summarizer using PPO. The RLHF-optimized model was preferred by human evaluators 65–75% of the time over supervised fine-tuning baselines. This paper established that RLHF could significantly improve human-perceived quality beyond what standard supervised learning achieves."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "residual-connections",
      "title": "Residual Connections: Skip Connections, Gradient Flow, and Deep Network Training",
      "description": "Residual connections add the input directly to the sublayer output — output = x + Sublayer(x) — enabling gradient flow through 100+ layer networks and preventing vanishing gradients in deep transformers.",
      "category": "architecture",
      "citation_snippet": "Residual connections compute output = x + Sublayer(x), providing a gradient highway that bypasses each sublayer; He et al. (2016) showed they enable training of 1,000-layer networks with no vanishing gradient.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1512.03385",
          "label": "He et al. (2016) — Deep Residual Learning for Image Recognition. CVPR 2016"
        },
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/1605.06431",
          "label": "Veit et al. (2016) — Residual Networks Behave Like Ensembles of Relatively Shallow Networks. NeurIPS 2016"
        }
      ],
      "data_points": [
        {
          "label": "Residual formula",
          "value": "y = x + F(x)",
          "unit": "",
          "note": "F(x) = sublayer function (attention or FFN); x = identity shortcut"
        },
        {
          "label": "Gradient flow advantage",
          "value": "∂L/∂x = 1 + ∂L/∂F",
          "unit": "",
          "note": "The gradient always has a direct path of magnitude ≥ 1 to earlier layers"
        },
        {
          "label": "Original ResNet depth",
          "value": "1,000",
          "unit": "layers",
          "note": "He et al. (2016) successfully trained 1,000-layer residual nets; impossible without skip connections"
        },
        {
          "label": "Transformer depth with residuals",
          "value": "6 + 6",
          "unit": "encoder + decoder layers",
          "note": "Each sublayer wrapped in residual; enables stable training of 100+ layer variants"
        },
        {
          "label": "Dropout applied",
          "value": "0.1",
          "unit": "rate",
          "note": "Dropout applied to sublayer output before addition: y = x + Dropout(Sublayer(x))"
        }
      ],
      "faq_items": [
        {
          "question": "Why do residual connections prevent vanishing gradients?",
          "answer": "In a network without residuals, gradients must pass through every layer's Jacobian. If any layer's Jacobian has eigenvalues < 1, the gradient decays exponentially with depth. With residuals, the gradient path is ∂L/∂x = ∂L/∂(x+F) · (1 + ∂F/∂x). The '1' term ensures there is always a direct gradient signal of at least magnitude 1, regardless of what ∂F/∂x does."
        },
        {
          "question": "How do residual connections affect model capacity?",
          "answer": "Residual connections do not reduce model capacity — F(x) still has all the same parameters. However, they change what the network learns: instead of learning the desired mapping H(x) directly, F learns the residual H(x) − x. If the optimal mapping is close to the identity, F only needs to produce a small correction, which is easier to learn than the full transformation."
        },
        {
          "question": "What is the ensemble interpretation of residual networks?",
          "answer": "Veit et al. (2016) showed that a residual network with n blocks can be understood as an ensemble of 2ⁿ paths of varying lengths. Most gradient during training flows through short paths (2–3 layers), while long paths contribute exponentially less. This explains why removing or damaging a single layer in a residual network causes only a small accuracy drop — the other paths compensate."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "softmax-function",
      "title": "Softmax Function: Formula, Temperature Scaling, and Numerical Stability",
      "description": "Softmax converts a vector of logits to a probability distribution: σ(z_i) = e^{z_i} / Σ_j e^{z_j}; temperature T controls distribution sharpness; numerically stabilized by subtracting max(z) before exponentiation.",
      "category": "architecture",
      "citation_snippet": "Softmax σ(z_i) = e^{z_i}/Σe^{z_j} converts attention logits to probability distributions; temperature T<1 sharpens toward argmax (greedy), T→∞ flattens to uniform; numerically stabilized by subtracting max(z).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://www.deeplearningbook.org/",
          "label": "Goodfellow et al. (2016) — Deep Learning. MIT Press (Chapter 6)"
        },
        {
          "url": "https://doi.org/10.1207/s15516709cog0901_7",
          "label": "Ackley et al. (1985) — A Learning Algorithm for Boltzmann Machines. Cognitive Science"
        }
      ],
      "data_points": [
        {
          "label": "Softmax formula",
          "value": "σ(z_i) = e^{z_i} / Σ_j e^{z_j}",
          "unit": "",
          "note": "Input z ∈ ℝ^K; output is probability vector summing to 1"
        },
        {
          "label": "Temperature-scaled softmax",
          "value": "σ(z_i/T)",
          "unit": "",
          "note": "T=1 standard; T→0 argmax (greedy); T→∞ uniform distribution"
        },
        {
          "label": "Numerically stable computation",
          "value": "σ(z_i − max(z))",
          "unit": "",
          "note": "Subtracting max(z) prevents overflow; does not change the output value"
        },
        {
          "label": "Gradient (diagonal Jacobian)",
          "value": "∂σ_i/∂z_i = σ_i(1 − σ_i)",
          "unit": "",
          "note": "Derivative w.r.t. own input; off-diagonal: ∂σ_i/∂z_j = −σ_i·σ_j for i≠j"
        },
        {
          "label": "Attention softmax input scale (d_k=64)",
          "value": "÷8 (÷√d_k)",
          "unit": "",
          "note": "Dividing by √d_k=8 prevents large logits that saturate softmax gradients"
        }
      ],
      "faq_items": [
        {
          "question": "Why is the softmax numerically unstable without the max subtraction?",
          "answer": "For large z_i, e^{z_i} can exceed float32's maximum (≈3.4×10³⁸ at z≈88). If any component overflows to infinity, the division produces NaN. Subtracting max(z) from all components before exponentiation keeps the largest exponent at e^0=1, guaranteeing no overflow while preserving the output distribution identically."
        },
        {
          "question": "How does temperature affect softmax in language models?",
          "answer": "Temperature T scales the logits before softmax: σ(z_i/T). At T=1 (default), the model's trained distribution is used. T<1 sharpens the distribution — at T=0, it becomes argmax (always picks the highest-probability token). T>1 flattens it toward uniform, increasing randomness and diversity. Most practical inference uses T between 0.7 and 1.2."
        },
        {
          "question": "Why does attention use softmax specifically?",
          "answer": "Attention requires a probability distribution over key positions — weights that are non-negative and sum to 1. Softmax is the standard way to achieve this from arbitrary real-valued scores. The exponential function ensures all weights are strictly positive, and the normalization ensures they sum to exactly 1.0. Alternative normalizations (sigmoid, sparsemax) have been explored but softmax remains dominant."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "temperature-sampling",
      "title": "Temperature Sampling: Controlling Randomness in Autoregressive Language Model Generation",
      "description": "Temperature T divides logits before softmax; T<1 concentrates probability on high-likelihood tokens; T>1 increases entropy; T=0 is greedy argmax decoding.",
      "category": "inference",
      "citation_snippet": "Temperature T scales logits before softmax: p_i = exp(z_i/T) / Σ exp(z_j/T); T→0 approaches greedy decoding; T=1 is standard softmax; T>1 increases entropy. Holtzman et al. (ICLR 2020) showed T-based sampling produces incoherent text at high values without truncation.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1904.09751",
          "label": "Holtzman et al. (2020) — The Curious Case of Neural Text Degeneration. ICLR 2020"
        },
        {
          "url": "https://doi.org/10.1207/s15516709cog0901_7",
          "label": "Ackley et al. (1985) — A Learning Algorithm for Boltzmann Machines. Cognitive Science 9(1)"
        }
      ],
      "data_points": [
        {
          "label": "Temperature formula",
          "value": "p_i = exp(z_i/T) / Σ exp(z_j/T)",
          "unit": "",
          "note": "Standard softmax when T=1; reduces to argmax as T→0; equivalent to scaling logits by 1/T"
        },
        {
          "label": "Greedy decoding threshold",
          "value": "T → 0 (argmax)",
          "unit": "",
          "note": "Deterministic; always selects highest-probability token; produces repetition loops in long generation"
        },
        {
          "label": "Typical creative generation range",
          "value": "0.7–1.0",
          "unit": "",
          "note": "Balances diversity and coherence; values above 1.2 typically degrade semantic and grammatical coherence"
        },
        {
          "label": "Entropy at low vs high temperature",
          "value": "H(T=0.5) < H(T=1) < H(T=2)",
          "unit": "",
          "note": "Low T concentrates mass on top tokens; high T spreads mass uniformly; entropy scales monotonically with T"
        }
      ],
      "faq_items": [
        {
          "question": "What happens to the probability distribution when temperature approaches zero?",
          "answer": "As T→0, exp(z_i/T) diverges for the highest logit while all other terms approach zero. The result approaches a one-hot distribution placing all probability mass on the argmax token, equivalent to greedy decoding. This is deterministic but produces repetitive text because high-probability sequences often loop: once a frequent phrase begins, the model assigns high probability to its continuation indefinitely."
        },
        {
          "question": "Why does high temperature produce incoherent text?",
          "answer": "At T≫1, all logits are divided by a large number, compressing them toward zero before softmax. This makes the output distribution near-uniform — even highly unlikely tokens gain substantial probability. The model effectively ignores its learned predictions, treating all tokens as nearly equally likely, which destroys semantic and grammatical coherence. Holtzman et al. (2020) showed this degeneration requires truncation methods (top-k or top-p) to constrain the vocabulary."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "tokenization",
      "title": "Tokenization: Subword Units, Vocabulary Size, and Characters Per Token",
      "description": "Tokenization splits text into subword units before feeding to a language model; BPE vocabularies of 32K–100K tokens average ~4 characters per English token, balancing sequence length against vocabulary coverage.",
      "category": "representation",
      "citation_snippet": "Subword tokenization with BPE produces vocabularies of 32K–100K units; GPT-2's 50,257-token vocabulary averages ~4 characters per English token; a 1,000-word paragraph encodes to approximately 1,300 tokens.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1508.07909",
          "label": "Sennrich et al. (2016) — Neural Machine Translation of Rare Words with Subword Units. ACL 2016"
        },
        {
          "url": "https://openai.com/research/language-models-are-unsupervised-multitask-learners",
          "label": "Radford et al. (2019) — Language Models are Unsupervised Multitask Learners (GPT-2). OpenAI Technical Report"
        },
        {
          "url": "https://arxiv.org/abs/1808.06226",
          "label": "Kudo & Richardson (2018) — SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing. EMNLP 2018"
        }
      ],
      "data_points": [
        {
          "label": "GPT-2 vocabulary size",
          "value": "50,257",
          "unit": "tokens",
          "note": "Byte-level BPE; exact size chosen so vocabulary is divisible efficiently by GPUs"
        },
        {
          "label": "Average characters per English token",
          "value": "~4",
          "unit": "chars/token",
          "note": "Common words are single tokens; rare words split into multiple subword pieces"
        },
        {
          "label": "Typical sequence expansion",
          "value": "1.3×",
          "unit": "relative",
          "note": "1,000 English words ≈ 1,300 tokens; depends on vocabulary and text domain"
        },
        {
          "label": "BPE vocabulary range",
          "value": "32K–100K",
          "unit": "tokens",
          "note": "Smaller vocab → longer sequences; larger vocab → more parameters in embedding table"
        },
        {
          "label": "Embedding table size (50K vocab, 512 dim)",
          "value": "50,257 × 512 = 25.7M",
          "unit": "parameters",
          "note": "Embedding matrix often largest single parameter block in small models"
        }
      ],
      "faq_items": [
        {
          "question": "Why use subword tokenization instead of word-level or character-level?",
          "answer": "Word-level tokenization creates enormous vocabularies (500K+ words with proper nouns, inflections, misspellings) and treats unseen words as [UNK] tokens. Character-level tokenization produces very long sequences and struggles to capture semantic units. Subword tokenization (BPE, WordPiece, SentencePiece) finds a middle ground: frequent words become single tokens, rare words split into meaningful subpiece components. Sennrich et al. (2016) showed BPE-based systems achieve near-zero out-of-vocabulary rates while keeping sequences manageable."
        },
        {
          "question": "How does vocabulary size affect model behavior?",
          "answer": "Larger vocabularies reduce sequence length (fewer tokens per character), which reduces attention computation (quadratic in length). But larger vocabularies mean larger embedding tables and output projection layers. At 50K tokens and d_model=1024, the embedding/unembedding matrices alone contain 50K × 1024 ≈ 51M parameters. There is an empirical sweet spot around 32K–100K tokens for English text; multilingual models often use 250K+ to cover diverse scripts."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "scaling-laws",
      "title": "Scaling Laws: How Language Model Performance Scales with Parameters, Data, and Compute",
      "description": "Kaplan et al. (2020) found language model loss scales as power laws in N, D, and C; Chinchilla (2022) revised the optimal compute allocation to equal spending on parameters and training tokens.",
      "category": "training",
      "citation_snippet": "Kaplan et al. (2020) found L ∝ N^{-0.076} and L ∝ D^{-0.095}; Chinchilla (2022) revised: optimal N and D both scale as C^{0.5}, so a 70B model should train on ~1.4T tokens to be compute-optimal.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2001.08361",
          "label": "Kaplan et al. (2020) — Scaling Laws for Neural Language Models. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/2203.15556",
          "label": "Hoffmann et al. (2022) — Training Compute-Optimal Large Language Models (Chinchilla). NeurIPS 2022"
        },
        {
          "url": "https://arxiv.org/abs/2010.14701",
          "label": "Henighan et al. (2020) — Scaling Laws for Autoregressive Generative Modeling. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "Kaplan loss vs parameters",
          "value": "L(N) = (N_c/N)^{α_N}, α_N = 0.076",
          "unit": "",
          "note": "N_c ≈ 8.8×10¹³; power law with exponent 0.076 across 6 orders of magnitude"
        },
        {
          "label": "Kaplan loss vs dataset size",
          "value": "L(D) = (D_c/D)^{α_D}, α_D = 0.095",
          "unit": "",
          "note": "D_c ≈ 5.4×10¹³ tokens; loss decreases predictably as dataset grows"
        },
        {
          "label": "Chinchilla optimal parameter scaling",
          "value": "N_opt ∝ C^{0.5}",
          "unit": "",
          "note": "Hoffmann et al. (2022): both N and D should scale equally with compute budget C"
        },
        {
          "label": "Chinchilla 70B optimal tokens",
          "value": "1.4 trillion",
          "unit": "tokens",
          "note": "Compute-optimal for 70B parameter model; N_opt ≈ 20× fewer tokens than parameters"
        },
        {
          "label": "Pre-Chinchilla models (undertrained)",
          "value": "~20× undertrained",
          "unit": "",
          "note": "GPT-3 175B on 300B tokens; Chinchilla says 175B should train on ~3.5T tokens"
        }
      ],
      "faq_items": [
        {
          "question": "What are neural scaling laws and why do they matter?",
          "answer": "Neural scaling laws are empirical relationships showing that language model loss (measured by cross-entropy on held-out text) decreases predictably as a power function of model size, dataset size, or compute. Kaplan et al. (2020) demonstrated these relationships hold across 7 orders of magnitude in compute. This predictability enables researchers to forecast model capabilities before training, allocate compute budgets optimally, and design experiments efficiently."
        },
        {
          "question": "Why did Chinchilla overturn the Kaplan scaling law conclusions?",
          "answer": "Kaplan et al. (2020) found that for fixed compute, increasing model size helped more than increasing dataset size, leading to the practice of training large models on relatively few tokens. Hoffmann et al. (2022) showed this was incorrect due to insufficient hyperparameter tuning of smaller models. With properly tuned baselines, they found parameters and tokens should scale equally with compute. Their Chinchilla-70B model (70B params, 1.4T tokens) outperformed GPT-3 (175B, 300B tokens) using 4× less compute."
        },
        {
          "question": "What is the 'emergent abilities' threshold implied by scaling laws?",
          "answer": "Scaling laws model the smooth, continuous decrease in loss as models grow. However, Wei et al. (2022) documented 'emergent abilities' — capabilities like multi-step arithmetic, analogical reasoning, and certain NLP tasks that appear suddenly at specific scale thresholds rather than improving continuously. The apparent disconnect is partly a measurement artifact: binary evaluation metrics (pass/fail) can show discontinuous jumps even when the underlying loss is improving smoothly."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "top-p-sampling",
      "title": "Top-p (Nucleus) Sampling: Adaptive Vocabulary Truncation for Language Model Decoding",
      "description": "Nucleus sampling samples from the smallest token set whose cumulative probability ≥ p; adapts vocabulary size dynamically per step; p=0.9 introduced by Holtzman et al. (ICLR 2020).",
      "category": "inference",
      "citation_snippet": "Nucleus (top-p) sampling: select smallest V' ⊆ V such that Σ_{w∈V'} p(w|context) ≥ p, renormalize, sample; Holtzman et al. (ICLR 2020) showed top-p=0.9 produces text more preferred by humans than top-k, temperature-only, or greedy decoding across all evaluated metrics.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1904.09751",
          "label": "Holtzman et al. (2020) — The Curious Case of Neural Text Degeneration. ICLR 2020"
        },
        {
          "url": "https://arxiv.org/abs/1805.04833",
          "label": "Fan et al. (2018) — Hierarchical Neural Story Generation. ACL 2018"
        }
      ],
      "data_points": [
        {
          "label": "Recommended nucleus probability",
          "value": "p = 0.9–0.95",
          "unit": "",
          "note": "p=0.9 retains top 90% of probability mass; widely adopted default for natural text generation"
        },
        {
          "label": "Dynamic vocabulary size range",
          "value": "1 to ~50,000 tokens per step",
          "unit": "tokens",
          "note": "Peaked distributions → few tokens included; flat distributions → many tokens included"
        },
        {
          "label": "Human evaluation ranking",
          "value": "top-p > top-k > temperature-only > greedy",
          "unit": "",
          "note": "Holtzman et al. (2020): nucleus sampling most preferred by human raters across all story generation tasks"
        },
        {
          "label": "Renormalization formula",
          "value": "p̃_i = p_i / Σ_{j∈nucleus} p_j",
          "unit": "",
          "note": "After truncation to nucleus, probabilities rescaled to sum to 1.0 before sampling"
        }
      ],
      "faq_items": [
        {
          "question": "Why does nucleus sampling outperform top-k sampling?",
          "answer": "Top-k always samples from exactly k tokens regardless of distribution shape. If the distribution is very peaked (one token has 99% probability), top-k wastes probability mass on 999 near-zero tokens. If the distribution is very flat, top-k may exclude many reasonable candidates. Nucleus sampling adapts: peaked distributions yield a small nucleus; flat distributions yield a larger one. This matches the model's actual uncertainty rather than imposing a fixed vocabulary size."
        },
        {
          "question": "What does setting p=1.0 do in nucleus sampling?",
          "answer": "p=1.0 includes all tokens in the vocabulary (the smallest set with cumulative probability ≥ 1.0 is the entire vocabulary). This degenerates to standard temperature sampling with no truncation, including all low-probability 'tail' tokens. Holtzman et al. (2020) identified sampling from the unreliable tail as the primary source of degenerate text — the nucleus specifically excludes this tail, which is why values p < 1.0 produce better outputs."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "tool-use-function-calling",
      "title": "Tool Use and Function Calling: Language Models Invoking External Functions",
      "description": "Tool use lets language models output structured JSON describing function calls; an executor runs the function; the result is appended to context; Toolformer self-supervised tool learning (2023).",
      "category": "agents-applications",
      "citation_snippet": "Toolformer (Schick et al., NeurIPS 2023): model self-supervises API call insertion; reduces perplexity on 5 tools vs baseline; ReAct (Yao et al., ICLR 2023): interleaved reasoning+actions raise HotpotQA EM from 29.0% to 35.1% and ALFWorld success from 25% to 71%.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2302.04761",
          "label": "Schick et al. (2023) — Toolformer: Language Models Can Teach Themselves to Use Tools. NeurIPS 2023"
        },
        {
          "url": "https://arxiv.org/abs/2210.03629",
          "label": "Yao et al. (2022) — ReAct: Synergizing Reasoning and Acting in Language Models. ICLR 2023"
        },
        {
          "url": "https://arxiv.org/abs/2112.09332",
          "label": "Nakano et al. (2021) — WebGPT: Browser-assisted Question-Answering with Human Feedback. arXiv"
        }
      ],
      "data_points": [
        {
          "label": "ReAct HotpotQA Exact Match",
          "value": "35.1%",
          "unit": "Exact Match",
          "note": "Yao et al. (2022): ReAct (reason+act) vs 29.0% standard prompting; +6.1% absolute on multi-hop QA"
        },
        {
          "label": "ReAct ALFWorld success rate",
          "value": "71%",
          "unit": "% success",
          "note": "Yao et al. (2022): ReAct 71% vs 25% standard prompting; +46 points on embodied task completion"
        },
        {
          "label": "Toolformer tools",
          "value": "5 tools",
          "unit": "tool types",
          "note": "Schick et al. (2023): calculator, calendar, Wikipedia search, machine translation, QA system"
        },
        {
          "label": "Function call JSON format",
          "value": "{\"name\": \"tool_name\", \"arguments\": {\"arg\": \"val\"}}",
          "unit": "",
          "note": "Standard structured output; parsed by executor; result appended as observation in context"
        }
      ],
      "faq_items": [
        {
          "question": "How does function calling differ from RAG?",
          "answer": "RAG retrieves documents from a static vector index and prepends them to context before generation — it is read-only retrieval over a pre-indexed corpus. Function calling executes arbitrary code or API endpoints and returns structured results: computations (calculator, code interpreter), real-time lookups (live data, current prices), stateful writes (database updates, email sending), or multi-step workflows. Function calling is more general but requires a trust boundary — the executor must validate and sandbox what the model is permitted to invoke."
        },
        {
          "question": "What is the ReAct prompting framework?",
          "answer": "ReAct (Yao et al., 2022) interleaves reasoning traces with action steps: Thought → Action → Observation → Thought → Action → .... The model generates a natural language reasoning step explaining what information it needs, then a structured tool call, then incorporates the result as an observation before reasoning again. This explicit reasoning-before-action reduces errors compared to direct tool-call generation, as the model plans which tool to use and why before committing to a call."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "transformer-architecture",
      "title": "Transformer Architecture: Encoder-Decoder Design and Dimensions",
      "description": "The original transformer has 6 encoder + 6 decoder layers, d_model=512, 8 attention heads, d_ff=2048, and ~65M parameters; introduced by Vaswani et al. at NeurIPS 2017.",
      "category": "architecture",
      "citation_snippet": "The original transformer uses 6 encoder and 6 decoder layers, d_model=512, 8 attention heads, and 64M parameters; trained on WMT 2014 English-German to achieve 28.4 BLEU (Vaswani et al., 2017).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://jalammar.github.io/illustrated-transformer/",
          "label": "Alammar, J. — The Illustrated Transformer (2018)"
        },
        {
          "url": "https://arxiv.org/abs/1810.04805",
          "label": "Devlin et al. (2019) — BERT: Pre-training of Deep Bidirectional Transformers. NAACL 2019"
        }
      ],
      "data_points": [
        {
          "label": "Encoder layers",
          "value": "6",
          "unit": "layers",
          "note": "Each layer has multi-head self-attention + feed-forward sublayers"
        },
        {
          "label": "Decoder layers",
          "value": "6",
          "unit": "layers",
          "note": "Each layer has self-attention, cross-attention, and feed-forward sublayers"
        },
        {
          "label": "d_model (embedding dimension)",
          "value": "512",
          "unit": "dimensions",
          "note": "Uniform across all sublayers for easy residual connections"
        },
        {
          "label": "Number of attention heads",
          "value": "8",
          "unit": "heads",
          "note": "Each head operates on d_k = d_v = d_model/h = 64 dimensions"
        },
        {
          "label": "d_ff (feed-forward inner dimension)",
          "value": "2048",
          "unit": "dimensions",
          "note": "4× the model dimension; ReLU activation between two linear transforms"
        },
        {
          "label": "Total parameters (base)",
          "value": "65",
          "unit": "million",
          "note": "Encoder-decoder base model; 'big' model: 213M parameters"
        },
        {
          "label": "BLEU score (WMT EN-DE)",
          "value": "28.4",
          "unit": "BLEU",
          "note": "Best result in paper; surpassed all prior ensemble models"
        },
        {
          "label": "Training hardware",
          "value": "8 × NVIDIA P100",
          "unit": "",
          "note": "Base model trained for 100,000 steps (~12 hours)"
        }
      ],
      "faq_items": [
        {
          "question": "What are the key dimensions of the original transformer model?",
          "answer": "The original transformer (Vaswani et al., 2017) uses d_model=512, 8 attention heads (each operating on d_k=d_v=64 dimensions), d_ff=2048 in the feed-forward layers, 6 encoder layers, and 6 decoder layers, totaling approximately 65 million parameters for the base model."
        },
        {
          "question": "Why is d_model divided by the number of heads in multi-head attention?",
          "answer": "Dividing d_model by the number of heads (h) ensures that each attention head operates on d_k = d_model/h dimensions, so the total computation is equivalent to a single full-dimensional attention head. This allows the model to attend to information from different representation subspaces at different positions without increasing the computational cost."
        },
        {
          "question": "What was the significance of the transformer over previous sequence models?",
          "answer": "Prior models like LSTMs and GRUs processed tokens sequentially, making parallelization during training difficult and limiting long-range dependency capture due to vanishing gradients. The transformer's self-attention mechanism attends to all positions simultaneously in O(1) sequential operations (vs O(n) for recurrent models), enabling much faster training and better long-range dependency modeling."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "training-data-curation",
      "title": "Training Data Curation: Web Filtering, Deduplication, and Quality Selection",
      "description": "Raw web text requires extensive filtering to produce high-quality training corpora; Common Crawl at 400B+ tokens undergoes URL filtering, quality scoring, deduplication, and safety filtering, each step materially improving downstream model quality.",
      "category": "training",
      "citation_snippet": "Common Crawl contains 400B+ tokens of raw web text; quality filtering (perplexity scoring, deduplication, URL filtering) retains ~5–20% as training data; Penedo et al. (2023) FineWeb showed filtered quality improves benchmark scores by 2–4 points.",
      "sources": [
        {
          "url": "https://arxiv.org/abs/2112.11446",
          "label": "Rae et al. (2021) — Scaling Language Models: Methods, Analysis and Insights from Training Gopher. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/2406.17557",
          "label": "Penedo et al. (2023) — The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. arXiv"
        },
        {
          "url": "https://arxiv.org/abs/2107.06499",
          "label": "Lee et al. (2022) — Deduplicating Training Data Makes Language Models Better. ACL 2022"
        }
      ],
      "data_points": [
        {
          "label": "Common Crawl raw size",
          "value": "400+",
          "unit": "billion tokens",
          "note": "Monthly snapshots; 2021 snapshot ≈ 3.1TB compressed; quality varies widely"
        },
        {
          "label": "Retention rate after quality filtering",
          "value": "5–20%",
          "unit": "%",
          "note": "Typical filtering pipeline retains 5–20% of raw Common Crawl; varies by pipeline"
        },
        {
          "label": "Deduplication improvement",
          "value": "~1.5×",
          "unit": "perplexity improvement",
          "note": "Lee et al. (2022): removing duplicates reduces perplexity ~1.5× at same training compute"
        },
        {
          "label": "Near-deduplication threshold",
          "value": "0.8",
          "unit": "MinHash Jaccard similarity",
          "note": "Typical threshold for near-duplicate detection using MinHash LSH"
        },
        {
          "label": "Code data impact",
          "value": "+10–15%",
          "unit": "reasoning benchmark",
          "note": "Chen et al. (2021): including code in pre-training improves mathematical reasoning"
        }
      ],
      "faq_items": [
        {
          "question": "What steps are in a typical web data filtering pipeline?",
          "answer": "A typical pipeline includes: (1) URL-level filtering — blocklist of spam, adult content, and low-quality domains; (2) language identification — removing non-target language text; (3) quality filtering — perplexity scoring against a reference LM, text length filtering, symbol/punctuation ratio filters; (4) near-deduplication — MinHash LSH to remove near-duplicate documents; (5) content filtering — removing PII, harmful content. Each stage further reduces data volume while improving quality."
        },
        {
          "question": "Why does deduplication improve language model training?",
          "answer": "Lee et al. (2022) showed that training on deduplicated data significantly improves model quality at the same compute budget. The key mechanism: memorization. Models trained on highly duplicated data (e.g., the same document 100× in Common Crawl) memorize specific text verbatim rather than learning generalizable patterns. Deduplication forces the model to generalize rather than memorize, improving held-out perplexity by approximately 1.5× at identical compute."
        },
        {
          "question": "How is data quality measured without human labeling?",
          "answer": "The most common automated quality signals: (1) reference model perplexity — filter out text that a small, high-quality reference LM assigns high perplexity (i.e., text unlike high-quality sources); (2) content classification — train a binary classifier on known-good vs known-bad examples; (3) linguistic features — sentence count, token-to-word ratio, average word length, punctuation density; (4) URL quality scores — domain-level reputation from human-curated allow/blocklists. These signals are noisy individually but combine to significantly improve corpus quality."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "self-attention-mechanism",
      "title": "Scaled Dot-Product Attention: Formula, Complexity, and the √d_k Scaling Factor",
      "description": "Scaled dot-product attention computes Attention(Q,K,V) = softmax(Q·Kᵀ/√d_k)·V and has O(n²·d) time and memory complexity quadratic in sequence length.",
      "category": "architecture",
      "citation_snippet": "Scaled dot-product attention computes softmax(Q·Kᵀ/√d_k)·V, scaling by √d_k=8 to prevent vanishing gradients; time complexity is O(n²·d), quadratic in sequence length n (Vaswani et al., 2017).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1706.03762",
          "label": "Vaswani et al. (2017) — Attention Is All You Need. NeurIPS 2017"
        },
        {
          "url": "https://arxiv.org/abs/1409.0473",
          "label": "Bahdanau et al. (2015) — Neural Machine Translation by Jointly Learning to Align and Translate. ICLR 2015"
        },
        {
          "url": "https://arxiv.org/abs/1904.10509",
          "label": "Child et al. (2019) — Generating Long Sequences with Sparse Transformers. arXiv 2019"
        }
      ],
      "data_points": [
        {
          "label": "Attention formula",
          "value": "softmax(Q·Kᵀ/√d_k)·V",
          "note": "Scaled dot-product; scaling by √d_k prevents softmax saturation"
        },
        {
          "label": "d_k (key/query dimension)",
          "value": "64",
          "unit": "dimensions",
          "note": "d_model/h = 512/8 = 64; √64 = 8 is the scaling divisor"
        },
        {
          "label": "Time complexity",
          "value": "O(n²·d)",
          "note": "Quadratic in sequence length n; bottleneck for long sequences"
        },
        {
          "label": "Memory complexity",
          "value": "O(n²)",
          "note": "Attention matrix is n×n; 1024-token sequence needs ~4MB per head at float32"
        },
        {
          "label": "Sequential operations",
          "value": "O(1)",
          "note": "All token interactions computed in parallel; vs O(n) for RNNs"
        },
        {
          "label": "Path length (max signal)",
          "value": "O(1)",
          "note": "Any token pair connected in one operation; RNNs require O(n) steps"
        }
      ],
      "faq_items": [
        {
          "question": "Why divide by √d_k in scaled dot-product attention?",
          "answer": "For large d_k, the dot products Q·Kᵀ grow in magnitude proportional to √d_k, pushing the softmax into regions of very small gradients. Dividing by √d_k keeps the inputs to softmax in a moderate range regardless of dimensionality, preventing near-zero gradients during training."
        },
        {
          "question": "Why is self-attention O(n²) and why does that matter?",
          "answer": "Computing the full n×n attention matrix between all pairs of tokens requires O(n²) operations and O(n²) memory. For a sequence of 1024 tokens with d_model=512, the attention matrix alone consumes roughly 4MB per head at float32. This quadratic scaling is the primary constraint on context length for standard transformers."
        },
        {
          "question": "How does self-attention compare to RNNs for long-range dependencies?",
          "answer": "Self-attention connects any two tokens in O(1) operations regardless of their distance in the sequence. Recurrent networks require O(n) sequential steps for a signal to travel between distant tokens, making them vulnerable to vanishing gradients over long dependencies. This is one of the key advantages of the transformer."
        }
      ],
      "date_modified": "2026-02-27"
    },
    {
      "slug": "word-embeddings",
      "title": "Word Embeddings: Distributed Representations, word2vec, and Semantic Geometry",
      "description": "Word embeddings map discrete tokens to dense vectors in ℝ^d; word2vec skip-gram learns 300-dimensional vectors where cosine similarity captures semantic relationships and vector arithmetic encodes analogies.",
      "category": "representation",
      "citation_snippet": "word2vec skip-gram learns 300-dim embeddings where cosine similarity encodes semantics; vector arithmetic king − man + woman ≈ queen holds with ~76% accuracy; GloVe achieves 75.0% on word analogy tasks (Mikolov et al., 2013).",
      "sources": [
        {
          "url": "https://arxiv.org/abs/1310.4546",
          "label": "Mikolov et al. (2013) — Distributed Representations of Words and Phrases. NeurIPS 2013"
        },
        {
          "url": "https://arxiv.org/abs/1405.4513",
          "label": "Pennington et al. (2014) — GloVe: Global Vectors for Word Representation. EMNLP 2014"
        },
        {
          "url": "https://arxiv.org/abs/1607.04606",
          "label": "Bojanowski et al. (2017) — Enriching Word Vectors with Subword Information (FastText). TACL 2017"
        }
      ],
      "data_points": [
        {
          "label": "word2vec embedding dimension",
          "value": "300",
          "unit": "dimensions",
          "note": "Mikolov et al. (2013); cosine similarity captures semantic + syntactic relationships"
        },
        {
          "label": "GloVe word analogy accuracy",
          "value": "75.0%",
          "unit": "% accuracy",
          "note": "Pennington et al. (2014) on semantic analogy task; 65.5% on combined benchmark"
        },
        {
          "label": "word2vec skip-gram training objective",
          "value": "max P(w_{t±c} | w_t)",
          "unit": "",
          "note": "Predict context words within window c from center word; window size c=5 typical"
        },
        {
          "label": "Typical vocabulary size",
          "value": "100,000–500,000",
          "unit": "tokens",
          "note": "Embedding matrix shape: V × d, e.g., 100K × 300 = 30M parameters"
        },
        {
          "label": "Transformer embedding initialization",
          "value": "N(0, d_model^{-0.5})",
          "unit": "",
          "note": "Vaswani et al. scale embeddings by √d_model to match expected attention input scale"
        }
      ],
      "faq_items": [
        {
          "question": "Why does vector arithmetic like 'king − man + woman ≈ queen' work?",
          "answer": "Word2vec embeddings capture distributional semantics — words appearing in similar contexts have similar vectors. The 'king−man+woman' analogy works because the king−man vector captures the gender direction in the embedding space, and adding the woman vector relocates to queen's position. This holds with ~76% accuracy on the 8,869 semantic analogy pairs from Mikolov et al. (2013). It is a property that emerges from training, not a design constraint."
        },
        {
          "question": "How are word embeddings in transformers different from word2vec?",
          "answer": "word2vec produces static embeddings — each word has one vector regardless of context. Transformer embeddings (contextual embeddings) are dynamically computed: the same token 'bank' gets different representations in 'river bank' vs 'bank account' because attention layers mix information from surrounding tokens. The embedding table in a transformer provides an initial lookup; the hidden states after each attention layer become progressively more contextual."
        },
        {
          "question": "What dimension should word embeddings be?",
          "answer": "Empirically, performance scales logarithmically with embedding dimension. word2vec's original 300-dim vectors capture most useful structure; increasing to 1000+ dims yields diminishing returns for static embeddings. In transformers, d_model (embedding dimension) is coupled to the rest of the architecture. The original transformer uses d_model=512; larger models use 768, 1024, 2048, 4096, or 8192 dimensions."
        }
      ],
      "date_modified": "2026-02-27"
    }
  ]
}