ai-engineering-from-scratch-zh/phases/17-infrastructure-and-production/05-eagle3-speculative-decoding/quiz.json at main · fancyboi999/ai-engineering-from-scratch-zh · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
{
  "lesson": "05-eagle3-speculative-decoding",
  "title": "生产环境中的 EAGLE-3 推测解码",
  "questions": [
    {
      "stage": "pre",
      "question": "为什么推测解码（speculative decoding）能利用普通 decode 中存在的一个空隙？",
      "options": [
        "decode 是计算受限的，所以增加算力是免费的",
        "decode 无法从批处理中获益",
        "decode 比 prefill 需要更多的网络带宽",
        "decode 是内存受限的，所以 GPU 大部分时间都在空闲等待从 HBM 读取权重"
      ],
      "correct": 3,
      "explanation": ""
    },
    {
      "stage": "check",
      "question": "接受率 alpha 衡量的是什么？",
      "options": [
        "decode 期间使用的 GPU 显存比例",
        "draft 模型提议的 token 中被目标模型接受的比例",
        "draft 模型带来的延迟开销",
        "KV 缓存的命中率"
      ],
      "correct": 1,
      "explanation": ""
    },
    {
      "stage": "check",
      "question": "相比 EAGLE-2，EAGLE-3 改变了什么，使其在通用对话上把 alpha 推到大约 0.6-0.8？",
      "options": [
        "它使用同系列的全尺寸 draft 模型",
        "它运行在 CPU 而非 GPU 上",
        "它完全移除了 verify 步骤",
        "draft head 是在多个目标层上训练的，而不仅仅是最后一层"
      ],
      "correct": 3,
      "explanation": ""
    },
    {
      "stage": "check",
      "question": "本课指出，在大多数 2026 年硬件上、高并发场景下，alpha 大约低于多少时推测解码就会变成净负收益？",
      "options": [
        "0.95",
        "0.05",
        "0.55",
        "0.85"
      ],
      "correct": 2,
      "explanation": ""
    },
    {
      "stage": "post",
      "question": "在打开 EAGLE-3 之后，即使平均 ITL 下降了，你也应该最密切关注哪个指标？",
      "options": [
        "冷启动时间",
        "GPU 显存利用率",
        "P99 ITL，因为被拒绝的 draft 导致的双趟（two-pass）处理在满批次下可能被串行化",
        "平均端到端（E2E）延迟"
      ],
      "correct": 2,
      "explanation": ""
    },
    {
      "stage": "post",
      "question": "按本课所述，为什么在 2026 年的 vLLM 中推测解码是按需开启（而非默认）的？",
      "options": [
        "接受率取决于工作负载，未测量 alpha 就开启它是一种生产反模式",
        "它只在 Blackwell GPU 上工作",
        "它与 PagedAttention 不兼容",
        "它需要单独的许可证"
      ],
      "correct": 0,
      "explanation": ""
    }
  ]
}