ai-engineering-from-scratch-zh/phases/17-infrastructure-and-production/08-inference-metrics-goodput/quiz.json at main · fancyboi999/ai-engineering-from-scratch-zh · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
{
  "lesson": "08-inference-metrics-goodput",
  "title": "推理指标 —— TTFT、TPOT、ITL、Goodput、P99",
  "questions": [
    {
      "stage": "pre",
      "question": "哪些组成部分主导了 TTFT（首 token 时延）？",
      "options": [
        "仅 decode 的前向计算时间",
        "权重的磁盘 I/O",
        "Tokenizer 的 GIL 开销",
        "排队时间、网络请求时间和 prefill 时间"
      ],
      "correct": 3,
      "explanation": ""
    },
    {
      "stage": "check",
      "question": "本课把哪个指标称为对产品真正重要的那个？",
      "options": [
        "GPU 占空比",
        "以每秒 token 数计的总吞吐量",
        "Goodput —— 同时满足所有 SLO 约束的请求所占比例",
        "平均 ITL"
      ],
      "correct": 2,
      "explanation": ""
    },
    {
      "stage": "check",
      "question": "为什么用平均值来报告 LLM 延迟是错误的统计量？",
      "options": [
        "平均值在流式响应上永远无法计算",
        "平均值只适用于 prefill，不适用于 decode",
        "LLM 延迟分布是右偏的；用户会经常遭遇被平均值掩盖的 P99 离群值",
        "GenAI-Perf 不支持平均值"
      ],
      "correct": 2,
      "explanation": ""
    },
    {
      "stage": "check",
      "question": "为什么 GenAI-Perf 和 LLMPerf 对同一次运行的 TPOT 会得出不一致的结果？",
      "options": [
        "GenAI-Perf 只在 Blackwell 上运行",
        "GenAI-Perf 在计算 ITL 时排除了 TTFT，而 LLMPerf 把它包含进去，因此工具的选择改变了数值",
        "LLMPerf 用微秒，GenAI-Perf 用毫秒",
        "它们采样了不同的请求"
      ],
      "correct": 1,
      "explanation": ""
    },
    {
      "stage": "post",
      "question": "对于长输出请求（>500 个 token），哪个指标主导了端到端延迟？",
      "options": [
        "TPOT 乘以输出长度",
        "冷启动时间",
        "TTFT",
        "网络响应时间"
      ],
      "correct": 0,
      "explanation": ""
    },
    {
      "stage": "post",
      "question": "哪一组最能体现本课对 2026 年一个 70B 对话模型所设定的合理面向消费者的 SLO？",
      "options": [
        "仅 P50，不设更高的分位数",
        "TTFT P99 800 ms，TPOT P99 25 ms，输出 <300 token 时 E2E P99 3 秒，goodput >= 99%",
        "TTFT P99 8 秒，TPOT P99 200 ms，goodput 50%",
        "平均 TTFT 10 ms，平均 TPOT 1 ms"
      ],
      "correct": 1,
      "explanation": ""
    }
  ]
}