ai-engineering-from-scratch-zh/phases/19-capstone-projects/31-tokenized-dataset-sliding-window/quiz.json at main · fancyboi999/ai-engineering-from-scratch-zh · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
{
  "lesson": "31-tokenized-dataset-sliding-window",
  "title": "顶点课 31 —— 带 Sliding Window 的 Tokenized Dataset",
  "questions": [
    {
      "stage": "pre",
      "question": "因果语言模型训练 batch 的 shape 约定是什么？",
      "options": [
        "(B, T) input ids 和 (B, T) target ids，其中 target 是 input 左移一位",
        "(B, V) 词表上的 one-hot 向量",
        "(B, T, T) attention mask，input ids 放在对角线上",
        "(B, 2) input id 和 label 的配对"
      ],
      "correct": 0,
      "explanation": "因果 LM 训练读入 (B, T) 的 input ids，预测同样 shape 的 target ids，其中 target[t] = input[t+1]。"
    },
    {
      "stage": "check",
      "question": "在长度为 N 的 id 流中，步长为 S 时，能放下多少个长度为 T+1 的 window？",
      "options": [
        "N // (T + 1)",
        "max(0, 1 + (N - (T + 1)) // S)",
        "(N - T) * S",
        "N - T - S"
      ],
      "correct": 1,
      "explanation": "第一个 window 从 0 开始，后续 window 从 S 的倍数开始。最后一个 window 仍需完整覆盖 T+1 个 id。"
    },
    {
      "stage": "check",
      "question": "把 stride 减半会对 dataset 产生什么影响？",
      "options": [
        "每个 epoch 的训练样本数减半",
        "每个 epoch 的训练样本数大约翻倍",
        "没有影响，因为 window 从不重叠",
        "消除了相邻 window 之间的重叠"
      ],
      "correct": 1,
      "explanation": "更小的 stride 产生更多重叠的 window，增加了样本数量和边界多样性，代价是每个 epoch 计算量更大。"
    },
    {
      "stage": "check",
      "question": "为什么要给 DataLoader 传一个显式的 torch.Generator？",
      "options": [
        "加快数据加载速度",
        "不需要 batch padding",
        "让 shuffle 在相同 seed 下跨次运行可复现",
        "启用多 GPU 训练"
      ],
      "correct": 2,
      "explanation": "设置了 seed 的 generator 能复现相同的 shuffle 顺序。相同 seed 多次运行看到的 batch 顺序一样，这对公平地做超参比较是必要的。"
    },
    {
      "stage": "post",
      "question": "为什么 dataset 从大小为 T+1 的每个 window 中返回 (input[:-1], input[1:])？",
      "options": [
        "丢掉一个 id 来节省内存",
        "实现了 shift-by-one 的 target，使 loss 衡量的是 next-token prediction",
        "从输入中移除一个 special token",
        "这只是 PyTorch tensor 索引的一个怪癖"
      ],
      "correct": 1,
      "explanation": "位置 t 的 target 是位置 t+1 的 input。把 window 切成 [:-1] 和 [1:] 就是在表达这个约定。"
    },
    {
      "stage": "post",
      "question": "为什么本课丢弃不完整的尾部 window 而不是 padding？",
      "options": [
        "padding 会改变词表大小",
        "丢弃后每个样本长度一致，不需要 loss mask",
        "模型不支持 padded token",
        "被丢弃的数据会在下一个 epoch 恢复"
      ],
      "correct": 1,
      "explanation": "所有样本的长度都恰好是 T 个 token，batch 就是一个干净的矩形 tensor，loss 可以均匀地施加，不需要 mask。"
    }
  ]
}