ai-engineering-from-scratch-zh/phases/19-capstone-projects/32-token-positional-embeddings/quiz.json at main · fancyboi999/ai-engineering-from-scratch-zh · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
{
  "lesson": "32-token-positional-embeddings",
  "title": "顶点课 32 —— Token Embedding 与 Positional Embedding",
  "questions": [
    {
      "stage": "pre",
      "question": "embedding 阶段把 (B, T) 的 ids 变成什么 shape？",
      "options": [
        "(B, T)",
        "(B, V)",
        "(B, T, D)，其中 D 是模型维度",
        "(B, T, V)，其中 V 是词表大小"
      ],
      "correct": 2,
      "explanation": "每个 id 被映射到一个 D 维向量。batch 保留 B 和 T，多出一个特征维度 D。"
    },
    {
      "stage": "check",
      "question": "本课中 token embedding 和 positional embedding 是怎么组合的？",
      "options": [
        "沿特征轴拼接",
        "逐元素相加，position 向量在 batch 维度上广播",
        "额外加一层线性投影",
        "取逐元素最大值"
      ],
      "correct": 1,
      "explanation": "shape 为 (B, T, D) 和 (T, D) 的两个 tensor 做加法。广播把 positional table 沿 batch 轴复制。"
    },
    {
      "stage": "check",
      "question": "SinusoidalPositionalEmbedding 贡献了多少可学习参数？",
      "options": [
        "max_context_length",
        "max_context_length * d_model",
        "d_model",
        "零"
      ],
      "correct": 3,
      "explanation": "正弦表是根据公式计算出来并存为 buffer 的，不贡献任何可学习参数。"
    },
    {
      "stage": "check",
      "question": "哪种 positional encoding 不能接受超过其构造上限的序列长度？",
      "options": [
        "Sinusoidal positional embedding",
        "Learned positional embedding",
        "两者都是",
        "两者都不是"
      ],
      "correct": 2,
      "explanation": "在本课的实现中，两个类都在 forward 时强制检查 max_context_length。Learned 本身受表大小限制；这里的 sinusoidal 也受限，因为它用的是预先构建好的表。"
    },
    {
      "stage": "post",
      "question": "正弦方案为什么在相邻特征对上同时使用 sin 和 cos？",
      "options": [
        "因为单独的 sin 不是周期性的",
        "因为单独的 cos 不能做反向传播",
        "这样位置 p+k 处的向量就是位置 p 处向量的线性函数",
        "因为这样能翻倍参数量"
      ],
      "correct": 2,
      "explanation": "在同一波长上配对 sin 和 cos，意味着位置偏移 k 等价于一次旋转，而旋转是线性变换。attention 因此可以通过线性投影来学习相对偏移。"
    },
    {
      "stage": "post",
      "question": "为什么 embedding 的反向传播只更新 token table 中在本 batch 出现过的行？",
      "options": [
        "PyTorch 只存储非零 entry 的梯度",
        "前向传播中没被碰到的行，根据链式法则对 loss 的梯度为零",
        "优化器会跳过它没见过的行",
        "token id 是用稀疏数据结构存储的"
      ],
      "correct": 1,
      "explanation": "一行如果没有参与任何输出的计算，它对 loss 的导数就是零，所以这一步它的梯度为零。"
    }
  ]
}