torchforge/apps/grpo/qwen3_8b.yaml at e42e0182aea84715fe82bd169ab5194ac67320b4 · meta-pytorch/torchforge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Grouped Relative Policy Optimization (GRPO)
# >>> python -m apps.grpo.main --config apps/grpo/qwen3_8b.yaml

# Global configuration
group_size: 8
local_batch_size: 16 # per-device batch size
max_req_tokens: 512
max_res_tokens: 512
model: "Qwen/Qwen3-8B"
off_by_n: 1 # Off by one by default

# Observability configuration
metric_logging:
  wandb:
    project: "grpo-training"
    group: "grpo_exp_${oc.env:USER}"
    reduce_across_ranks: True
  console:
    reduce_across_ranks: True

# Dataset configuration
dataset:
  path: "openai/gsm8k"
  revision: "main"
  data_split: "train"
  streaming: true
  model: ${model}

# Policy configuration
policy:
  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
    model: ${model}
    tensor_parallel_size: 2
    pipeline_parallel_size: 1
    enforce_eager: false
  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
    n: ${group_size}
    max_tokens: ${max_res_tokens}
    temperature: 1.0
    top_p: 1.0

# Trainer configuration
trainer:
  model:
    name: qwen3
    flavor: 8B
    hf_assets_path: hf://${model}
  optimizer:
    name: AdamW
    lr: 1e-5
    eps: 1e-8
  lr_scheduler:
    warmup_steps: 1
  training:
    local_batch_size: ${local_batch_size}
    seq_len: 2048
    max_norm: 1.0
    steps: 1000000
    dtype: bfloat16
    gc_freq: 1
  compile:
    enable: false
  parallelism:
    data_parallel_replicate_degree: 1
    data_parallel_shard_degree: -1
    tensor_parallel_degree: 1
    pipeline_parallel_degree: 1
    context_parallel_degree: 1
    expert_parallel_degree: 1
    disable_loss_parallel: true
  checkpoint:
    enable: true
    initial_load_path: hf://${model}
    initial_load_in_hf: true
    last_save_in_hf: true
    interval: 500
    async_mode: "disabled"
  activation_checkpoint:
    mode: selective
    selective_ac_option: op

# Replay buffer configuration
replay_buffer:
  batch_size: ${local_batch_size}
  max_policy_age: ${off_by_n}
  # This should match the dp_size of TorchTitan
  # Here it's set explicitly to 2, because we've set
  # 2 GPUs for the trainer and we're using full FSDP.
  dp_size: 2

# Reference model configuration
ref_model:
  model:
    name: qwen3
    flavor: 8B
    hf_assets_path: hf://${model}
  training:
    seq_len: ${trainer.training.seq_len}
    dtype: bfloat16
    gc_freq: 1
  compile:
    enable: false
  parallelism:
    data_parallel_replicate_degree: 1
    data_parallel_shard_degree: 1
    tensor_parallel_degree: 1
    pipeline_parallel_degree: 1
    context_parallel_degree: 1
    expert_parallel_degree: 1
  checkpoint:
    initial_load_path: hf://${model}
    initial_load_in_hf: true

# All resource allocations
services:
  policy:
    procs: ${policy.engine_args.tensor_parallel_size}
    num_replicas: 1
    with_gpus: true
    mesh_name: policy
  ref_model:
    procs: 1
    num_replicas: 1
    with_gpus: true
    mesh_name: ref_model
  reward_actor:
    procs: 1
    num_replicas: 1
    with_gpus: false
    mesh_name: reward_actor

actors:
  dataset:
    procs: 1
    with_gpus: false
    mesh_name: dataset
  trainer:
    procs: 2
    with_gpus: true
    mesh_name: trainer
  replay_buffer:
    procs: 1
    with_gpus: false
    mesh_name: replay_buffer
  compute_advantages:
    procs: 1
    with_gpus: false
    mesh_name: compute_advantages