add llama 8b example (#623)

felipemello1 · Felipe Mello · web-flow · commit 0754effa93d1 · 2025-12-05T13:34:08.000-05:00
Co-authored-by: Felipe Mello &lt;felipemello@fb.com&gt;
diff --git a/apps/grpo/README.md b/apps/grpo/README.md
@@ -0,0 +1,64 @@
+Last updated: 2025-12-05
+
+# GRPO on GSM8K
+
+Training GRPO (Grouped Relative Policy Optimization) on GSM8K grade school math word problems.
+
+## Dataset
+
+GSM8K consists of grade school math word problems that require multi-step arithmetic reasoning. Models generate solutions with chain-of-thought reasoning and provide final answers in `\boxed{}` format.
+
+**Example Input:**
+```
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+Cutting Knowledge Date: December 2023
+Today Date: 26 Jul 2024
+
+Put all your scratchpad work between <think> and </think> tags.
+            Your final answer should be between <answer> and </answer> tags otherwise it will not be scored.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+Lucille is painting her room. Two of her walls are 3 meters wide and 2 meters tall. The third wall is 5 meters wide and 2 meters tall. The final wall is 4 meters wide and 2 meters tall. If each can of paint covers 2 square meters, how many cans of paint does Lucille need?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+```
+
+**Example Output:**
+```
+<think>
+First, let's calculate the area of each wall:
+
+- Two walls of 3 meters x 2 meters = 3 x 2 = 6 square meters per wall. Since there are two of these, 6 * 2 = 12 square meters.
+- The third wall is 5 meters x 2 meters = 10 square meters.
+- The final wall is 4 meters x 2 meters = 8 square meters.
+Total wall area = 12 + 10 + 8 = 30 square meters.
+
+Since each can covers 2 square meters, we need to divide the total wall area by the area one can covers: 30 / 2 = 15.
+
+</think>
+
+<answer>15</answer>
+```
+
+## Quick Start
+
+**Llama 3.1 8B** (recommended for learning, requires 5 GPUs as is, not optimized):
+```bash
+python -m apps.grpo.main --config apps/grpo/llama3_8b.yaml
+```
+
+**Qwen3 1.7B** (NOTE: Qwen3 is already saturated on GSM8K, so rewards will **not** increase. Requires 3 GPUs, not optimized):
+```bash
+python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
+```
+
+## Expected Results
+
+For **Llama 3.1 8B**, training rewards should rise above 0.8 within the first few steps as the model learns the task.
+
+![Llama 3.1 8B Training Rewards](wandb_llama8b.png)
+
+## Configurations
+
+- `llama3_8b.yaml` - Meta Llama 3.1 8B Instruct
+- `qwen3_1_7b.yaml` - Qwen3 1.7B
+- `qwen3_8b.yaml` - Qwen3 8B
+- `qwen3_32b.yaml` - Qwen3 32B
diff --git a/apps/grpo/llama3_8b.yaml b/apps/grpo/llama3_8b.yaml
@@ -0,0 +1,149 @@
+# Grouped Relative Policy Optimization (GRPO)
+# >>> python -m apps.grpo.main --config apps/grpo/llama3_8b.yaml
+
+# Global configuration
+group_size: 4
+local_batch_size: 4 # per-device batch size
+max_req_tokens: 1024
+max_res_tokens: 2048
+model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
+off_by_n: 1 # Off by one by default
+
+# Observability configuration
+metric_logging:
+  wandb:
+    project: grpo-training
+    group: grpo_exp_${oc.env:USER}
+    logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
+  console:
+    logging_mode: global_reduce
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration
+policy:
+  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
+    model: ${model}
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    enforce_eager: false
+  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: llama3
+    flavor: 8B
+    hf_assets_path: hf://${model}
+  optimizer:
+    name: AdamW
+    lr: 1e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 1
+  training:
+    local_batch_size: ${local_batch_size}
+    seq_len: ${sum:${max_req_tokens},${max_res_tokens}}  # seq_len >= max_req_tokens + max_res_tokens
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: -1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    folder: ./checkpoint              # The folder to save checkpoints to.
+    initial_load_path: hf://${model}  # The path to load the initial checkpoint from. Ignored if `folder` exists.
+    initial_load_in_hf: true          # If true, interpret initial_load_path as a HuggingFace model repo
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${local_batch_size}
+  max_policy_age: ${off_by_n}
+  # This should match the dp_size of TorchTitan
+  # Here it's set explicitly to 2, because we've set
+  # 2 GPUs for the trainer and we're using full FSDP.
+  dp_size: 2
+
+# Reference model configuration
+ref_model:
+  model:
+    name: llama3
+    flavor: 8B
+    hf_assets_path: hf://${model}
+  training:
+    seq_len: ${trainer.training.seq_len}
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 1
+  checkpoint:
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+
+# All resource allocations
+services:
+  policy:
+    procs: ${policy.engine_args.tensor_parallel_size}
+    num_replicas: 1
+    with_gpus: true
+    mesh_name: policy
+  ref_model:
+    procs: 1
+    num_replicas: 1
+    with_gpus: true
+    mesh_name: ref_model
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    with_gpus: false
+    mesh_name: reward_actor
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+    mesh_name: dataset
+  trainer:
+    procs: 2
+    with_gpus: true
+    mesh_name: trainer
+  replay_buffer:
+    procs: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  compute_advantages:
+    procs: 1
+    with_gpus: false
+    mesh_name: compute_advantages
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -27,7 +27,7 @@
 from forge.actors.trainer import TitanTrainer
 from forge.controller.actor import ForgeActor
 from forge.controller.provisioner import init_provisioner, shutdown
-from forge.data.rewards import LanguageReward, MathReward, ThinkingReward
+from forge.data.rewards import MathReward, ThinkingReward
 from forge.data_models.completion import Completion
 from forge.observability.metric_actors import get_or_create_metric_logger
 from forge.observability.metrics import record_metric, Reduce
@@ -274,15 +274,10 @@ async def setup(self):
         self._epoch = 0
 
         def gsm8k_transform(sample):
-            system_prompt = """You are a helpful AI assistant that solves math problems.
-
-Please show your reasoning inside <思考></思考> tags, then provide your final numerical answer inside <answer></answer> tags.
-
-Example:
-Question: What is 12 + 5?
-<思考>12と5を足します。12 + 5 = 17です。</思考>
-<answer>17</answer>
-"""
+            system_prompt = """
+            Put all your scratchpad work between <think> and </think> tags.
+            Your final answer should be between <answer> and </answer> tags otherwise it will not be scored.
+            """
             request: str = sample["question"]
             as_chat = [
                 {"role": "system", "content": system_prompt},
@@ -409,17 +404,7 @@ async def main(cfg: DictConfig):
         ComputeAdvantages.options(**cfg.actors.compute_advantages).as_actor(),
         ReferenceModel.options(**cfg.services.ref_model).as_service(**cfg.ref_model),
         RewardActor.options(**cfg.services.reward_actor).as_service(
-            reward_functions=[
-                MathReward(),
-                ThinkingReward(tag="思考"),  # Use Japanese tag
-                LanguageReward(
-                    target_language="ja",
-                    tag="思考",
-                    match_reward=2.0,
-                    debug=False,  # set to true for verbose logging
-                    debug_sample_rate=0.1,
-                ),  # Japanese language reward with debug
-            ]
+            reward_functions=[MathReward(), ThinkingReward()]
         ),
     )
 
diff --git a/apps/grpo/wandb_llama8b.png b/apps/grpo/wandb_llama8b.png