make dapo default (#736)

felipemello1 · Felipe Mello · web-flow · commit 898be00e6dfd · 2026-01-28T11:53:19.000-05:00
Co-authored-by: Felipe Mello &lt;felipemello@fb.com&gt;
diff --git a/apps/grpo/llama3_8b.yaml b/apps/grpo/llama3_8b.yaml
@@ -2,8 +2,8 @@
 # >>> python -m apps.grpo.main --config apps/grpo/llama3_8b.yaml
 
 # Global configuration
-group_size: 4
-local_batch_size: 4 # per-device batch size
+group_size: 8
+local_batch_size: 8 # per-device batch size
 max_req_tokens: 1024
 max_res_tokens: 2048
 model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
@@ -91,7 +91,7 @@ replay_buffer:
   # 2 GPUs for the trainer and we're using full FSDP.
   dp_size: 2
 
-# Reference model configuration
+# Reference model configuration (if enabled in services)
 ref_model:
   model:
     name: llama3
@@ -117,16 +117,19 @@ ref_model:
 
 # All resource allocations
 services:
+  # Ref model is only necessary if the loss requires it (GRPO with beta>0),
+  # but we recommend using DAPO instead
+  # ref_model:
+  #   ref_model:
+  #     procs: 1
+  #     num_replicas: 1
+  #     with_gpus: true
+  #     mesh_name: ref_model
   generator:
     procs: ${generator.engine_args.tensor_parallel_size}
     num_replicas: 1
     with_gpus: true
     mesh_name: generator
-  ref_model:
-    procs: 1
-    num_replicas: 1
-    with_gpus: true
-    mesh_name: ref_model
   reward_actor:
     procs: 1
     num_replicas: 1
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -24,7 +24,7 @@
 from forge.observability.metrics import record_metric, Reduce
 from forge.observability.perf_tracker import Tracer
 from forge.rl import collate, ComputeAdvantages, Episode, RewardActor
-from forge.rl.loss import GRPOLoss
+from forge.rl.loss import DAPOLoss, GRPOLoss
 from forge.types import LauncherConfig, ProvisionerConfig
 from forge.util.checkpoint import drop_weights
 from forge.util.config import parse
@@ -68,12 +68,7 @@ async def main(cfg: DictConfig):
     )
 
     # ---- Setup loss function ---- #
-    loss_fn = GRPOLoss(
-        clip_low=0.2,
-        clip_high=0.28,
-        beta=0.1,
-        agg_type="fixed_horizon",
-    )
+    loss_fn = DAPOLoss()
 
     # Fail-fast: Check loss/ref_model compatibility before spawning actors
     uses_ref_model = cfg.get("services", {}).get("ref_model") is not None
diff --git a/apps/grpo/qwen3_1_7b.yaml b/apps/grpo/qwen3_1_7b.yaml
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 8
-local_batch_size: 16 # per-device batch size
+local_batch_size: 8 # per-device batch size
 max_req_tokens: 1024
 max_res_tokens: 2048
 model: "Qwen/Qwen3-1.7B"
@@ -92,7 +92,7 @@ replay_buffer:
   max_policy_age: ${off_by_n}
   dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
 
-# Reference model configuration
+# Reference model configuration (if enabled in services)
 ref_model:
   model:
     name: qwen3
@@ -118,16 +118,19 @@ ref_model:
 
 # All resource allocations
 services:
+  # Ref model is only necessary if the loss requires it (GRPO with beta>0),
+  # but we recommend using DAPO instead
+  # ref_model:
+  #   ref_model:
+  #     procs: 1
+  #     num_replicas: 1
+  #     with_gpus: true
+  #     mesh_name: ref_model
   generator:
     procs: ${generator.engine_args.tensor_parallel_size}
     num_replicas: 1
     mesh_name: generator
     with_gpus: true
-  ref_model:
-    procs: 1
-    num_replicas: 1
-    mesh_name: ref_model
-    with_gpus: true
   reward_actor:
     procs: 1
     num_replicas: 1
diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml
@@ -2,8 +2,8 @@
 # >>> python -m apps.grpo.main --config apps/grpo/qwen3_8b.yaml
 
 # Global configuration
-group_size: 16
-local_batch_size: 4 # per-device batch size
+group_size: 8
+local_batch_size: 8 # per-device batch size
 max_req_tokens: 1024
 max_res_tokens: 2048
 model: "Qwen/Qwen3-8B"
@@ -91,7 +91,7 @@ replay_buffer:
   # 2 GPUs for the trainer and we're using full FSDP.
   dp_size: 2
 
-# Reference model configuration
+# Reference model configuration (if enabled in services)
 ref_model:
   model:
     name: qwen3
@@ -117,16 +117,19 @@ ref_model:
 
 # All resource allocations
 services:
+  # Ref model is only necessary if the loss requires it (GRPO with beta>0),
+  # but we recommend using DAPO instead
+  # ref_model:
+  #   ref_model:
+  #     procs: 1
+  #     num_replicas: 1
+  #     with_gpus: true
+  #     mesh_name: ref_model
   generator:
     procs: ${generator.engine_args.tensor_parallel_size}
     num_replicas: 1
     with_gpus: true
     mesh_name: generator
-  ref_model:
-    procs: 1
-    num_replicas: 1
-    with_gpus: true
-    mesh_name: ref_model
   reward_actor:
     procs: 1
     num_replicas: 1