Skip to content

The actor/entropy loss appears somewhat abnormal—it's too large. We launched DeepSWE training based on the Qwen3-4B model, using the following script: #391

@Cesilina

Description

@Cesilina

scripts:
set -x

export VLLM_ATTENTION_BACKEND=FUSED_ATTN

export PYTORCH_ALLOC_CONF="expandable_segments:False"
export VLLM_USE_V1=1
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export HYDRA_FULL_ERROR=1
export VLLM_ENGINE_ITERATION_TIMEOUT_S=100000000000
export HF_DATASETS_CACHE="/mnt/moe_nas/public/datasets/.cache/hf_datasets"

Find the directory where rllm package is located

RLLM_DIR=$(python3 -c "import rllm; import os; print(os.path.dirname(os.path.dirname(rllm.file)))")

python3 -m rllm.trainer.verl.train_agent_ppo
algorithm.adv_estimator=rloo
data.train_files=/mnt/moe_nas/home/zxn/code/rllm/data/swe/R2E-Gym-Subset/train_verl.parquet
data.val_files=/mnt/moe_nas/home/zxn/code/rllm/data/swe/R2E-Gym-Subset/test_verl.parquet
data.train_batch_size=2
data.val_batch_size=2
data.max_prompt_length=2048
data.max_response_length=4096
data.filter_overlong_prompts=True
data.filter_overlong_prompts_workers=2
actor_rollout_ref.model.path=/mnt/moe_nas/public/models/Qwen3-4B
actor_rollout_ref.hybrid_engine=True
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.model.use_remove_padding=True
actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum
actor_rollout_ref.actor.ppo_mini_batch_size=2
actor_rollout_ref.actor.use_dynamic_bsz=False
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=8192
actor_rollout_ref.actor.use_kl_loss=False
actor_rollout_ref.actor.clip_ratio_high=0.28
actor_rollout_ref.actor.kl_loss_coef=0.001
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
actor_rollout_ref.model.enable_gradient_checkpointing=True
actor_rollout_ref.actor.fsdp_config.param_offload=True
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
actor_rollout_ref.rollout.tensor_model_parallel_size=2
actor_rollout_ref.rollout.name=vllm
actor_rollout_ref.rollout.mode="async"
actor_rollout_ref.rollout.enforce_eager=False
actor_rollout_ref.rollout.temperature=1.0
actor_rollout_ref.rollout.gpu_memory_utilization=0.6
actor_rollout_ref.rollout.n=4
actor_rollout_ref.rollout.val_kwargs.n=1
actor_rollout_ref.rollout.val_kwargs.temperature=0
actor_rollout_ref.ref.fsdp_config.param_offload=True
actor_rollout_ref.actor.entropy_coeff=0.0
algorithm.kl_ctrl.kl_coef=0.001
rllm.mask_truncated_samples=False
trainer.critic_warmup=0
trainer.logger=['console','tensorboard']
trainer.project_name='deepscaler-agent'
trainer.experiment_name='swe-agent-rl-qwen3-exp3'
trainer.val_before_train=False
trainer.n_gpus_per_node=2
+trainer.tensorboard_dir=/mnt/moe_nas/home/zxn/code/rllm/deepswe_qwen3_tensorboards/
trainer.nnodes=1
trainer.save_freq=100000
trainer.test_freq=100000
trainer.default_hdfs_dir=null
rllm.env.name=swe
rllm.agent.name=sweagent
rllm.agent.max_steps=50
rllm.agent.overlong_filter=True
rllm.agent.trajectory_timeout=5400
trainer.total_epochs=1000

experiment results:

�[36m(TaskRunner pid=14937)�[0m step:12 - traj/steps_mean:np.float64(5.625) - traj/steps_min:np.int64(4) - traj/steps_max:np.int64(7) - traj/env_time_mean:np.float64(1.8401756286621094) - traj/env_time_min:np.float64(0.8794617652893066) - traj/env_time_max:np.float64(5.168613433837891) - traj/llm_time_mean:np.float64(51.190229684114456) - traj/llm_time_min:np.float64(35.602015256881714) - traj/llm_time_max:np.float64(69.37657141685486) - traj/total_time_mean:np.float64(53.030405312776566) - traj/total_time_min:np.float64(36.48147702217102) - traj/total_time_max:np.float64(74.54518485069275) - traj/token_mismatch_mean:np.float64(0.0) - traj/token_mismatch_min:np.float64(0.0) - traj/token_mismatch_max:np.float64(0.0) - batch/solve_none:2 - batch/solve_all:0 - batch/solve_partial:0 - actor/entropy:246.14312744140625 - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/pg_loss:np.float64(0.0) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.08421234222537655) - perf/max_memory_allocated_gb:np.float64(44.57679033279419) - perf/max_memory_reserved_gb:np.float64(48.51171875) - perf/cpu_memory_used_gb:np.float64(109.08635711669922) - actor/lr:np.float64(1e-06) - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:4051.625 - response_length/max:4096.0 - response_length/min:3909.0 - response_length/clip_ratio:0.75 - response_length_non_aborted/mean:4051.625 - response_length_non_aborted/max:4096.0 - response_length_non_aborted/min:3909.0 - response_length_non_aborted/clip_ratio:0.75 - response/aborted_ratio:0.0 - prompt_length/mean:1798.0 - prompt_length/max:1818.0 - prompt_length/min:1778.0 - prompt_length/clip_ratio:0.0 - timing_s/collect_trajectory:103.37594948615879 - timing_s/transform_trajectory:0.1359260380268097 - timing_s/old_log_prob:11.141448149457574 - timing_s/adv:11.143349538557231 - timing_s/update_actor:36.031790810637176 - timing_s/step:150.69677534047514 - timing_per_token_ms/update_actor:0.769959416429198 - timing_per_token_ms/adv:0.2381210235390566

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions