-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun.sh
More file actions
executable file
·78 lines (63 loc) · 1.81 KB
/
run.sh
File metadata and controls
executable file
·78 lines (63 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/bash
set -e
conda deactivate || true
if [ -f "$HOME/anaconda3/etc/profile.d/conda.sh" ]; then
source "$HOME/anaconda3/etc/profile.d/conda.sh"
if [[ "$CONDA_DEFAULT_ENV" != "verl" ]]; then
conda activate verl 2>/dev/null || echo "Note: 'verl' environment not found or could not be activated. Continuing with current environment..."
fi
fi
# Login to Weights & Biases (wandb) for experiment tracking
wandb login
# Defaults
SIZE="small"
MULTI_GPU=0
BACKEND="fsdp"
DEMO=0
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--size)
SIZE="$2"
shift 2
;;
--backend)
BACKEND="$2"
shift 2
;;
--multi-gpu)
MULTI_GPU=1
shift
;;
--demo)
DEMO=1
shift
;;
*)
echo "Unknown argument: $1"
echo "Usage: $0 [--size micro|small|medium|large] [--multi-gpu] [--backend fsdp|megatron] [--demo]"
exit 1
;;
esac
done
# Validate size
echo "Size: $SIZE"
echo "Multi-GPU: $MULTI_GPU"
echo "Backend: $BACKEND"
export SIZE="$SIZE"
export BACKEND="$BACKEND"
export DEMO="$DEMO"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# # Copy the training-scripts/ under the verl/
# cp -r "$SCRIPT_DIR/training-scripts" "$SCRIPT_DIR/verl/"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
mkdir -p "$SCRIPT_DIR/logs"
LOG_FILE="$SCRIPT_DIR/logs/training_${SIZE}_${TIMESTAMP}.log"
if [[ $MULTI_GPU -eq 1 ]]; then
echo "Running multi-GPU training..."
bash "$SCRIPT_DIR/training-scripts/run_qwen3_multi_gpu.sh" | tee "$LOG_FILE"
else
echo "Running single-GPU training..."
bash "$SCRIPT_DIR/training-scripts/run_qwen3_single_gpu.sh" | tee "$LOG_FILE"
fi
echo "Training log saved to: $LOG_FILE"