|
2 | 2 | set -euo pipefail |
3 | 3 | offset=0 |
4 | 4 | dry=false |
| 5 | +NUM_THREADS=4 # number of parallel jobs |
5 | 6 | usage() { |
6 | 7 | echo """ |
7 | 8 | Run jobs from a slurm submission script's job list on |
@@ -35,29 +36,74 @@ if [ ! -f "$job_list" ]; then |
35 | 36 | exit 1 |
36 | 37 | fi |
37 | 38 |
|
38 | | -if [ $num_jobs -gt 16 ]; then |
39 | | - echo "ERROR: too many jobs!" |
| 39 | +mkdir -p $log_dir |
| 40 | + |
| 41 | +job_ids=() |
| 42 | + |
| 43 | +function cleanup_jobs() { |
| 44 | + echo "" |
| 45 | + echo ">>> Caught signal, killing all jobs..." |
| 46 | + for job_id in "${job_ids[@]}"; do |
| 47 | + if ps -p "$job_id" >& /dev/null; then |
| 48 | + # Kill the entire process group (negative PID) |
| 49 | + kill -- -"$job_id" 2>/dev/null || true # SIGTERM |
| 50 | + fi |
| 51 | + done |
| 52 | + sleep 1 |
| 53 | + # SIGKILL, if still alive |
| 54 | + for job_id in "${job_ids[@]}"; do |
| 55 | + if ps -p "$job_id" >& /dev/null; then |
| 56 | + kill -9 -- -"$job_id" 2>/dev/null || true |
| 57 | + fi |
| 58 | + done |
| 59 | + echo """>>> All jobs killed. |
| 60 | + To check if any remain: |
| 61 | + ps -ef | grep $(whoami) |
| 62 | + Kill zombies with, e.g.,: |
| 63 | + pkill -u $(whoami) java |
| 64 | + """ |
40 | 65 | exit 1 |
41 | | -fi |
| 66 | +} |
| 67 | +trap cleanup_jobs SIGINT SIGTERM |
42 | 68 |
|
43 | | -mkdir -p $log_dir |
| 69 | +function wait_for_jobs() { |
| 70 | + stat=10 |
| 71 | + while [ "${#job_ids[@]}" -gt $1 ]; do |
| 72 | + for i in "${!job_ids[@]}"; do |
| 73 | + if [ "$1" -eq 0 ]; then |
| 74 | + if [ "${#job_ids[@]}" -lt $stat ]; then |
| 75 | + echo ">>> $(date) >>> waiting on ${#job_ids[@]} jobs" |
| 76 | + stat=${#job_ids[@]} |
| 77 | + fi |
| 78 | + fi |
| 79 | + set +e |
| 80 | + ps ${job_ids[$i]} >& /dev/null |
| 81 | + if [ "$?" -ne 0 ]; then |
| 82 | + echo ">>> jobid ${job_ids[$i]} finished." |
| 83 | + unset job_ids[$i] |
| 84 | + fi |
| 85 | + set -e |
| 86 | + done |
| 87 | + sleep 1 |
| 88 | + done |
| 89 | +} |
44 | 90 |
|
45 | 91 | i=0 |
46 | | -echo "SUBMITTING:" |
47 | | -tail -n +$((offset + 1)) $job_list | head -n $num_jobs | while IFS= read -r cmd; do |
| 92 | +echo "===== JOBS: =====" |
| 93 | +while IFS= read -r cmd; do |
| 94 | + i=$((i+1)) |
48 | 95 | echo "JOB $i: $cmd" |
49 | 96 | if ! $dry; then |
50 | 97 | $cmd > $log_dir/job.$i.out 2> $log_dir/job.$i.err & |
| 98 | + job_ids+=($!) |
| 99 | + wait_for_jobs $NUM_THREADS |
51 | 100 | fi |
52 | | - i=$((i+1)) |
53 | | -done |
| 101 | +done < <(tail -n +$((offset + 1)) $job_list | head -n $num_jobs) |
| 102 | +wait_for_jobs 0 |
| 103 | + |
| 104 | +echo "=================" |
54 | 105 | if $dry; then |
55 | 106 | echo "THIS WAS A DRY-RUN; no jobs submitted" |
56 | 107 | else |
57 | | - echo """ |
58 | | -JOBS SUBMITTED. |
59 | | -- They are running in the backround |
60 | | -- Monitor with \`htop -u $(whoami)\` |
61 | | -- Logs written to \`$log_dir\` |
62 | | - """ |
| 108 | + echo "DONE!" |
63 | 109 | fi |
0 commit comments