Skip to content

Commit 3cc90d2

Browse files
committed
fix: better loop
1 parent 150d001 commit 3cc90d2

1 file changed

Lines changed: 60 additions & 14 deletions

File tree

util/run-here.sh

Lines changed: 60 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
set -euo pipefail
33
offset=0
44
dry=false
5+
NUM_THREADS=4 # number of parallel jobs
56
usage() {
67
echo """
78
Run jobs from a slurm submission script's job list on
@@ -35,29 +36,74 @@ if [ ! -f "$job_list" ]; then
3536
exit 1
3637
fi
3738

38-
if [ $num_jobs -gt 16 ]; then
39-
echo "ERROR: too many jobs!"
39+
mkdir -p $log_dir
40+
41+
job_ids=()
42+
43+
function cleanup_jobs() {
44+
echo ""
45+
echo ">>> Caught signal, killing all jobs..."
46+
for job_id in "${job_ids[@]}"; do
47+
if ps -p "$job_id" >& /dev/null; then
48+
# Kill the entire process group (negative PID)
49+
kill -- -"$job_id" 2>/dev/null || true # SIGTERM
50+
fi
51+
done
52+
sleep 1
53+
# SIGKILL, if still alive
54+
for job_id in "${job_ids[@]}"; do
55+
if ps -p "$job_id" >& /dev/null; then
56+
kill -9 -- -"$job_id" 2>/dev/null || true
57+
fi
58+
done
59+
echo """>>> All jobs killed.
60+
To check if any remain:
61+
ps -ef | grep $(whoami)
62+
Kill zombies with, e.g.,:
63+
pkill -u $(whoami) java
64+
"""
4065
exit 1
41-
fi
66+
}
67+
trap cleanup_jobs SIGINT SIGTERM
4268

43-
mkdir -p $log_dir
69+
function wait_for_jobs() {
70+
stat=10
71+
while [ "${#job_ids[@]}" -gt $1 ]; do
72+
for i in "${!job_ids[@]}"; do
73+
if [ "$1" -eq 0 ]; then
74+
if [ "${#job_ids[@]}" -lt $stat ]; then
75+
echo ">>> $(date) >>> waiting on ${#job_ids[@]} jobs"
76+
stat=${#job_ids[@]}
77+
fi
78+
fi
79+
set +e
80+
ps ${job_ids[$i]} >& /dev/null
81+
if [ "$?" -ne 0 ]; then
82+
echo ">>> jobid ${job_ids[$i]} finished."
83+
unset job_ids[$i]
84+
fi
85+
set -e
86+
done
87+
sleep 1
88+
done
89+
}
4490

4591
i=0
46-
echo "SUBMITTING:"
47-
tail -n +$((offset + 1)) $job_list | head -n $num_jobs | while IFS= read -r cmd; do
92+
echo "===== JOBS: ====="
93+
while IFS= read -r cmd; do
94+
i=$((i+1))
4895
echo "JOB $i: $cmd"
4996
if ! $dry; then
5097
$cmd > $log_dir/job.$i.out 2> $log_dir/job.$i.err &
98+
job_ids+=($!)
99+
wait_for_jobs $NUM_THREADS
51100
fi
52-
i=$((i+1))
53-
done
101+
done < <(tail -n +$((offset + 1)) $job_list | head -n $num_jobs)
102+
wait_for_jobs 0
103+
104+
echo "================="
54105
if $dry; then
55106
echo "THIS WAS A DRY-RUN; no jobs submitted"
56107
else
57-
echo """
58-
JOBS SUBMITTED.
59-
- They are running in the backround
60-
- Monitor with \`htop -u $(whoami)\`
61-
- Logs written to \`$log_dir\`
62-
"""
108+
echo "DONE!"
63109
fi

0 commit comments

Comments
 (0)