From a3deff06735f57f24a369230c5006456c5136088 Mon Sep 17 00:00:00 2001
From: daxellwells <daxelltyronwells@gmail.com>
Date: Sun, 3 May 2026 15:01:26 +0200
Subject: [PATCH 1/5] fix: prevent ZeroDivisionError in SwinUNETR training when
 validation fold has classes with no samples

---
 .../swinunetr/scripts/train.py                | 28 ++++++++++++-------
 auto3dseg/configs/metadata.json               |  5 ++--
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
index 88a2a1c0..34bf0cf2 100644
--- a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
+++ b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
@@ -640,24 +640,29 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                     metric = metric.tolist()
                     if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
                         for _c in range(metric_dim):
-                            logger.debug(f"Evaluation metric - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1]}")
+                            if metric[2 * _c +1] == 0:
+                                logger.debug(f"Warning: class {_c + 1} has no samples in validation fold, skipping.")
+                            logger.debug(f"Evaluation metric - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')}")
                             try:
                                 writer.add_scalar(
-                                    f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1], epoch
+                                    f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), epoch
                                 )
                                 mlflow.log_metric(
-                                    f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1], step=epoch
+                                    f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), step=epoch
                                 )
                             except BaseException:
-                                writer.add_scalar(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1], epoch)
+                                writer.add_scalar(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), epoch)
                                 mlflow.log_metric(
-                                    f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1], step=epoch
+                                    f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), step=epoch
                                 )
 
                         avg_metric = 0
+                        count = 0
                         for _c in range(metric_dim):
-                            avg_metric += metric[2 * _c] / metric[2 * _c + 1]
-                        avg_metric = avg_metric / float(metric_dim)
+                            if metric[2 * _c + 1] != 0:
+                                avg_metric += metric[2 * _c] / metric[2 * _c + 1]
+                                count +=1
+                        avg_metric = avg_metric / float(count)
                         logger.debug(f"Avg_metric: {avg_metric}")
 
                         writer.add_scalar("val/acc", avg_metric, epoch)
@@ -801,13 +806,16 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                 if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
                     for _c in range(metric_dim):
                         logger.debug(
-                            f"Evaluation metric at original resolution - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1]}"
+                            f"Evaluation metric at original resolution - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')}"
                         )
 
                     avg_metric = 0
+                    count = 0
                     for _c in range(metric_dim):
-                        avg_metric += metric[2 * _c] / metric[2 * _c + 1]
-                    avg_metric = avg_metric / float(metric_dim)
+                        if metric[2 * _c + 1] != 0:
+                            avg_metric += metric[2 * _c] / metric[2 * _c + 1]
+                            count += 1
+                    avg_metric = avg_metric / float(count)
                     logger.debug(f"Avg_metric at original resolution: {avg_metric}")
 
                     with open(os.path.join(ckpt_path, "progress.yaml"), "r") as out_file:
diff --git a/auto3dseg/configs/metadata.json b/auto3dseg/configs/metadata.json
index 68ee514e..907212ef 100644
--- a/auto3dseg/configs/metadata.json
+++ b/auto3dseg/configs/metadata.json
@@ -1,6 +1,7 @@
 {
-    "version": "0.0.8",
+    "version": "0.0.9",
     "changelog": {
+        "0.0.9": "Fix ZeroDivisionError in swinunetr training script for missing classes in validation fold.",
         "0.0.8": "Update swin unetr pretrained weights link",
         "0.0.7": "Add support for MLFlow experiment name.",
         "0.0.6": "Move metadata.json under 'configs' to be consistent with bundles.",
@@ -10,4 +11,4 @@
         "0.0.2": "update hyper-parameter naming in dints algorithm template.",
         "0.0.1": "this version is based on commit 03a6d4effb9223670f439c3a29198ef34938922f."
     }
-}
+}
\ No newline at end of file

From 9fa7e04ec71a0b3853ab5e2165b9f32f870f2317 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 3 May 2026 13:13:47 +0000
Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto3dseg/configs/metadata.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto3dseg/configs/metadata.json b/auto3dseg/configs/metadata.json
index 907212ef..80a645a2 100644
--- a/auto3dseg/configs/metadata.json
+++ b/auto3dseg/configs/metadata.json
@@ -11,4 +11,4 @@
         "0.0.2": "update hyper-parameter naming in dints algorithm template.",
         "0.0.1": "this version is based on commit 03a6d4effb9223670f439c3a29198ef34938922f."
     }
-}
\ No newline at end of file
+}

From b344c3c7010451df68f881b67cddf86c8dc8166b Mon Sep 17 00:00:00 2001
From: daxellwells <daxelltyronwells@gmail.com>
Date: Sun, 3 May 2026 15:23:43 +0200
Subject: [PATCH 3/5]  fix: guard avg_metric division against zero count when
 all classes absent from validation fold

---
 auto3dseg/algorithm_templates/swinunetr/scripts/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
index 34bf0cf2..dc0d2bc4 100644
--- a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
+++ b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
@@ -662,7 +662,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                             if metric[2 * _c + 1] != 0:
                                 avg_metric += metric[2 * _c] / metric[2 * _c + 1]
                                 count +=1
-                        avg_metric = avg_metric / float(count)
+                        avg_metric = avg_metric / float(count) if count > 0 else float('nan')
                         logger.debug(f"Avg_metric: {avg_metric}")
 
                         writer.add_scalar("val/acc", avg_metric, epoch)
@@ -815,7 +815,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                         if metric[2 * _c + 1] != 0:
                             avg_metric += metric[2 * _c] / metric[2 * _c + 1]
                             count += 1
-                    avg_metric = avg_metric / float(count)
+                    avg_metric = avg_metric / float(count) if count > 0 else float('nan')
                     logger.debug(f"Avg_metric at original resolution: {avg_metric}")
 
                     with open(os.path.join(ckpt_path, "progress.yaml"), "r") as out_file:

From d4a7af543f09c34998118ad66a16a8d6258e076e Mon Sep 17 00:00:00 2001
From: daxellwells <daxelltyronwells@gmail.com>
Date: Sun, 3 May 2026 15:57:55 +0200
Subject: [PATCH 4/5] fix: skip NaN metrics in MLflow/TensorBoard logging and
 promote missing-class warning level

---
 .../swinunetr/scripts/train.py                | 36 +++++++++----------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
index dc0d2bc4..b5d5b61d 100644
--- a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
+++ b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
@@ -640,21 +640,17 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                     metric = metric.tolist()
                     if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
                         for _c in range(metric_dim):
+                            class_metric = metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')
                             if metric[2 * _c +1] == 0:
-                                logger.debug(f"Warning: class {_c + 1} has no samples in validation fold, skipping.")
-                            logger.debug(f"Evaluation metric - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')}")
-                            try:
-                                writer.add_scalar(
-                                    f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), epoch
-                                )
-                                mlflow.log_metric(
-                                    f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), step=epoch
-                                )
-                            except BaseException:
-                                writer.add_scalar(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), epoch)
-                                mlflow.log_metric(
-                                    f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), step=epoch
-                                )
+                                logger.warning(f"Class {_c + 1} has no samples in validation fold; logging as NaN.")
+                            logger.debug(f"Evaluation metric - class {_c + 1}: {class_metric}")
+                            if not math.isnan(class_metric):
+                                try:
+                                    writer.add_scalar(f"val_class/acc_{class_names[_c]}", class_metric, epoch)
+                                    mlflow.log_metric(f"val_class/acc_{class_names[_c]}", class_metric, step=epoch)
+                                except BaseException:
+                                    writer.add_scalar(f"val_class/acc_{_c}", class_metric, epoch)
+                                    mlflow.log_metric(f"val_class/acc_{_c}", class_metric, step=epoch)
 
                         avg_metric = 0
                         count = 0
@@ -665,8 +661,9 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                         avg_metric = avg_metric / float(count) if count > 0 else float('nan')
                         logger.debug(f"Avg_metric: {avg_metric}")
 
-                        writer.add_scalar("val/acc", avg_metric, epoch)
-                        mlflow.log_metric("val/acc", avg_metric, step=epoch)
+                        if not math.isnan(avg_metric):
+                            writer.add_scalar("val/acc", avg_metric, epoch)
+                            mlflow.log_metric("val/acc", avg_metric, step=epoch)
 
                         if avg_metric > best_metric:
                             best_metric = avg_metric
@@ -805,9 +802,10 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                 metric = metric.tolist()
                 if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
                     for _c in range(metric_dim):
-                        logger.debug(
-                            f"Evaluation metric at original resolution - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')}"
-                        )
+                        class_metric = metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')
+                        if metric[2 * _c + 1] == 0:
+                            logger.warning(f"Class {_c + 1} has no samples in validation fold; logging as NaN.")
+                        logger.debug(f"Evaluation metric at original resolution - class {_c + 1}: {class_metric}")
 
                     avg_metric = 0
                     count = 0

From 812f55784b8a055329f1c8e4b67d164f221d82c1 Mon Sep 17 00:00:00 2001
From: daxellwells <daxelltyronwells@gmail.com>
Date: Sun, 3 May 2026 16:16:54 +0200
Subject: [PATCH 5/5] fix: skip early stopping and progress.yaml write when
 avg_metric is NaN

---
 .../swinunetr/scripts/train.py                  | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
index b5d5b61d..1b57d45d 100644
--- a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
+++ b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py
@@ -696,7 +696,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                                 )
                             )
 
-                        if es:
+                        if es and not math.isnan(avg_metric):
                             early_stopping(val_acc=avg_metric)
                             stop_train = torch.tensor(early_stopping.early_stop).to(device)
 
@@ -819,13 +819,14 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
                     with open(os.path.join(ckpt_path, "progress.yaml"), "r") as out_file:
                         progress = yaml.safe_load(out_file)
 
-                    dict_file = {}
-                    dict_file["best_avg_dice_score"] = float(avg_metric)
-                    dict_file["best_avg_dice_score_epoch"] = int(progress[-1]["best_avg_dice_score_epoch"])
-                    dict_file["best_avg_dice_score_iteration"] = int(progress[-1]["best_avg_dice_score_iteration"])
-                    dict_file["inverted_best_validation"] = True
-                    with open(os.path.join(ckpt_path, "progress.yaml"), "a") as out_file:
-                        yaml.dump([dict_file], stream=out_file)
+                    if not math.isnan(avg_metric):
+                        dict_file = {}
+                        dict_file["best_avg_dice_score"] = float(avg_metric)
+                        dict_file["best_avg_dice_score_epoch"] = int(progress[-1]["best_avg_dice_score_epoch"])
+                        dict_file["best_avg_dice_score_iteration"] = int(progress[-1]["best_avg_dice_score_iteration"])
+                        dict_file["inverted_best_validation"] = True
+                        with open(os.path.join(ckpt_path, "progress.yaml"), "a") as out_file:
+                            yaml.dump([dict_file], stream=out_file)
 
                 if torch.cuda.device_count() > 1:
                     dist.barrier()