From a3deff06735f57f24a369230c5006456c5136088 Mon Sep 17 00:00:00 2001 From: daxellwells Date: Sun, 3 May 2026 15:01:26 +0200 Subject: [PATCH 1/5] fix: prevent ZeroDivisionError in SwinUNETR training when validation fold has classes with no samples --- .../swinunetr/scripts/train.py | 28 ++++++++++++------- auto3dseg/configs/metadata.json | 5 ++-- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py index 88a2a1c0..34bf0cf2 100644 --- a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py +++ b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py @@ -640,24 +640,29 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): metric = metric.tolist() if torch.cuda.device_count() == 1 or dist.get_rank() == 0: for _c in range(metric_dim): - logger.debug(f"Evaluation metric - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1]}") + if metric[2 * _c +1] == 0: + logger.debug(f"Warning: class {_c + 1} has no samples in validation fold, skipping.") + logger.debug(f"Evaluation metric - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')}") try: writer.add_scalar( - f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1], epoch + f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), epoch ) mlflow.log_metric( - f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1], step=epoch + f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), step=epoch ) except BaseException: - writer.add_scalar(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1], epoch) + writer.add_scalar(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), epoch) mlflow.log_metric( - f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1], step=epoch + f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), step=epoch ) avg_metric = 0 + count = 0 for _c in range(metric_dim): - avg_metric += metric[2 * _c] / metric[2 * _c + 1] - avg_metric = avg_metric / float(metric_dim) + if metric[2 * _c + 1] != 0: + avg_metric += metric[2 * _c] / metric[2 * _c + 1] + count +=1 + avg_metric = avg_metric / float(count) logger.debug(f"Avg_metric: {avg_metric}") writer.add_scalar("val/acc", avg_metric, epoch) @@ -801,13 +806,16 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): if torch.cuda.device_count() == 1 or dist.get_rank() == 0: for _c in range(metric_dim): logger.debug( - f"Evaluation metric at original resolution - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1]}" + f"Evaluation metric at original resolution - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')}" ) avg_metric = 0 + count = 0 for _c in range(metric_dim): - avg_metric += metric[2 * _c] / metric[2 * _c + 1] - avg_metric = avg_metric / float(metric_dim) + if metric[2 * _c + 1] != 0: + avg_metric += metric[2 * _c] / metric[2 * _c + 1] + count += 1 + avg_metric = avg_metric / float(count) logger.debug(f"Avg_metric at original resolution: {avg_metric}") with open(os.path.join(ckpt_path, "progress.yaml"), "r") as out_file: diff --git a/auto3dseg/configs/metadata.json b/auto3dseg/configs/metadata.json index 68ee514e..907212ef 100644 --- a/auto3dseg/configs/metadata.json +++ b/auto3dseg/configs/metadata.json @@ -1,6 +1,7 @@ { - "version": "0.0.8", + "version": "0.0.9", "changelog": { + "0.0.9": "Fix ZeroDivisionError in swinunetr training script for missing classes in validation fold.", "0.0.8": "Update swin unetr pretrained weights link", "0.0.7": "Add support for MLFlow experiment name.", "0.0.6": "Move metadata.json under 'configs' to be consistent with bundles.", @@ -10,4 +11,4 @@ "0.0.2": "update hyper-parameter naming in dints algorithm template.", "0.0.1": "this version is based on commit 03a6d4effb9223670f439c3a29198ef34938922f." } -} +} \ No newline at end of file From 9fa7e04ec71a0b3853ab5e2165b9f32f870f2317 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 3 May 2026 13:13:47 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto3dseg/configs/metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto3dseg/configs/metadata.json b/auto3dseg/configs/metadata.json index 907212ef..80a645a2 100644 --- a/auto3dseg/configs/metadata.json +++ b/auto3dseg/configs/metadata.json @@ -11,4 +11,4 @@ "0.0.2": "update hyper-parameter naming in dints algorithm template.", "0.0.1": "this version is based on commit 03a6d4effb9223670f439c3a29198ef34938922f." } -} \ No newline at end of file +} From b344c3c7010451df68f881b67cddf86c8dc8166b Mon Sep 17 00:00:00 2001 From: daxellwells Date: Sun, 3 May 2026 15:23:43 +0200 Subject: [PATCH 3/5] fix: guard avg_metric division against zero count when all classes absent from validation fold --- auto3dseg/algorithm_templates/swinunetr/scripts/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py index 34bf0cf2..dc0d2bc4 100644 --- a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py +++ b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py @@ -662,7 +662,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): if metric[2 * _c + 1] != 0: avg_metric += metric[2 * _c] / metric[2 * _c + 1] count +=1 - avg_metric = avg_metric / float(count) + avg_metric = avg_metric / float(count) if count > 0 else float('nan') logger.debug(f"Avg_metric: {avg_metric}") writer.add_scalar("val/acc", avg_metric, epoch) @@ -815,7 +815,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): if metric[2 * _c + 1] != 0: avg_metric += metric[2 * _c] / metric[2 * _c + 1] count += 1 - avg_metric = avg_metric / float(count) + avg_metric = avg_metric / float(count) if count > 0 else float('nan') logger.debug(f"Avg_metric at original resolution: {avg_metric}") with open(os.path.join(ckpt_path, "progress.yaml"), "r") as out_file: From d4a7af543f09c34998118ad66a16a8d6258e076e Mon Sep 17 00:00:00 2001 From: daxellwells Date: Sun, 3 May 2026 15:57:55 +0200 Subject: [PATCH 4/5] fix: skip NaN metrics in MLflow/TensorBoard logging and promote missing-class warning level --- .../swinunetr/scripts/train.py | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py index dc0d2bc4..b5d5b61d 100644 --- a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py +++ b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py @@ -640,21 +640,17 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): metric = metric.tolist() if torch.cuda.device_count() == 1 or dist.get_rank() == 0: for _c in range(metric_dim): + class_metric = metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan') if metric[2 * _c +1] == 0: - logger.debug(f"Warning: class {_c + 1} has no samples in validation fold, skipping.") - logger.debug(f"Evaluation metric - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')}") - try: - writer.add_scalar( - f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), epoch - ) - mlflow.log_metric( - f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), step=epoch - ) - except BaseException: - writer.add_scalar(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), epoch) - mlflow.log_metric( - f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1]if metric[2 * _c + 1] != 0 else float('nan'), step=epoch - ) + logger.warning(f"Class {_c + 1} has no samples in validation fold; logging as NaN.") + logger.debug(f"Evaluation metric - class {_c + 1}: {class_metric}") + if not math.isnan(class_metric): + try: + writer.add_scalar(f"val_class/acc_{class_names[_c]}", class_metric, epoch) + mlflow.log_metric(f"val_class/acc_{class_names[_c]}", class_metric, step=epoch) + except BaseException: + writer.add_scalar(f"val_class/acc_{_c}", class_metric, epoch) + mlflow.log_metric(f"val_class/acc_{_c}", class_metric, step=epoch) avg_metric = 0 count = 0 @@ -665,8 +661,9 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): avg_metric = avg_metric / float(count) if count > 0 else float('nan') logger.debug(f"Avg_metric: {avg_metric}") - writer.add_scalar("val/acc", avg_metric, epoch) - mlflow.log_metric("val/acc", avg_metric, step=epoch) + if not math.isnan(avg_metric): + writer.add_scalar("val/acc", avg_metric, epoch) + mlflow.log_metric("val/acc", avg_metric, step=epoch) if avg_metric > best_metric: best_metric = avg_metric @@ -805,9 +802,10 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): metric = metric.tolist() if torch.cuda.device_count() == 1 or dist.get_rank() == 0: for _c in range(metric_dim): - logger.debug( - f"Evaluation metric at original resolution - class {_c + 1}: {metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan')}" - ) + class_metric = metric[2 * _c] / metric[2 * _c + 1] if metric[2 * _c + 1] != 0 else float('nan') + if metric[2 * _c + 1] == 0: + logger.warning(f"Class {_c + 1} has no samples in validation fold; logging as NaN.") + logger.debug(f"Evaluation metric at original resolution - class {_c + 1}: {class_metric}") avg_metric = 0 count = 0 From 812f55784b8a055329f1c8e4b67d164f221d82c1 Mon Sep 17 00:00:00 2001 From: daxellwells Date: Sun, 3 May 2026 16:16:54 +0200 Subject: [PATCH 5/5] fix: skip early stopping and progress.yaml write when avg_metric is NaN --- .../swinunetr/scripts/train.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py index b5d5b61d..1b57d45d 100644 --- a/auto3dseg/algorithm_templates/swinunetr/scripts/train.py +++ b/auto3dseg/algorithm_templates/swinunetr/scripts/train.py @@ -696,7 +696,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): ) ) - if es: + if es and not math.isnan(avg_metric): early_stopping(val_acc=avg_metric) stop_train = torch.tensor(early_stopping.early_stop).to(device) @@ -819,13 +819,14 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): with open(os.path.join(ckpt_path, "progress.yaml"), "r") as out_file: progress = yaml.safe_load(out_file) - dict_file = {} - dict_file["best_avg_dice_score"] = float(avg_metric) - dict_file["best_avg_dice_score_epoch"] = int(progress[-1]["best_avg_dice_score_epoch"]) - dict_file["best_avg_dice_score_iteration"] = int(progress[-1]["best_avg_dice_score_iteration"]) - dict_file["inverted_best_validation"] = True - with open(os.path.join(ckpt_path, "progress.yaml"), "a") as out_file: - yaml.dump([dict_file], stream=out_file) + if not math.isnan(avg_metric): + dict_file = {} + dict_file["best_avg_dice_score"] = float(avg_metric) + dict_file["best_avg_dice_score_epoch"] = int(progress[-1]["best_avg_dice_score_epoch"]) + dict_file["best_avg_dice_score_iteration"] = int(progress[-1]["best_avg_dice_score_iteration"]) + dict_file["inverted_best_validation"] = True + with open(os.path.join(ckpt_path, "progress.yaml"), "a") as out_file: + yaml.dump([dict_file], stream=out_file) if torch.cuda.device_count() > 1: dist.barrier()