Generate repro test errors when model does not run successfully (#173)

jo-basevi · web-flow · commit cb6a51e8029e · 2025-12-11T13:18:50.000+11:00
* Raise RuntimeErrors for failed model runs, and add more detailed logs to exceptions so it is displayed in the test output on Github * Repro tests: Move checking experiment runs to requested_experiments fixture. This means any errors from non-zero exit status from model runs are reported as test setup errors vs reproducibility test failures. * Add a test to check errors are caught and re-raised in Experiments class * Add a test_test_repro_determinism test * Add coverage patch subprocess (See https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html)
diff --git a/pyproject.toml b/pyproject.toml
@@ -112,6 +112,8 @@ tag_prefix = "v"
 parentdir_prefix = "model_config_tests-"
 
 [tool.coverage.run]
+patch = ["subprocess"]
 omit = [
-    "src/model_config_tests/_version.py"
+    "*/model_config_tests/_version.py",
+    "src/model_config_tests/_version.py",
 ]
diff --git a/src/model_config_tests/config_tests/test_bit_reproducibility.py b/src/model_config_tests/config_tests/test_bit_reproducibility.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from model_config_tests.exp_test_helper import Experiments
+from model_config_tests.exp_test_helper import Experiments, ExpTestHelper
 from model_config_tests.util import DAY_IN_SECONDS, HOUR_IN_SECONDS
 
 # Names of shared experiments
@@ -147,6 +147,20 @@ def experiments(
     return _experiments(experiments_markers, output_path, control_path, keep_archive)
 
 
+@pytest.fixture
+def requested_experiments(request, experiments: Experiments):
+    """Fixture to check that requested experiments have run successfully
+    and return a dictionary of ExpTestHelper instances for each experiment."""
+    exp_marker = request.node.get_closest_marker("experiments").args[0]
+    requested_exps = {}
+    for exp_name in exp_marker:
+        # Check experiment has run successfully - this will raise an
+        # error if there are any non-zero exit codes in the outputs
+        experiments.check_experiment(exp_name)
+        requested_exps[exp_name] = experiments.get_experiment(exp_name)
+    return requested_exps
+
+
 class TestBitReproducibility:
 
     @pytest.mark.repro
@@ -160,7 +174,7 @@ def test_repro_historical(
         self,
         output_path: Path,
         control_path: Path,
-        experiments: Experiments,
+        requested_experiments: dict[str, ExpTestHelper],
         checksum_path: Optional[Path],
     ):
         """
@@ -178,9 +192,9 @@ def test_repro_historical(
             Path to the model configuration to test. This is copied for
             for control directories in experiments. Default is set in
             conftests.py.
-        experiments: Experiments
-            Class that manages the shared experiments. This is a fixture
-            defined in this file.
+        requested_experiments: dict[str, ExpTestHelper]
+            A dictionary of requested experiments, where the key is the
+            experiment name and the value is an instance of ExpTestHelper.
         checksum_path: Optional[Path]
             Path to checksums to compare model output against. Default is
             set to checksums saved on model configuration. This is a
@@ -190,12 +204,7 @@ def test_repro_historical(
         checksum_output_dir = set_checksum_output_dir(output_path=output_path)
 
         # Use default runtime experiment to get the historical checksums
-        experiments.check_experiments([EXP_DEFAULT_RUNTIME])
-        exp = experiments.get_experiment(EXP_DEFAULT_RUNTIME)
-
-        assert (
-            exp.model.output_exists()
-        ), "Output file required for model checksums does not exist"
+        exp = requested_experiments.get(EXP_DEFAULT_RUNTIME)
 
         # Set the checksum output filename using the model default runtime
         runtime_hours = exp.model.default_runtime_seconds // HOUR_IN_SECONDS
@@ -235,20 +244,16 @@ def test_repro_historical(
             EXP_1D_RUNTIME_REPEAT: {"n_runs": 1, "model_runtime": DAY_IN_SECONDS},
         }
     )
-    def test_repro_determinism(self, experiments: Experiments):
+    def test_repro_determinism(self, requested_experiments: dict[str, ExpTestHelper]):
         """
         Determinism test that confirms repeated model runs for 1 day
         give the same results
         """
-        experiments.check_experiments([EXP_1D_RUNTIME, EXP_1D_RUNTIME_REPEAT])
-        exp_1d_runtime = experiments.get_experiment(EXP_1D_RUNTIME)
-        exp_1d_runtime_repeat = experiments.get_experiment(EXP_1D_RUNTIME_REPEAT)
+        exp_1d_runtime = requested_experiments.get(EXP_1D_RUNTIME)
+        exp_1d_runtime_repeat = requested_experiments.get(EXP_1D_RUNTIME_REPEAT)
 
         # Compare expected to produced.
-        assert exp_1d_runtime.model.output_exists()
         expected = exp_1d_runtime.extract_checksums()
-
-        assert exp_1d_runtime_repeat.model.output_exists()
         produced = exp_1d_runtime_repeat.extract_checksums()
 
         assert produced == expected
@@ -262,16 +267,17 @@ def test_repro_determinism(self, experiments: Experiments):
             EXP_2D_RUNTIME: {"n_runs": 1, "model_runtime": 2 * DAY_IN_SECONDS},
         }
     )
-    def test_repro_restart(self, output_path: Path, experiments: Experiments):
+    def test_repro_restart(
+        self, output_path: Path, requested_experiments: dict[str, ExpTestHelper]
+    ):
         """
         Restart reproducibility test that confirms two short consecutive
         1-day model runs give the same results as a longer single 2-day model
         run.
         """
         # Get experiments with 2x1 day and 2 day runtimes
-        experiments.check_experiments([EXP_1D_RUNTIME, EXP_2D_RUNTIME])
-        exp_1d_runtime = experiments.get_experiment(EXP_1D_RUNTIME)
-        exp_2d_runtime = experiments.get_experiment(EXP_2D_RUNTIME)
+        exp_1d_runtime = requested_experiments.get(EXP_1D_RUNTIME)
+        exp_2d_runtime = requested_experiments.get(EXP_2D_RUNTIME)
 
         # Now compare the output between our two short and one long run.
         checksums_1d_0 = exp_1d_runtime.extract_checksums()
@@ -305,14 +311,15 @@ def test_repro_restart(self, output_path: Path, experiments: Experiments):
             EXP_1D_RUNTIME_REPEAT: {"n_runs": 2, "model_runtime": DAY_IN_SECONDS},
         }
     )
-    def test_repro_determinism_restart(self, experiments: Experiments):
+    def test_repro_determinism_restart(
+        self, requested_experiments: dict[str, ExpTestHelper]
+    ):
         """
         Determinism test that confirms repeated experiments with two
         consecutive 1-day model runs give the same results
         """
-        experiments.check_experiments([EXP_1D_RUNTIME, EXP_1D_RUNTIME_REPEAT])
-        exp_1d_runtime = experiments.get_experiment(EXP_1D_RUNTIME)
-        exp_1d_runtime_repeat = experiments.get_experiment(EXP_1D_RUNTIME_REPEAT)
+        exp_1d_runtime = requested_experiments.get(EXP_1D_RUNTIME)
+        exp_1d_runtime_repeat = requested_experiments.get(EXP_1D_RUNTIME_REPEAT)
 
         # Extract checksums, using the output from the second model run
         expected = exp_1d_runtime.extract_checksums(exp_1d_runtime.model.output_1)
diff --git a/src/model_config_tests/exp_test_helper.py b/src/model_config_tests/exp_test_helper.py
@@ -136,9 +136,30 @@ def submit_payu_run(self, n_runs: int = None) -> str:
             # Change to experiment directory and run.
             os.chdir(self.control_path)
 
-            print("Running payu setup and payu sweep commands")
-            sp.run(["payu", "setup", "--lab", str(self.lab_path)], check=True)
-            sp.run(["payu", "sweep", "--lab", str(self.lab_path)], check=True)
+            print("Running payu setup")
+            result = sp.run(
+                ["payu", "setup", "--lab", str(self.lab_path)],
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode != 0:
+                # Add additional error messaging for debugging
+                error_msg = (
+                    "Failed to run payu setup:\n"
+                    f"Return code: {result.returncode}\n"
+                    f"--- stdout ---\n{result.stdout}\n"
+                    f"--- stderr ---\n{result.stderr}"
+                )
+                print(error_msg)
+                raise RuntimeError(error_msg)
+
+            print("Running payu sweep")
+            sp.run(
+                ["payu", "sweep", "--lab", str(self.lab_path)],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
 
             run_command = ["payu", "run", "--lab", str(self.lab_path)]
             if n_runs:
@@ -208,7 +229,7 @@ def __init__(
         self.output_path = output_path
         self.keep_archive = keep_archive
         self.experiments = {}
-        self.successful_experiments = []
+        self.experiment_errors = {}
 
     def setup_and_submit(
         self,
@@ -282,22 +303,27 @@ def wait_for_all_experiments(self, catch_errors=True) -> None:
             try:
                 exp.wait_for_payu_run()
                 print(f"Experiment {exp_name} completed successfully")
-                self.successful_experiments.append(exp_name)
             except RuntimeError as e:
+                self.experiment_errors[exp_name] = str(e)
                 if catch_errors:
-                    print(f"Error in experiment {exp_name}: {e}")
+                    print(f"Error running experiment {exp_name}: {e}")
                 else:
-                    raise e
+                    raise
 
-    def check_experiments(self, exp_names=list[str]) -> None:
+    def check_experiment(self, exp_name: str) -> None:
         """
-        Check whether given experiments names have run successfully
+        Check whether given experiment name has run successfully
         """
-        for exp_name in exp_names:
-            # TODO: Is there other useful information to display here?
-            assert (
-                exp_name in self.successful_experiments
-            ), f"There was an error running experiment: {exp_name}"
+        if exp_name in self.experiment_errors:
+            raise RuntimeError(
+                f"There was an error running experiment {exp_name}:"
+                f" {self.experiment_errors[exp_name]}"
+            )
+
+        # Double check if the required experiment output exists
+        exp = self.experiments.get(exp_name)
+        if not exp.model.output_exists():
+            raise RuntimeError(f"Experiment {exp_name} output file does not exist.")
 
 
 def setup_exp(
@@ -519,13 +545,13 @@ def wait_for_qsub_job(
     # Check whether the run job was successful
     exit_status = parse_exit_status_from_file(stdout)
     if exit_status != 0:
-        print(
+        raise RuntimeError(
+            f"Payu {job_type} job failed with exit status {exit_status}:\n"
             f"Job_ID: {job_id}\n"
             f"Output files: {output_files}\n"
             f"--- stdout ---\n{stdout}\n"
             f"--- stderr ---\n{stderr}\n"
         )
-        raise RuntimeError(f"Payu {job_type} job failed with exit status {exit_status}")
 
     return stdout, stderr, output_files
 
diff --git a/tests/config_tests/test_test_bit_reproducibility.py b/tests/config_tests/test_test_bit_reproducibility.py
@@ -213,8 +213,8 @@ def base_test_command(self):
         # Minimal test command
         test_cmd = (
             "model-config-tests -s "
-            # Use -k to select one test
-            f"-k {self.test_name} "
+            # Use -m to specify the test (each repro test has a unique marker)
+            f"-m {self.test_name.removeprefix('test_')} "
             f"--output-path {self.output_path} "
             # Keep archive flag will keep any pre-existing archive for the test
             # and disable the actual 'payu run' steps
@@ -500,3 +500,31 @@ def check_checksum(output_path, checksum_path, model_name, match=True):
         assert test_checksum.read_text() == checksum_path.read_text()
     else:
         assert test_checksum.read_text() != checksum_path.read_text()
+
+
+@pytest.mark.parametrize("fail", [False, True])
+def test_test_repro_determinism(tmp_dir, fail):
+    """Test repro determinism for some example output"""
+    test_name = "test_repro_determinism"
+
+    exp1_name = "exp_1d_runtime"
+    exp2_name = "exp_1d_runtime_repeat"
+
+    # Setup some example files for both experiments
+    exp1_helper = CommonTestHelper(test_name, exp1_name, "access", tmp_dir)
+    exp1_helper.copy_config("release-preindustrial+concentrations")
+    exp1_helper.create_mock_output("output000", modify=False)
+
+    exp2_helper = CommonTestHelper(test_name, exp2_name, "access", tmp_dir)
+    exp2_helper.create_mock_output("output000", modify=fail)
+
+    # Build test command
+    test_cmd = (
+        f"{exp1_helper.base_test_command()} "
+        f"--control-path {exp1_helper.control_path} "
+    )
+
+    # Run test in a subprocess call
+    result = subprocess.run(shlex.split(test_cmd), capture_output=True, text=True)
+    # Check test result
+    assert result.returncode == int(fail)
diff --git a/tests/test_exp_test_helper.py b/tests/test_exp_test_helper.py

Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,8 @@ tag_prefix = "v"`
`112`	`112`	`parentdir_prefix = "model_config_tests-"`
`113`	`113`
`114`	`114`	`[tool.coverage.run]`
	`115`	`+patch = ["subprocess"]`
`115`	`116`	`omit = [`
`116`		`- "src/model_config_tests/_version.py"`
	`117`	`+ "*/model_config_tests/_version.py",`
	`118`	`+ "src/model_config_tests/_version.py",`
`117`	`119`	`]`