Ensure safe rollback on checksum failure during dataset move

davelopez · Copilot · davelopez · commit 4580b48f0187 · 2026-05-04T18:34:25.000+02:00
Adds cleanup of partially transferred files in the target store when a checksum verification error or other exception occurs during cross-device dataset transfer. Updates tests to verify that failed transfers do not result in data loss and are safely rerunnable.

Co-authored-by: Copilot &lt;copilot@github.com&gt;
diff --git a/lib/galaxy/managers/dataset_storage_operations.py b/lib/galaxy/managers/dataset_storage_operations.py
@@ -993,6 +993,8 @@ def _record_ineligible(
 
     def _execute_dataset_transfer(self, dataset: Dataset, dataset_id: int, quota_delta: int):
         source_proxy = self._dataset_proxy(dataset, str(dataset.object_store_id))
+        target_proxy: Optional[DatasetObjectStoreProxy] = None
+        extra_files_path_name = dataset.extra_files_path_name
         if not self._source_dataset_exists(source_proxy):
             self._record_transfer_failure(
                 dataset_id,
@@ -1004,10 +1006,11 @@ def _execute_dataset_transfer(self, dataset: Dataset, dataset_id: int, quota_del
         try:
             bytes_processed = 0
             if self.storage_operation_manager.requires_data_transfer(dataset, self.run.target_object_store_id):
+                target_proxy = self._dataset_proxy(dataset, self.run.target_object_store_id)
                 bytes_processed = self._copy_dataset_to_target_store(dataset, self.run.target_object_store_id)
                 self._verify_copied_dataset_integrity(dataset, self.run.target_object_store_id)
                 self._finalize_cross_device_move(dataset, self.run.target_object_store_id)
-                self._cleanup_source_dataset_data(source_proxy, dataset.extra_files_path_name)
+                self._cleanup_source_dataset_data(source_proxy, extra_files_path_name)
             else:
                 self.dataset_manager.update_object_store_id(self.trans, dataset, self.run.target_object_store_id)
 
@@ -1017,13 +1020,17 @@ def _execute_dataset_transfer(self, dataset: Dataset, dataset_id: int, quota_del
             self._notify_dataset_update(dataset)
         except ChecksumVerificationError as exc:
             log.warning("Integrity verification failed for run %s dataset %s: %s", self.run.id, dataset.id, exc)
+            if target_proxy is not None:
+                self._cleanup_target_dataset_data(target_proxy, extra_files_path_name)
             self._record_transfer_failure(
                 dataset_id,
                 DatasetStorageOperationFailureReasonCode.checksum_verification_failed,
                 self._CHECKSUM_FAILURE_MESSAGE,
             )
         except Exception:
             log.exception("Storage operation execution error for run %s dataset %s", self.run.id, dataset.id)
+            if target_proxy is not None:
+                self._cleanup_target_dataset_data(target_proxy, extra_files_path_name)
             self._record_transfer_failure(
                 dataset_id,
                 DatasetStorageOperationFailureReasonCode.execution_error,
@@ -1224,6 +1231,40 @@ def _cleanup_source_dataset_data(
                 exc_info=True,
             )
 
+    def _cleanup_target_dataset_data(
+        self,
+        target_proxy: DatasetObjectStoreProxy,
+        extra_files_path_name: Optional[str],
+    ) -> None:
+        try:
+            self.app.object_store.delete(target_proxy)
+        except Exception:
+            log.warning(
+                "Failed to delete target dataset file while rolling back failed storage move for run %s dataset %s",
+                self.run.id,
+                target_proxy.id,
+                exc_info=True,
+            )
+
+        if not extra_files_path_name:
+            return
+
+        try:
+            if self.app.object_store.exists(target_proxy, dir_only=True, extra_dir=extra_files_path_name):
+                self.app.object_store.delete(
+                    target_proxy,
+                    entire_dir=True,
+                    extra_dir=extra_files_path_name,
+                    dir_only=True,
+                )
+        except Exception:
+            log.warning(
+                "Failed to delete target extra files while rolling back failed storage move for run %s dataset %s",
+                self.run.id,
+                target_proxy.id,
+                exc_info=True,
+            )
+
     def _finalize_cross_device_move(self, dataset: Dataset, target_object_store_id: str):
         old_object_store_id = dataset.object_store_id
         quota_source_map = self.app.object_store.get_quota_source_map()
diff --git a/test/integration/objectstore/test_bulk_storage_operations.py b/test/integration/objectstore/test_bulk_storage_operations.py
@@ -25,8 +25,22 @@
 
 import os
 import string
-from typing import Any
+from typing import (
+    Any,
+    cast,
+)
+from unittest.mock import patch
 
+from galaxy.managers.dataset_storage_operations import (
+    DatasetStorageOperationManager,
+    StorageOperationRunExecutor,
+)
+from galaxy.managers.datasets import DatasetManager
+from galaxy.model import (
+    DatasetStorageOperationSnapshot,
+    User,
+)
+from galaxy.model.scoped_session import galaxy_scoped_session
 from galaxy_test.base.decorators import requires_celery
 from galaxy_test.base.populators import (
     DatasetCollectionPopulator,
@@ -160,6 +174,45 @@ def _store_file_counts(self) -> tuple[int, int]:
         separate_path = os.path.join(self.object_stores_parent, "files_separate")
         return files_count(default_path), files_count(separate_path)
 
+    def _execute_snapshot_sync(
+        self,
+        sa_session: galaxy_scoped_session,
+        snapshot: DatasetStorageOperationSnapshot,
+        *,
+        skip_ineligible: bool,
+        force_checksum_mismatch: bool = False,
+    ) -> str:
+        storage_operation_manager = DatasetStorageOperationManager(self._app.object_store)
+        dataset_manager = DatasetManager(self._app)
+        user = sa_session.get(User, snapshot.user_id)
+
+        run, _ = storage_operation_manager.create_run_and_summary(
+            sa_session=sa_session,
+            snapshot=snapshot,
+            skip_ineligible=skip_ineligible,
+        )
+        executor = storage_operation_manager.create_run_executor(
+            sa_session=sa_session,
+            dataset_manager=dataset_manager,
+            app=self._app,
+            run=run,
+            user=user,
+        )
+
+        if force_checksum_mismatch:
+            # Simulate corruption by forcing source/target checksums to differ during verify step.
+            with patch.object(
+                StorageOperationRunExecutor,
+                "_sha256",
+                autospec=True,
+                side_effect=lambda _self, path: "source-hash" if "files_default" in path else "target-hash",
+            ):
+                executor.execute_run(snapshot)
+        else:
+            executor.execute_run(snapshot)
+
+        return self._app.security.encode_id(run.id)
+
     def _item(self, hda_id: str) -> dict[str, Any]:
         return {"id": hda_id, "history_content_type": "dataset"}
 
@@ -683,3 +736,74 @@ def test_idempotent_reexecution_mixed_state_no_data_mutation(self):
                 "becomes-ineligible\n",
             )
             self._assert_dataset_store_and_content(history_id, control["id"], DEFAULT_OBJECT_STORE_ID, "control\n")
+
+    def test_cross_device_checksum_mismatch_is_safe_and_rerunnable(self):
+        """Checksum mismatch fails safely (no data loss) and the same snapshot can be re-run successfully."""
+        with self.dataset_populator.test_history() as history_id:
+            hda = self.dataset_populator.new_dataset(history_id, content="checksum-guard", wait=True)
+
+            preview = self._preview_move(
+                history_id,
+                SEPARATE_DEVICE_OBJECT_STORE_ID,
+                [self._item(hda["id"])],
+            )
+            self._assert_eligibility(preview, eligible=1, ineligible=0)
+            baseline_default_count, baseline_separate_count = self._store_file_counts()
+
+            # Force checksum mismatch during execution to simulate corruption or other transfer failure.
+            # This uses a synchronous execution of the snapshot to ensure the mismatch occurs in the first run
+            # and allows re-running the same snapshot without needing to wait for a real async run to complete.
+            sa_session = cast(galaxy_scoped_session, self._app.model.session)
+            snapshot_id = self._app.security.decode_id(preview["snapshot_id"])
+            snapshot = sa_session.get(DatasetStorageOperationSnapshot, snapshot_id)
+            assert snapshot is not None
+            first_run_id = self._execute_snapshot_sync(
+                sa_session,
+                snapshot,
+                skip_ineligible=False,
+                force_checksum_mismatch=True,
+            )
+            first_run_status = self.dataset_populator.storage_run_status(history_id, first_run_id)
+            self._assert_run_counts(first_run_status["run"], succeeded=0, failed=1, skipped=0)
+
+            first_run_items = self._run_items(history_id, first_run_id)
+            assert len(first_run_items) == 1
+            assert first_run_items[0]["dataset_id"] == hda["id"]
+            assert first_run_items[0]["state"] == "failed"
+            assert first_run_items[0]["reason_code"] == "checksum_verification_failed"
+
+            # Failed transfer should rollback target writes (no leftover files).
+            failed_default_count, failed_separate_count = self._store_file_counts()
+            assert failed_default_count == baseline_default_count
+            assert failed_separate_count == baseline_separate_count
+
+            # Failure path must keep data readable from source store (no data loss).
+            self._assert_dataset_store_and_content(
+                history_id,
+                hda["id"],
+                DEFAULT_OBJECT_STORE_ID,
+                "checksum-guard\n",
+            )
+
+            # Re-run same snapshot without forced corruption; move should now succeed.
+            second_run_id = self._execute_snapshot_sync(sa_session, snapshot, skip_ineligible=False)
+            second_run_status = self.dataset_populator.storage_run_status(history_id, second_run_id)
+            self._assert_run_counts(second_run_status["run"], succeeded=1, failed=0, skipped=0)
+
+            second_run_items = self._run_items(history_id, second_run_id)
+            assert len(second_run_items) == 1
+            assert second_run_items[0]["dataset_id"] == hda["id"]
+            assert second_run_items[0]["state"] == "succeeded"
+            assert second_run_items[0]["reason_code"] is None
+            assert second_run_items[0]["bytes_processed"] > 0
+
+            succeeded_default_count, succeeded_separate_count = self._store_file_counts()
+            assert succeeded_default_count == baseline_default_count - 1
+            assert succeeded_separate_count == baseline_separate_count + 1
+
+            self._assert_dataset_store_and_content(
+                history_id,
+                hda["id"],
+                SEPARATE_DEVICE_OBJECT_STORE_ID,
+                "checksum-guard\n",
+            )