Make SFT hardware-agnostic (#749)

DamianSzwichtenberg · dsawczuk-int · Copilot · web-flow · commit 3b233c13e68e · 2026-02-23T13:24:18.000-05:00
Co-authored-by: Sawczuk, Daniel &lt;daniel.sawczuk@intel.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/apps/sft/main.py b/apps/sft/main.py
@@ -444,7 +444,8 @@ async def train(self) -> None:
             # Move tensors to the appropriate device
             for k, v in batch.items():
                 if isinstance(v, torch.Tensor):
-                    batch[k] = v.to("cuda")  # TODO: hardcoded for now
+                    # self.device is set up in ForgeEngine
+                    batch[k] = v.to(self.device)
 
             self.train_step(batch)
             # self.profiler.step()
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
@@ -31,6 +31,79 @@
 logger.setLevel(logging.DEBUG)
 
 
+class DeviceProxy:
+    """A hardware-agnostic proxy using torch.accelerator.
+
+    Handles device counting and environment variable mapping for isolation.
+    """
+
+    # Mapping of PyTorch backend names to driver isolation variables
+    _VISIBLE_DEVICES_ENV_MAP: dict[str, str] = {
+        "cuda": "CUDA_VISIBLE_DEVICES",
+        "xpu": "ZE_AFFINITY_MASK",  # Intel Level Zero
+    }
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if any accelerator is available."""
+        return torch.accelerator.is_available()
+
+    @staticmethod
+    def get_device_count() -> int:
+        """Returns the number of available accelerator devices."""
+        if not DeviceProxy.is_available():
+            return 0
+        return torch.accelerator.device_count()
+
+    @classmethod
+    def get_visible_devices_env_var(cls) -> str | None:
+        """Returns the environment variable name used to mask devices.
+
+        Returns None if no accelerator is available or the backend is not supported.
+        """
+        if not cls.is_available():
+            return None
+        accelerator = torch.accelerator.current_accelerator()
+        if accelerator is None:
+            return None
+        return cls._VISIBLE_DEVICES_ENV_MAP.get(accelerator.type)
+
+    @classmethod
+    def get_isolation_env_vars(cls, device_ids: list[str]) -> dict[str, str]:
+        """Returns environment variables needed to isolate specific device IDs.
+
+        Returns an empty dict if no isolation env var is available for this backend.
+        """
+        env_var_name = cls.get_visible_devices_env_var()
+        if env_var_name is None:
+            return {}
+        return {env_var_name: ",".join(device_ids)}
+
+    @classmethod
+    def get_visible_devices_from_env(cls) -> set[int] | None:
+        """Parses visible devices from the appropriate environment variable.
+
+        Returns None if the variable is not set.
+        Raises ValueError if the format is invalid.
+        """
+        env_var = cls.get_visible_devices_env_var()
+        if env_var is None:
+            return None
+
+        env_value = os.environ.get(env_var, None)
+        if env_value is None or not env_value.strip():
+            return None
+
+        try:
+            # For Intel Level Zero we support ZE_FLAT_DEVICE_HIERARCHY=flat
+            return set(int(x.strip()) for x in env_value.split(",") if x.strip())
+        except ValueError as e:
+            raise ValueError(
+                f"Invalid {env_var} format: '{env_value}'. "
+                f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}"
+            ) from e
+
+
 def _get_port() -> str:
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind(("localhost", 0))
@@ -49,13 +122,8 @@ def get_info(self) -> tuple[str, str]:
 
     @endpoint
     def get_gpu_count(self) -> int:
-        """Returns the number of GPUs available on this host."""
-        try:
-            gpu_count = torch.cuda.device_count()
-        except Exception:
-            # If torch is not available or CUDA is not available, assume no GPUs
-            gpu_count = 0
-        return gpu_count
+        """Returns the number of accelerator devices available on this host."""
+        return DeviceProxy.get_device_count()
 
 
 class EnvSetter(Actor):
@@ -209,33 +277,15 @@ def __init__(self, cfg: ProvisionerConfig | None = None):
         # remove this once this is supported in Monarch.
         self._this_host_id = uuid.uuid1()
 
-        # For the local host, we may want to set CUDA_VISIBLE_DEVICES
+        # For the local host, we may want to set device visibility
         # for small scale testing. We inherit the environment's
-        # CUDA_VISIBLE_DEVICES **only for the local host** and not
-        # for remote hosts.
-        available_local_devices = None
-        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-        if cuda_visible_devices is not None and cuda_visible_devices.strip():
-            try:
-                available_local_devices = set(
-                    int(x.strip()) for x in cuda_visible_devices.split(",") if x.strip()
-                )
-            except ValueError as e:
-                raise ValueError(
-                    f"Invalid CUDA_VISIBLE_DEVICES format: '{cuda_visible_devices}'. "
-                    f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}"
-                ) from e
-
-        # Get the actual GPU count for the local host
-        try:
-            local_gpu_count = torch.cuda.device_count()
-        except Exception:
-            # If torch is not available or CUDA is not available, assume no GPUs
-            local_gpu_count = 0
+        # device visibility setting **only for the local host**.
+        available_local_devices = DeviceProxy.get_visible_devices_from_env()
+        local_device_count = DeviceProxy.get_device_count()
 
         self._host_gpu_map = {
             self._this_host_id: GpuManager(
-                available_local_devices, max_device_count=local_gpu_count
+                available_local_devices, max_device_count=local_device_count
             ),
         }
         self._proc_host_map = {}
@@ -298,7 +348,7 @@ async def get_proc_mesh(
             mesh_name: Name of the pre-allocated mesh to use.
                 Must match a mesh name defined in the launcher config.
             with_gpus: Whether to include GPU allocations.
-                This only adds the CUDA_VISIBLE_DEVICES environment variable.
+                This only adds the hardware isolation environment variable.
             num_hosts: The number of hosts to allocate.
                 If this is set, a remote allocation is created.
                 If this is None, it uses the local host.
@@ -356,7 +406,9 @@ async def get_proc_mesh(
                 # Set the PTD world size
                 world_size = num_procs * (num_hosts or 1)
                 env_vars["WORLD_SIZE"] = str(world_size)
-                env_vars["CUDA_VISIBLE_DEVICES"] = ",".join(gpu_ids)
+
+                # Set device isolation using the appropriate environment variable
+                env_vars.update(DeviceProxy.get_isolation_env_vars(gpu_ids))
 
                 # Inherit Forge-relevant environment variables from the system
                 for env_var in all_env_vars():
diff --git a/src/forge/observability/perf_tracker.py b/src/forge/observability/perf_tracker.py
@@ -54,18 +54,18 @@ class Tracer:
     Tracer with multi-step timing and optional memory tracking at start/stop boundaries.
     Steps only affect timing; memory is tracked from start() to stop().
 
-    Supports non-blocking CUDA timing via CUDA events and background polling threads.
+    Supports non-blocking accelerator timing via torch events and background polling threads.
     Aggregation is handled externally by the metrics system via record_metric.
 
     User must call start() and stop() explicitly.
     Supports reuse: after calling stop(), you may call start() again to begin a new timing session.
 
     Local env flag DISABLE_PERF_METRICS can be used to skip all timing operations.
-    Local env flag METRIC_TIMER_USES_GPU can be used to set CUDA timing.
+    Local env flag METRIC_TIMER_USES_GPU can be used to set accelerator timing.
 
     Args:
         prefix (str): Prefix for metric names, e.g. "my_prefix" -> "{my_prefix}/{step_name}/duration_avg_s".
-        track_memory (bool): Whether to track CUDA memory usage. Defaults to False.
+        track_memory (bool): Whether to track accelerator memory usage. Defaults to False.
         timer (str): Timing backend; "cpu" (default) or "gpu".
 
     Example:
@@ -138,8 +138,8 @@ def start(self) -> None:
         else:
             # Env var not set - use the timer parameter
             use_gpu = self.time_with_gpu
-        time_with_gpu_events = use_gpu and torch.cuda.is_available()
-        self._timer = _TimerCUDA() if time_with_gpu_events else _TimerCPU()
+        time_with_gpu_events = use_gpu and torch.accelerator.is_available()
+        self._timer = _TimerGPU() if time_with_gpu_events else _TimerCPU()
         self._timer.start()
 
         self._active = True
@@ -176,7 +176,7 @@ def stop(self) -> None:
     def _start_memory_tracking(self) -> None:
         is_outer_scope = not _is_memory_active()
         should_track = (
-            self.track_memory and is_outer_scope and torch.cuda.is_available()
+            self.track_memory and is_outer_scope and torch.accelerator.is_available()
         )
 
         if self.track_memory and not is_outer_scope:
@@ -185,23 +185,23 @@ def _start_memory_tracking(self) -> None:
 
         if should_track:
             _set_memory_active(True)
-            torch.cuda.reset_peak_memory_stats()
-            self._start_mem = torch.cuda.memory_allocated()
+            torch.accelerator.reset_peak_memory_stats()
+            self._start_mem = torch.accelerator.memory_allocated()
             self._memory_started = True
 
     def _stop_memory_tracking(self) -> None:
         if not self._memory_started:
             return
 
-        end_mem = torch.cuda.memory_allocated()
+        end_mem = torch.accelerator.memory_allocated()
         delta = (end_mem - self._start_mem) / 1024**3
-        peak_mem = torch.cuda.max_memory_allocated() / 1024**3
+        peak_mem = torch.accelerator.max_memory_allocated() / 1024**3
         record_metric(
             f"{self.prefix}/memory_delta_end_start_avg_gb", delta, Reduce.MEAN
         )
         record_metric(f"{self.prefix}/memory_peak_max_gb", peak_mem, Reduce.MAX)
         _set_memory_active(False)
-        torch.cuda.reset_peak_memory_stats()
+        torch.accelerator.reset_peak_memory_stats()
         self._memory_started = False
 
     def _record_timing_metrics(
@@ -258,12 +258,12 @@ def get_all_durations(self) -> tuple[list[tuple[str, float]], float]:
         return self._durations[:], stop_step_ms
 
 
-class _TimerCUDA(_TimerProtocol):
-    """CUDA timing backend with non-blocking events and futures.
-    Uses a thread pool to poll CUDA events asynchronously without blocking the main thread.
+class _TimerGPU(_TimerProtocol):
+    """Accelerator timing backend with non-blocking events and futures.
+    Uses a thread pool to poll torch events asynchronously without blocking the main thread.
 
     Example:
-        timer = _TimerCUDA()
+        timer = _TimerGPU()
         timer.start()
         # torch.mm(a, b)  # ~100ms GPU
         timer.step("matmul")
@@ -272,36 +272,36 @@ class _TimerCUDA(_TimerProtocol):
     """
 
     def __init__(self, max_workers: int = 2) -> None:
-        if not torch.cuda.is_available():
-            raise RuntimeError("CUDA is not available for timing")
+        if not torch.accelerator.is_available():
+            raise RuntimeError("Accelerator is not available for timing")
         self._executor = ThreadPoolExecutor(max_workers=max_workers)
         self._futures: list[tuple[str, Future[float], int]] = (
             []
         )  # (name, future, submission_index)
         self._durations: list[tuple[str, float]] = []
-        self._chain_start: torch.cuda.Event | None = None
+        self._chain_start: torch.Event | None = None
 
     def start(self) -> None:
         """Call before any steps. Clear state for reuse; record initial event on current stream."""
         self._futures.clear()
         self._durations.clear()
-        stream = torch.cuda.current_stream()
-        start_event = torch.cuda.Event(enable_timing=True)
+        stream = torch.accelerator.current_stream()
+        start_event = torch.Event(enable_timing=True)
         start_event.record(stream)
         self._chain_start = start_event
 
     def step(self, name: str) -> None:
         """Mark the end of a GPU workload segment and start the next, submitting async polling.
-        Records a CUDA end event on the current stream; a background thread polls completion.
+        Records a torch end event on the current stream; a background thread polls completion.
 
         Args:
             name: Label for this segment's duration
         """
         if self._chain_start is None:
             raise ValueError("Timer must be started before calling step")
 
-        stream = torch.cuda.current_stream()
-        end_event = torch.cuda.Event(enable_timing=True)
+        stream = torch.accelerator.current_stream()
+        end_event = torch.Event(enable_timing=True)
         end_event.record(stream)
 
         future = self._executor.submit(self._poll_elapsed, self._chain_start, end_event)
@@ -312,9 +312,7 @@ def step(self, name: str) -> None:
 
         self._chain_start = end_event
 
-    def _poll_elapsed(
-        self, start_event: torch.cuda.Event, end_event: torch.cuda.Event
-    ) -> float:
+    def _poll_elapsed(self, start_event: torch.Event, end_event: torch.Event) -> float:
         """Compute elapsed time after polling with backoff."""
         # Poll until ready
         sleep_time = 0.001  # Start at 1ms
@@ -388,8 +386,8 @@ def trace(
 
     Args:
         prefix (str): Prefix for metric names
-        track_memory (bool): Whether to track CUDA memory usage. Defaults to False.
-        timer (str): Timing backend; "cpu" (default) or "gpu" (requires CUDA).
+        track_memory (bool): Whether to track memory usage. Defaults to False.
+        timer (str): Timing backend; "cpu" (default) or "gpu" (requires accelerator support).
 
     Decorator Examples:
         @trace("my_prefix", track_memory=True, timer="gpu")
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -14,5 +14,5 @@ def gpu_test(gpu_count: int = 1):
     required amount of GPU is not available
     """
     message = f"Not enough GPUs to run the test: requires {gpu_count}"
-    local_gpu_count: int = torch.cuda.device_count()
+    local_gpu_count: int = torch.accelerator.device_count()
     return pytest.mark.skipif(local_gpu_count < gpu_count, reason=message)
diff --git a/tests/unit_tests/datasets/test_stop_after_one_epoch.py b/tests/unit_tests/datasets/test_stop_after_one_epoch.py
@@ -158,7 +158,7 @@ def test_epoch_sync_across_ranks(self):
 
             batch_iter = StopAfterOneEpoch(
                 iter=iter(dataloader),
-                device=torch.device("cuda"),
+                device=torch.accelerator.current_accelerator(),
                 dp_mesh=dp_mesh,
             )
 
diff --git a/tests/unit_tests/observability/test_perf_tracker.py b/tests/unit_tests/observability/test_perf_tracker.py
diff --git a/tests/unit_tests/test_provisioner.py b/tests/unit_tests/test_provisioner.py

Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,7 @@ def test_epoch_sync_across_ranks(self):`
`158`	`158`
`159`	`159`	`batch_iter = StopAfterOneEpoch(`
`160`	`160`	`iter=iter(dataloader),`
`161`		`- device=torch.device("cuda"),`
	`161`	`+ device=torch.accelerator.current_accelerator(),`
`162`	`162`	`dp_mesh=dp_mesh,`
`163`	`163`	`)`
`164`	`164`