feat: add support for inputs_embeds in CUDA graph execution

drewjin · drewjin · commit bdac9edb6a97 · 2026-04-25T10:29:02.000Z
- Introduced `enable_cudagraph_inputs_embeds` configuration option to allow capturing CUDA graphs from `inputs_embeds` instead of `input_ids`, enhancing flexibility in model execution.
- Updated `LLaDA2Model` and `MultiBlockModelRunnerTemplate` to support the new inputs_embeds functionality, improving performance for specific use cases.
- Modified argument parsing and configuration loading to accommodate the new inputs_embeds settings, ensuring seamless integration with existing benchmarks.
diff --git a/diffulex/config.py b/diffulex/config.py
@@ -77,6 +77,7 @@ class Config:
     attn_impl: str = "triton"  # "triton" or "naive"
     enable_prefill_cudagraph: bool = True
     enable_full_static_runner: bool = True
+    enable_cudagraph_inputs_embeds: bool = False
     prefill_cudagraph_max_len: int = 0
     enable_torch_compile: bool = True
     enable_cudagraph_torch_compile: bool = False
diff --git a/diffulex/layer/embed_head.py b/diffulex/layer/embed_head.py
@@ -120,9 +120,9 @@ def _linear_into_workspace(self, x: torch.Tensor) -> torch.Tensor:
             logits.add_(self.bias)
         return logits
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, x: torch.Tensor, gather: bool = True):
         logits = self._linear_into_workspace(x)
-        if self.tp_size > 1:
+        if gather and self.tp_size > 1:
             if LM_HEAD_FP32_GATHER:
                 logits_dtype = logits.dtype
                 logits = _tp_gather_to_rank0(logits.to(torch.float32), self.tp_group, self.tp_size, self.tp_rank)
diff --git a/diffulex/model/llada2.py b/diffulex/model/llada2.py
@@ -363,6 +363,15 @@ def forward(
         mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.word_embeddings(input_ids)
+        return self.forward_inputs_embeds(hidden_states, positions, mask)
+
+    def forward_inputs_embeds(
+        self,
+        inputs_embeds: torch.Tensor,
+        positions: torch.Tensor,
+        mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
         hidden_states = self._maybe_apply_token_merging(hidden_states)
         for layer in self.layers:
             hidden_states = layer(positions, hidden_states, mask)
@@ -497,8 +506,16 @@ def forward(
     ) -> torch.Tensor:
         return self.model(input_ids, positions, mask)
 
-    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        return self.lm_head(hidden_states)
+    def forward_inputs_embeds(
+        self,
+        inputs_embeds: torch.Tensor,
+        positions: torch.Tensor,
+        mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.model.forward_inputs_embeds(inputs_embeds, positions, mask)
+
+    def compute_logits(self, hidden_states: torch.Tensor, gather: bool = True) -> torch.Tensor:
+        return self.lm_head(hidden_states, gather=gather)
 
 
 SparseMoEBlock = FusedMoE
diff --git a/diffulex/server/args.py b/diffulex/server/args.py
@@ -38,6 +38,7 @@ class ServerArgs:
     max_model_len: int = 2048
     enable_prefill_cudagraph: bool = True
     enable_full_static_runner: bool = True
+    enable_cudagraph_inputs_embeds: bool = False
     prefill_cudagraph_max_len: int = 0
     enable_torch_compile: bool = True
     enable_cudagraph_torch_compile: bool = False
@@ -81,6 +82,7 @@ def engine_kwargs(self) -> dict:
             "max_model_len": self.max_model_len,
             "enable_prefill_cudagraph": self.enable_prefill_cudagraph,
             "enable_full_static_runner": self.enable_full_static_runner,
+            "enable_cudagraph_inputs_embeds": self.enable_cudagraph_inputs_embeds,
             "prefill_cudagraph_max_len": self.prefill_cudagraph_max_len,
             "enable_torch_compile": self.enable_torch_compile,
             "enable_cudagraph_torch_compile": self.enable_cudagraph_torch_compile,
@@ -137,6 +139,7 @@ def build_arg_parser() -> argparse.ArgumentParser:
     parser.add_argument("--max-model-len", type=int, default=2048)
     parser.add_argument("--disable-prefill-cudagraph", action="store_true")
     parser.add_argument("--disable-full-static-runner", action="store_true")
+    parser.add_argument("--enable-cudagraph-inputs-embeds", action="store_true")
     parser.add_argument("--prefill-cudagraph-max-len", type=int, default=0)
     parser.add_argument("--disable-torch-compile", action="store_true")
     parser.add_argument("--enable-cudagraph-torch-compile", action="store_true")
@@ -189,6 +192,7 @@ def parse_args(argv: Sequence[str] | None = None) -> ServerArgs:
         max_model_len=ns.max_model_len,
         enable_prefill_cudagraph=not ns.disable_prefill_cudagraph,
         enable_full_static_runner=not ns.disable_full_static_runner,
+        enable_cudagraph_inputs_embeds=ns.enable_cudagraph_inputs_embeds,
         prefill_cudagraph_max_len=ns.prefill_cudagraph_max_len,
         enable_torch_compile=not ns.disable_torch_compile,
         enable_cudagraph_torch_compile=ns.enable_cudagraph_torch_compile,
diff --git a/diffulex/strategy_template/multi_block/engine/model_runner.py b/diffulex/strategy_template/multi_block/engine/model_runner.py
@@ -134,9 +134,16 @@ def _capture_model_forward_graph(
         num_tokens: int,
         *,
         allow_compile: bool = False,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.cuda.CUDAGraph:
         def run_once() -> None:
-            outputs[:num_tokens] = self.model(input_ids[:num_tokens], positions[:num_tokens])
+            if inputs_embeds is None:
+                outputs[:num_tokens] = self.model(input_ids[:num_tokens], positions[:num_tokens])
+            else:
+                outputs[:num_tokens] = self.model.forward_inputs_embeds(
+                    inputs_embeds[:num_tokens],
+                    positions[:num_tokens],
+                )
 
         stream = self._get_graph_capture_stream()
         pool = self._get_graph_pool()
@@ -291,6 +298,16 @@ def _model_hidden_dtype(self) -> torch.dtype:
         except StopIteration:
             return torch.get_default_dtype()
 
+    def _use_cudagraph_inputs_embeds(self) -> bool:
+        if not bool(getattr(self.config, "enable_cudagraph_inputs_embeds", False)):
+            return False
+        if getattr(self.config, "model_name", None) not in {"llada2", "llada2_moe", "llada2_mini"}:
+            return False
+        return (
+            hasattr(self.model, "forward_inputs_embeds")
+            and hasattr(getattr(self.model, "model", None), "word_embeddings")
+        )
+
     def _ensure_runtime_static_buffers(
         self,
         *,
@@ -456,6 +473,12 @@ def _capture_prefill_cudagraph(self, bucket_len: int):
         device = self._cuda_graph_device()
 
         input_ids = torch.zeros(bucket_len, dtype=torch.int64, device=device)
+        use_inputs_embeds = self._use_cudagraph_inputs_embeds()
+        inputs_embeds = (
+            torch.zeros(bucket_len, hf_config.hidden_size, dtype=self._model_hidden_dtype(), device=device)
+            if use_inputs_embeds
+            else None
+        )
         positions = torch.zeros(bucket_len, dtype=torch.int64, device=device)
         slot_mapping = torch.full((bucket_len,), -1, dtype=torch.int32, device=device)
         context_lens = torch.zeros(req_capacity, dtype=torch.int32, device=device)
@@ -506,10 +529,18 @@ def _capture_prefill_cudagraph(self, bucket_len: int):
             padded_prefix_lens=padded_prefix_lens,
             outputs=outputs,
         )
+        if inputs_embeds is not None:
+            graph_vars["inputs_embeds"] = inputs_embeds
         graph_vars.update(self._prefill_graph_extra_vars(bucket_len, device))
         self._init_prefill_graph_extra_metadata(attn_metadata, graph_vars, bucket_len)
 
-        graph = self._capture_model_forward_graph(input_ids, positions, outputs, bucket_len)
+        graph = self._capture_model_forward_graph(
+            input_ids,
+            positions,
+            outputs,
+            bucket_len,
+            inputs_embeds=inputs_embeds,
+        )
         if self.graph_pool is None:
             self.graph_pool = graph.pool()
         torch.cuda.synchronize()
@@ -528,14 +559,16 @@ def _copy_common_graph_inputs(
         num_reqs: int,
     ) -> None:
         for key, value in graph_vars.items():
-            if key == "outputs":
+            if key in ("outputs", "inputs_embeds"):
                 continue
             if key in ("slot_mapping", "page_tables"):
                 value.fill_(-1)
             else:
                 value.zero_()
 
         graph_vars["input_ids"][:num_tokens] = input_ids
+        if "inputs_embeds" in graph_vars:
+            graph_vars["inputs_embeds"][:num_tokens] = self.model.model.word_embeddings(input_ids)
         graph_vars["positions"][:num_tokens] = positions
         graph_vars["slot_mapping"][:num_tokens] = attn_metadata.slot_mapping
         graph_vars["context_lens"][:num_reqs] = attn_metadata.context_lens
@@ -767,6 +800,12 @@ def capture_cudagraph_multi_block(self: ModelRunnerBase):
         device = self._cuda_graph_device()
 
         input_ids = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
+        use_inputs_embeds = self._use_cudagraph_inputs_embeds()
+        inputs_embeds = (
+            torch.zeros(max_num_tokens, hf_config.hidden_size, dtype=self._model_hidden_dtype(), device=device)
+            if use_inputs_embeds
+            else None
+        )
         positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
         slot_mapping = torch.full((max_num_tokens,), -1, dtype=torch.int32, device=device)
         context_lens = torch.zeros(max_num_seqs, dtype=torch.int32, device=device)
@@ -819,7 +858,14 @@ def capture_cudagraph_multi_block(self: ModelRunnerBase):
                 padded_prefix_lens=padded_prefix_lens[:num_seqs],
             )
 
-            graph = self._capture_model_forward_graph(input_ids, positions, outputs, num_tokens, allow_compile=True)
+            graph = self._capture_model_forward_graph(
+                input_ids,
+                positions,
+                outputs,
+                num_tokens,
+                allow_compile=True,
+                inputs_embeds=inputs_embeds[:num_tokens] if inputs_embeds is not None else None,
+            )
             if self.graph_pool is None:
                 self.graph_pool = graph.pool()
             self.graphs[num_tokens] = graph
@@ -840,4 +886,6 @@ def capture_cudagraph_multi_block(self: ModelRunnerBase):
             padded_prefix_lens=padded_prefix_lens,
             outputs=outputs,
         )
+        if inputs_embeds is not None:
+            self.graph_vars["inputs_embeds"] = inputs_embeds
         reset_warming_up()
diff --git a/diffulex_bench/arg_parser.py b/diffulex_bench/arg_parser.py
@@ -421,6 +421,12 @@ def create_argument_parser() -> argparse.ArgumentParser:
         default=None,
         help="Use the full-static CUDA graph runner for supported multi-block forward passes",
     )
+    parser.add_argument(
+        "--enable-cudagraph-inputs-embeds",
+        action=argparse.BooleanOptionalAction,
+        default=None,
+        help="For LLaDA2 only: capture CUDA graphs from inputs_embeds instead of input_ids",
+    )
     parser.add_argument(
         "--prefill-cudagraph-max-len",
         type=int,
diff --git a/diffulex_bench/config.py b/diffulex_bench/config.py
@@ -128,6 +128,7 @@ class EngineConfig:
     max_num_reqs: int = 128
     enable_prefill_cudagraph: bool = True
     enable_full_static_runner: bool = True
+    enable_cudagraph_inputs_embeds: bool = False
     prefill_cudagraph_max_len: int = 0
     enable_torch_compile: bool = True
     enable_cudagraph_torch_compile: bool = False
diff --git a/diffulex_bench/main.py b/diffulex_bench/main.py
@@ -458,6 +458,8 @@ def apply_engine_arg_overrides(engine: EngineConfig) -> None:
             config.engine.enable_prefill_cudagraph = bool(args.enable_prefill_cudagraph)
         if getattr(args, "enable_full_static_runner", None) is not None:
             config.engine.enable_full_static_runner = bool(args.enable_full_static_runner)
+        if getattr(args, "enable_cudagraph_inputs_embeds", None) is not None:
+            config.engine.enable_cudagraph_inputs_embeds = bool(args.enable_cudagraph_inputs_embeds)
         if (
             was_provided("prefill_cudagraph_max_len")
             and getattr(args, "prefill_cudagraph_max_len", None) is not None
@@ -534,6 +536,11 @@ def apply_engine_arg_overrides(engine: EngineConfig) -> None:
                 if getattr(args, "enable_full_static_runner", None) is not None
                 else True
             ),
+            enable_cudagraph_inputs_embeds=(
+                bool(getattr(args, "enable_cudagraph_inputs_embeds", False))
+                if getattr(args, "enable_cudagraph_inputs_embeds", None) is not None
+                else False
+            ),
             prefill_cudagraph_max_len=(getattr(args, "prefill_cudagraph_max_len", None) or 0),
             enable_torch_compile=(
                 bool(getattr(args, "enable_torch_compile", True))