rasmith
diff --git a/‎.buildkite/test_areas/kernels.yaml‎
Lines changed: 10 additions & 0 deletions b/‎.buildkite/test_areas/kernels.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 4 additions & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/compile/backend.py‎
Lines changed: 12 additions & 4 deletions b/‎tests/compile/backend.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎tests/compile/fusions_e2e/test_tp1_quant.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/compile/fusions_e2e/test_tp1_quant.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/compile/fusions_e2e/test_tp2_ar_rms.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/compile/fusions_e2e/test_tp2_ar_rms.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/compile/fusions_e2e/test_tp2_async_tp.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/compile/fusions_e2e/test_tp2_async_tp.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/compile/passes/distributed/test_sequence_parallelism.py‎
Lines changed: 7 additions & 7 deletions b/‎tests/compile/passes/distributed/test_sequence_parallelism.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎tests/compile/passes/ir/__init__.py‎ b/‎tests/compile/passes/ir/__init__.py‎
diff --git a/‎tests/compile/passes/ir/test_lowering.py‎
Lines changed: 69 additions & 0 deletions b/‎tests/compile/passes/ir/test_lowering.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎tests/compile/passes/test_fusion.py‎
Lines changed: 10 additions & 8 deletions b/‎tests/compile/passes/test_fusion.py‎
Lines changed: 10 additions & 8 deletions
@@ -2,6 +2,16 @@ group: Kernels
 depends_on: 
   - image-build
 steps:
+- label: vLLM IR Tests
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/"
+  source_file_dependencies:
+    - vllm/ir
+    - vllm/kernels
+  commands:
+    - pytest -v -s tests/ir
+    - pytest -v -s tests/kernels/ir
+
 - label: Kernels Core Operation Test
   timeout_in_minutes: 75
   source_file_dependencies:
 
@@ -13,6 +13,9 @@
 /vllm/model_executor/layers/rotary_embedding.py @vadiklyutiy
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
+/vllm/ir @ProExpertProg
+/vllm/kernels/ @ProExpertProg @tjtanaa
+/vllm/kernels/helion @ProExpertProg @zou3519
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
@@ -74,6 +77,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
 /tests/evals @mgoin @vadiklyutiy
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/kernels/ir @ProExpertProg @tjtanaa
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 
@@ -8,7 +8,7 @@
 
 import depyf
 from torch import fx
-from torch._ops import OpOverload
+from torch._ops import OpOverload, OpOverloadPacket
 from torch.fx._utils import lazy_format_graph_code
 
 from vllm.compilation.passes.fx_utils import find_op_nodes
@@ -90,7 +90,9 @@ def post_pass(self, graph: fx.Graph):
         # assign by reference, will reflect the final state of the graph
         self.final_graph = graph
 
-    def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced=True):
+    def check_before_ops(
+        self, ops: Sequence[OpOverload | OpOverloadPacket], fully_replaced=True
+    ):
         for op in ops:
             num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
             num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
@@ -99,13 +101,19 @@ def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced=True):
             if fully_replaced:
                 assert num_post == 0, f"Unexpected op {op.name()} in post-pass graph"
 
-    def check_after_ops(self, ops: Sequence[OpOverload]):
+    def check_after_ops(self, ops: Sequence[OpOverload | OpOverloadPacket]):
         for op in ops:
             num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
             num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
             assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph"
             assert num_post > 0, f"Op {op.name()} not found in post-pass graph"
 
-    def op_count(self, op: OpOverload, before=False) -> int:
+    def op_count(self, op: OpOverload | OpOverloadPacket, before=False) -> int:
         graph = self.graph_pre_pass if before else self.graph_post_pass
         return len(list(find_op_nodes(op, graph)))
+
+    def print_graphs(self):
+        print("=== Graph before custom passes ===")
+        print(self.graph_pre_pass.python_code(root_module="self", verbose=True).src)
+        print("=== Graph after custom passes ===")
+        print(self.graph_post_pass.python_code(root_module="self", verbose=True).src)
@@ -99,6 +99,8 @@ def test_tp1_fp8_fusions(
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
     model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
+
     compilation_config = dict(
         use_inductor_graph_partition=inductor_graph_partition,
         custom_ops=custom_ops.split(","),
@@ -166,6 +168,7 @@ def test_tp1_fp4_fusions(
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
     model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
 
     compilation_config = dict(
         use_inductor_graph_partition=inductor_graph_partition,
 
@@ -68,6 +68,7 @@ def test_tp2_ar_rms_fp8_fusions(
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
     model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
 
     compilation_config = dict(
         use_inductor_graph_partition=inductor_graph_partition,
@@ -128,6 +129,7 @@ def test_tp2_ar_rms_fp4_fusions(
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
     model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
 
     compilation_config = dict(
         use_inductor_graph_partition=inductor_graph_partition,
@@ -182,6 +184,7 @@ def test_tp2_ar_rms_fusions(
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
     model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
 
     compilation_config = dict(
         use_inductor_graph_partition=inductor_graph_partition,
 
@@ -58,6 +58,7 @@ def test_tp2_async_tp_fp8_fusions(
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
     model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
 
     compilation_config = dict(
         use_inductor_graph_partition=inductor_graph_partition,
@@ -121,6 +122,7 @@ def test_tp2_async_tp_fusions(
     model_kwargs["hf_overrides"] = hf_overrides(n_layers)
     model_kwargs["load_format"] = "dummy"
     model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
 
     compilation_config = dict(
         use_inductor_graph_partition=inductor_graph_partition,
 
@@ -9,7 +9,6 @@
 from tests.utils import TestFP8Layer, multi_gpu_test
 from vllm.compilation.passes.fusion.rms_quant_fusion import RMSNormQuantFusionPass
 from vllm.compilation.passes.fusion.sequence_parallelism import SequenceParallelismPass
-from vllm.compilation.passes.fx_utils import find_auto_fn
 from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
 from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
 from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
@@ -86,13 +85,14 @@ def ops_in_model_after(self):
         ]
 
     def ops_in_model(self):
-        if RMSNorm.enabled():
-            return [
-                torch.ops._C.rms_norm.default,
+        return (
+            [torch.ops.vllm_ir.rms_norm]
+            + [
                 torch.ops._C.fused_add_rms_norm.default,
             ]
-        else:
-            return []
+            if RMSNorm.enabled()
+            else []
+        )
 
 
 class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
@@ -321,4 +321,4 @@ def sequence_parallelism_pass_on_test_model(
             assert backend.op_count(op, before=False) == 4
 
         for op in model.ops_in_model():
-            find_auto_fn(backend.graph_post_pass.nodes, op)
+            assert backend.op_count(op, before=False) > 0
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from torch import nn
+
+import vllm.kernels  # noqa: F401 to register kernels
+from vllm import ir
+from vllm.compilation.passes.ir.lowering_pass import (
+    VllmIRLoweringPass,
+)
+from vllm.config import get_current_vllm_config
+from vllm.ir import ops
+from vllm.platforms import current_platform
+
+from ...backend import TestBackend
+
+
+class Model(nn.Module):
+    def __init__(self, hidden_size=16, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hidden_size = hidden_size
+        self.weight = torch.ones(hidden_size, dtype=torch.bfloat16)
+
+    def forward(self, x):
+        x1 = x + 4.0
+        x2 = ops.rms_norm(x1, self.weight, 1e-5)
+        x3 = x2 * 5.0
+        # no weight
+        x4 = ops.rms_norm(x3, None, 1e-5)
+        x5 = x4 / 2.0
+        # dispatch to native due to variance_size parameter
+        x6 = ops.rms_norm(x5, self.weight, 1e-5, self.hidden_size // 2)
+        return x6 + 3.0
+
+
+@pytest.mark.parametrize("rms_provider", ops.rms_norm.supported_providers())
+def test_lowering_rms_norm(rms_provider, default_vllm_config):
+    torch.set_default_device(current_platform.device_type)
+
+    lowering_pass = VllmIRLoweringPass(get_current_vllm_config())
+    backend = TestBackend(lowering_pass)
+    backend_unlowered = TestBackend()
+
+    model = Model()
+    x = torch.randn(8, 16, dtype=torch.bfloat16)
+    with (
+        ops.rms_norm.set_priority([rms_provider, "native"]),
+        ir.enable_torch_wrap(True),
+    ):
+        compiled_model = torch.compile(model, backend=backend, fullgraph=True)
+        compiled_unlowered_model = torch.compile(
+            model, backend=backend_unlowered, fullgraph=True
+        )
+        output = compiled_model(x)
+        output_unlowered = compiled_unlowered_model(x)
+
+    selected = lowering_pass.selected_impls["rms_norm"]
+    assert len(selected) == 3
+    assert selected["rms_norm"] == rms_provider
+    assert selected["rms_norm_1"] == rms_provider
+    assert selected["rms_norm_2"] == "native"
+
+    # Compiled function guards on global value, avoid recompilation
+    with ir.enable_torch_wrap(True):
+        output2 = compiled_model(x)
+
+    torch.testing.assert_close(output_unlowered, output)
+    torch.testing.assert_close(output_unlowered, output2)
@@ -6,6 +6,7 @@
 import torch
 
 import vllm.config
+import vllm.ir.ops
 import vllm.plugins
 from tests.compile.backend import TestBackend
 from tests.utils import TestBlockFP8Layer, TestFP8Layer
@@ -51,7 +52,6 @@
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
-RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
 
 # Kernel and group_shape combinations: (kernel, group_shape)
@@ -246,10 +246,8 @@ def ops_in_model_after(self):
         ]
 
     def ops_in_model_before_partial(self):
-        return (
-            [RMS_OP, RMS_ADD_OP]
-            if self.enable_rms_norm_custom_op
-            else [torch.ops.aten.rsqrt]
+        return [torch.ops.vllm_ir.rms_norm] + (
+            [RMS_ADD_OP] if self.enable_rms_norm_custom_op else [torch.ops.aten.rsqrt]
         )
 
 
@@ -340,7 +338,10 @@ def test_fusion_rmsnorm_quant(
         ),
     )
 
-    with vllm.config.set_current_vllm_config(vllm_config):
+    with (
+        vllm.config.set_current_vllm_config(vllm_config),
+        vllm_config.kernel_config.ir_op_priority.set_priority(),
+    ):
         # Setup device before model creation
         torch.set_default_device("cuda")
         torch.set_default_dtype(dtype)
@@ -370,8 +371,9 @@ def test_fusion_rmsnorm_quant(
         # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
         if not enable_rms_norm_custom_op:
             n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
-            # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
-            assert n_add_nodes(backend.graph_pre_pass) == 7
+            # rms_norm is IR, not included
+            # 6 = 3x2 (3xRMS_ADD, 2 each)
+            assert n_add_nodes(backend.graph_pre_pass) == 6
             assert n_add_nodes(backend.graph_post_pass) == 2