add custom scale

shubhambhokare1 · shubhambhokare1 · commit 5f386b4f7d5f · 2025-04-17T00:01:44.000Z
diff --git a/onnxscript/rewriter/ort_fusions/sdpa.py b/onnxscript/rewriter/ort_fusions/sdpa.py
@@ -13,6 +13,7 @@ def __init__(self, name: str, *, use_mask: bool, pre_scale: bool, use_mul: bool)
         self._use_mask = use_mask
         self._pre_scale = pre_scale
         self._use_mul = use_mul
+        self._custom_scale = False
 
     def pattern(
         self, op, query, key_transposed, value, mask, query_scale, key_scale, qk_scale
@@ -57,34 +58,48 @@ def check(self, op, query, key_transposed, value, mask, query_scale, key_scale,
 
         if self._pre_scale:
             # Check if query_scale and key_scale are scalars == sqrt(expected_scaling_factor)
+            # If they are scalars but != sqrt(expected_scaling_factor), a custom scale is being used.
             sqrt_scaling_factor = math.sqrt(expected_scaling_factor)
-            if not _ir_utils.is_singleton_value(query_scale, sqrt_scaling_factor, rtol=1e-3):
+
+            if _ir_utils.get_singleton_value(query_scale) is None:
                 return check_result.fail(
-                    "Query scale is not a scalar or does not match the expected scaling factor.",
+                    "Query scale is not a scalar.",
                     query_scale,
                 )
-            if not _ir_utils.is_singleton_value(key_scale, sqrt_scaling_factor, rtol=1e-3):
+            if not _ir_utils.is_singleton_value(query_scale, sqrt_scaling_factor, rtol=1e-3):
+                self._custom_scale = True
+            if _ir_utils.get_singleton_value(key_scale) is None:
                 return check_result.fail(
-                    "Key scale is not a scalar or does not match the expected scaling factor.",
+                    "Key scale is not a scalar.",
                     key_scale,
                 )
+            if not _ir_utils.is_singleton_value(key_scale, sqrt_scaling_factor, rtol=1e-3):
+                self._custom_scale = True
         else:
             # Check if qk_scale is a scalar == expected_scaling_factor)
-            if not _ir_utils.is_singleton_value(qk_scale, expected_scaling_factor, rtol=1e-3):
+            # If it is a scalar but != sqrt(expected_scaling_factor), a custom scale is being used
+            if _ir_utils.get_singleton_value(qk_scale) is None:
                 return check_result.fail(
-                    "QK scale is not a scalar or does not match the expected scaling factor.",
+                    "QK scale is not a scalar.",
                     qk_scale,
                 )
+            if not _ir_utils.is_singleton_value(qk_scale, expected_scaling_factor, rtol=1e-3):
+                self._custom_scale = True
 
         # check ranks/shapes
 
         return check_result
 
-    def rewrite(self, op, query, key_transposed, value, mask, **_):
+    def rewrite(
+        self, op, query, key_transposed, value, mask, query_scale, key_scale, qk_scale, **_
+    ):
+        sdpa_args = [query, key_transposed, value]
         if self._use_mask:
-            return op.SDPA(query, key_transposed, value, mask, _domain="ai.onnxruntime.fusion")
-        else:
-            return op.SDPA(query, key_transposed, value, _domain="ai.onnxruntime.fusion")
+            sdpa_args.append(mask)
+        if self._custom_scale:
+            scale = _ir_utils.get_singleton_value(query_scale if self._pre_scale else qk_scale)
+            return op.SDPA(*sdpa_args, scale=scale, _domain="ai.onnxruntime.fusion")
+        return op.SDPA(*sdpa_args, _domain="ai.onnxruntime.fusion")
 
 
 # Rules for SDPA without mask
diff --git a/onnxscript/rewriter/ort_fusions/sdpa_test.py b/onnxscript/rewriter/ort_fusions/sdpa_test.py
@@ -74,6 +74,52 @@ def _unmasked_post_mul_sdpa_script(query, key, value):
     return attn_output
 
 
+@script()
+def _custom_scale_pre_div_sdpa_script(query, key, value):
+    key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
+    divisor = op.Constant(value_float=2.0)
+    scaled_query = op.Div(query, divisor)
+    scaled_key = op.Div(key_transposed, divisor)
+    attn_score = op.MatMul(scaled_query, scaled_key)
+    attn_weight = op.Softmax(attn_score, axis=-1)
+    attn_output = op.MatMul(attn_weight, value)
+    return attn_output
+
+
+@script()
+def _custom_scale_pre_mul_sdpa_script(query, key, value):
+    key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
+    multiplier = op.Constant(value_float=0.5)
+    scaled_query = op.Mul(query, multiplier)
+    scaled_key = op.Mul(key_transposed, multiplier)
+    attn_score = op.MatMul(scaled_query, scaled_key)
+    attn_weight = op.Softmax(attn_score, axis=-1)
+    attn_output = op.MatMul(attn_weight, value)
+    return attn_output
+
+
+@script()
+def _custom_scale_post_div_sdpa_script(query, key, value):
+    key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
+    divisor = op.Constant(value_float=0.1)
+    attn_score = op.MatMul(query, key_transposed)
+    scaled_attn_score = op.Div(attn_score, divisor)
+    attn_weight = op.Softmax(scaled_attn_score, axis=-1)
+    attn_output = op.MatMul(attn_weight, value)
+    return attn_output
+
+
+@script()
+def _custom_scale_post_mul_sdpa_script(query, key, value):
+    key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
+    multiplier = op.Constant(value_float=0.125)
+    attn_score = op.MatMul(query, key_transposed)
+    scaled_attn_score = op.Mul(attn_score, multiplier)
+    attn_weight = op.Softmax(scaled_attn_score, axis=-1)
+    attn_output = op.MatMul(attn_weight, value)
+    return attn_output
+
+
 @script()
 def _masked_pre_div_sdpa_script(query, key, value, mask):
     key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
@@ -124,6 +170,56 @@ def _masked_post_mul_sdpa_script(query, key, value, mask):
     return attn_output
 
 
+@script()
+def _custom_scale_pre_div_sdpa_script(query, key, value, mask):
+    key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
+    divisor = op.Constant(value_float=2.0)
+    scaled_query = op.Div(query, divisor)
+    scaled_key = op.Div(key_transposed, divisor)
+    attn_score = op.MatMul(scaled_query, scaled_key)
+    masked_attn_score = op.Add(attn_score, mask)
+    attn_weight = op.Softmax(masked_attn_score, axis=-1)
+    attn_output = op.MatMul(attn_weight, value)
+    return attn_output
+
+
+@script()
+def _custom_scale_mul_sdpa_script(query, key, value, mask):
+    key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
+    multiplier = op.Constant(value_float=0.5)
+    scaled_query = op.Mul(query, multiplier)
+    scaled_key = op.Mul(key_transposed, multiplier)
+    attn_score = op.MatMul(scaled_query, scaled_key)
+    masked_attn_score = op.Add(attn_score, mask)
+    attn_weight = op.Softmax(masked_attn_score, axis=-1)
+    attn_output = op.MatMul(attn_weight, value)
+    return attn_output
+
+
+@script()
+def _custom_scale_post_div_sdpa_script(query, key, value, mask):
+    key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
+    divisor = op.Constant(value_float=0.1)
+    attn_score = op.MatMul(query, key_transposed)
+    scaled_attn_score = op.Div(attn_score, divisor)
+    masked_attn_score = op.Add(scaled_attn_score, mask)
+    attn_weight = op.Softmax(masked_attn_score, axis=-1)
+    attn_output = op.MatMul(attn_weight, value)
+    return attn_output
+
+
+@script()
+def _custom_scale_post_mul_sdpa_script(query, key, value, mask):
+    key_transposed = op.Transpose(key, perm=[0, 1, 3, 2])
+    multiplier = op.Constant(value_float=0.125)
+    attn_score = op.MatMul(query, key_transposed)
+    scaled_attn_score = op.Mul(attn_score, multiplier)
+    masked_attn_score = op.Add(scaled_attn_score, mask)
+    attn_weight = op.Softmax(masked_attn_score, axis=-1)
+    attn_output = op.MatMul(attn_weight, value)
+    return attn_output
+
+
 class SDPATestCase:
     def __init__(self, script_func):
         self.script_func = script_func
@@ -161,6 +257,14 @@ class TestSDPAFusion(unittest.TestCase):
             ("pre_mul", _masked_pre_mul_sdpa_script),
             ("post_div", _masked_post_div_sdpa_script),
             ("post_mul", _masked_post_mul_sdpa_script),
+            ("custom_scale_post_mul", _custom_scale_post_mul_sdpa_script),
+            ("custom_scale_post_div", _custom_scale_post_div_sdpa_script),
+            ("custom_scale_pre_mul", _custom_scale_pre_mul_sdpa_script),
+            ("custom_scale_pre_div", _custom_scale_pre_div_sdpa_script),
+            ("custom_scale_post_mul_masked", _custom_scale_post_mul_sdpa_script),
+            ("custom_scale_post_div_masked", _custom_scale_post_div_sdpa_script),
+            ("custom_scale_pre_mul_masked", _custom_scale_pre_mul_sdpa_script),
+            ("custom_scale_pre_div_masked", _custom_scale_pre_div_sdpa_script),
         ]
     )
     def test_sdpa_fusion(self, name, script_func):