fix: add explanatory comment for linear_before_reset=1 in aten_gru

Copilot · justinchuby · Copilot · commit 0226bb36fc57 · 2026-03-15T17:08:47.000Z
Co-authored-by: justinchuby &lt;11205048+justinchuby@users.noreply.github.com&gt;
diff --git a/onnxscript/function_libs/torch_lib/ops/core.py b/onnxscript/function_libs/torch_lib/ops/core.py
@@ -4348,6 +4348,11 @@ def aten_gru(
         # Extract hidden_size from hx shape: [num_layers * num_directions, batch, hidden_size]
         hidden_size_attr = hx.shape[2]
 
+        # linear_before_reset=1 matches PyTorch's GRU formulation where the linear
+        # transformation is applied before multiplying by the reset gate:
+        #   ht = g(Xt*(Wh^T) + rt (.) (Ht-1*(Rh^T) + Rbh) + Wbh)
+        # The ONNX default (linear_before_reset=0) uses a different equation and
+        # would produce numerically incorrect results.
         if B is not None:
             Y, Y_h = op.GRU(
                 current_input,