Support per-expert MoE checkpoints in qwen3_5_moe.sanitize

sdayal · sdayal · commit cd039ff2da27 · 2026-04-28T20:10:33.000+02:00
Qwen/Qwen3.6-35B-A3B-FP8 ships expert MLPs as one tensor per expert per
projection:
  model.language_model.layers.{L}.mlp.experts.{E}.{gate,up,down}_proj.weight

The bf16 master Qwen/Qwen3.6-35B-A3B is already pre-stacked
(experts.gate_up_proj / experts.down_proj) and loads via the existing
combined-format branch unchanged; only the FP8 release lands per-expert,
likely because Qwen's FP8 quantization pipeline runs per-expert and the
artifact is not re-stacked after quantization.

Loading Qwen/Qwen3.6-35B-A3B-FP8 today fails strict load_weights with
thousands of unexpected keys (30720 = 256 experts x 40 layers x 3
projections).

Add a second branch in Qwen3_5MoeModel.sanitize that detects the
per-expert layout and stacks the per-expert tensors along axis 0 into the
same switch_mlp.* form the combined-format branch produces. Pre-stacked
checkpoints (mlx-community redistributions, Qwen3.5-MoE, Qwen3.6 bf16 MoE)
take the original branch unchanged.

Defensive contiguity check raises ValueError on non-contiguous indices
(e.g. shipping experts {0,1,3} but skipping 2) so a malformed checkpoint
fails loud rather than silently dropping experts.

Tests in tests/test_models.py:
- test_qwen3_5_moe_per_expert_weights_stack_to_switch_mlp (positive)
- test_qwen3_5_moe_per_expert_gap_raises (defensive)
- test_qwen3_5_moe_combined_format_still_splits_to_switch_mlp (regression)

Signed-off-by: Shivendra Dayal &lt;sdayal@gmail.com&gt;
diff --git a/mlx_lm/models/qwen3_5_moe.py b/mlx_lm/models/qwen3_5_moe.py
@@ -2,6 +2,8 @@
 
 from dataclasses import dataclass
 
+import mlx.core as mx
+
 from .base import BaseModelArgs
 from .qwen3_5 import Model as Qwen3_5Model
 
@@ -48,5 +50,44 @@ def sanitize(self, weights):
                 new_weights[f"{prefix}.switch_mlp.down_proj.weight"] = new_weights.pop(
                     f"{prefix}.experts.down_proj"
                 )
+            elif f"{prefix}.experts.0.gate_proj.weight" in new_weights:
+                # Per-expert layout (Qwen/Qwen3.6-35B-A3B-FP8): one tensor per
+                # expert per projection. The bf16 master Qwen/Qwen3.6-35B-A3B
+                # is already pre-stacked and falls through the combined-format
+                # branch above unchanged. Collect all matching keys for this
+                # layer, validate the index range is contiguous from 0, then
+                # stack along axis 0 into the same shape the combined-format
+                # branch produces.
+                experts_prefix = f"{prefix}.experts."
+                gate_suffix = ".gate_proj.weight"
+                indices = set()
+                for key in new_weights:
+                    if key.startswith(experts_prefix) and key.endswith(gate_suffix):
+                        tail = key[len(experts_prefix) : -len(gate_suffix)]
+                        if tail.isdigit():
+                            indices.add(int(tail))
+                expected = set(range(len(indices)))
+                if indices != expected:
+                    missing = sorted(expected - indices)
+                    extra = sorted(indices - expected)
+                    raise ValueError(
+                        f"Per-expert MoE weights at {prefix}.experts have "
+                        f"non-contiguous indices: missing={missing}, "
+                        f"unexpected={extra}."
+                    )
+                gates, ups, downs = [], [], []
+                for e in range(len(indices)):
+                    gates.append(
+                        new_weights.pop(f"{prefix}.experts.{e}.gate_proj.weight")
+                    )
+                    ups.append(
+                        new_weights.pop(f"{prefix}.experts.{e}.up_proj.weight")
+                    )
+                    downs.append(
+                        new_weights.pop(f"{prefix}.experts.{e}.down_proj.weight")
+                    )
+                new_weights[f"{prefix}.switch_mlp.gate_proj.weight"] = mx.stack(gates)
+                new_weights[f"{prefix}.switch_mlp.up_proj.weight"] = mx.stack(ups)
+                new_weights[f"{prefix}.switch_mlp.down_proj.weight"] = mx.stack(downs)
 
         return self.language_model.sanitize(new_weights)
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -649,6 +649,184 @@ def test_qwen3_5_family_convert_then_load_norm_not_shift_twice(self):
                 mx.array_equal(loaded[mlx_norm_key], converted[mlx_norm_key])
             )
 
+    def test_qwen3_5_moe_per_expert_weights_stack_to_switch_mlp(self):
+        # Qwen/Qwen3.6-35B-A3B-FP8 ships expert MLPs as one tensor per expert
+        # per projection rather than as a single combined
+        # ``experts.gate_up_proj`` / ``experts.down_proj`` tensor (the bf16
+        # master Qwen/Qwen3.6-35B-A3B is already pre-stacked). The sanitize
+        # step must stack the per-expert tensors into the combined
+        # ``switch_mlp.*`` form that ``Qwen3_5MoeSparseMlp.load_weights``
+        # expects.
+        from mlx_lm.models import qwen3_5_moe
+
+        text_config = {
+            "model_type": "qwen3_5_moe_text",
+            "hidden_size": 4,
+            "intermediate_size": 8,
+            "num_hidden_layers": 1,
+            "num_attention_heads": 1,
+            "num_key_value_heads": 1,
+            "rms_norm_eps": 1e-5,
+            "vocab_size": 16,
+            "linear_num_value_heads": 1,
+            "linear_num_key_heads": 1,
+            "linear_key_head_dim": 4,
+            "linear_value_head_dim": 4,
+            "linear_conv_kernel_dim": 1,
+            "full_attention_interval": 1,
+            "tie_word_embeddings": False,
+            "max_position_embeddings": 16,
+            "num_experts": 2,
+            "num_experts_per_tok": 1,
+            "moe_intermediate_size": 6,
+            "shared_expert_intermediate_size": 6,
+        }
+        args = qwen3_5_moe.ModelArgs.from_dict(
+            {"model_type": "qwen3_5_moe", "text_config": text_config}
+        )
+        model = qwen3_5_moe.Model(args)
+
+        prefix = "model.language_model.layers.0.mlp.experts"
+        per_expert = {
+            f"{prefix}.0.gate_proj.weight": mx.full((6, 4), 1.0),
+            f"{prefix}.0.up_proj.weight": mx.full((6, 4), 2.0),
+            f"{prefix}.0.down_proj.weight": mx.full((4, 6), 3.0),
+            f"{prefix}.1.gate_proj.weight": mx.full((6, 4), 4.0),
+            f"{prefix}.1.up_proj.weight": mx.full((6, 4), 5.0),
+            f"{prefix}.1.down_proj.weight": mx.full((4, 6), 6.0),
+        }
+        sanitized = model.sanitize(dict(per_expert))
+
+        out_prefix = "language_model.model.layers.0.mlp.switch_mlp"
+        gate_key = f"{out_prefix}.gate_proj.weight"
+        up_key = f"{out_prefix}.up_proj.weight"
+        down_key = f"{out_prefix}.down_proj.weight"
+
+        self.assertIn(gate_key, sanitized)
+        self.assertIn(up_key, sanitized)
+        self.assertIn(down_key, sanitized)
+        self.assertEqual(sanitized[gate_key].shape, (2, 6, 4))
+        self.assertEqual(sanitized[up_key].shape, (2, 6, 4))
+        self.assertEqual(sanitized[down_key].shape, (2, 4, 6))
+        # Per-expert keys must not leak through.
+        self.assertFalse(any(".experts.0." in k for k in sanitized))
+        self.assertFalse(any(".experts.1." in k for k in sanitized))
+        # Stacking preserves per-expert content along axis 0.
+        self.assertTrue(mx.array_equal(sanitized[gate_key][0], per_expert[f"{prefix}.0.gate_proj.weight"]))
+        self.assertTrue(mx.array_equal(sanitized[gate_key][1], per_expert[f"{prefix}.1.gate_proj.weight"]))
+        self.assertTrue(mx.array_equal(sanitized[down_key][1], per_expert[f"{prefix}.1.down_proj.weight"]))
+
+    def test_qwen3_5_moe_per_expert_gap_raises(self):
+        # Defensive: if a per-expert checkpoint has non-contiguous expert indices
+        # (e.g. ships 0, 1, 3 but not 2), sanitize must fail loudly rather than
+        # silently dropping expert 3 — which would replicate the strict-load
+        # failure this branch is meant to fix.
+        from mlx_lm.models import qwen3_5_moe
+
+        text_config = {
+            "model_type": "qwen3_5_moe_text",
+            "hidden_size": 4,
+            "intermediate_size": 8,
+            "num_hidden_layers": 1,
+            "num_attention_heads": 1,
+            "num_key_value_heads": 1,
+            "rms_norm_eps": 1e-5,
+            "vocab_size": 16,
+            "linear_num_value_heads": 1,
+            "linear_num_key_heads": 1,
+            "linear_key_head_dim": 4,
+            "linear_value_head_dim": 4,
+            "linear_conv_kernel_dim": 1,
+            "full_attention_interval": 1,
+            "tie_word_embeddings": False,
+            "max_position_embeddings": 16,
+            "num_experts": 3,
+            "num_experts_per_tok": 1,
+            "moe_intermediate_size": 6,
+            "shared_expert_intermediate_size": 6,
+        }
+        args = qwen3_5_moe.ModelArgs.from_dict(
+            {"model_type": "qwen3_5_moe", "text_config": text_config}
+        )
+        model = qwen3_5_moe.Model(args)
+
+        prefix = "model.language_model.layers.0.mlp.experts"
+        # Index 2 deliberately missing.
+        gapped = {
+            f"{prefix}.0.gate_proj.weight": mx.zeros((6, 4)),
+            f"{prefix}.0.up_proj.weight": mx.zeros((6, 4)),
+            f"{prefix}.0.down_proj.weight": mx.zeros((4, 6)),
+            f"{prefix}.1.gate_proj.weight": mx.zeros((6, 4)),
+            f"{prefix}.1.up_proj.weight": mx.zeros((6, 4)),
+            f"{prefix}.1.down_proj.weight": mx.zeros((4, 6)),
+            f"{prefix}.3.gate_proj.weight": mx.zeros((6, 4)),
+            f"{prefix}.3.up_proj.weight": mx.zeros((6, 4)),
+            f"{prefix}.3.down_proj.weight": mx.zeros((4, 6)),
+        }
+        with self.assertRaisesRegex(ValueError, "non-contiguous"):
+            model.sanitize(dict(gapped))
+
+    def test_qwen3_5_moe_combined_format_still_splits_to_switch_mlp(self):
+        # Regression guard: pre-stacked checkpoints (e.g. mlx-community Qwen3.5
+        # / 3.6 redistributions) ship ``experts.gate_up_proj`` /
+        # ``experts.down_proj`` as combined tensors. Sanitize must still split
+        # them into ``switch_mlp.{gate,up,down}_proj.weight`` via the original
+        # branch, untouched by the new per-expert path.
+        from mlx_lm.models import qwen3_5_moe
+
+        text_config = {
+            "model_type": "qwen3_5_moe_text",
+            "hidden_size": 4,
+            "intermediate_size": 8,
+            "num_hidden_layers": 1,
+            "num_attention_heads": 1,
+            "num_key_value_heads": 1,
+            "rms_norm_eps": 1e-5,
+            "vocab_size": 16,
+            "linear_num_value_heads": 1,
+            "linear_num_key_heads": 1,
+            "linear_key_head_dim": 4,
+            "linear_value_head_dim": 4,
+            "linear_conv_kernel_dim": 1,
+            "full_attention_interval": 1,
+            "tie_word_embeddings": False,
+            "max_position_embeddings": 16,
+            "num_experts": 2,
+            "num_experts_per_tok": 1,
+            "moe_intermediate_size": 6,
+            "shared_expert_intermediate_size": 6,
+        }
+        args = qwen3_5_moe.ModelArgs.from_dict(
+            {"model_type": "qwen3_5_moe", "text_config": text_config}
+        )
+        model = qwen3_5_moe.Model(args)
+
+        # Pre-stacked: gate_up has shape (num_experts, 2*intermediate, hidden);
+        # down has shape (num_experts, hidden, intermediate).
+        gate_up = mx.arange(2 * 12 * 4, dtype=mx.float32).reshape(2, 12, 4)
+        down = mx.arange(2 * 4 * 6, dtype=mx.float32).reshape(2, 4, 6)
+        sanitized = model.sanitize(
+            {
+                "model.language_model.layers.0.mlp.experts.gate_up_proj": gate_up,
+                "model.language_model.layers.0.mlp.experts.down_proj": down,
+            }
+        )
+
+        out_prefix = "language_model.model.layers.0.mlp.switch_mlp"
+        self.assertEqual(sanitized[f"{out_prefix}.gate_proj.weight"].shape, (2, 6, 4))
+        self.assertEqual(sanitized[f"{out_prefix}.up_proj.weight"].shape, (2, 6, 4))
+        self.assertEqual(sanitized[f"{out_prefix}.down_proj.weight"].shape, (2, 4, 6))
+        self.assertTrue(
+            mx.array_equal(sanitized[f"{out_prefix}.gate_proj.weight"], gate_up[:, :6, :])
+        )
+        self.assertTrue(
+            mx.array_equal(sanitized[f"{out_prefix}.up_proj.weight"], gate_up[:, 6:, :])
+        )
+        self.assertTrue(mx.array_equal(sanitized[f"{out_prefix}.down_proj.weight"], down))
+        # Combined keys must not leak through after split.
+        self.assertNotIn("language_model.model.layers.0.mlp.experts.gate_up_proj", sanitized)
+        self.assertNotIn("language_model.model.layers.0.mlp.experts.down_proj", sanitized)
+
     def test_gemma4_convert_then_load_keeps_language_model_prefix(self):
         from mlx_lm.models import gemma4