uxlfoundation
diff --git a/‎src/common/matmul.cpp‎
Lines changed: 2 additions & 1 deletion b/‎src/common/matmul.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/common/matmul_pd.hpp‎
Lines changed: 28 additions & 7 deletions b/‎src/common/matmul_pd.hpp‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎src/cpu/aarch64/matmul/jit_int8_kernel_types.hpp‎
Lines changed: 8 additions & 1 deletion b/‎src/cpu/aarch64/matmul/jit_int8_kernel_types.hpp‎
Lines changed: 8 additions & 1 deletion
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2019 Intel Corporation
+* Copyright 2026 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -322,7 +323,7 @@ status_t matmul_attr_check(const matmul_desc_t &desc, const engine_t *engine,
             const int mask_src = sc.get_mask(DNNL_ARG_SRC);
 
             VCHECK_MATMUL_UNIMPL(
-                    utils::one_of(mask_src, 0, src_qmask_K,
+                    utils::one_of(mask_src, 0, src_qmask_M, src_qmask_K,
                             src_qmask_M + src_qmask_K, full_tensor_mask),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
 
 
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2019 Intel Corporation
+* Copyright 2026 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +19,7 @@
 #define COMMON_MATMUL_PD_HPP
 
 #include <assert.h>
+#include <map>
 
 #include "oneapi/dnnl/dnnl.h"
 
@@ -199,10 +201,20 @@ struct matmul_pd_t : public primitive_desc_t {
     virtual bool attr_scales_ok(const std::vector<int> &supported_args
             = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST},
             const std::vector<int> &supported_qmodes
-            = {quantization_mode::static_sazp}) const {
+            = {quantization_mode::static_sazp},
+            const std::map<int, std::vector<int>> &extra_masks = {}) const {
         const auto &scales = attr()->scales_;
         if (scales.has_default_values()) return true;
 
+        const auto extra_mask_ok = [&](int arg, int mask) {
+            const auto it = extra_masks.find(arg);
+            if (it != extra_masks.end()) {
+                for (const auto &extra_mask : it->second)
+                    if (mask == extra_mask) return true;
+            }
+            return false;
+        };
+
         bool ok = scales.has_default_values(supported_args);
         for (int arg : supported_args) {
             if (scales.has_default_values(arg)) { continue; }
@@ -238,22 +250,31 @@ struct matmul_pd_t : public primitive_desc_t {
                                     (mask & wei_qmask_K()), is_decompression);
                 }
             } else if (arg == DNNL_ARG_SRC) {
+                // Masks supported across all implementations. Implementation
+                // specific masks can be passed through `extra_masks`.
                 ok = ok
-                        && utils::one_of(mask, 0, src_qmask_K(),
-                                src_qmask_M() + src_qmask_K(),
-                                full_tensor_mask());
+                        && (utils::one_of(mask, 0, src_qmask_K(),
+                                    src_qmask_M() + src_qmask_K(),
+                                    full_tensor_mask())
+                                || extra_mask_ok(arg, mask));
                 ok = ok
                         && IMPLICATION((mask & src_qmask_K()),
                                 !scales.get(arg).has_default_groups());
                 ok = ok
                         && IMPLICATION(!scales.get(arg).has_default_groups(),
                                 scales.get_group(arg, 0)
                                         && K() % scales.get_group(arg, 1) == 0);
+                ok = ok
+                        && IMPLICATION(mask == src_qmask_M(),
+                                scales.get(arg).has_default_groups());
             } else if (arg == DNNL_ARG_DST) {
+                // Masks supported across all implementations. Implementation
+                // specific masks can be passed through `extra_masks`.
                 ok = ok
-                        && utils::one_of(mask, 0, dst_qmask_N(),
-                                dst_qmask_M() + dst_qmask_N(),
-                                full_tensor_mask());
+                        && (utils::one_of(mask, 0, dst_qmask_N(),
+                                    dst_qmask_M() + dst_qmask_N(),
+                                    full_tensor_mask())
+                                || extra_mask_ok(arg, mask));
                 ok = ok
                         && IMPLICATION(!scales.get(arg).has_default_groups(),
                                 (M() % scales.get_group(arg, -2)) == 0
 
@@ -66,12 +66,17 @@ struct brg_int8_t {
     int na, nb;
     int m_tail, n_tail, k_tail;
     int is_m_tail, is_k_tail, is_n_tail, is_zp_cal;
+    data_type_t dst_dt;
+    const int acc_dt_sz = sizeof(float);
     int dst_dt_sz;
     bool is_s8, is_u8_s8;
     bool is_bias;
     bool with_scales;
+    bool with_src_scales;
+    bool with_wei_scales;
     bool with_dst_scales;
     bool is_oc_scales;
+    bool is_per_m_scales = false;
     jit_int8_broadcast_t zp_type_a = jit_int8_broadcast_t::none;
     jit_int8_broadcast_t zp_type_b = jit_int8_broadcast_t::none;
     jit_int8_broadcast_t zp_type_c = jit_int8_broadcast_t::none;
@@ -83,8 +88,10 @@ struct brg_int8_t {
 
 struct call_params_t {
     const uint8_t *src, *wei;
-    float *dst;
+    uint8_t *dst;
     const float *bias, *scales, *dst_scales;
+    const float *src_scales; // optional per-row src logical-M scales
+    const float *wei_scales; // optional kernel-ready weight scales
     dim_t M, K, N;
     char *buf_B_ptr_;
     int *na, *nb;