Add NONE pooling mode to support pre-pooled model outputs (#4710)

aneesh-db · ylwu-amzn · web-flow · commit 919f3eada950 · 2026-03-25T11:52:34.000-07:00
The documentation states that sentence-transformer models traced with the sentence-transformers library include post-processing, implying no additional pooling is needed. However, the code always applies MEAN pooling by default, even for models that already provide pre-pooled sentence_embedding output. This adds NONE as a pooling option so that models with pre-computed sentence embeddings can use those outputs directly without redundant pooling computation. Changes: - Add NONE to PoolingMode enum in BaseModelConfig - Update ONNXSentenceTransformerTextEmbeddingTranslator to use second output (sentence_embedding) when pooling_mode is NONE - Update HuggingfaceTextEmbeddingTranslator to support NONE pooling with fallback logic for various output formats - Add unit tests for NONE pooling in both ONNX and TorchScript - Update documentation with NONE pooling description - Add release notes entry Resolves #4708 Signed-off-by: Aneesh Nema <aneesh.nema@databricks.com> Signed-off-by: Yaliang Wu <ylwu@amazon.com> Co-authored-by: Yaliang Wu <ylwu@amazon.com>
diff --git a/common/src/main/java/org/opensearch/ml/common/model/BaseModelConfig.java b/common/src/main/java/org/opensearch/ml/common/model/BaseModelConfig.java
@@ -247,7 +247,8 @@ public enum PoolingMode {
         MAX("max"),
         WEIGHTED_MEAN("weightedmean"),
         CLS("cls"),
-        LAST_TOKEN("lasttoken");
+        LAST_TOKEN("lasttoken"),
+        NONE("none");
 
         private String name;
 
diff --git a/docs/model_serving_framework/text_embedding_model_examples.md b/docs/model_serving_framework/text_embedding_model_examples.md
@@ -295,7 +295,7 @@ POST /_plugins/_ml/models/zwla5YUB1qmVrJFlwzXJ/_unload
 Without [`sentence-transformers`](https://pypi.org/project/sentence-transformers/) installed, you can trace this model `AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')`.
 But model traced this way doesn't include post-processing. So user have to specify post-process logic with `pooling_mode` and `normalize_result`.
 
-Supported pooling method: `mean`, `mean_sqrt_len`, `max`, `weightedmean`, `cls`, `lasttoken`.
+Supported pooling method: `mean`, `mean_sqrt_len`, `max`, `weightedmean`, `cls`, `lasttoken`, `none`.
 
 The only difference is the uploading model input, for load/predict/profile/unload model, you can refer to ["1.1 trace sentence transformers model"](#11-trace-sentence-transformers-model).
 
@@ -322,7 +322,7 @@ POST /_plugins/_ml/models/_upload
 User can export Pytorch model to ONNX, then upload and run it with ml-commons APIs.
 This example ONNX model also needs to specify post-process logic with `pooling_mode` and `normalize_result`.
 
-Supported pooling method: `mean`, `mean_sqrt_len`, `max`, `weightedmean`, `cls`, `lasttoken`.
+Supported pooling method: `mean`, `mean_sqrt_len`, `max`, `weightedmean`, `cls`, `lasttoken`, `none`.
 
 ### Pooling Methods
 
@@ -334,6 +334,7 @@ Supported pooling method: `mean`, `mean_sqrt_len`, `max`, `weightedmean`, `cls`,
 | `weightedmean` | Weighted average where later tokens have higher weights |
 | `cls` | Uses the first token (CLS token) embedding |
 | `lasttoken` | Uses the last non-padding token's embedding. Useful for decoder-only models where the final token captures cumulative context |
+| `none` | Uses pre-pooled output from model directly without additional pooling computation. Use when model already provides pooled embeddings (e.g., `sentence_embedding` or `pooler_output`). Avoids redundant pooling and matches original model behavior |
 
 The only difference is the uploading model input, for load/predict/profile/unload model, you can refer to ["1.1 trace sentence transformers model"](#11-trace-sentence-transformers-model).
 
diff --git a/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/text_embedding/HuggingfaceTextEmbeddingTranslator.java b/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/text_embedding/HuggingfaceTextEmbeddingTranslator.java
@@ -78,9 +78,29 @@ public NDList processInput(TranslatorContext ctx, String input) {
     /** {@inheritDoc} */
     @Override
     public float[] processOutput(TranslatorContext ctx, NDList list) {
-        NDArray embeddings = list.get("last_hidden_state");
-        if (embeddings == null) {
-            embeddings = list.get(0);
+        NDArray embeddings;
+
+        // NONE pooling mode uses pre-pooled output directly if available
+        if ("none".equals(pooling)) {
+            // Try to get pre-pooled output (sentence_embedding, pooler_output, etc.)
+            embeddings = list.get("sentence_embedding");
+            if (embeddings == null) {
+                embeddings = list.get("pooler_output");
+            }
+            if (embeddings == null && list.size() > 1) {
+                // Use second output if available
+                embeddings = list.get(1);
+            }
+            if (embeddings == null) {
+                // Fallback to first output
+                embeddings = list.get(0);
+            }
+        } else {
+            // For other pooling modes, use last_hidden_state or first output
+            embeddings = list.get("last_hidden_state");
+            if (embeddings == null) {
+                embeddings = list.get(0);
+            }
         }
         Encoding encoding = (Encoding) ctx.getAttachment("encoding");
         long[] attentionMask = encoding.getAttentionMask();
@@ -105,6 +125,9 @@ public float[] processOutput(TranslatorContext ctx, NDList list) {
             case "lasttoken":
                 embeddings = lastTokenPool(embeddings, inputAttentionMask);
                 break;
+            case "none":
+                // No pooling - use pre-pooled output as-is
+                break;
             default:
                 throw new AssertionError("Unexpected pooling model: " + pooling);
         }
@@ -232,9 +255,10 @@ public HuggingfaceTextEmbeddingTranslator.Builder optPoolingMode(String poolingM
                 && !"cls".equals(poolingMode)
                 && !"mean_sqrt_len".equals(poolingMode)
                 && !"weightedmean".equals(poolingMode)
-                && !"lasttoken".equals(poolingMode)) {
+                && !"lasttoken".equals(poolingMode)
+                && !"none".equals(poolingMode)) {
                 throw new IllegalArgumentException(
-                    "Invalid pooling model, must be one of [mean, max, cls, mean_sqrt_len, weightedmean, lasttoken]."
+                    "Invalid pooling model, must be one of [mean, max, cls, mean_sqrt_len, weightedmean, lasttoken, none]."
                 );
             }
             this.pooling = poolingMode;
diff --git a/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/text_embedding/ONNXSentenceTransformerTextEmbeddingTranslator.java b/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/text_embedding/ONNXSentenceTransformerTextEmbeddingTranslator.java
@@ -87,7 +87,16 @@ public NDList processInput(TranslatorContext ctx, Input input) {
     /** {@inheritDoc} */
     @Override
     public Output processOutput(TranslatorContext ctx, NDList list) {
-        NDArray embeddings = list.get(0);
+        NDArray embeddings;
+
+        // NONE pooling mode uses pre-pooled output directly if available
+        if (this.poolingMode == TextEmbeddingModelConfig.PoolingMode.NONE && list.size() > 1) {
+            // Use the second output (sentence_embedding) which is pre-pooled
+            embeddings = list.get(1);
+        } else {
+            // Use first output (token_embeddings) for explicit pooling
+            embeddings = list.get(0);
+        }
         int shapeLength = embeddings.getShape().getShape().length;
         if (shapeLength == 3) {
             embeddings = embeddings.get(0);
@@ -115,6 +124,9 @@ public Output processOutput(TranslatorContext ctx, NDList list) {
             case LAST_TOKEN:
                 embeddings = lastTokenPool(embeddings, inputAttentionMask);
                 break;
+            case NONE:
+                // No pooling - use pre-pooled output as-is
+                break;
             default:
                 throw new IllegalArgumentException("Unsupported pooling method");
         }
diff --git a/ml-algorithms/src/test/java/org/opensearch/ml/engine/algorithms/text_embedding/TextEmbeddingDenseModelTest.java b/ml-algorithms/src/test/java/org/opensearch/ml/engine/algorithms/text_embedding/TextEmbeddingDenseModelTest.java
@@ -225,6 +225,17 @@ public void initModel_predict_ONNX_LastTokenPooling() throws URISyntaxException
         initModel_predict_HuggingfaceModel(modelFile, modelType, poolingMode, normalize, modelMaxLength, modelFormat, dimension);
     }
 
+    @Test
+    public void initModel_predict_ONNX_NonePooling() throws URISyntaxException {
+        String modelFile = "all-MiniLM-L6-v2_onnx.zip";
+        String modelType = "bert";
+        TextEmbeddingModelConfig.PoolingMode poolingMode = TextEmbeddingModelConfig.PoolingMode.NONE;
+        boolean normalize = true;
+        int modelMaxLength = 512;
+        MLModelFormat modelFormat = MLModelFormat.ONNX;
+        initModel_predict_HuggingfaceModel(modelFile, modelType, poolingMode, normalize, modelMaxLength, modelFormat, dimension);
+    }
+
     @Test
     public void initModel_predict_TorchScript_Huggingface_LastTokenPooling() throws URISyntaxException {
         String modelFile = "all-MiniLM-L6-v2_torchscript_huggingface.zip";
@@ -236,6 +247,17 @@ public void initModel_predict_TorchScript_Huggingface_LastTokenPooling() throws
         initModel_predict_HuggingfaceModel(modelFile, modelType, poolingMode, normalize, modelMaxLength, modelFormat, dimension);
     }
 
+    @Test
+    public void initModel_predict_TorchScript_Huggingface_NonePooling() throws URISyntaxException {
+        String modelFile = "all-MiniLM-L6-v2_torchscript_huggingface.zip";
+        String modelType = "bert";
+        TextEmbeddingModelConfig.PoolingMode poolingMode = TextEmbeddingModelConfig.PoolingMode.NONE;
+        boolean normalize = true;
+        int modelMaxLength = 512;
+        MLModelFormat modelFormat = MLModelFormat.TORCH_SCRIPT;
+        initModel_predict_HuggingfaceModel(modelFile, modelType, poolingMode, normalize, modelMaxLength, modelFormat, dimension);
+    }
+
     private void initModel_predict_HuggingfaceModel(
         String modelFile,
         String modelType,
diff --git a/release-notes/opensearch-ml-commons.release-notes-3.4.0.0.md b/release-notes/opensearch-ml-commons.release-notes-3.4.0.0.md
@@ -9,6 +9,7 @@ Compatible with OpenSearch and OpenSearch Dashboards version 3.4.0
 * allow higher maximum number of batch inference job tasks ([#4474](https://github.com/opensearch-project/ml-commons/pull/4474))
 
 ### Bug Fixes
+* Add NONE pooling mode to support pre-pooled model outputs, fixing bug where MEAN pooling was applied by default ([#4708](https://github.com/opensearch-project/ml-commons/issues/4708))
 * Fix agent type update ([#4341](https://github.com/opensearch-project/ml-commons/pull/4341))
 * Handle edge case of empty values of tool configs ([#4479](https://github.com/opensearch-project/ml-commons/pull/4479))
 * Fix OpenAI RAG integration tests: Replace Wikimedia image URL with Unsplash ([#4472](https://github.com/opensearch-project/ml-commons/pull/4472))