fix(embed): mark all tokens for output to suppress llama.cpp 'overriding' warning (#2208)

Anai-Guo · Anai-Guo · commit 0f3f883e246e · 2026-05-09T06:18:02.000-07:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1040,7 +1040,8 @@ def embed(
 
         # get pooling information
         pooling_type = self.pooling_type()
-        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
+        # All tokens need outputs for embeddings; llama.cpp otherwise logs an "overriding" warning per input.
+        logits_all = True
 
         if self.context_params.embeddings is False:
             raise RuntimeError(