fix(embed): mark all tokens as output to suppress llama.cpp "overriding" INFO

Anai-Guo · Anai-Guo · commit 1b5fade95e9a · 2026-05-09T21:20:52.000-07:00
Force logits_all=True in Llama.embed() so per-token batch.logits[i] flags are all set, regardless of pooling type. Previously, when pooling != NONE, add_sequence flipped most tokens to logits[i]=False, and llama.cpp printed init: embeddings required but some input tokens were not marked as outputs -> overriding once per embed input and silently overrode the flags. Pooling type only changes how per-token outputs are read back in decode_batch (llama_get_embeddings vs llama_get_embeddings_seq), not whether they are produced — so this aligns the per-token flags with what llama.cpp already needed and removes the noisy per-input override message. Fixes #2208.
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1040,7 +1040,13 @@ def embed(
 
         # get pooling information
         pooling_type = self.pooling_type()
-        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
+        # In embedding mode every input token must be marked as an output, regardless of
+        # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit
+        # "embeddings required but some input tokens were not marked as outputs ->
+        # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the
+        # per-token outputs are read back (see decode_batch below), not whether they are
+        # produced. See abetlen/llama-cpp-python#2208.
+        logits_all = True
 
         if self.context_params.embeddings is False:
             raise RuntimeError(