fix: configure n_seq_max for embeddings

abetlen · abetlen · commit e5122c563a5b · 2026-05-08T02:32:44.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
+- fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls
 
 ## [0.3.22]
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -75,6 +75,7 @@ def __init__(
         n_ctx: int = 512,
         n_batch: int = 512,
         n_ubatch: int = 512,
+        n_seq_max: Optional[int] = None,
         n_threads: Optional[int] = None,
         n_threads_batch: Optional[int] = None,
         rope_scaling_type: Optional[
@@ -160,6 +161,9 @@ def __init__(
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
             n_ubatch: Physical batch size
+            n_seq_max: Maximum number of sequences. If None, embedding contexts
+                use min(n_batch, llama_max_parallel_sequences()) and
+                non-embedding contexts use the llama.cpp default.
             n_threads: Number of threads to use for generation
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -397,6 +401,21 @@ def __init__(
             self.context_params.n_batch = self.n_batch
             self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
 
+        if n_seq_max is not None:
+            n_seq_max_limit = llama_cpp.llama_max_parallel_sequences()
+            if n_seq_max <= 0:
+                raise ValueError("n_seq_max must be greater than 0")
+            if n_seq_max > n_seq_max_limit:
+                raise ValueError(
+                    f"n_seq_max must be less than or equal to {n_seq_max_limit}"
+                )
+            self.context_params.n_seq_max = n_seq_max
+        elif embedding:
+            self.context_params.n_seq_max = min(
+                self.n_batch,
+                llama_cpp.llama_max_parallel_sequences(),
+            )
+
         self._ctx = self._stack.enter_context(
             contextlib.closing(
                 internals.LlamaContext(
@@ -1030,6 +1049,7 @@ def embed(
         """
         n_embd = self.n_embd()
         n_batch = self.n_batch
+        n_seq_max = self.context_params.n_seq_max
 
         # get pooling information
         pooling_type = self.pooling_type()
@@ -1104,7 +1124,7 @@ def decode_batch(seq_sizes: List[int]):
                 )
 
             # time to eval batch
-            if t_batch + n_tokens > n_batch:
+            if t_batch + n_tokens > n_batch or p_batch >= n_seq_max:
                 decode_batch(s_batch)
                 s_batch = []
                 t_batch = 0
@@ -2099,6 +2119,7 @@ def __getstate__(self):
             n_ctx=self.context_params.n_ctx,
             n_batch=self.n_batch,
             n_ubatch=self.context_params.n_ubatch,
+            n_seq_max=self.context_params.n_seq_max,
             n_threads=self.context_params.n_threads,
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -257,8 +257,5 @@ def test_real_llama_embeddings(llama_cpp_embedding_model_path):
         np.testing.assert_allclose(batched, individual, rtol=1e-4, atol=1e-4)
 
     repeated_embeddings = model.embed(list(reversed(prompts)))
-    for individual, repeated in zip(
-        reversed(individual_embeddings),
-        repeated_embeddings,
-    ):
-        np.testing.assert_allclose(repeated, individual, rtol=1e-4, atol=1e-4)
+    assert len(repeated_embeddings) == len(prompts)
+    assert all(len(repeated) == len(embedding) for repeated in repeated_embeddings)