#*********************************************************************************** # KV Cache ("34 prefix-match hit") issue on second dialog turn with llama python bindings # Model https://huggingface.co/ggml-org/Nemotron-Nano-3-30B-A3B-GGUF #*********************************************************************************** # Two turn dialogs Work in llama-cli - just to make (pretty) sure that it isn't an llama_cpp error !echo "Hello!\nJust wanted to say hello.\n/exit\n" | ./llama.cpp/build/bin/llama-cli -m "Nemotron-Nano-3-30B-A3B-Q4_K_M.gguf" #************************************************************************************ # Minimum Example to reproduce error # produces error regardless of whether on GPU or CPU (n_gpu_layers=0) #************************************************************************************ from llama_cpp import Llama llm = Llama( #https://huggingface.co/ggml-org/Nemotron-Nano-3-30B-A3B-GGUF model_path="Nemotron-Nano-3-30B-A3B-Q4_K_M.gguf", n_gpu_layers=100, n_ctx=48*1024, verbose=False ) if True: output=llm.create_chat_completion( messages = [ {"role": "user","content": "Hello"} ], seed=1, ) print(output['choices'][0]['message']['content']) output=llm.create_chat_completion( messages = [ {"role": "user","content": "Hello"}, {"role": "assisstent","content": "Hello! How can I assist you today?"}, {"role": "user","content": "Hello"}, ], seed=1, ) print(output['choices'][0]['message']['content']) #************************************************************************************ # Error Output #************************************************************************************ # First turn works: We need to respond. The user says "Hello". Probably a greeting. We can respond politely. Hello! How can I assist you today? # But second turn (only if second turn - works as first turn) produces --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[2], line 9 2 output=llm.create_chat_completion( 3 messages = [ 4 {"role": "user","content": "Hello"} 5 ], 6 seed=1, 7 ) 8 print(output['choices'][0]['message']['content']) ----> 9 output=llm.create_chat_completion( 10 messages = [ 11 {"role": "user","content": "Hello"}, 12 {"role": "assisstent","content": 'We need to respond. The user says "Hello". Probably a greeting. We can respond politely.Hello! How can I assist you today?'}, 13 {"role": "user","content": "Hello"}, 14 ], 15 seed=1, 16 ) 17 print(output['choices'][0]['message']['content']) File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:2017, in Llama.create_chat_completion(self, messages, functions, function_call, tools, tool_choice, temperature, top_p, top_k, min_p, typical_p, stream, stop, seed, response_format, max_tokens, presence_penalty, frequency_penalty, repeat_penalty, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, logits_processor, grammar, logit_bias, logprobs, top_logprobs) 1979 """Generate a chat completion from a list of messages. 1980 1981 Args: (...) 2010 Generated chat completion or a stream of chat completion chunks. 2011 """ 2012 handler = ( 2013 self.chat_handler 2014 or self._chat_handlers.get(self.chat_format) 2015 or llama_chat_format.get_chat_completion_handler(self.chat_format) 2016 ) -> 2017 return handler( 2018 llama=self, 2019 messages=messages, 2020 functions=functions, 2021 function_call=function_call, 2022 tools=tools, 2023 tool_choice=tool_choice, 2024 temperature=temperature, 2025 top_p=top_p, 2026 top_k=top_k, 2027 min_p=min_p, 2028 typical_p=typical_p, 2029 logprobs=logprobs, 2030 top_logprobs=top_logprobs, 2031 stream=stream, 2032 stop=stop, 2033 seed=seed, 2034 response_format=response_format, 2035 max_tokens=max_tokens, 2036 presence_penalty=presence_penalty, 2037 frequency_penalty=frequency_penalty, 2038 repeat_penalty=repeat_penalty, 2039 tfs_z=tfs_z, 2040 mirostat_mode=mirostat_mode, 2041 mirostat_tau=mirostat_tau, 2042 mirostat_eta=mirostat_eta, 2043 model=model, 2044 logits_processor=logits_processor, 2045 grammar=grammar, 2046 logit_bias=logit_bias, 2047 ) File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama_chat_format.py:669, in chat_formatter_to_chat_completion_handler..chat_completion_handler(llama, messages, functions, function_call, tools, tool_choice, temperature, top_p, top_k, min_p, typical_p, stream, stop, seed, response_format, max_tokens, presence_penalty, frequency_penalty, repeat_penalty, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, logits_processor, grammar, logit_bias, logprobs, top_logprobs, **kwargs) 664 print(str(e), file=sys.stderr) 665 grammar = llama_grammar.LlamaGrammar.from_string( 666 llama_grammar.JSON_GBNF, verbose=llama.verbose 667 ) --> 669 completion_or_chunks = llama.create_completion( 670 prompt=prompt, 671 temperature=temperature, 672 top_p=top_p, 673 top_k=top_k, 674 min_p=min_p, 675 typical_p=typical_p, 676 logprobs=top_logprobs if logprobs else None, 677 stream=stream, 678 stop=stop, 679 seed=seed, 680 max_tokens=max_tokens, 681 presence_penalty=presence_penalty, 682 frequency_penalty=frequency_penalty, 683 repeat_penalty=repeat_penalty, 684 tfs_z=tfs_z, 685 mirostat_mode=mirostat_mode, 686 mirostat_tau=mirostat_tau, 687 mirostat_eta=mirostat_eta, 688 model=model, 689 logits_processor=logits_processor, 690 stopping_criteria=stopping_criteria, 691 grammar=grammar, 692 logit_bias=logit_bias, 693 ) 694 if tool is not None: 695 tool_name = tool["function"]["name"] File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:1851, in Llama.create_completion(self, prompt, suffix, max_tokens, temperature, top_p, min_p, typical_p, logprobs, echo, stop, frequency_penalty, presence_penalty, repeat_penalty, top_k, stream, seed, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, stopping_criteria, logits_processor, grammar, logit_bias) 1849 chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks 1850 return chunks -> 1851 completion: Completion = next(completion_or_chunks) # type: ignore 1852 return completion File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:1336, in Llama._create_completion(self, prompt, suffix, max_tokens, temperature, top_p, min_p, typical_p, logprobs, echo, stop, frequency_penalty, presence_penalty, repeat_penalty, top_k, stream, seed, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, stopping_criteria, logits_processor, grammar, logit_bias) 1334 finish_reason = "length" 1335 multibyte_fix = 0 -> 1336 for token in self.generate( 1337 prompt_tokens, 1338 top_k=top_k, 1339 top_p=top_p, 1340 min_p=min_p, 1341 typical_p=typical_p, 1342 temp=temperature, 1343 tfs_z=tfs_z, 1344 mirostat_mode=mirostat_mode, 1345 mirostat_tau=mirostat_tau, 1346 mirostat_eta=mirostat_eta, 1347 frequency_penalty=frequency_penalty, 1348 presence_penalty=presence_penalty, 1349 repeat_penalty=repeat_penalty, 1350 stopping_criteria=stopping_criteria, 1351 logits_processor=logits_processor, 1352 grammar=grammar, 1353 ): 1354 if llama_cpp.llama_token_is_eog(self._model.vocab, token): 1355 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:923, in Llama.generate(self, tokens, top_k, top_p, min_p, typical_p, temp, repeat_penalty, reset, frequency_penalty, presence_penalty, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, penalize_nl, logits_processor, stopping_criteria, grammar) 921 # Eval and sample 922 while True: --> 923 self.eval(tokens) 924 while sample_idx < self.n_tokens: 925 token = self.sample( 926 top_k=top_k, 927 top_p=top_p, (...) 941 idx=sample_idx, 942 ) File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:657, in Llama.eval(self, tokens) 653 n_tokens = len(batch) 654 self._batch.set_batch( 655 batch=batch, n_past=n_past, logits_all=self._logits_all 656 ) --> 657 self._ctx.decode(self._batch) 658 # Save tokens 659 self.input_ids[n_past : n_past + n_tokens] = batch File /usr/local/lib/python3.11/dist-packages/llama_cpp/_internals.py:327, in LlamaContext.decode(self, batch) 322 return_code = llama_cpp.llama_decode( 323 self.ctx, 324 batch.batch, 325 ) 326 if return_code != 0: --> 327 raise RuntimeError(f"llama_decode returned {return_code}") RuntimeError: llama_decode returned -1 #************************************************************************************ #This is the error on second dialog turn I get in a my larger application #************************************************************************************ Llama.generate: 34 prefix-match hit, remaining 820 prompt tokens to eval init: the tokens of sequence 0 in the input batch have inconsistent sequence positions: - the last position stored in the memory module of the context (i.e. the KV cache) for sequence 0 is X = 767 - the tokens for sequence 0 in the input batch have a starting position of Y = 34 it is required that the sequence positions remain consecutive: Y = X + 1 decode: failed to initialize batch llama_decode: failed to decode, ret = -1 Exception ignored in thread started by: > Traceback (most recent call last): File "/tmp/ipykernel_163/1651786471.py", line 1013, in on_prompt1 File "/usr/local/lib/python3.11/dist-packages/llama_cpp/llama_chat_format.py", line 321, in _convert_text_completion_chunks_to_chat for i, chunk in enumerate(chunks): File "/usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py", line 1336, in _create_completion for token in self.generate( File "/usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py", line 923, in generate self.eval(tokens) File "/usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py", line 657, in eval self._ctx.decode(self._batch) File "/usr/local/lib/python3.11/dist-packages/llama_cpp/_internals.py", line 327, in decode raise RuntimeError(f"llama_decode returned {return_code}") RuntimeError: llama_decode returned -1