#***********************************************************************************
# KV Cache ("34 prefix-match hit") issue on second dialog turn with llama python bindings 
# Model https://huggingface.co/ggml-org/Nemotron-Nano-3-30B-A3B-GGUF
#***********************************************************************************

# Two turn dialogs Work in llama-cli - just to make (pretty) sure that it isn't an llama_cpp error
 
!echo "Hello!\nJust wanted to say hello.\n/exit\n" | ./llama.cpp/build/bin/llama-cli -m "Nemotron-Nano-3-30B-A3B-Q4_K_M.gguf"
 

#************************************************************************************
# Minimum Example to reproduce error
#   produces error regardless of whether on GPU or CPU (n_gpu_layers=0)
#************************************************************************************ 

from llama_cpp import Llama

llm = Llama(
  #https://huggingface.co/ggml-org/Nemotron-Nano-3-30B-A3B-GGUF
  model_path="Nemotron-Nano-3-30B-A3B-Q4_K_M.gguf", n_gpu_layers=100,
  n_ctx=48*1024,
  verbose=False
) 
 
if True:
    output=llm.create_chat_completion(
        messages = [
            {"role": "user","content": "Hello"}
        ],
        seed=1,   
    )
    print(output['choices'][0]['message']['content'])
    
output=llm.create_chat_completion(
    messages = [
        {"role": "user","content": "Hello"},
        {"role": "assisstent","content": "Hello! How can I assist you today?"},
        {"role": "user","content": "Hello"},
    ],
    seed=1,   
)
print(output['choices'][0]['message']['content'])
 
#************************************************************************************ 
# Error Output
#************************************************************************************  
 
# First turn works:

We need to respond. The user says "Hello". Probably a greeting. We can respond politely.
</think>
Hello! How can I assist you today?

# But second turn (only if second turn - works as first turn) produces 
 
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[2], line 9
      2     output=llm.create_chat_completion(
      3         messages = [
      4             {"role": "user","content": "Hello"}
      5         ],
      6         seed=1,   
      7     )
      8     print(output['choices'][0]['message']['content'])
----> 9 output=llm.create_chat_completion(
     10     messages = [
     11         {"role": "user","content": "Hello"},
     12         {"role": "assisstent","content": 'We need to respond. The user says "Hello". Probably a greeting. We can respond politely.</think>Hello! How can I assist you today?'},
     13         {"role": "user","content": "Hello"},
     14     ],
     15     seed=1,   
     16 )
     17 print(output['choices'][0]['message']['content'])

File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:2017, in Llama.create_chat_completion(self, messages, functions, function_call, tools, tool_choice, temperature, top_p, top_k, min_p, typical_p, stream, stop, seed, response_format, max_tokens, presence_penalty, frequency_penalty, repeat_penalty, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, logits_processor, grammar, logit_bias, logprobs, top_logprobs)
   1979 """Generate a chat completion from a list of messages.
   1980 
   1981 Args:
   (...)   2010     Generated chat completion or a stream of chat completion chunks.
   2011 """
   2012 handler = (
   2013     self.chat_handler
   2014     or self._chat_handlers.get(self.chat_format)
   2015     or llama_chat_format.get_chat_completion_handler(self.chat_format)
   2016 )
-> 2017 return handler(
   2018     llama=self,
   2019     messages=messages,
   2020     functions=functions,
   2021     function_call=function_call,
   2022     tools=tools,
   2023     tool_choice=tool_choice,
   2024     temperature=temperature,
   2025     top_p=top_p,
   2026     top_k=top_k,
   2027     min_p=min_p,
   2028     typical_p=typical_p,
   2029     logprobs=logprobs,
   2030     top_logprobs=top_logprobs,
   2031     stream=stream,
   2032     stop=stop,
   2033     seed=seed,
   2034     response_format=response_format,
   2035     max_tokens=max_tokens,
   2036     presence_penalty=presence_penalty,
   2037     frequency_penalty=frequency_penalty,
   2038     repeat_penalty=repeat_penalty,
   2039     tfs_z=tfs_z,
   2040     mirostat_mode=mirostat_mode,
   2041     mirostat_tau=mirostat_tau,
   2042     mirostat_eta=mirostat_eta,
   2043     model=model,
   2044     logits_processor=logits_processor,
   2045     grammar=grammar,
   2046     logit_bias=logit_bias,
   2047 )

File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama_chat_format.py:669, in chat_formatter_to_chat_completion_handler.<locals>.chat_completion_handler(llama, messages, functions, function_call, tools, tool_choice, temperature, top_p, top_k, min_p, typical_p, stream, stop, seed, response_format, max_tokens, presence_penalty, frequency_penalty, repeat_penalty, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, logits_processor, grammar, logit_bias, logprobs, top_logprobs, **kwargs)
    664             print(str(e), file=sys.stderr)
    665         grammar = llama_grammar.LlamaGrammar.from_string(
    666             llama_grammar.JSON_GBNF, verbose=llama.verbose
    667         )
--> 669 completion_or_chunks = llama.create_completion(
    670     prompt=prompt,
    671     temperature=temperature,
    672     top_p=top_p,
    673     top_k=top_k,
    674     min_p=min_p,
    675     typical_p=typical_p,
    676     logprobs=top_logprobs if logprobs else None,
    677     stream=stream,
    678     stop=stop,
    679     seed=seed,
    680     max_tokens=max_tokens,
    681     presence_penalty=presence_penalty,
    682     frequency_penalty=frequency_penalty,
    683     repeat_penalty=repeat_penalty,
    684     tfs_z=tfs_z,
    685     mirostat_mode=mirostat_mode,
    686     mirostat_tau=mirostat_tau,
    687     mirostat_eta=mirostat_eta,
    688     model=model,
    689     logits_processor=logits_processor,
    690     stopping_criteria=stopping_criteria,
    691     grammar=grammar,
    692     logit_bias=logit_bias,
    693 )
    694 if tool is not None:
    695     tool_name = tool["function"]["name"]

File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:1851, in Llama.create_completion(self, prompt, suffix, max_tokens, temperature, top_p, min_p, typical_p, logprobs, echo, stop, frequency_penalty, presence_penalty, repeat_penalty, top_k, stream, seed, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, stopping_criteria, logits_processor, grammar, logit_bias)
   1849     chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
   1850     return chunks
-> 1851 completion: Completion = next(completion_or_chunks)  # type: ignore
   1852 return completion

File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:1336, in Llama._create_completion(self, prompt, suffix, max_tokens, temperature, top_p, min_p, typical_p, logprobs, echo, stop, frequency_penalty, presence_penalty, repeat_penalty, top_k, stream, seed, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, model, stopping_criteria, logits_processor, grammar, logit_bias)
   1334 finish_reason = "length"
   1335 multibyte_fix = 0
-> 1336 for token in self.generate(
   1337     prompt_tokens,
   1338     top_k=top_k,
   1339     top_p=top_p,
   1340     min_p=min_p,
   1341     typical_p=typical_p,
   1342     temp=temperature,
   1343     tfs_z=tfs_z,
   1344     mirostat_mode=mirostat_mode,
   1345     mirostat_tau=mirostat_tau,
   1346     mirostat_eta=mirostat_eta,
   1347     frequency_penalty=frequency_penalty,
   1348     presence_penalty=presence_penalty,
   1349     repeat_penalty=repeat_penalty,
   1350     stopping_criteria=stopping_criteria,
   1351     logits_processor=logits_processor,
   1352     grammar=grammar,
   1353 ):
   1354     if llama_cpp.llama_token_is_eog(self._model.vocab, token):
   1355         text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)

File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:923, in Llama.generate(self, tokens, top_k, top_p, min_p, typical_p, temp, repeat_penalty, reset, frequency_penalty, presence_penalty, tfs_z, mirostat_mode, mirostat_tau, mirostat_eta, penalize_nl, logits_processor, stopping_criteria, grammar)
    921 # Eval and sample
    922 while True:
--> 923     self.eval(tokens)
    924     while sample_idx < self.n_tokens:
    925         token = self.sample(
    926             top_k=top_k,
    927             top_p=top_p,
   (...)    941             idx=sample_idx,
    942         )

File /usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py:657, in Llama.eval(self, tokens)
    653 n_tokens = len(batch)
    654 self._batch.set_batch(
    655     batch=batch, n_past=n_past, logits_all=self._logits_all
    656 )
--> 657 self._ctx.decode(self._batch)
    658 # Save tokens
    659 self.input_ids[n_past : n_past + n_tokens] = batch

File /usr/local/lib/python3.11/dist-packages/llama_cpp/_internals.py:327, in LlamaContext.decode(self, batch)
    322 return_code = llama_cpp.llama_decode(
    323     self.ctx,
    324     batch.batch,
    325 )
    326 if return_code != 0:
--> 327     raise RuntimeError(f"llama_decode returned {return_code}")

RuntimeError: llama_decode returned -1 
 
 
 
#************************************************************************************
#This is the error on second dialog turn I get in a my larger application
#************************************************************************************
 
Llama.generate: 34 prefix-match hit, remaining 820 prompt tokens to eval
init: the tokens of sequence 0 in the input batch have inconsistent sequence positions:
 - the last position stored in the memory module of the context (i.e. the KV cache) for sequence 0 is X = 767
 - the tokens for sequence 0 in the input batch have a starting position of Y = 34
 it is required that the sequence positions remain consecutive: Y = X + 1
decode: failed to initialize batch
llama_decode: failed to decode, ret = -1
Exception ignored in thread started by: <bound method chatbot.on_prompt1 of <__main__.chatbot object at 0x7f96641b9ed0>>
Traceback (most recent call last):
  File "/tmp/ipykernel_163/1651786471.py", line 1013, in on_prompt1
  File "/usr/local/lib/python3.11/dist-packages/llama_cpp/llama_chat_format.py", line 321, in _convert_text_completion_chunks_to_chat
    for i, chunk in enumerate(chunks):
  File "/usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py", line 1336, in _create_completion
    for token in self.generate(
  File "/usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py", line 923, in generate
    self.eval(tokens)
  File "/usr/local/lib/python3.11/dist-packages/llama_cpp/llama.py", line 657, in eval
    self._ctx.decode(self._batch)
  File "/usr/local/lib/python3.11/dist-packages/llama_cpp/_internals.py", line 327, in decode
    raise RuntimeError(f"llama_decode returned {return_code}")
RuntimeError: llama_decode returned -1