Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings
- fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205
- fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls

Expand Down
88 changes: 88 additions & 0 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ def _warn_deprecated(symbol: str, hint: str) -> None:
llama_token_p = ctypes.POINTER(llama_token)
# typedef int32_t llama_seq_id;
llama_seq_id = ctypes.c_int32
# typedef uint32_t llama_state_seq_flags;
llama_state_seq_flags = ctypes.c_uint32


# enum llama_vocab_type {
Expand Down Expand Up @@ -2835,6 +2837,92 @@ def llama_state_seq_load_file(
) -> int: ...


# for backwards-compat
# define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1

# work only with partial states, such as SWA KV cache or recurrent cache
# (e.g. Mamba)
# define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 1

# keeps the tensor data on device buffers
# (i.e. not accessible in host memory, but faster save/load)
# define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
LLAMA_STATE_SEQ_FLAGS_ON_DEVICE = 2


# LLAMA_API size_t llama_state_seq_get_size_ext(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_state_seq_flags flags);
@ctypes_function(
"llama_state_seq_get_size_ext",
[llama_context_p_ctypes, llama_seq_id, llama_state_seq_flags],
ctypes.c_size_t,
)
def llama_state_seq_get_size_ext(
ctx: llama_context_p,
seq_id: llama_seq_id,
flags: llama_state_seq_flags,
/,
) -> int: ...


# LLAMA_API size_t llama_state_seq_get_data_ext(
# struct llama_context * ctx,
# uint8_t * dst,
# size_t size,
# llama_seq_id seq_id,
# llama_state_seq_flags flags);
@ctypes_function(
"llama_state_seq_get_data_ext",
[
llama_context_p_ctypes,
ctypes.POINTER(ctypes.c_uint8),
ctypes.c_size_t,
llama_seq_id,
llama_state_seq_flags,
],
ctypes.c_size_t,
)
def llama_state_seq_get_data_ext(
ctx: llama_context_p,
dst: CtypesArray[ctypes.c_uint8],
size: Union[ctypes.c_size_t, int],
seq_id: llama_seq_id,
flags: llama_state_seq_flags,
/,
) -> int: ...


# LLAMA_API size_t llama_state_seq_set_data_ext(
# struct llama_context * ctx,
# const uint8_t * src,
# size_t size,
# llama_seq_id dest_seq_id,
# llama_state_seq_flags flags);
@ctypes_function(
"llama_state_seq_set_data_ext",
[
llama_context_p_ctypes,
ctypes.POINTER(ctypes.c_uint8),
ctypes.c_size_t,
llama_seq_id,
llama_state_seq_flags,
],
ctypes.c_size_t,
)
def llama_state_seq_set_data_ext(
ctx: llama_context_p,
src: CtypesArray[ctypes.c_uint8],
size: Union[ctypes.c_size_t, int],
dest_seq_id: llama_seq_id,
flags: llama_state_seq_flags,
/,
) -> int: ...


# //
# // Decoding
# //
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Loading