Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings
- feat: Update llama.cpp to ggml-org/llama.cpp@b9a2170fc and sync Python bindings
- chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217

Expand Down
21 changes: 21 additions & 0 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,14 @@ def _warn_deprecated(symbol: str, hint: str) -> None:
LLAMA_SPLIT_MODE_TENSOR = 3


# enum llama_context_type {
# LLAMA_CONTEXT_TYPE_DEFAULT = 0,
# LLAMA_CONTEXT_TYPE_MTP = 1,
# };
LLAMA_CONTEXT_TYPE_DEFAULT = 0
LLAMA_CONTEXT_TYPE_MTP = 1


# typedef struct llama_token_data {
# llama_token id; // token id
# float logit; // log-odds of the token
Expand Down Expand Up @@ -894,9 +902,11 @@ class llama_sampler_seq_config(ctypes.Structure):
# uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
# uint32_t n_ubatch; // physical maximum batch size
# uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
# uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
# int32_t n_threads; // number of threads to use for generation
# int32_t n_threads_batch; // number of threads to use for batch processing

# enum llama_context_type ctx_type; // set the context type (e.g. MTP)
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
# enum llama_attention_type attention_type; // attention type to use for embeddings
Expand Down Expand Up @@ -947,8 +957,10 @@ class llama_context_params(ctypes.Structure):
n_batch (int): logical maximum batch size that can be submitted to llama_decode
n_ubatch (int): physical maximum batch size
n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models)
n_rs_seq (int): number of recurrent-state snapshots per sequence for rollback
n_threads (int): number of threads to use for generation
n_threads_batch (int): number of threads to use for batch processing
ctx_type (int): context type, from `enum llama_context_type`
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
attention_type (int): attention type to use for embeddings
Expand Down Expand Up @@ -982,8 +994,10 @@ class llama_context_params(ctypes.Structure):
n_batch: int
n_ubatch: int
n_seq_max: int
n_rs_seq: int
n_threads: int
n_threads_batch: int
ctx_type: int
rope_scaling_type: int
pooling_type: int
attention_type: int
Expand Down Expand Up @@ -1016,8 +1030,10 @@ class llama_context_params(ctypes.Structure):
("n_batch", ctypes.c_uint32),
("n_ubatch", ctypes.c_uint32),
("n_seq_max", ctypes.c_uint32),
("n_rs_seq", ctypes.c_uint32),
("n_threads", ctypes.c_int32),
("n_threads_batch", ctypes.c_int32),
("ctx_type", ctypes.c_int),
("rope_scaling_type", ctypes.c_int),
("pooling_type", ctypes.c_int),
("attention_type", ctypes.c_int),
Expand Down Expand Up @@ -1591,6 +1607,11 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...


# LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx);
@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32)
def llama_n_rs_seq(ctx: llama_context_p, /) -> int: ...


# DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
@ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Submodule llama.cpp updated 843 files
Loading