diff --git a/CHANGELOG.md b/CHANGELOG.md index 36e4fa168..18c6af161 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@91e84fed6 and sync Python bindings +- feat: Update llama.cpp to ggml-org/llama.cpp@b9a2170fc and sync Python bindings - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a9c32a15b..6560b5178 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -516,6 +516,14 @@ def _warn_deprecated(symbol: str, hint: str) -> None: LLAMA_SPLIT_MODE_TENSOR = 3 +# enum llama_context_type { +# LLAMA_CONTEXT_TYPE_DEFAULT = 0, +# LLAMA_CONTEXT_TYPE_MTP = 1, +# }; +LLAMA_CONTEXT_TYPE_DEFAULT = 0 +LLAMA_CONTEXT_TYPE_MTP = 1 + + # typedef struct llama_token_data { # llama_token id; // token id # float logit; // log-odds of the token @@ -894,9 +902,11 @@ class llama_sampler_seq_config(ctypes.Structure): # uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode # uint32_t n_ubatch; // physical maximum batch size # uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) +# uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] # int32_t n_threads; // number of threads to use for generation # int32_t n_threads_batch; // number of threads to use for batch processing +# enum llama_context_type ctx_type; // set the context type (e.g. MTP) # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id # enum llama_attention_type attention_type; // attention type to use for embeddings @@ -947,8 +957,10 @@ class llama_context_params(ctypes.Structure): n_batch (int): logical maximum batch size that can be submitted to llama_decode n_ubatch (int): physical maximum batch size n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models) + n_rs_seq (int): number of recurrent-state snapshots per sequence for rollback n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing + ctx_type (int): context type, from `enum llama_context_type` rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) attention_type (int): attention type to use for embeddings @@ -982,8 +994,10 @@ class llama_context_params(ctypes.Structure): n_batch: int n_ubatch: int n_seq_max: int + n_rs_seq: int n_threads: int n_threads_batch: int + ctx_type: int rope_scaling_type: int pooling_type: int attention_type: int @@ -1016,8 +1030,10 @@ class llama_context_params(ctypes.Structure): ("n_batch", ctypes.c_uint32), ("n_ubatch", ctypes.c_uint32), ("n_seq_max", ctypes.c_uint32), + ("n_rs_seq", ctypes.c_uint32), ("n_threads", ctypes.c_int32), ("n_threads_batch", ctypes.c_int32), + ("ctx_type", ctypes.c_int), ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), @@ -1591,6 +1607,11 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int: ... def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... +# LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx); +@ctypes_function("llama_n_rs_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_rs_seq(ctx: llama_context_p, /) -> int: ... + + # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_ctx_train(model: llama_model_p, /) -> int: ... diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 91e84fed6..b9a2170fc 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 91e84fed64329cd96202d68220724a1d92f5ec1f +Subproject commit b9a2170fce1f3f33cb4934b34efecb806bbbb348