GGML_ABORT("fatal error");
}
+
+bool llama_hparams::has_kv(uint32_t il) const {
+ if (n_layer_kv_from_start >= 0) {
+ if (il < (uint32_t) n_layer_kv_from_start) {
+ return true;
+ }
+
+ return false;
+ }
+
+ // by default, all layers have kv
+ return true;
+}
+
+uint32_t llama_hparams::n_layer_kv() const {
+ uint32_t res = 0;
+
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ if (has_kv(il)) {
+ res++;
+ }
+ }
+
+ return res;
+}
uint32_t n_embd;
uint32_t n_embd_features = 0;
uint32_t n_layer;
+ int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_rot;
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_pos_per_embd() const;
bool is_swa(uint32_t il) const;
+
+ bool has_kv(uint32_t il) const;
+
+ // number of layers for which has_kv() returns true
+ uint32_t n_layer_kv() const;
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_ubatch,
- uint32_t n_pad) : hparams(model.hparams), unified(unified) {
- llama_kv_cache::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
- llama_kv_cache::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
+ uint32_t n_pad,
+ const layer_filter_cb & filter,
+ const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
+
+ // chain filters
+ const layer_filter_cb filter_base = [&](int32_t il) {
+ if (filter && !filter(il)) {
+ return false;
+ }
+
+ return !model.hparams.is_swa(il);
+ };
+
+ const layer_filter_cb filter_swa = [&](int32_t il) {
+ if (filter && !filter(il)) {
+ return false;
+ }
+
+ return model.hparams.is_swa(il);
+ };
const uint32_t size_base = kv_size;
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
kv_base = std::make_unique<llama_kv_cache>(
- model, std::move(filter_base), type_k, type_v,
+ model, type_k, type_v,
v_trans, offload, unified, size_base, n_seq_max, n_pad,
- 0, LLAMA_SWA_TYPE_NONE);
+ 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
kv_swa = std::make_unique<llama_kv_cache>(
- model, std::move(filter_swa), type_k, type_v,
+ model, type_k, type_v,
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
- hparams.n_swa, hparams.swa_type);
+ hparams.n_swa, hparams.swa_type, filter_swa, reuse);
}
void llama_kv_cache_iswa::clear(bool data) {
bool v_trans,
bool offload,
bool swa_full,
- bool ,
+ bool unified,
uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_ubatch,
- uint32_t n_pad);
+ uint32_t n_pad,
+ const layer_filter_cb & filter,
+ const layer_reuse_cb & reuse);
~llama_kv_cache_iswa() = default;
//
llama_kv_cache::llama_kv_cache(
- const llama_model & model,
- layer_filter_cb && filter,
- ggml_type type_k,
- ggml_type type_v,
- bool v_trans,
- bool offload,
- bool unified,
- uint32_t kv_size,
- uint32_t n_seq_max,
- uint32_t n_pad,
- uint32_t n_swa,
- llama_swa_type swa_type) :
+ const llama_model & model,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ bool unified,
+ uint32_t kv_size,
+ uint32_t n_seq_max,
+ uint32_t n_pad,
+ uint32_t n_swa,
+ llama_swa_type swa_type,
+ const layer_filter_cb & filter,
+ const layer_reuse_cb & reuse) :
model(model), hparams(model.hparams), v_trans(v_trans),
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
GGML_ASSERT(kv_size % n_pad == 0);
- // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
- auto n_layer_cache = hparams.n_layer;
- if (model.arch == LLM_ARCH_GEMMA3N) {
- n_layer_cache = 20;
- }
- if (model.arch == LLM_ARCH_GLM4_MOE) {
- // GLM-4.5: Only process up to last layer, skip final NextN layer
- n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
- }
+ const uint32_t n_layer_kv = hparams.n_layer_kv();
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
ggml_init_params params = {
- /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_cache*ggml_tensor_overhead()),
+ /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
__func__, hparams.n_embd_v_gqa_max());
}
- for (uint32_t il = 0; il < n_layer_cache; il++) {
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
+ if (!hparams.has_kv(il)) {
+ LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
+ continue;
+ }
+
if (filter && !filter(il)) {
- LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+ LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
continue;
}
layers.push_back({ il, k, v, k_stream, v_stream, });
}
- // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
- if (model.arch == LLM_ARCH_GEMMA3N) {
- LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
+ if (reuse) {
+ LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
- for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) {
- if (filter && !filter(il)) {
- LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
+ const int32_t il_reuse = reuse(il);
+
+ if (il_reuse < 0) {
+ LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
continue;
}
- const bool is_swa = hparams.is_swa(il);
- const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1);
+ if (filter && !filter(il)) {
+ LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
+ continue;
+ }
GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
+
map_layer_ids[il] = map_layer_ids[il_reuse];
- LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa);
+ LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
}
}
public:
static uint32_t get_padding(const llama_cparams & cparams);
- // this callback is used to filter out layers that should not be included in the cache
- using layer_filter_cb = std::function<bool(int32_t il)>;
-
struct stream_copy_info {
bool empty() const {
assert(ssrc.size() == sdst.size());
using slot_info_vec_t = std::vector<slot_info>;
llama_kv_cache(
- const llama_model & model,
- layer_filter_cb && filter,
- ggml_type type_k,
- ggml_type type_v,
- bool v_trans,
- bool offload,
- bool unified,
- uint32_t kv_size,
- uint32_t n_seq_max,
- uint32_t n_pad,
- uint32_t n_swa,
- llama_swa_type swa_type);
+ const llama_model & model,
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool offload,
+ bool unified,
+ uint32_t kv_size,
+ uint32_t n_seq_max,
+ uint32_t n_pad,
+ uint32_t n_swa,
+ llama_swa_type swa_type,
+ const layer_filter_cb & filter,
+ const layer_reuse_cb & reuse);
~llama_kv_cache() = default;
//
llama_memory_hybrid::llama_memory_hybrid(
- const llama_model & model,
- /* attn */
- ggml_type type_k,
- ggml_type type_v,
- bool v_trans,
- uint32_t kv_size,
- uint32_t n_pad,
- uint32_t n_swa,
- llama_swa_type swa_type,
- /* recurrent */
- ggml_type type_r,
- ggml_type type_s,
- uint32_t rs_size,
- /* common */
- uint32_t n_seq_max,
- bool offload,
- bool unified,
- /* layer filters */
- layer_filter_cb && filter_attn,
- layer_filter_cb && filter_recr) :
+ const llama_model & model,
+ /* attn */
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ uint32_t kv_size,
+ uint32_t n_pad,
+ uint32_t n_swa,
+ llama_swa_type swa_type,
+ /* recurrent */
+ ggml_type type_r,
+ ggml_type type_s,
+ uint32_t rs_size,
+ /* common */
+ uint32_t n_seq_max,
+ bool offload,
+ bool unified,
+ /* layer filters */
+ const layer_filter_cb & filter_attn,
+ const layer_filter_cb & filter_recr) :
hparams(model.hparams),
mem_attn(new llama_kv_cache(
model,
- filter_attn == nullptr ?
- [&](int32_t il) { return !hparams.is_recurrent(il); }
- : filter_attn,
type_k,
type_v,
v_trans,
n_seq_max,
n_pad,
n_swa,
- swa_type
+ swa_type,
+ filter_attn == nullptr ?
+ [&](int32_t il) { return !hparams.is_recurrent(il); }
+ : filter_attn,
+ nullptr
)),
mem_recr(new llama_memory_recurrent(
model,
- filter_recr == nullptr ?
- [&](int32_t il) { return hparams.is_recurrent(il); }
- : filter_recr,
type_r,
type_s,
offload,
rs_size,
- n_seq_max
+ n_seq_max,
+ filter_recr == nullptr ?
+ [&](int32_t il) { return hparams.is_recurrent(il); }
+ : filter_recr
)) {}
llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
class llama_memory_hybrid : public llama_memory_i {
public:
-
- // this callback is used to filter out layers that should not be included in the cache
- using layer_filter_cb = std::function<bool(int32_t il)>;
-
llama_memory_hybrid(
const llama_model & model,
/* attn */
- ggml_type type_k,
- ggml_type type_v,
- bool v_trans,
- uint32_t kv_size,
- uint32_t n_pad,
- uint32_t n_swa,
- llama_swa_type swa_type,
- /* recurrent */
- ggml_type type_r,
- ggml_type type_s,
- uint32_t rs_size,
- /* common */
- uint32_t n_seq_max,
- bool offload,
- bool unified,
- /* layer filters */
- layer_filter_cb && filter_attn = nullptr,
- layer_filter_cb && filter_recr = nullptr);
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ uint32_t kv_size,
+ uint32_t n_pad,
+ uint32_t n_swa,
+ llama_swa_type swa_type,
+ /* recurrent */
+ ggml_type type_r,
+ ggml_type type_s,
+ uint32_t rs_size,
+ /* common */
+ uint32_t n_seq_max,
+ bool offload,
+ bool unified,
+ /* layer filters */
+ const layer_filter_cb & filter_attn = nullptr,
+ const layer_filter_cb & filter_recr = nullptr);
~llama_memory_hybrid() = default;
//
llama_memory_recurrent::llama_memory_recurrent(
- const llama_model & model,
- layer_filter_cb && filter,
- ggml_type type_r,
- ggml_type type_s,
- bool offload,
- uint32_t mem_size,
- uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
+ const llama_model & model,
+ ggml_type type_r,
+ ggml_type type_s,
+ bool offload,
+ uint32_t mem_size,
+ uint32_t n_seq_max,
+ const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
const int32_t n_layer = hparams.n_layer;
head = 0;
// see the implementation of llama_kv_cache_context_i for an example how to do it
class llama_memory_recurrent : public llama_memory_i {
public:
-
- // this callback is used to filter out layers that should not be included in the cache
- using layer_filter_cb = std::function<bool(int32_t il)>;
-
llama_memory_recurrent(
- const llama_model & model,
- layer_filter_cb && filter,
- ggml_type type_r,
- ggml_type type_s,
- bool offload,
- uint32_t mem_size,
- uint32_t n_seq_max);
+ const llama_model & model,
+ ggml_type type_r,
+ ggml_type type_s,
+ bool offload,
+ uint32_t mem_size,
+ uint32_t n_seq_max,
+ const layer_filter_cb & filter);
~llama_memory_recurrent() = default;
#include "llama.h"
#include <memory>
+#include <functional>
struct llama_ubatch;
// general concept of LLM memory
// the KV cache is a type of LLM memory, but there can be other types
struct llama_memory_i {
+ // this callback is used to filter out layers that should not be included in the cache
+ using layer_filter_cb = std::function<bool(int32_t il)>;
+
+ // this callback is used to specify which layers should reuse memory from other layers
+ // return negative value to indicate that the layer il should not reuse memory
+ using layer_reuse_cb = std::function<int32_t(int32_t il)>;
+
virtual ~llama_memory_i() = default;
// split the input batch into a set of ubatches and verify that they can fit into the cache
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(5);
+ hparams.n_layer_kv_from_start = 20;
hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f;
hparams.f_attention_scale = 1.0f;
// Expert gating function (GLM-4.5 uses sigmoid)
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
- hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
}
// NextN/MTP parameters
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+ // TODO: when MTP is implemented, this should probably be updated if needed
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
switch (hparams.n_layer) {
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
const int64_t n_embd_altup;
const int64_t n_altup;
const int i_altup_act;
- const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
const int n_layer_sparsity = 10; // number of layers using activation sparsity
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
for (int il = 0; il < n_layer; ++il) {
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
- const bool has_kv = (il < n_layer_kv);
-
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
// self-attention
- if (has_kv) {
+ if (hparams.has_kv(il)) {
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
} else {
- // no KV layers
+ // reuse KV cache of earlier layers
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
if (llm_arch_is_recurrent(arch)) {
res = new llama_memory_recurrent(
*this,
- nullptr,
GGML_TYPE_F32,
GGML_TYPE_F32,
cparams.offload_kqv,
std::max((uint32_t) 1, cparams.n_seq_max),
- cparams.n_seq_max);
+ cparams.n_seq_max,
+ nullptr);
} else if (llm_arch_is_hybrid(arch)) {
const auto padding = llama_kv_cache::get_padding(cparams);
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
+ llama_memory_i::layer_reuse_cb reuse = nullptr;
+
+ if (arch == LLM_ARCH_GEMMA3N) {
+ reuse = [&](int32_t il) {
+ if (il >= (int32_t) hparams.n_layer_kv_from_start) {
+ return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
+ }
+
+ return -1;
+ };
+ }
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
GGML_ASSERT(hparams.is_swa_any());
n_ctx_per_stream,
cparams.n_seq_max,
cparams.n_ubatch,
- padding);
+ padding,
+ nullptr,
+ reuse);
} else {
GGML_ASSERT(!hparams.is_swa_any());
res = new llama_kv_cache(
*this,
- nullptr,
params.type_k,
params.type_v,
!cparams.flash_attn,
cparams.n_seq_max,
padding,
hparams.n_swa,
- hparams.swa_type);
+ hparams.swa_type,
+ nullptr,
+ nullptr);
}
}
}