llama-kv-cache-iswa.cpp
llama-memory-recurrent.cpp
llama-memory-hybrid.cpp
+ llama-memory-hybrid-iswa.cpp
llama-memory.cpp
llama-mmap.cpp
llama-model-loader.cpp
return nullptr;
}
-static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
- llama_model & model = adapter.model;
-
ggml_context * ctx_init;
gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
}
}
- // update number of nodes used
- model.n_lora_nodes += adapter.get_n_nodes();
+ // register adapter with model
+ model.loras.insert(&adapter);
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
- llama_adapter_lora * adapter = new llama_adapter_lora(*model);
+ llama_adapter_lora * adapter = new llama_adapter_lora();
try {
- llama_adapter_lora_init_impl(path_lora, *adapter);
+ llama_adapter_lora_init_impl(*model, path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
-void llama_adapter_lora_free(llama_adapter_lora * adapter) {
- // update number of nodes used
- GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
- adapter->model.n_lora_nodes -= adapter->get_n_nodes();
-
- delete adapter;
+void llama_adapter_lora_free(llama_adapter_lora *) {
+ // deprecated: adapters are freed by llama_model's destructor
}
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
};
struct llama_adapter_lora {
- llama_model & model;
-
// map tensor name to lora_a_b
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;
- llama_adapter_lora(llama_model & model) : model(model) {}
+ llama_adapter_lora() = default;
~llama_adapter_lora() = default;
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
{ LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
{ LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_EXAONE4, "exaone4" },
+ { LLM_ARCH_EXAONE_MOE, "exaone-moe" },
{ LLM_ARCH_RWKV6, "rwkv6" },
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
{ LLM_ARCH_RWKV7, "rwkv7" },
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_POST_NORM,
};
+ case LLM_ARCH_EXAONE_MOE:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_GATE_SHEXP,
+ LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_DOWN_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ LLM_TENSOR_NEXTN_EH_PROJ,
+ LLM_TENSOR_NEXTN_EMBED_TOKENS,
+ LLM_TENSOR_NEXTN_ENORM,
+ LLM_TENSOR_NEXTN_HNORM,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+ };
case LLM_ARCH_RWKV6:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_ARCH_NEMOTRON_H_MOE,
LLM_ARCH_EXAONE,
LLM_ARCH_EXAONE4,
+ LLM_ARCH_EXAONE_MOE,
LLM_ARCH_RWKV6,
LLM_ARCH_RWKV6QWEN2,
LLM_ARCH_RWKV7,
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
{ "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
+ { "exaone-moe", LLM_CHAT_TEMPLATE_EXAONE_MOE },
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGLM_4;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
+ if (tmpl_contains("<|tool_declare|>")) {
+ return LLM_CHAT_TEMPLATE_EXAONE_MOE;
+ }
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
return LLM_CHAT_TEMPLATE_GLMEDGE;
if (add_ass) {
ss << "[|assistant|]";
}
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_MOE) {
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << "<|system|>\n" << trim(message->content) << "<|endofturn|>\n";
+ } else if (role == "user") {
+ ss << "<|user|>\n" << trim(message->content) << "<|endofturn|>\n";
+ } else if (role == "assistant") {
+ ss << "<|assistant|>\n" << trim(message->content) << "<|endofturn|>\n";
+ } else if (role == "tool") {
+ ss << "<|tool|>\n" << trim(message->content) << "<|endofturn|>\n";
+ }
+ }
+ if (add_ass) {
+ ss << "<|assistant|>\n";
+ }
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
// this template requires the model to have "\n\n" as EOT token
for (size_t i = 0; i < chat.size(); i++) {
LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3,
LLM_CHAT_TEMPLATE_EXAONE_4,
+ LLM_CHAT_TEMPLATE_EXAONE_MOE,
LLM_CHAT_TEMPLATE_RWKV_WORLD,
LLM_CHAT_TEMPLATE_GRANITE,
LLM_CHAT_TEMPLATE_GIGACHAT,
}
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
+ cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
// with causal attention, the batch size is limited by the context size
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
+ // intialized later
+ cparams.pipeline_parallel = false;
+
{
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
// graph outputs buffer
{
- // resized during inference when a batch uses more outputs
- // Create a dummy batch for initialization.
- llama_batch dummy_batch = {};
- dummy_batch.n_tokens = 0;
- if (output_reserve(params.n_seq_max, dummy_batch) < params.n_seq_max) {
+ if (output_reserve(params.n_seq_max) < params.n_seq_max) {
throw std::runtime_error("failed to reserve initial output buffer");
}
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
- const uint32_t n_seqs = cparams.n_seq_max;
- const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
- const size_t max_nodes = this->graph_max_nodes(n_tokens);
-
- LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
-
- gf_res_prev.reset(new llm_graph_result(max_nodes));
- gf_res_reserve.reset(new llm_graph_result(max_nodes));
-
// TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
}
}
- sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));
+ cparams.pipeline_parallel = pipeline_parallel;
- if (pipeline_parallel) {
- LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
+ if (cparams.pipeline_parallel) {
+ LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
}
- llama_memory_context_ptr mctx;
- if (memory) {
- LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
- mctx = memory->init_full();
- if (!mctx) {
- throw std::runtime_error("failed to initialize memory module");
+ sched_reserve();
+
+ if (!cparams.flash_attn) {
+ if (ggml_is_quantized(params.type_v)) {
+ throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
}
}
+ }
- cross.v_embd.clear();
-
- // avoid reserving graphs with zero outputs - assume one output per sequence
- n_outputs = n_seqs;
-
- LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
+ // Initialize the full vocabulary token ids for backend samplers.
+ {
+ const int n_vocab = model.vocab.n_tokens();
- // resolve automatic Flash Attention use
- if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
- auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
- if (!gf) {
- throw std::runtime_error("failed to split graph for Flash Attention check");
- }
+ sampling.token_ids_full_vocab.resize(n_vocab);
+ for (int i = 0; i < n_vocab; ++i) {
+ sampling.token_ids_full_vocab[i] = i;
+ }
+ }
+}
- const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
- bool fa_device_mismatch = false;
- for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
- ggml_tensor * n = ggml_graph_node(gf, i);
- if (n->op != GGML_OP_FLASH_ATTN_EXT) {
- continue;
- }
- ggml_backend_dev_t device_fa = ggml_backend_get_device(
- ggml_backend_sched_get_tensor_backend(sched.get(), n));
+llama_context::~llama_context() {
+ if (!model.hparams.no_alloc) {
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+ ggml_backend_t backend = backend_ptrs[i];
+ ggml_backend_buffer_type_t buft = backend_buft[i];
- // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
- GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
- const int il = std::stoi(n->name + prefix_len);
- ggml_backend_dev_t device_kv = model.dev_layer(il);
- if (device_fa != device_kv) {
- LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
- "is assigned to device %s (usually due to missing support)\n",
- __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
- // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
- fa_device_mismatch = true;
- break;
- }
- }
- if (fa_device_mismatch) {
- cparams.flash_attn = false;
- LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
- if (ggml_is_quantized(params.type_v)) {
- throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
- }
+ const size_t size_exp = backend_buf_exp_size[i];
+ const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ if (size_exp == size_act) {
+ LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
} else {
- cparams.flash_attn = true;
- LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
+ LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
}
}
+ }
+ ggml_opt_free(opt_ctx);
+}
- // reserve worst-case graph
- int n_splits_pp = -1;
- int n_nodes_pp = -1;
+void llama_context::sched_reserve() {
+ if (!sched_need_reserve) {
+ return;
+ }
- int n_splits_tg = -1;
- int n_nodes_tg = -1;
+ sched_need_reserve = false;
- // reserve pp (prompt processing) graph first so that buffers are only allocated once
- {
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
- model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
- if (!gf) {
- if (pipeline_parallel) {
- LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
- sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
- gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
- }
- if (!gf) {
- throw std::runtime_error("failed to allocate compute pp buffers");
- }
- }
+ LLAMA_LOG_INFO("%s: reserving ...\n", __func__);
+
+ synchronize();
+
+ const int64_t t_start_us = ggml_time_us();
+
+ const uint32_t n_seqs = cparams.n_seq_max;
+ const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+ const size_t max_nodes = this->graph_max_nodes(n_tokens);
+
+ LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
+
+ gf_res_prev.reset(new llm_graph_result(max_nodes));
+ gf_res_reserve.reset(new llm_graph_result(max_nodes));
- n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
- n_nodes_pp = ggml_graph_n_nodes(gf);
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, cparams.pipeline_parallel, cparams.op_offload));
+
+ llama_memory_context_ptr mctx;
+ if (memory) {
+ LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
+ mctx = memory->init_full();
+ if (!mctx) {
+ throw std::runtime_error("failed to initialize memory module");
}
+ }
- // reserve with tg (token generation) graph to get the number of splits and nodes
- {
- auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
- if (!gf) {
- throw std::runtime_error("failed to allocate compute tg buffers");
- }
+ // avoid reserving graphs with zero outputs - assume one output per sequence
+ const int n_outputs = n_seqs;
+
+ LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
- n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
- n_nodes_tg = ggml_graph_n_nodes(gf);
+ // resolve automatic Flash Attention use
+ if (cparams.auto_fa) {
+ auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
+ if (!gf) {
+ throw std::runtime_error("failed to split graph for Flash Attention check");
}
- // reserve again with pp graph to avoid ggml-alloc reallocations during inference
- {
- // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
- //
- // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
- //
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
- if (!gf) {
- throw std::runtime_error("failed to allocate compute pp buffers");
+ const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
+ bool fa_device_mismatch = false;
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+ ggml_tensor * n = ggml_graph_node(gf, i);
+ if (n->op != GGML_OP_FLASH_ATTN_EXT) {
+ continue;
+ }
+ ggml_backend_dev_t device_fa = ggml_backend_get_device(
+ ggml_backend_sched_get_tensor_backend(sched.get(), n));
+
+ // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
+ GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
+ const int il = std::stoi(n->name + prefix_len);
+ ggml_backend_dev_t device_kv = model.dev_layer(il);
+ if (device_fa != device_kv) {
+ LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
+ "is assigned to device %s (usually due to missing support)\n",
+ __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
+ // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
+ fa_device_mismatch = true;
+ break;
}
}
+ if (fa_device_mismatch) {
+ cparams.flash_attn = false;
+ LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
+ } else {
+ cparams.flash_attn = true;
+ LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
+ }
- for (size_t i = 0; i < backend_ptrs.size(); ++i) {
- ggml_backend_t backend = backend_ptrs[i];
- ggml_backend_buffer_type_t buft = backend_buft[i];
- if (!model.hparams.no_alloc) {
- backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ cparams.auto_fa = false;
+ }
+
+ // reserve worst-case graph
+ int n_splits_pp = -1;
+ int n_nodes_pp = -1;
+
+ int n_splits_tg = -1;
+ int n_nodes_tg = -1;
+
+ // reserve pp (prompt processing) graph first so that buffers are only allocated once
+ {
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
+ model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
+ if (!gf) {
+ if (cparams.pipeline_parallel) {
+ LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
+ cparams.pipeline_parallel = false;
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
+ gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
}
- if (backend_buf_exp_size[i] > 1) {
- LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
- ggml_backend_buft_name(buft),
- backend_buf_exp_size[i] / 1024.0 / 1024.0);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute pp buffers");
}
}
- if (n_nodes_pp == n_nodes_tg) {
- LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
- } else {
- LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
- }
+ n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+ n_nodes_pp = ggml_graph_n_nodes(gf);
+ }
- if (n_splits_pp == n_splits_tg) {
- LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
- } else {
- LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+ // reserve with tg (token generation) graph to get the number of splits and nodes
+ {
+ auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute tg buffers");
}
+
+ n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+ n_nodes_tg = ggml_graph_n_nodes(gf);
}
- // Initialize the full vocabulary token ids for backend samplers.
+ // reserve again with pp graph to avoid ggml-alloc reallocations during inference
{
- const int n_vocab = model.vocab.n_tokens();
+ // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
+ //
+ // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
+ //
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute pp buffers");
+ }
+ }
- sampling.token_ids_full_vocab.resize(n_vocab);
- for (int i = 0; i < n_vocab; ++i) {
- sampling.token_ids_full_vocab[i] = i;
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+ ggml_backend_t backend = backend_ptrs[i];
+ ggml_backend_buffer_type_t buft = backend_buft[i];
+ if (!model.hparams.no_alloc) {
+ backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ }
+ if (backend_buf_exp_size[i] > 1) {
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+ ggml_backend_buft_name(buft),
+ backend_buf_exp_size[i] / 1024.0 / 1024.0);
}
}
-}
-llama_context::~llama_context() {
- if (!model.hparams.no_alloc) {
- for (size_t i = 0; i < backend_ptrs.size(); ++i) {
- ggml_backend_t backend = backend_ptrs[i];
- ggml_backend_buffer_type_t buft = backend_buft[i];
+ if (n_nodes_pp == n_nodes_tg) {
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
+ } else {
+ LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+ }
- const size_t size_exp = backend_buf_exp_size[i];
- const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
- if (size_exp == size_act) {
- LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
- __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
- } else {
- LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
- __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
- }
- }
+ if (n_splits_pp == n_splits_tg) {
+ LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+ } else {
+ LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
}
- ggml_opt_free(opt_ctx);
+
+ const int64_t t_end_us = ggml_time_us();
+
+ LLAMA_LOG_INFO("%s: reserve took %.2f ms, sched copies = %d\n",
+ __func__, (t_end_us - t_start_us)/1000.0, ggml_backend_sched_get_n_copies(sched.get()));
}
void llama_context::synchronize() {
+ if (!sched) {
+ return;
+ }
+
ggml_backend_sched_synchronize(sched.get());
// FIXME: if multiple single tokens are evaluated without a synchronization,
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
}
- const uint32_t n_embd_out = model.hparams.get_n_embd_out();
+ const uint32_t n_embd_out = model.hparams.n_embd_out();
return embd + j*n_embd_out;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
cparams.embeddings = value;
+
+ // TODO: not sure yet if we want to reserve here
+ //sched_need_reserve = true;
}
void llama_context::set_causal_attn(bool value) {
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+ if (cparams.causal_attn == value) {
+ return;
+ }
+
cparams.causal_attn = value;
+
+ sched_need_reserve = true;
}
void llama_context::set_warmup(bool value) {
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+ if (cparams.warmup == value) {
+ return;
+ }
+
cparams.warmup = value;
+
+ // warmups are usually with small batches, so no need to reserve
+ //sched_need_reserve = true;
}
bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
+ if (!sampler && sampling.samplers.count(seq_id) == 0) {
+ return true;
+ }
+
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
const bool can_offload =
sampling.samplers[seq_id] = sampler;
+ sched_need_reserve = true;
+
return true;
}
if (sampler && !can_offload) {
LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
+ if (sampling.samplers.count(seq_id) > 0) {
+ sched_need_reserve = true;
+ }
+
sampling.samplers.erase(seq_id);
return false;
sampling.samplers.erase(seq_id);
+ sched_need_reserve = true;
+
return true;
}
float scale) {
LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
+ if (auto it = loras.find(adapter); it != loras.end()) {
+ if (it->second == scale) {
+ return;
+ }
+ }
+
loras[adapter] = scale;
+
+ sched_need_reserve = true;
}
bool llama_context::rm_adapter_lora(
llama_adapter_lora * adapter) {
LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
- auto pos = loras.find(adapter);
- if (pos != loras.end()) {
- loras.erase(pos);
+ auto it = loras.find(adapter);
+ if (it != loras.end()) {
+ loras.erase(it);
+
+ sched_need_reserve = true;
+
return true;
}
void llama_context::clear_adapter_lora() {
LLAMA_LOG_DEBUG("%s: call\n", __func__);
+ if (loras.empty()) {
+ return;
+ }
+
loras.clear();
+
+ sched_need_reserve = true;
}
bool llama_context::apply_adapter_cvec(
int32_t il_end) {
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
+ // TODO: should we reserve?
+
return cvec.apply(model, data, len, n_embd, il_start, il_end);
}
// TODO: this clear of the buffer can easily be forgotten - need something better
embd_seq.clear();
+ sched_reserve();
+
n_queued_tokens += n_tokens;
// reserve output buffer
- if (output_reserve(n_tokens, batch_inp) < n_tokens) {
+ if (output_reserve(n_tokens) < n_tokens) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
return -2;
};
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
// extract logits
- if (logits && t_logits) {
+ if (logits && t_logits) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr);
GGML_ASSERT(logits != nullptr);
{
// extract token embeddings
GGML_ASSERT(embd != nullptr);
- const uint32_t n_embd_out = hparams.get_n_embd_out();
+ const uint32_t n_embd_out = hparams.n_embd_out();
GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
}
}
+static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_seq_id, llama_sampler *> & samplers) {
+ for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
+ if (!ubatch.output[i]) {
+ continue;
+ }
+
+ // Check if the output token has at least one sequence without a backend sampler.
+ for (int32_t j = 0; j < ubatch.n_seq_id[i]; ++j) {
+ llama_seq_id seq_id = ubatch.seq_id[i][j];
+ if (samplers.find(seq_id) == samplers.end()) {
+ return true;
+ }
+ }
+ }
+ return false; // all sequences use backend sampling
+}
+
int llama_context::decode(const llama_batch & batch_inp) {
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
embd_seq.clear();
output_swaps.clear();
+ sched_reserve();
+
bool did_optimize = false;
// handle any pending shifts/copies
}
// reserve output buffer
- if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+ if (output_reserve(n_outputs_all) < n_outputs_all) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
return -2;
};
}
// extract logits
- // For multi-sequence batches that mix backend samplers and CPU sampler
- // this is currently inefficient as we copy all logits even for the
- // backend sampled tokens.
- if (logits && t_logits && n_outputs > 0) {
+ if (logits && t_logits && n_outputs > 0 && needs_raw_logits(ubatch, sampling.samplers)) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr);
GGML_ASSERT(logits != nullptr);
{
// extract token embeddings
GGML_ASSERT(embd != nullptr);
- const uint32_t n_embd_out = hparams.get_n_embd_out();
+ const uint32_t n_embd_out = hparams.n_embd_out();
float * embd_out = embd + n_outputs_prev*n_embd_out;
if (n_outputs) {
}
}
- // This flag indicates whether a backend sampler has actually sampled a specific
- // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
- const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty();
-
- if (has_samplers && has_sampled) {
+ // Copy backend sampling output if this ubatch produced any sampling tensors.
+ if (has_samplers && (!res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty())) {
const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
const auto stride = n_vocab;
// output
//
-uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & batch) {
+uint32_t llama_context::output_reserve(int32_t n_outputs) {
+
const auto & hparams = model.hparams;
const auto & vocab = model.vocab;
const auto n_batch = cparams.n_batch;
const auto n_vocab = vocab.n_tokens();
- const auto n_embd_out = hparams.get_n_embd_out();
+ const auto n_embd_out = hparams.n_embd_out();
bool has_logits = true;
bool has_embd = cparams.embeddings;
has_embd = true;
}
- // Check which sampling modes are needed for the current batch.
- // TODO: avoid this branching by working with the worst-case
- bool has_sampling = false;
- bool cpu_logits = false;
-
- if (batch.logits) {
- for (int32_t i = 0; i < batch.n_tokens; i++) {
- if (!batch.logits[i]) {
- continue;
- }
- for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
- llama_seq_id seq_id = batch.seq_id[i][j];
- if (sampling.samplers.find(seq_id) != sampling.samplers.end()) {
- has_sampling = true;
- } else {
- cpu_logits = true;
- }
- }
- }
- } else {
- // When batch.logits is nullptr (when loading state with a dummy batch),
- // allocate CPU logits.
- cpu_logits = true;
- }
size_t backend_float_count = 0;
size_t backend_token_count = 0;
- // Allocate CPU logits buffer only if needed by sequences in this batch
- logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
+ logits_size = has_logits ? n_vocab*n_outputs_max : 0;
embd_size = has_embd ? n_embd_out*n_outputs_max : 0;
- // TODO: avoid this branching by working with the worst-case
- if (!has_sampling) {
- sampling.logits_size = 0;
- sampling.probs_size = 0;
- sampling.sampled_size = 0;
- sampling.candidates_size = 0;
- } else {
+ // Allocate backend sampling output buffers if there are backend samplers configured.
+ const bool has_sampling = !sampling.samplers.empty();
+ if (has_sampling) {
sampling.logits_size = n_vocab*n_outputs_max;
sampling.probs_size = n_vocab*n_outputs_max;
sampling.sampled_size = n_outputs_max;
size_t offset = 0;
uint8_t * base = (uint8_t *) output_base;
- logits = (has_logits && cpu_logits) ? output_base : nullptr;
+ logits = has_logits ? output_base : nullptr;
offset += logits_size * sizeof(float);
embd = has_embd ? (float *) (base + offset) : nullptr;
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
}
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
- res += model.n_lora_nodes;
+ for (const auto & lora : model.loras) {
+ res += lora->get_n_nodes();
+ }
return res;
}
ggml_set_name(cur, name);
}
- if (!cparams.offload_kqv) {
- if (strcmp(name, "kqv_merged_cont") == 0) {
- // all nodes between the KV store and the attention output are run on the CPU
- ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
- }
- }
-
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
}
}
+ // [TAG_CONTEXT_STATE_LOGITS]
// write logits
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
auto n_outputs = this->n_outputs;
io.read_to(&n_outputs, sizeof(n_outputs));
- // Create a dummy batch for state loading.
- llama_batch dummy_batch = {};
- dummy_batch.n_tokens = 0;
- if (n_outputs > output_reserve(n_outputs, dummy_batch)) {
+ if (n_outputs > output_reserve(n_outputs)) {
throw std::runtime_error("could not reserve outputs");
}
}
// reserve output buffer
- if (output_reserve(n_outputs_all, balloc->get_batch()) < n_outputs_all) {
+ if (output_reserve(n_outputs_all) < n_outputs_all) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
GGML_ABORT("TODO: handle this error");
};
};
ctx_compute_opt = ggml_init(params);
}
- ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
+ ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
ggml_opt_alloc(opt_ctx, train);
res->set_inputs(&ubatch);
~llama_context();
+ // reserve a new backend scheduler (if needed)
+ // for example, when:
+ // - changing loras
+ // - changing samplers
+ // - changing attention type
+ // - etc.
+ void sched_reserve();
+
void synchronize();
const llama_model & get_model() const;
// Make sure enough space is available for outputs.
// Returns max number of outputs for which space was reserved.
- uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
+ uint32_t output_reserve(int32_t n_outputs);
void output_reorder();
ggml_backend_sched_ptr sched;
+ bool sched_need_reserve = true;
+
ggml_backend_t backend_cpu = nullptr;
std::vector<ggml_backend_ptr> backends;
bool causal_attn;
bool offload_kqv;
bool flash_attn;
+ bool auto_fa;
bool no_perf;
bool warmup;
bool op_offload;
bool kv_unified;
+ bool pipeline_parallel;
enum llama_pooling_type pooling_type;
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
#include "llama-memory-hybrid.h"
+#include "llama-memory-hybrid-iswa.h"
#include "llama-memory-recurrent.h"
#include <cassert>
}
if (ubatch->embd) {
- const int64_t n_embd = embd->ne[0];
+ GGML_ASSERT(n_embd == embd->ne[0]);
+
const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
bool res = true;
- res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
- res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
+ res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+ res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
return res;
}
int32_t * data = (int32_t *) pos_bucket->data;
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- for (int i = 0; i < n_tokens; ++i) {
- data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
- }
+ for (int j = 0; j < n_tokens; ++j) {
+ for (int i = 0; i < n_tokens; ++i) {
+ data[j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
}
}
}
const int64_t n_tokens = ubatch->n_tokens;
const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
- for (int h = 0; h < 1; ++h) {
- for (int i1 = 0; i1 < n_tokens; ++i1) {
- const llama_seq_id s1 = ubatch->seq_id[i1][0];
- const llama_pos p1 = ubatch->pos[i1];
+ for (int i1 = 0; i1 < n_tokens; ++i1) {
+ const llama_seq_id s1 = ubatch->seq_id[i1][0];
+ const llama_pos p1 = ubatch->pos[i1];
- const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
+ const uint64_t idst = i1*n_kv;
- for (int i0 = 0; i0 < n_tokens; ++i0) {
- const llama_seq_id s0 = ubatch->seq_id[i0][0];
- const llama_pos p0 = ubatch->pos[i0];
+ for (int i0 = 0; i0 < n_tokens; ++i0) {
+ const llama_seq_id s0 = ubatch->seq_id[i0][0];
+ const llama_pos p0 = ubatch->pos[i0];
- // mask different sequences
- if (s0 != s1) {
- continue;
- }
-
- // mask future tokens
- if (cparams.causal_attn && p0 > p1) {
- continue;
- }
+ // mask different sequences
+ if (s0 != s1) {
+ continue;
+ }
- // apply SWA if any
- if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
- continue;
- }
+ // mask future tokens
+ if (cparams.causal_attn && p0 > p1) {
+ continue;
+ }
- data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
+ // apply SWA if any
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
+ continue;
}
+
+ data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
}
}
};
return res;
}
+void llm_graph_input_attn_k::set_input(const llama_ubatch * ubatch) {
+ mctx->set_input_k_idxs(self_k_idxs, ubatch);
+
+ mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+}
+
+bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+
+ res &= self_kq_mask->ne[0] == mctx->get_n_kv();
+ res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
+
+ return res;
+}
+
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
float * data = (float *) cross_kq_mask->data;
- for (int h = 0; h < 1; ++h) {
- for (int i = 0; i < n_tokens; ++i) {
- for (int j = 0; j < n_enc; ++j) {
- float f = -INFINITY;
+ for (int i = 0; i < n_tokens; ++i) {
+ for (int j = 0; j < n_enc; ++j) {
+ float f = -INFINITY;
- for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
- const llama_seq_id seq_id = ubatch->seq_id[i][s];
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
- if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
- f = 0.0f;
- }
+ if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
+ f = 0.0f;
}
-
- data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
}
- }
- for (int i = n_tokens; i < n_tokens; ++i) {
- for (int j = 0; j < n_enc; ++j) {
- data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
- }
+ data[i*n_enc + j] = f;
}
}
}
return res;
}
+void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
+ const auto * attn_ctx = mctx->get_attn();
+
+ // base tensors may not be allocated if there are no non-SWA attention layers
+ if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
+ attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
+ attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+
+ attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+ }
+
+ // swa tensors may not be allocated if there are no SWA attention layers
+ if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
+ attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
+ attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+
+ attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+ }
+
+ const int64_t n_rs = mctx->get_recr()->get_n_rs();
+
+ if (inp_rs->s_copy) {
+ GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
+ int32_t * data = (int32_t *) inp_rs->s_copy->data;
+
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+ for (uint32_t i = 0; i < n_rs; ++i) {
+ data[i] = mctx->get_recr()->s_copy(i);
+ }
+ }
+}
+
+bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast<const llama_memory_hybrid_iswa_context *>(params.mctx);
+
+ this->mctx = mctx;
+
+ bool res = true;
+
+ const auto * attn_ctx = mctx->get_attn();
+
+ // base tensors may not be allocated if there are no non-SWA attention layers
+ if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
+ res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
+ //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= inp_attn->self_kq_mask->ne[0] == attn_ctx->get_base()->get_n_kv();
+ res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+ }
+
+ // swa tensors may not be allocated if there are no SWA attention layers
+ if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
+ res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+ //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= inp_attn->self_kq_mask_swa->ne[0] == attn_ctx->get_swa()->get_n_kv();
+ res &= inp_attn->self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
+ }
+
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+
+ res &= inp_rs->head == mctx->get_recr()->get_head();
+ res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
+
+ return res;
+}
+
void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
// set the inputs only for the active samplers in the current ubatch
std::unordered_set<llama_seq_id> active_samplers;
}
void llm_graph_result::reset() {
- t_tokens = nullptr;
+ t_inp_tokens = nullptr;
+ t_inp_embd = nullptr;
t_logits = nullptr;
t_embd = nullptr;
t_embd_pooled = nullptr;
// input embeddings with optional lora
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
- const int64_t n_embd = hparams.n_embd_inp();
+ const int64_t n_embd_inp = hparams.n_embd_inp();
+ const int64_t n_embd = hparams.n_embd;
+
+ assert(n_embd_inp >= n_embd);
+
+ auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
- auto inp = std::make_unique<llm_graph_input_embd>();
+ inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+ cb(inp->tokens, "inp_tokens", -1);
+ ggml_set_input(inp->tokens);
+ res->t_inp_tokens = inp->tokens;
- ggml_tensor * cur = nullptr;
+ inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
+ cb(inp->embd, "inp_embd", -1);
+ ggml_set_input(inp->embd);
- if (ubatch.token) {
- inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
- //cb(inp->tokens, "inp_tokens", -1);
- ggml_set_input(inp->tokens);
- res->t_tokens = inp->tokens;
+ // select one of the 2 inputs, based on the batch contents
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18550
+ std::array<ggml_tensor *, 2> inps;
+
+ // token embeddings path (ubatch.token != nullptr)
+ {
+ auto & cur = inps[0];
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
cur = ggml_add(ctx0, cur, inpL_delta);
}
- } else {
- inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
- ggml_set_input(inp->embd);
+
+ if (n_embd_inp != n_embd) {
+ cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0);
+ }
+ }
+
+ // vector embeddings path (ubatch.embd != nullptr)
+ {
+ auto & cur = inps[1];
cur = inp->embd;
}
+ assert(ggml_are_same_shape (inps[0], inps[1]));
+ assert(ggml_are_same_stride(inps[0], inps[1]));
+
+ ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
+
+ if (n_embd_inp != n_embd) {
+ cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0);
+ }
+
+ res->t_inp_embd = cur;
+
// For Granite architecture
if (hparams.f_embedding_scale != 0.0f) {
cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
}
- cb(cur, "inp_embd", -1);
+ cb(cur, "embd", -1);
res->add_input(std::move(inp));
//}
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
- const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
+ const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
ggml_set_input(cur);
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * sinks,
- ggml_tensor * v_mla,
+ ggml_tensor * v_mla, // TODO: remove
float kq_scale,
int il) const {
+ GGML_ASSERT(v_mla == nullptr);
+
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
// expand k later to enable rope fusion which directly writes into k-v cache
return cur;
}
+static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
+ ggml_context * ctx0,
+ const llama_ubatch & ubatch,
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_context * mctx_cur) {
+
+ auto inp = std::make_unique<llm_graph_input_attn_k>(hparams, cparams, mctx_cur);
+
+ {
+ GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
+
+ const auto n_kv = mctx_cur->get_n_kv();
+ const auto n_tokens = ubatch.n_tokens;
+ const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+ inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
+
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp->self_kq_mask);
+
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+ }
+
+ return inp;
+}
+
+llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const {
+ const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
+
+ auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
+
+ return (llm_graph_input_attn_k *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_attn(
+ llm_graph_input_attn_k * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks,
+ ggml_tensor * v_mla,
+ float kq_scale,
+ int il) const {
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ // expand k later to enable rope fusion which directly writes into k-v cache
+ ggml_build_forward_expand(gf, q_cur);
+ ggml_build_forward_expand(gf, v_cur);
+ ggml_build_forward_expand(gf, k_cur);
+
+ const auto * mctx_cur = inp->mctx;
+
+ // store to KV cache
+ {
+ const auto & k_idxs = inp->get_k_idxs();
+
+ ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
+ }
+
+ const auto & kq_mask = inp->get_kq_mask();
+
+ ggml_tensor * q = q_cur;
+ ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+ ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
+
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+ cb(cur, "kqv_out", il);
+
+ if (wo) {
+ cur = build_lora_mm(wo, cur);
+ if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+ // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+ ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+ }
+ }
+
+ if (wo_b) {
+ cur = ggml_add(ctx0, cur, wo_b);
+ }
+
+ return cur;
+}
+
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
}
+llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const {
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_iswa_context *>(mctx);
+
+ auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
+
+ // build iswa attention input
+ const auto * attn_ctx = mctx_cur->get_attn();
+
+ auto inp_attn = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, attn_ctx);
+
+ const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+ {
+ const auto n_kv = attn_ctx->get_base()->get_n_kv();
+
+ inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
+ inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
+
+ inp_attn->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp_attn->self_kq_mask);
+
+ inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
+ }
+
+ {
+ const auto n_kv = attn_ctx->get_swa()->get_n_kv();
+
+ inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
+ inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
+
+ inp_attn->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp_attn->self_kq_mask_swa);
+
+ inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
+ }
+
+ auto inp = std::make_unique<llm_graph_input_mem_hybrid_iswa>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
+
+ return (llm_graph_input_mem_hybrid_iswa *) res->add_input(std::move(inp));
+}
+
void llm_graph_context::build_dense_out(
ggml_tensor * dense_2,
ggml_tensor * dense_3) const {
class llama_kv_cache_iswa_context;
class llama_memory_recurrent_context;
class llama_memory_hybrid_context;
+class llama_memory_hybrid_iswa_context;
// certain models (typically multi-modal) can produce different types of graphs
enum llm_graph_type {
class llm_graph_input_embd : public llm_graph_input_i {
public:
- llm_graph_input_embd() = default;
+ llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
virtual ~llm_graph_input_embd() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
+
+ const int64_t n_embd = 0;
};
class llm_graph_input_pos : public llm_graph_input_i {
const llama_kv_cache_context * mctx;
};
+// V-less input for the KV cache
+// ref: https://github.com/ggml-org/llama.cpp/pull/19067
+class llm_graph_input_attn_k : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_k(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_context * mctx) :
+ hparams(hparams),
+ cparams(cparams),
+ mctx(mctx) {
+ }
+ ~llm_graph_input_attn_k() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const llama_kv_cache_context * mctx;
+};
+
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
public:
llm_graph_input_attn_kv_iswa(
const llama_memory_hybrid_context * mctx;
};
+class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
+public:
+ llm_graph_input_mem_hybrid_iswa(
+ const llama_cparams & cparams,
+ std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
+ const llama_memory_hybrid_iswa_context * mctx) :
+ inp_attn(std::move(inp_attn)),
+ inp_rs(std::move(inp_rs)),
+ cparams(cparams),
+ mctx(mctx) { }
+ virtual ~llm_graph_input_mem_hybrid_iswa() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
+
+ llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
+
+ const llama_cparams cparams;
+
+ const llama_memory_hybrid_iswa_context * mctx;
+};
+
class llm_graph_input_sampling : public llm_graph_input_i {
public:
llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
virtual ~llm_graph_result() = default;
- ggml_tensor * get_tokens() const { return t_tokens; }
+ ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
ggml_tensor * get_logits() const { return t_logits; }
ggml_tensor * get_embd() const { return t_embd; }
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
void set_params(const llm_graph_params & params);
// important graph nodes
- ggml_tensor * t_tokens = nullptr;
+ ggml_tensor * t_inp_tokens = nullptr;
+ ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens]
ggml_tensor * t_logits = nullptr;
ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr;
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_k * build_attn_inp_k() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_k * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
+ llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
+
//
// pooling
//
return n_embd_inp;
}
-uint32_t llama_hparams::get_n_embd_out() const {
- return n_embd_out > 0 ? n_embd_out : n_embd;
+uint32_t llama_hparams::n_embd_out() const {
+ return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
}
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
GGML_ABORT("fatal error");
}
+bool llama_hparams::is_mla() const {
+ assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) ||
+ (n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0));
+
+ return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0;
+}
+
+uint32_t llama_hparams::n_embd_head_k_mla() const {
+ return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
+}
+
+uint32_t llama_hparams::n_embd_head_v_mla() const {
+ return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
+}
+
bool llama_hparams::has_kv(uint32_t il) const {
if (n_layer_kv_from_start >= 0) {
if (il < (uint32_t) n_layer_kv_from_start) {
return res;
}
-bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
- assert(p0 >= 0 && p1 >= 0);
-
- switch (swa_type) {
- case LLAMA_SWA_TYPE_NONE:
- {
- } break;
- case LLAMA_SWA_TYPE_STANDARD:
- {
- if (p1 - p0 >= (int32_t) n_swa) {
- return true;
- }
- } break;
- case LLAMA_SWA_TYPE_CHUNKED:
- {
- const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
-
- if (p0 < pos_chunk_start) {
- return true;
- }
- } break;
- case LLAMA_SWA_TYPE_SYMMETRIC:
- {
- const int32_t half_n_swa = (int32_t) n_swa / 2;
- const int32_t pos_diff = p1 - p0;
-
- // Mask if outside the symmetric window
- if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
- return true;
- }
- } break;
- }
-
- return false;
-}
-
bool llama_hparams::use_mrope() const {
return rope_sections[0] > 0 && rope_sections[1] > 0;
}
#include "llama.h"
#include <array>
+#include <cassert>
// bump if necessary
#define LLAMA_MAX_LAYERS 512
uint32_t n_rel_attn_bkts = 0;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
- uint32_t n_embd_head_k_mla = 0;
- uint32_t n_embd_head_v_mla = 0;
+ uint32_t n_embd_head_k_mla_impl = 0;
+ uint32_t n_embd_head_v_mla_impl = 0;
// for WavTokenizer
struct llama_hparams_posnet posnet;
uint32_t n_cls_out = 1;
// output embedding dimension (0 = use n_embd)
- uint32_t n_embd_out = 0;
+ uint32_t n_embd_out_impl = 0;
// llama4 smallthinker
uint32_t n_moe_layer_step = 0;
uint32_t n_embd_inp() const;
// dimension of output embeddings
- uint32_t get_n_embd_out() const;
+ uint32_t n_embd_out() const;
// dimension of key embeddings across all k-v heads
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
bool is_swa(uint32_t il) const;
+ // note: currently only support if either all or none of the layers are MLA
+ bool is_mla() const;
+
+ uint32_t n_embd_head_k_mla() const;
+ uint32_t n_embd_head_v_mla() const;
+
bool has_kv(uint32_t il) const;
// number of layers for which has_kv() returns true
uint32_t n_layer_kv() const;
// note that this function uses different SWA parameters from those in the hparams
+ // note: inlined on purpose for performance reasons
// TODO: think of a better place for this function
// TODO: pack the SWA params in a struct?
- static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
+ static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
+ assert(p0 >= 0 && p1 >= 0);
+
+ switch (swa_type) {
+ case LLAMA_SWA_TYPE_NONE:
+ {
+ } break;
+ case LLAMA_SWA_TYPE_STANDARD:
+ {
+ if (p1 - p0 >= (int32_t) n_swa) {
+ return true;
+ }
+ } break;
+ case LLAMA_SWA_TYPE_CHUNKED:
+ {
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+ if (p0 < pos_chunk_start) {
+ return true;
+ }
+ } break;
+ case LLAMA_SWA_TYPE_SYMMETRIC:
+ {
+ const int32_t half_n_swa = (int32_t) n_swa / 2;
+ const int32_t pos_diff = p1 - p0;
+
+ // Mask if outside the symmetric window
+ if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
+ return true;
+ }
+ } break;
+ }
+
+ return false;
+ }
+
bool use_mrope() const;
};
__func__, hparams.n_embd_v_gqa_max());
}
+ const bool is_mla = hparams.is_mla();
+
for (uint32_t il = 0; il < hparams.n_layer; il++) {
if (!hparams.has_kv(il)) {
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
throw std::runtime_error("failed to create ggml context for kv cache");
}
- ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
- ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
+ const bool has_k = true;
+ const bool has_v = !is_mla;
+
+ ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
+ ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
- ggml_format_name(k, "cache_k_l%d", il);
- ggml_format_name(v, "cache_v_l%d", il);
+ has_k && ggml_format_name(k, "cache_k_l%d", il);
+ has_v && ggml_format_name(v, "cache_v_l%d", il);
std::vector<ggml_tensor *> k_stream;
std::vector<ggml_tensor *> v_stream;
for (uint32_t s = 0; s < n_stream; ++s) {
- k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]));
- v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]));
+ k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
+ v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
}
map_layer_ids[il] = layers.size();
const auto & layer = layers[il];
ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
- ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
+
+ if (layer.v_stream[ssrc]) {
+ ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
+ }
}
}
}
const llama_seq_id seq_id_cell = cells.seq_get(idx);
// SWA mask
- if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
can_use = true;
}
}
}
}
-void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
- const uint32_t n_tokens = ubatch->n_tokens;
+struct args_set_input_kq_mask {
+ const llama_hparams & hparams;
+ const llama_ubatch * ubatch;
- GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
- float * data = (float *) dst->data;
+ const std::vector<llama_kv_cells> & v_cells;
+ const std::vector<uint32_t> & seq_to_stream;
- const int64_t n_kv = dst->ne[0];
- const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
+ uint32_t n_swa;
+ llama_swa_type swa_type;
- GGML_ASSERT(n_tokens%n_stream == 0);
+ int64_t n_kv;
+ int64_t n_stream;
+ int64_t n_tps;
+};
- // n_tps == n_tokens_per_stream
- const int64_t n_tps = n_tokens/n_stream;
+template<bool causal, bool swa, bool is_2d, bool alibi>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+ //const auto & hparams = args.hparams;
+ const auto & ubatch = args.ubatch;
- std::fill(data, data + ggml_nelements(dst), -INFINITY);
-
- // Use only the previous KV cells of the correct sequence for each token of the ubatch.
- // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
- // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
- // Causal mask:
- // xxx-------
- // xxxx------
- // xxxxx-----
- // Non-causal mask:
- // xxxxx-----
- // xxxxx-----
- // xxxxx-----
- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
- // TODO: optimize this section
- for (uint32_t h = 0; h < 1; ++h) {
- for (uint32_t s = 0; s < n_stream; ++s) {
- for (uint32_t ii = 0; ii < n_tps; ++ii) {
- const uint32_t i = s*n_tps + ii;
+ const auto & v_cells = args.v_cells;
+ const auto & seq_to_stream = args.seq_to_stream;
+
+ const uint32_t n_swa = args.n_swa;
+ const llama_swa_type swa_type = args.swa_type;
- const llama_seq_id seq_id = ubatch->seq_id[i][0];
+ const int64_t n_kv = args.n_kv;
+ const int64_t n_stream = args.n_stream;
+ const int64_t n_tps = args.n_tps;
- const auto & cells = v_cells[seq_to_stream[seq_id]];
+ // the min position in the batch for each sequence
+ llama_pos seq_pos_min[LLAMA_MAX_SEQ];
+ std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
- const llama_pos p1 = ubatch->pos[i];
+ for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
- // for M-RoPE
- const bool is_2d = ubatch->is_pos_2d();
- const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
- const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
+ seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
+ }
- const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
+ for (uint32_t s = 0; s < n_stream; ++s) {
+ // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
+ std::unordered_map<llama_seq_id, uint32_t> seq_srct;
+ std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
- for (uint32_t j = 0; j < n_kv; ++j) {
- if (cells.is_empty(j)) {
- continue;
- }
+ for (uint32_t ii = 0; ii < n_tps; ++ii) {
+ const uint32_t i = s*n_tps + ii;
+
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
+
+ const auto & cells = v_cells.at(seq_to_stream[seq_id]);
+
+ llama_pos p0 = -1;
+ const llama_pos p1 = ubatch->pos[i];
+
+ // for M-RoPE
+ const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
+ const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
+
+ const uint64_t idst = n_kv*i;
+
+ // for tokens of the same sequence, the mask is mostly the same, so we can reuse it
+ // the only cells that could change are the ones that are with similar positions as the
+ // ones in the batch (i.e. due to causal masking, SWA, etc.)
+ // keep track of those cells and shortcut the loop to save time
+ // note: this optimization is not compatible with Alibi position encoding
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18842
+ bool prev = false;
- // mask the token if not the same sequence
- if (!cells.seq_has(j, seq_id)) {
- continue;
+ auto & idxs = seq_idxs[seq_id];
+
+ if (!alibi) {
+ if (seq_srct.find(seq_id) != seq_srct.end()) {
+ const uint32_t srct = seq_srct[seq_id];
+
+ const uint64_t idst_prev = n_kv*srct;
+
+ std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
+
+ prev = true;
+ } else {
+ idxs.clear();
+ idxs.reserve(ubatch->n_tokens + n_swa + 32);
+
+ seq_srct[seq_id] = i;
+ }
+ }
+
+ for (uint32_t jj = 0; jj < n_kv; ++jj) {
+ uint32_t j = jj;
+
+ // we have an exiting mask for this sequence -> update just seq_idxs
+ if (!alibi) {
+ if (prev) {
+ if (jj >= idxs.size()) {
+ break;
+ }
+
+ j = idxs[jj];
}
+ }
+
+ if (cells.is_empty(j)) {
+ goto skip;
+ }
+
+ // mask the token if not the same sequence
+ if (!cells.seq_has(j, seq_id)) {
+ goto skip;
+ }
+
+ p0 = cells.pos_get(j);
- const llama_pos p0 = cells.pos_get(j);
+ if (!alibi) {
+ if (!prev) {
+ // record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
+ if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
+ idxs.push_back(j);
+ }
+ }
+ }
+ if (causal) {
// mask future tokens
- if (causal_attn && p0 > p1) {
- continue;
+ if (p0 > p1) {
+ goto skip;
}
// M-RoPE causal mask
- if (causal_attn && is_2d && p0 == p1) {
- const auto & p0_ext = cells.ext_get(j);
- if (p0_ext.is_2d_gt(p1_x, p1_y)) {
- continue;
+ if (is_2d) {
+ if (p0 == p1) {
+ const auto & p0_ext = cells.ext_get(j);
+
+ if (p0_ext.is_2d_gt(p1_x, p1_y)) {
+ goto skip;
+ }
}
}
+ }
- // apply SWA if any
- if (is_masked_swa(p0, p1)) {
- continue;
+ // apply SWA if any
+ if (swa) {
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
+ goto skip;
}
+ }
- data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
+ if (alibi) {
+ data[idst + j] = -std::abs(p0 - p1);
+ } else {
+ data[idst + j] = 0.0f;
}
+
+ continue;
+skip:
+ data[idst + j] = -INFINITY;
}
}
}
}
+template<bool causal, bool swa, bool is_2d>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+ const bool alibi = args.hparams.use_alibi;
+ if (alibi) {
+ set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
+ } else {
+ set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
+ }
+}
+
+template<bool causal, bool swa>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+ const bool is_2d = args.ubatch->is_pos_2d();
+ if (is_2d) {
+ set_input_kq_mask_impl<causal, swa, true> (args, data);
+ } else {
+ set_input_kq_mask_impl<causal, swa, false>(args, data);
+ }
+}
+
+template<bool causal>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+ const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
+ if (swa) {
+ set_input_kq_mask_impl<causal, true> (args, data);
+ } else {
+ set_input_kq_mask_impl<causal, false>(args, data);
+ }
+}
+
+void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+ const uint32_t n_tokens = ubatch->n_tokens;
+
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+ float * data = (float *) dst->data;
+
+ const int64_t n_kv = dst->ne[0];
+ const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
+
+ GGML_ASSERT(n_tokens%n_stream == 0);
+
+ // n_tps == n_tokens_per_stream
+ const int64_t n_tps = n_tokens/n_stream;
+
+ //const int64_t t_start = ggml_time_us();
+
+ const args_set_input_kq_mask args = {
+ /*.hparams =*/ hparams,
+ /*.ubatch =*/ ubatch,
+ /*.v_cells =*/ v_cells,
+ /*.seq_to_stream =*/ seq_to_stream,
+ /*.n_swa =*/ n_swa,
+ /*.swa_type =*/ swa_type,
+ /*.n_kv =*/ n_kv,
+ /*.n_stream =*/ n_stream,
+ /*.n_tps =*/ n_tps,
+ };
+
+ if (causal_attn) {
+ set_input_kq_mask_impl<true> (args, data);
+ } else {
+ set_input_kq_mask_impl<false>(args, data);
+ }
+
+ //const int64_t t_end = ggml_time_us();
+
+ //LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
+}
+
void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
const int64_t n_tokens = ubatch->n_tokens;
size_t size_v_bytes = 0;
for (const auto & layer : layers) {
- size_v_bytes += ggml_nbytes(layer.v);
+ size_v_bytes += layer.v ? ggml_nbytes(layer.v) : 0;
}
return size_v_bytes;
const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
+ const auto & n_rot = hparams.n_rot;
+
+ const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
+
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
ggml_tensor * k =
ggml_view_3d(ctx, layer.k,
- n_embd_head_k, n_head_kv, get_size()*n_stream,
+ n_rot, n_head_kv, get_size()*n_stream,
ggml_row_size(layer.k->type, n_embd_head_k),
ggml_row_size(layer.k->type, n_embd_k_gqa),
- 0);
+ ggml_row_size(layer.k->type, n_embd_nope));
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
return gf;
}
-bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
- return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
-}
-
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags);
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
auto * v = layer.v_stream[cr.strm];
+ if (!v) {
+ continue;
+ }
// Write value type
const int32_t v_type_i = (int32_t) v->type;
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
auto * v = layer.v_stream[cr.strm];
+ if (!v) {
+ continue;
+ }
// Write value type
const int32_t v_type_i = (int32_t) v->type;
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
auto * v = layer.v_stream[strm];
+ if (!v) {
+ continue;
+ }
// Read type of value
int32_t v_type_i_ref;
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
auto * v = layer.v_stream[strm];
+ if (!v) {
+ continue;
+ }
// Read type of value
int32_t v_type_i_ref;
size_t size_k_bytes() const;
size_t size_v_bytes() const;
- bool is_masked_swa(llama_pos p0, llama_pos p1) const;
-
ggml_tensor * build_rope_shift(
const llama_cparams & cparams,
ggml_context * ctx,
--- /dev/null
+#include "llama-memory-hybrid-iswa.h"
+
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-context.h"
+
+//
+// llama_memory_hybrid_iswa
+//
+
+llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
+ const llama_model & model,
+ /* attn */
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool swa_full,
+ uint32_t kv_size,
+ uint32_t n_ubatch,
+ uint32_t n_pad,
+ /* recurrent */
+ ggml_type type_r,
+ ggml_type type_s,
+ uint32_t rs_size,
+ /* common */
+ uint32_t n_seq_max,
+ bool offload,
+ bool unified,
+ /* layer filters */
+ const layer_filter_cb & filter_attn,
+ const layer_filter_cb & filter_recr) :
+ hparams(model.hparams),
+ mem_attn(new llama_kv_cache_iswa(
+ model,
+ type_k,
+ type_v,
+ v_trans,
+ offload,
+ swa_full,
+ unified,
+ kv_size,
+ n_seq_max,
+ n_ubatch,
+ n_pad,
+ filter_attn == nullptr ?
+ [&](int32_t il) { return !hparams.is_recurrent(il); }
+ : filter_attn,
+ nullptr
+ )),
+ mem_recr(new llama_memory_recurrent(
+ model,
+ type_r,
+ type_s,
+ offload,
+ rs_size,
+ n_seq_max,
+ filter_recr == nullptr ?
+ [&](int32_t il) { return hparams.is_recurrent(il); }
+ : filter_recr
+ )) {}
+
+llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+ do {
+ balloc.split_reset();
+
+ // follow the recurrent pattern for creating the ubatch splits
+ std::vector<llama_ubatch> ubatches;
+
+ while (true) {
+ llama_ubatch ubatch;
+
+ if (embd_all) {
+ // if all tokens are output, split by sequence
+ ubatch = balloc.split_seq(n_ubatch);
+ } else {
+ // TODO: non-sequential equal split can be done if using unified KV cache
+ // for simplicity, we always use sequential equal split for now
+ ubatch = balloc.split_equal(n_ubatch, true);
+ }
+
+ if (ubatch.n_tokens == 0) {
+ break;
+ }
+
+ ubatches.push_back(std::move(ubatch)); // NOLINT
+ }
+
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
+ // failed to find a suitable split
+ break;
+ }
+
+ // prepare the recurrent batches first
+ if (!mem_recr->prepare(ubatches)) {
+ // TODO: will the recurrent cache be in an undefined context at this point?
+ LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
+ return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+ }
+
+ // prepare the attention cache (iswa version returns both base and swa slot infos)
+ auto sinfos_base = mem_attn->get_base()->prepare(ubatches);
+ if (sinfos_base.empty()) {
+ LLAMA_LOG_ERROR("%s: failed to prepare attention base ubatches\n", __func__);
+ return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+ }
+
+ auto sinfos_swa = mem_attn->get_swa()->prepare(ubatches);
+ if (sinfos_swa.empty()) {
+ LLAMA_LOG_ERROR("%s: failed to prepare attention swa ubatches\n", __func__);
+ return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+ }
+
+ return std::make_unique<llama_memory_hybrid_iswa_context>(
+ this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
+ } while(false);
+
+ return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_memory_hybrid_iswa::init_full() {
+ return std::make_unique<llama_memory_hybrid_iswa_context>(this);
+}
+
+llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * lctx, bool optimize) {
+ return std::make_unique<llama_memory_hybrid_iswa_context>(this, lctx, optimize);
+}
+
+bool llama_memory_hybrid_iswa::get_can_shift() const {
+ // Shifting is trivially supported for recurrent
+ return mem_attn->get_can_shift();
+}
+
+void llama_memory_hybrid_iswa::clear(bool data) {
+ mem_attn->clear(data);
+ mem_recr->clear(data);
+}
+
+bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+ // Try removing from the recurrent cache first since it may fail. If it does
+ // fail, the cache will not have been mutated.
+ if (!mem_recr->seq_rm(seq_id, p0, p1)) {
+ return false;
+ }
+ return mem_attn->seq_rm(seq_id, p0, p1);
+}
+
+void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+ mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+ mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
+ mem_attn->seq_keep(seq_id);
+ mem_recr->seq_keep(seq_id);
+}
+
+void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+ mem_attn->seq_add(seq_id, p0, p1, shift);
+ mem_recr->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+ mem_attn->seq_div(seq_id, p0, p1, d);
+ mem_recr->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_memory_hybrid_iswa::seq_pos_min(llama_seq_id seq_id) const {
+ // the min of the total cache is the max of the two caches' min values
+ return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
+}
+
+llama_pos llama_memory_hybrid_iswa::seq_pos_max(llama_seq_id seq_id) const {
+ // the max of the total cache is the min of the two caches' max values
+ return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_breakdown() const {
+ std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
+ for (const auto & buft_size : mem_recr->memory_breakdown()) {
+ mb[buft_size.first] += buft_size.second;
+ }
+ return mb;
+}
+
+void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+ mem_attn->state_write(io, seq_id, flags);
+ mem_recr->state_write(io, seq_id, flags);
+}
+
+void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+ mem_attn->state_read(io, seq_id, flags);
+ mem_recr->state_read(io, seq_id, flags);
+}
+
+llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
+ return mem_attn.get();
+}
+
+llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
+ return mem_recr.get();
+}
+
+//
+// llama_memory_hybrid_iswa_context
+//
+
+llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
+
+llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
+ ctx_attn(mem->get_mem_attn()->init_full()),
+ ctx_recr(mem->get_mem_recr()->init_full()),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
+ llama_memory_hybrid_iswa * mem,
+ llama_context * lctx,
+ bool optimize) :
+ ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
+ ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
+ llama_memory_hybrid_iswa * mem,
+ slot_info_vec_t sinfos_base,
+ slot_info_vec_t sinfos_swa,
+ std::vector<llama_ubatch> ubatches) :
+ ubatches(std::move(ubatches)),
+ // note: here we copy the ubatches. not sure if this is ideal
+ ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
+ ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
+ status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+bool llama_memory_hybrid_iswa_context::next() {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+ ctx_attn->next();
+ ctx_recr->next();
+
+ if (++i_next >= ubatches.size()) {
+ return false;
+ }
+
+ return true;
+}
+
+bool llama_memory_hybrid_iswa_context::apply() {
+ assert(!llama_memory_status_is_fail(status));
+
+ bool res = true;
+
+ res = res & ctx_attn->apply();
+ res = res & ctx_recr->apply();
+
+ return res;
+}
+
+llama_memory_status llama_memory_hybrid_iswa_context::get_status() const {
+ return status;
+}
+
+const llama_ubatch & llama_memory_hybrid_iswa_context::get_ubatch() const {
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+ return ubatches[i_next];
+}
+
+const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn() const {
+ return static_cast<const llama_kv_cache_iswa_context *>(ctx_attn.get());
+}
+
+const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
+ return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
+}
--- /dev/null
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cache-iswa.h"
+#include "llama-memory.h"
+#include "llama-memory-recurrent.h"
+
+#include <memory>
+#include <vector>
+
+//
+// llama_memory_hybrid_iswa
+//
+
+// utilizes instances of llama_memory_recurrent and llama_kv_cache_iswa to
+// support models where each layer may be either attention-based (with SWA support) or recurrent
+
+class llama_memory_hybrid_iswa : public llama_memory_i {
+public:
+ llama_memory_hybrid_iswa(
+ const llama_model & model,
+ /* attn */
+ ggml_type type_k,
+ ggml_type type_v,
+ bool v_trans,
+ bool swa_full,
+ uint32_t kv_size,
+ uint32_t n_ubatch,
+ uint32_t n_pad,
+ /* recurrent */
+ ggml_type type_r,
+ ggml_type type_s,
+ uint32_t rs_size,
+ /* common */
+ uint32_t n_seq_max,
+ bool offload,
+ bool unified,
+ /* layer filters */
+ const layer_filter_cb & filter_attn = nullptr,
+ const layer_filter_cb & filter_recr = nullptr);
+
+ ~llama_memory_hybrid_iswa() = default;
+
+ //
+ // llama_memory_i
+ //
+
+ llama_memory_context_ptr init_batch(
+ llama_batch_allocr & balloc,
+ uint32_t n_ubatch,
+ bool embd_all) override;
+
+ llama_memory_context_ptr init_full() override;
+
+ llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+ bool get_can_shift() const override;
+
+ void clear(bool data) override;
+
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+ void seq_keep(llama_seq_id seq_id) override;
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
+
+ llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+ // state write/load
+
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+ //
+ // llama_memory_hybrid_iswa specific API
+ //
+
+ llama_kv_cache_iswa * get_mem_attn() const;
+ llama_memory_recurrent * get_mem_recr() const;
+
+private:
+ const llama_hparams & hparams;
+
+ const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
+ const std::unique_ptr<llama_memory_recurrent> mem_recr;
+};
+
+class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
+public:
+ using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
+
+ // init failure
+ explicit llama_memory_hybrid_iswa_context(llama_memory_status status);
+
+ // init full
+ explicit llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem);
+
+ // init update
+ explicit llama_memory_hybrid_iswa_context(
+ llama_memory_hybrid_iswa * mem,
+ llama_context * lctx,
+ bool optimize);
+
+ // init success
+ llama_memory_hybrid_iswa_context(
+ llama_memory_hybrid_iswa * mem,
+ slot_info_vec_t sinfos_base,
+ slot_info_vec_t sinfos_swa,
+ std::vector<llama_ubatch> ubatches);
+
+ ~llama_memory_hybrid_iswa_context() = default;
+
+ bool next() override;
+ bool apply() override;
+
+ llama_memory_status get_status() const override;
+ const llama_ubatch & get_ubatch() const override;
+
+ //
+ // llama_memory_hybrid_iswa_context
+ //
+
+ const llama_kv_cache_iswa_context * get_attn() const;
+ const llama_memory_recurrent_context * get_recr() const;
+
+private:
+ // the index of the next ubatch to process
+ size_t i_next = 0;
+
+ std::vector<llama_ubatch> ubatches;
+
+ const llama_memory_context_ptr ctx_attn;
+ const llama_memory_context_ptr ctx_recr;
+
+ const llama_memory_status status;
+};
}
errno = 0;
if (fd == -1) {
- std::size_t ret = std::fread(ptr, len, 1, fp);
+ const size_t curr_off = tell();
+ const size_t to_read = std::min(len, size - curr_off);
+
+ std::size_t ret = std::fread(ptr, to_read, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
- if (ret != 1) {
+ if (to_read > 0 && ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}
} else {
continue; // Interrupted by signal, retry
}
// Fallback to std::fread in case the DMA controller cannot access the buffer
- if (errno == EFAULT) {
+ if (errno == EFAULT || errno == EINVAL) {
+ LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
auto curr_off = tell();
close(fd);
fd = -1;
#ifdef _WIN32
return _fileno(pimpl->fp);
#else
+ if (pimpl->fd != -1) {
+ return pimpl->fd;
+ }
#if defined(fileno)
return fileno(pimpl->fp);
#else
char* errmsg = std::strerror(errno);
bool suggest = (errno == ENOMEM);
-#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
- // visionOS/tvOS dont't support RLIMIT_MEMLOCK
- // Skip resource limit checks on visionOS/tvOS
+#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
+ // visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
+ // Skip resource limit checks on these platforms
suggest = false;
#else
struct rlimit lock_limit;
#include "ggml.h"
+#include <algorithm>
#include <array>
#include <cinttypes>
#include <cstring>
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
switch (arr_info.gt) {
+ case GGUF_TYPE_BOOL:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
(std::is_same<T, uint32_t>::value)); break;
result[i] = value;
}
} else {
- std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+ if (arr_info.gt == GGUF_TYPE_BOOL) {
+ std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
+ return static_cast<T>(x);
+ });
+ } else {
+ std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+ }
}
return true;
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
contexts.emplace_back(ctx);
- use_direct_io = use_direct_io && files.back()->has_direct_io();
+ if (use_mmap && use_direct_io) {
+ if (files.back()->has_direct_io()) {
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+ use_mmap = false;
+ } else {
+ LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
+ use_direct_io = false;
- // Disable mmap in case Direct I/O is enabled and available
- if (use_direct_io && use_mmap) {
- use_mmap = false;
- LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+ // reopen file using std::fopen for mmap
+ files.pop_back();
+ files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+ }
}
// Save tensors data offset of the main file.
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
- if (hparams.n_embd_out > 0) {
- add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
+ if (hparams.n_embd_out_impl > 0) {
+ add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl);
}
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
#include "llama-memory-hybrid.h"
+#include "llama-memory-hybrid-iswa.h"
#include "llama-memory-recurrent.h"
#include "ggml-cpp.h"
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
- // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
+ // contexts where the model tensors metadata is stored as well as the corresponding buffers:
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
buft_list_t cpu_buft_list;
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
}
-llama_model::~llama_model() = default;
+llama_model::~llama_model() {
+ for (auto * lora : loras) {
+ delete lora;
+ }
+}
void llama_model::load_stats(llama_model_loader & ml) {
pimpl->n_elements = ml.n_elements;
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
- ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
case LLM_ARCH_DEEPSEEK2:
{
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
- bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
+ const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
if (!is_lite) {
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
}
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
// that have no expert_gating_func model parameter set
- hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+ if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
+ // GLM 4.7 Lite
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+ } else {
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+ }
}
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
switch (hparams.n_layer) {
case 27: type = LLM_TYPE_16B; break;
+ case 47: type = LLM_TYPE_30B_A3B; break;
case 60: type = LLM_TYPE_236B; break;
case 61: type = LLM_TYPE_671B; break;
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_EXAONE_MOE:
+ {
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.n_swa = 128;
+ hparams.set_swa_pattern(4);
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_30B_A3B; break;
+ case 48:
+ case 49: type = LLM_TYPE_235B_A22B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_RWKV6:
case LLM_ARCH_RWKV6QWEN2:
{
} break;
case LLM_ARCH_DEEPSEEK2:
{
- // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
- const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
-
- const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
+ const bool is_mla = hparams.is_mla();
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
- const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
- const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
+ const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+ const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
const int64_t n_embd_head_qk_rope = hparams.n_rot;
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- if (!is_lite) {
+ if (q_lora_rank > 0) {
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
}
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
- if (!is_lite) {
+ if (q_lora_rank > 0) {
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
} else {
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
}
} break;
+ case LLM_ARCH_EXAONE_MOE:
+ {
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert = hparams.n_expert;
+ const int64_t n_expert_used = hparams.n_expert_used;
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
+ const int64_t head_dim = hparams.n_embd_head_k;
+ const int64_t n_qo_dim = n_head * head_dim;
+ const int64_t n_kv_dim = n_head_kv * head_dim;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ int flags = 0;
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ // skip all tensors in the NextN layers
+ flags |= TENSOR_SKIP;
+ }
+
+ auto & layer = layers[i];
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
+
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+
+ // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
+ if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
+
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
+ }
+
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
+
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
+ }
+ }
+ } break;
case LLM_ARCH_RWKV6:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
}
// for LFM2-ColBert-350M
- dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
} break;
case LLM_ARCH_SMALLTHINKER:
{
};
// hparams
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
- LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
- LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
+ LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
+ LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
if (!hparams.vocab_only) {
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
- LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
- LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
- LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
- LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
- LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
- LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
- LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
- LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
- LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
- LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
- LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
- LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
- LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
- LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
- LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
- LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
- LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
- LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
- LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
- LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
+ LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
+ LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
+ LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
+ LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
+ LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
+ LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
+ LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
+ LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
- LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
- LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
}
- LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
- LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
- LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
+ LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
// MRoPE (Multi-axis Rotary Position Embedding) sections
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
- LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
+ LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
}
if (!classifier_labels.empty()) {
- LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
size_t i = 0;
for (auto label : classifier_labels) {
- LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
}
}
}
arch == LLM_ARCH_QWEN3NEXT ||
arch == LLM_ARCH_NEMOTRON_H ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
- LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
- LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
}
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
if (pimpl->n_elements >= 1e12) {
- LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
} else if (pimpl->n_elements >= 1e9) {
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
} else if (pimpl->n_elements >= 1e6) {
- LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
} else {
- LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
}
// general kv
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
if (arch == LLM_ARCH_DEEPSEEK) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
}
if (arch == LLM_ARCH_DEEPSEEK2) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
- LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
- LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
- LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
}
if (arch == LLM_ARCH_QWEN2MOE) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
}
if (arch == LLM_ARCH_MINICPM ||
arch == LLM_ARCH_GRANITE_MOE ||
arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
- LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
- LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
- LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
if (arch == LLM_ARCH_BAILINGMOE) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
}
if (arch == LLM_ARCH_BAILINGMOE2) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
- LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
}
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
}
if (arch == LLM_ARCH_GROVEMOE) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
- LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
- LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
+ LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
+ LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
+ LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
}
vocab.print_info();
};
}
- res = new llama_memory_hybrid(
- /* model */ *this,
- /* attn_type_k */ params.type_k,
- /* attn_type_v */ params.type_v,
- /* attn_v_trans */ !cparams.flash_attn,
- /* attn_kv_size */ cparams.n_ctx,
- /* attn_n_pad */ 1,
- /* attn_n_swa */ hparams.n_swa,
- /* attn_swa_type */ hparams.swa_type,
- /* recurrent_type_k */ GGML_TYPE_F32,
- /* recurrent_type_v */ GGML_TYPE_F32,
- /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
- /* n_seq_max */ cparams.n_seq_max,
- /* offload */ cparams.offload_kqv,
- /* unified */ cparams.kv_unified,
- /* filter_attn */ std::move(filter_attn),
- /* filter_recr */ std::move(filter_recr));
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ // Use hybrid-iswa for hybrid models with SWA
+ res = new llama_memory_hybrid_iswa(
+ /* model */ *this,
+ /* attn_type_k */ params.type_k,
+ /* attn_type_v */ params.type_v,
+ /* attn_v_trans */ !cparams.flash_attn,
+ /* attn_swa_full */ params.swa_full,
+ /* attn_kv_size */ cparams.n_ctx,
+ /* attn_n_ubatch */ cparams.n_ubatch,
+ /* attn_n_pad */ 1,
+ /* recurrent_type_r */ GGML_TYPE_F32,
+ /* recurrent_type_s */ GGML_TYPE_F32,
+ /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+ /* n_seq_max */ cparams.n_seq_max,
+ /* offload */ cparams.offload_kqv,
+ /* unified */ cparams.kv_unified,
+ /* filter_attn */ std::move(filter_attn),
+ /* filter_recr */ std::move(filter_recr));
+ } else {
+ res = new llama_memory_hybrid(
+ /* model */ *this,
+ /* attn_type_k */ params.type_k,
+ /* attn_type_v */ params.type_v,
+ /* attn_v_trans */ !cparams.flash_attn,
+ /* attn_kv_size */ cparams.n_ctx,
+ /* attn_n_pad */ 1,
+ /* attn_n_swa */ hparams.n_swa,
+ /* attn_swa_type */ hparams.swa_type,
+ /* recurrent_type_k */ GGML_TYPE_F32,
+ /* recurrent_type_v */ GGML_TYPE_F32,
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+ /* n_seq_max */ cparams.n_seq_max,
+ /* offload */ cparams.offload_kqv,
+ /* unified */ cparams.kv_unified,
+ /* filter_attn */ std::move(filter_attn),
+ /* filter_recr */ std::move(filter_recr));
+ }
} else {
llama_memory_i::layer_reuse_cb reuse = nullptr;
llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
}
} break;
+ case LLM_ARCH_EXAONE_MOE:
+ {
+ llm = std::make_unique<llm_build_exaone_moe>(*this, params);
+ } break;
case LLM_ARCH_RWKV6:
{
llm = std::make_unique<llm_build_rwkv6>(*this, params);
/*.kv_overrides =*/ nullptr,
/*.vocab_only =*/ false,
/*.use_mmap =*/ true,
- /*.use_direct_io =*/ true,
+ /*.use_direct_io =*/ false,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.use_extra_bufts =*/ true,
}
int32_t llama_model_n_embd_out(const llama_model * model) {
- return model->hparams.get_n_embd_out();
+ return model->hparams.n_embd_out();
}
int32_t llama_model_n_layer(const llama_model * model) {
case LLM_ARCH_NEMOTRON:
case LLM_ARCH_EXAONE:
case LLM_ARCH_EXAONE4:
+ case LLM_ARCH_EXAONE_MOE:
case LLM_ARCH_MINICPM3:
case LLM_ARCH_BAILINGMOE2:
case LLM_ARCH_DOTS1:
#include <memory>
#include <string>
#include <unordered_map>
+#include <unordered_set>
#include <vector>
struct llama_cparams;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
- // for keeping track of extra nodes used by lora adapters
- uint32_t n_lora_nodes = 0;
+ // for keeping track of associated LoRA adapters
+ std::unordered_set<llama_adapter_lora *> loras;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
++qs.i_ffn_up;
}
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
- //}
- // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
- //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
- //}
- // This can be used to reduce the size of the Q5_K_S model.
- // The associated PPL increase is fully in line with the size reduction
- //else {
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
- //}
- bool convert_incompatible_tensor = false;
- {
- const int64_t nx = tensor->ne[0];
- const int64_t ny = tensor->ne[1];
- const int64_t qk_k = ggml_blck_size(new_type);
-
- if (nx % qk_k != 0) {
- LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
- convert_incompatible_tensor = true;
- } else {
- ++qs.n_k_quantized;
- }
- }
-
- if (convert_incompatible_tensor) {
- switch (new_type) {
- case GGML_TYPE_TQ1_0:
- case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
- case GGML_TYPE_IQ2_XXS:
- case GGML_TYPE_IQ2_XS:
- case GGML_TYPE_IQ2_S:
- case GGML_TYPE_IQ3_XXS:
- case GGML_TYPE_IQ3_S:
- case GGML_TYPE_IQ1_S:
- case GGML_TYPE_IQ1_M:
- case GGML_TYPE_Q2_K:
- case GGML_TYPE_Q3_K:
- case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
- default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
- }
- if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
- new_type = GGML_TYPE_F16;
- }
- LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
- ++qs.n_fallback;
- }
-
return new_type;
}
}
std::vector<std::string> splits = {};
- llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params());
// get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) {
- int fallback = qs.n_fallback;
- new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
- // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
- if (params->tensor_types && qs.n_fallback - fallback == 0) {
+ // if the user provided tensor types - use those
+ bool manual = false;
+ if (params->tensor_types) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
const std::string tensor_name(tensor->name);
for (const auto & [tname, qtype] : tensor_types) {
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
if (qtype != new_type) {
- LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+ LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
+ manual = true;
+ break;
}
}
}
}
+
+ // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+ if (!manual) {
+ new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+ }
+
+ // incompatible tensor shapes are handled here - fallback to a compatible type
+ {
+ bool convert_incompatible_tensor = false;
+
+ const int64_t nx = tensor->ne[0];
+ const int64_t ny = tensor->ne[1];
+ const int64_t qk_k = ggml_blck_size(new_type);
+
+ if (nx % qk_k != 0) {
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+ convert_incompatible_tensor = true;
+ } else {
+ ++qs.n_k_quantized;
+ }
+
+ if (convert_incompatible_tensor) {
+ switch (new_type) {
+ case GGML_TYPE_TQ1_0:
+ case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
+ case GGML_TYPE_IQ2_XXS:
+ case GGML_TYPE_IQ2_XS:
+ case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_IQ3_XXS:
+ case GGML_TYPE_IQ3_S:
+ case GGML_TYPE_IQ1_S:
+ case GGML_TYPE_IQ1_M:
+ case GGML_TYPE_Q2_K:
+ case GGML_TYPE_Q3_K:
+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
+ default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+ }
+ if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+ new_type = GGML_TYPE_F16;
+ }
+ LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+ ++qs.n_fallback;
+ }
+ }
}
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type;
mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
- // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
- // top_p_bias = (mask * 1e9f) - 1e9f.
- // So entries in the mask that we want to discard will become -1e9f, and
- // others will be 0 (meaning that will not effect the logits).
- const float large_val = 1e9f;
- struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
+ // Apply -INFINITY bias for masked-out tokens
+ // log(1) = 0 (keep), log(0) = -INF (discard)
+ struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
ggml_set_name(top_p_bias, "top_p_bias");
data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
struct ggml_tensor * mask = ggml_step(ctx, sub);
ggml_set_name(mask, "min_p_mask");
- // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
- // min_p_bias = (mask * 1e9f) - 1e9f.
- // So entries in the mask that we want to discard will become -1e9f, and
- // others will be 0 (meaning that will not effect the logits).
- const float large_val = 1e9f;
- struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
+ // Apply -INFINITY bias for masked-out tokens
+ // log(1) = 0 (keep), log(0) = -INF (discard)
+ struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
ggml_set_name(min_p_bias, "min_p_bias");
- // Add the min_p bias to the logits.
data->logits = ggml_add(ctx, data->logits, min_p_bias);
ggml_set_name(data->logits, "min_p_logits");
return result;
}
+// adaptive-p sampler state
+//
+// maintains an exponential moving average of the *ORIGINAL* probabilities
+// of selected tokens, used to compute an adapted target at each sampling step.
+//
+// see llama.h for a full description of the sampler
+//
+// ref: https://github.com/ggml-org/llama.cpp/pull/17927
+//
+struct llama_sampler_adaptive_p {
+ const float target; // target probability (0.0 - 1.0; negative = disabled)
+ const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
+ const uint32_t seed; // original RNG seed
+ uint32_t seed_cur; // actual RNG seed
+ std::mt19937 rng; // RNG state
+ float weighted_sum; // sum(p_i * decay^i)
+ float total_weight; // sum(decay^i), converges to 1/(1-decay)
+ std::vector<float> original_probs; // pre-transform probs, cached for EMA update
+ llama_token pending_token_id; // token ID of selected token
+ int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs
+};
+
+// adaptive probability transformation constants
+static constexpr float DISTRIBUTION_WIDTH = 0.3f;
+static constexpr float PEAK_LOGIT_VALUE = 5.0f;
+static constexpr float SHARPNESS = 10.0f;
+static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
+
+static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
+ return "adaptive-p";
+}
+
+static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+
+ llama_sampler_softmax_impl(cur_p, false);
+
+ if (ctx->target < 0.0f) {
+ // at negative target values, adaptive-p is no-op
+ // we simply sample from the existing distribution
+ cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+ return;
+ }
+
+ // store the original probabilities
+ ctx->original_probs.resize(cur_p->size);
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ ctx->original_probs[i] = cur_p->data[i].p;
+ }
+
+ // using the EMA, compute the adapted target probability for the current sampling step
+ auto target = std::clamp(ctx->target, 0.0f, 1.0f);
+ float adapted_target = std::clamp(
+ ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
+ 0.0f, 1.0f
+ );
+
+ // adaptive probability transform
+ //
+ // quadratic near target for fine differentiation, transitioning to linear decay in the
+ // tails. unbounded negative logits ensure proper suppression of far-from-target tokens
+ // after the softmax.
+ //
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (cur_p->data[i].logit == -INFINITY) {
+ // don't transform logits that are -INFINITY
+ // (as masked out by e.g. min-p and top-p when using backend sampling)
+ continue;
+ }
+ float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
+ cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
+ }
+
+ // softmax and sample from the transformed distribution
+ llama_sampler_softmax_impl(cur_p, false);
+ const int idx = llama_sample_dist(cur_p, ctx->rng);
+ cur_p->selected = idx;
+
+ // store the selected token ID for acceptance later
+ ctx->pending_token_id = cur_p->data[idx].id;
+ ctx->pending_token_idx = idx;
+}
+
+static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+ if (ctx->pending_token_id == token) {
+ GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
+ GGML_ASSERT(ctx->pending_token_idx != -1);
+ // update EMA with the original probability of the selected token
+ ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
+ ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
+ }
+ ctx->pending_token_id = LLAMA_TOKEN_NULL;
+ ctx->pending_token_idx = -1;
+}
+
+static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
+ // ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
+ // original_probs is completely overwritten on every call to _apply.
+ // so we only need to reset the EMA state and pending token.
+ ctx->weighted_sum = ctx->target / (1.0f - ctx->decay);
+ ctx->total_weight = 1.0f / (1.0f - ctx->decay);
+ ctx->pending_token_id = LLAMA_TOKEN_NULL;
+ ctx->pending_token_idx = -1;
+ ctx->seed_cur = get_rng_seed(ctx->seed);
+ ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
+ auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
+ auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
+
+ // copy everything (target, decay, seed, and RNG are already set)
+ result_ctx->weighted_sum = ctx->weighted_sum;
+ result_ctx->total_weight = ctx->total_weight;
+ result_ctx->pending_token_id = ctx->pending_token_id;
+ result_ctx->pending_token_idx = ctx->pending_token_idx;
+
+ return result;
+}
+
+static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_adaptive_p *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_adaptive_p_i = {
+ /* .name = */ llama_sampler_adaptive_p_name,
+ /* .accept = */ llama_sampler_adaptive_p_accept,
+ /* .apply = */ llama_sampler_adaptive_p_apply,
+ /* .reset = */ llama_sampler_adaptive_p_reset,
+ /* .clone = */ llama_sampler_adaptive_p_clone,
+ /* .free = */ llama_sampler_adaptive_p_free,
+ /* .backend_init = */ nullptr,
+ /* .backend_accept = */ nullptr,
+ /* .backend_apply = */ nullptr,
+ /* .backend_set_input = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_adaptive_p(
+ float target,
+ float decay,
+ uint32_t seed
+) {
+ auto seed_cur = get_rng_seed(seed);
+ float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
+ return llama_sampler_init(
+ /* .iface = */ &llama_sampler_adaptive_p_i,
+ /* .ctx = */ new llama_sampler_adaptive_p {
+ /* .target = */ target,
+ /* .decay = */ clamped_decay,
+ /* .seed = */ seed,
+ /* .seed_cur = */ seed_cur,
+ /* .rng = */ std::mt19937(seed_cur),
+ /* .weighted_sum = */ target / (1.0f - clamped_decay),
+ /* .total_weight = */ 1.0f / (1.0f - clamped_decay),
+ /* .original_probs = */ {},
+ /* .pending_token_id = */ LLAMA_TOKEN_NULL,
+ /* .pending_token_idx = */ -1
+ }
+ );
+}
+
// logit-bias
struct llama_sampler_logit_bias : public llama_sampler_backend {
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
+ regex_exprs = {
+ // original regex from tokenizer.json
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
+ };
+ break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
} else if (
tokenizer_pre == "exaone4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+ } else if (
+ tokenizer_pre == "exaone-moe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
} else if (
tokenizer_pre == "chameleon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
auto & attr = id_to_token[t.second].attr;
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
+ LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
+ __func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
+
+ attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
}
}
special_eog_ids.erase(end_id);
auto & attr = id_to_token[end_id].attr;
- attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
+ attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
}
}
void llama_vocab::impl::print_info() const {
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
// special tokens
- if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
- if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
- if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
- if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
- if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
- if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
- if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
- if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
-
- if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
-
- if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
- if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
- if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
- if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
- if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
- if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
+
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
+
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
for (const auto & id : special_eog_ids) {
- LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
}
- LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
}
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
+ LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
};
struct LLM_KV;
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
}
} else {
- LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
- __func__, hp_nct, n_ctx_min);
+ if (n_ctx_min == UINT32_MAX) {
+ LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+ } else {
+ LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+ __func__, hp_nct, n_ctx_min);
+ }
}
} else {
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
// model split
//
-int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
+int32_t llama_split_path(
+ char * split_path,
+ size_t maxlen,
+ const char * path_prefix,
+ int32_t split_no,
+ int32_t split_count) {
+
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
- if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
- return strlen(split_path);
+
+ const int written = snprintf(
+ split_path,
+ maxlen,
+ SPLIT_PATH_FORMAT,
+ path_prefix,
+ split_no + 1,
+ split_count
+ );
+
+ if (written < 0 || (size_t) written >= maxlen) {
+ return 0;
}
- return 0;
+
+ return (int32_t) written;
}
-int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
- std::string str_split_path(split_path);
+int32_t llama_split_prefix(
+ char * split_prefix,
+ size_t maxlen,
+ const char * split_path,
+ int32_t split_no,
+ int32_t split_count) {
+
+ const std::string str_split_path(split_path);
+
char postfix[32];
- snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
- std::string str_postfix(postfix);
-
- // check if split_prefix ends with postfix
- int size_prefix = str_split_path.size() - str_postfix.size();
- if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
- snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
- return size_prefix;
+ snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
+
+ const std::string str_postfix(postfix);
+ if (str_split_path.size() <= str_postfix.size()) {
+ return 0;
+ }
+
+ const size_t size_prefix = str_split_path.size() - str_postfix.size();
+
+ if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
+ const size_t copy_len = std::min(size_prefix + 1, maxlen);
+ snprintf(split_prefix, copy_len, "%s", split_path);
+
+ return (int32_t) size_prefix;
}
return 0;
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
- bool use_direct_io; // use direct io, takes precedence over use_mmap
+ bool use_direct_io; // use direct io, takes precedence over use_mmap when supported
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
// - returns true if the parameters could be successfully modified to fit device memory
// - this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified
+ // with the exception of the context size which is modified if and only if equal to 0
LLAMA_API enum llama_params_fit_status llama_params_fit(
const char * path_model,
struct llama_model_params * mparams,
// Manually free a LoRA adapter
// NOTE: loaded adapters will be free when the associated model is deleted
- LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
+ LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
+ "adapters are now freed together with the associated model");
// Get the invocation tokens if the current lora is an alora
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
// [EXPERIMENTAL]
// attach a sampler to the context
// note: prefer initializing the context with llama_context_params.samplers when possible
- // note: changing the samplers of a context can cause graph reallocations and degraded performance
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
// mirror of llama_sampler_i:
const char ** seq_breakers,
size_t num_breakers);
+ /// adaptive-p: select tokens near a configurable target probability over time.
+ ///
+ /// the adaptive-p sampler transforms the token probability distribution to favor tokens
+ /// that fall near a user-configurable probability target.
+ ///
+ /// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
+ /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
+ /// adapted target probability at each sampling step, thus maintaining the desired target
+ /// probability over time.
+ ///
+ /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
+ /// in the sampler chain (like mirostat, dist, greedy).
+ ///
+ /// only mild truncation before this sampler is recommended. we suggest applying min-p
+ /// before adaptive-p as the only other active sampler in the chain.
+ ///
+ /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
+ /// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
+ /// @param seed RNG seed
+ ///
+ /// ref: https://github.com/ggml-org/llama.cpp/pull/17927
+ ///
+ LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
+ float target,
+ float decay,
+ uint32_t seed);
+
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
int32_t n_vocab,
int32_t n_logit_bias,
/// @details Build a split GGUF final path for this chunk.
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
// Returns the split_path length.
- LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
+ LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
// Returns the split_prefix length.
- LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
+ LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
// Print system information
LLAMA_API const char * llama_print_system_info(void);
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
- // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
- bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
-
- const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
+ const bool is_mla = hparams.is_mla();
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
- const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
- const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
+ const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
+ const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
const int64_t n_embd_head_qk_rope = hparams.n_rot;
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv();
+ auto * inp_attn_kv = !is_mla ? build_attn_inp_kv() : nullptr;
+ auto * inp_attn_k = is_mla ? build_attn_inp_k() : nullptr;
ggml_tensor * inp_out_ids = build_inp_out_ids();
// self_attention
{
ggml_tensor * q = NULL;
+
+ const bool is_lite = model.layers[il].wq;
+
if (!is_lite) {
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il);
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
// note: rope must go first for in-place context shifting in build_rope_shift()
- ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
cb(Qcur, "Qcur", il);
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
cb(kv_cmpr, "kv_cmpr_reshape", il);
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
- ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
+ ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
cb(Kcur, "Kcur", il);
// {kv_lora_rank, 1, n_tokens}
}
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
- cur = build_attn(inp_attn,
+ cur = build_attn(inp_attn_k,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
} else {
Vcur = ggml_cont(ctx0, Vcur);
cb(Vcur, "Vcur_cont", il);
- // note: rope must go first for in-place context shifting in build_rope_shift()
- ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0);
cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
cb(Kcur, "Kcur", il);
if (inp_attn_scale) {
}
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
- cur = build_attn(inp_attn,
+ cur = build_attn(inp_attn_kv,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
--- /dev/null
+#include "models.h"
+
+
+llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_k;
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn_iswa = build_attn_inp_kv_iswa();
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+ for (int il = 0; il < n_transformer_layers; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // use RoPE for SWA layers
+ const bool is_local_layer = hparams.is_swa(il);
+
+ // norm
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur_normed", il);
+ cb(Kcur, "Kcur_normed", il);
+
+ if (is_local_layer) {
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
+ freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+ }
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn_iswa,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+ cb(cur, "attn_out", il);
+ }
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // norm
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ // dense branch
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL, NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ ggml_tensor * moe_out = build_moe_ffn(cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, hparams.expert_weights_norm,
+ true, hparams.expert_weights_scale,
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
+ il);
+ cb(moe_out, "ffn_moe_out", il);
+
+ // FFN shared expert
+ {
+ ggml_tensor * ffn_shexp =
+ build_ffn(cur,
+ model.layers[il].ffn_up_shexp, NULL, NULL,
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
+ model.layers[il].ffn_down_shexp, NULL, NULL,
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(ffn_shexp, "ffn_shexp", il);
+
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
+ cb(cur, "ffn_out", il);
+ }
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ cur = inpL;
+
+ // final norm
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
// equivalent to get_per_layer_inputs() in python code
// output shape: [n_embd_altup, n_layer, n_tokens]
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
- auto inp = std::make_unique<llm_graph_input_embd>();
+ auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
ggml_tensor * inp_per_layer;
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
ggml_set_input(inp->tokens);
- res->t_tokens = inp->tokens;
+ res->t_inp_tokens = inp->tokens;
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
res->add_input(std::move(inp));
} else {
// Vision embedding path: use padding token (ID=0) embedding
+ // TODO: verify if this is the correct behavior in transformers implementation
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
- // Extract and dequantize padding token embedding (column 0)
- ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
- ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
- inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
+ // Extract and dequantize padding token embedding (row 0)
+ ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
+ inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
// Reshape to [n_embd_altup, n_layer, 1]
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
ggml_tensor * cur;
llm_build_exaone(const llama_model & model, const llm_graph_params & params);
};
+struct llm_build_exaone_moe : public llm_graph_context {
+ llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
+};
+
struct llm_build_falcon : public llm_graph_context {
llm_build_falcon(const llama_model & model, const llm_graph_params & params);
};
const llama_model & model,
const int64_t n_embd_head,
const int il) {
- // compute Q and K and (optionally) RoPE them
+ // compute Q and K
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
ggml_tensor * cur;
llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
- const int64_t n_embd = hparams.n_embd;
+
+ const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
- std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
-
- if (ubatch.embd) {
- // Image input: split main embd and deepstack embds
- ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
- for (size_t i = 0; i < n_deepstack_layers; i++) {
- deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
- }
- inpL = inpL_main;
- }
-
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
- if (ubatch.embd && (size_t)il < n_deepstack_layers) {
- cur = ggml_add(ctx0, cur, deepstack_features[il]);
+ if (il < (int) n_deepstack_layers) {
+ ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
+ cur = ggml_add(ctx0, cur, ds);
cb(cur, "deepstack_out", il);
}
llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
- const int64_t n_embd = hparams.n_embd;
+
+ const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
- std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
-
- if (ubatch.embd) {
- // Image input: split main embd and deepstack embds
- ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
- for (size_t i = 0; i < n_deepstack_layers; i++) {
- deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
- }
- inpL = inpL_main;
- }
-
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
- if (ubatch.embd && (size_t)il < n_deepstack_layers) {
- cur = ggml_add(ctx0, cur, deepstack_features[il]);
+ if (il < (int) n_deepstack_layers) {
+ ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
+ cur = ggml_add(ctx0, cur, ds);
cb(cur, "deepstack_out", il);
}