talk-llama : sync llama.cpp

author Georgi Gerganov <redacted>

Mon, 16 Mar 2026 05:16:46 +0000 (07:16 +0200)

committer Georgi Gerganov <redacted>

Mon, 16 Mar 2026 11:10:15 +0000 (13:10 +0200)
author Georgi Gerganov <redacted>
Mon, 16 Mar 2026 05:16:46 +0000 (07:16 +0200)
committer Georgi Gerganov <redacted>
Mon, 16 Mar 2026 11:10:15 +0000 (13:10 +0200)
diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp

index 47e8d5278acdc7521c537021ba77fb17124d7e6c..799d16167ba781b7ab593dd37e3ca99489a35a42 100644 (file)
--- a/examples/talk-llama/llama-arch.cpp
+++ b/examples/talk-llama/llama-arch.cpp
@@ -4,6 +4,7 @@
  
  #include <map>
  #include <set>
+#include <vector>
  
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
      { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize
@@ -184,6 +185,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
      { LLM_KV_EXPERT_GROUP_SCALE,                "%s.expert_group_scale"                },
      { LLM_KV_EXPERTS_PER_GROUP,                 "%s.experts_per_group"                 },
      { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                },
+    { LLM_KV_MOE_LATENT_SIZE,                   "%s.moe_latent_size"                   },
      { LLM_KV_NEXTN_PREDICT_LAYERS,              "%s.nextn_predict_layers"              },
      { LLM_KV_NUM_DEEPSTACK_LAYERS,              "%s.n_deepstack_layers"                },
      { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
@@ -229,11 +231,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
      { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
      { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
      { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
+    { LLM_KV_ATTENTION_KEY_LENGTH_SWA,               "%s.attention.key_length_swa"               },
+    { LLM_KV_ATTENTION_VALUE_LENGTH_SWA,             "%s.attention.value_length_swa"             },
      { LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,           "%s.attention.indexer.head_count"           },
      { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,           "%s.attention.indexer.key_length"           },
      { LLM_KV_ATTENTION_INDEXER_TOP_K,                "%s.attention.indexer.top_k"                },
  
      { LLM_KV_ROPE_DIMENSION_COUNT,           "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_DIMENSION_COUNT_SWA,       "%s.rope.dimension_count_swa"             },
      { LLM_KV_ROPE_DIMENSION_SECTIONS,        "%s.rope.dimension_sections"              },
      { LLM_KV_ROPE_FREQ_BASE,                 "%s.rope.freq_base"                       },
      { LLM_KV_ROPE_FREQ_BASE_SWA,             "%s.rope.freq_base_swa"                   },
@@ -361,6 +366,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
      { LLM_TENSOR_FFN_UP_SHEXP,                           "blk.%d.ffn_up_shexp" },
      { LLM_TENSOR_FFN_DOWN_SHEXP,                         "blk.%d.ffn_down_shexp" },
      { LLM_TENSOR_FFN_EXP_PROBS_B,                        "blk.%d.exp_probs_b" },
+    { LLM_TENSOR_FFN_LATENT_DOWN,                        "blk.%d.ffn_latent_down" },
+    { LLM_TENSOR_FFN_LATENT_UP,                          "blk.%d.ffn_latent_up" },
      { LLM_TENSOR_ATTN_NORM_2,                            "blk.%d.attn_norm_2" },
      { LLM_TENSOR_ATTN_QKV,                               "blk.%d.attn_qkv" },
      { LLM_TENSOR_LAYER_OUT_NORM,                         "blk.%d.layer_output_norm" },
@@ -1083,6 +1090,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                  LLM_TENSOR_TOKEN_EMBD,
                  LLM_TENSOR_OUTPUT_NORM,
                  LLM_TENSOR_OUTPUT,
+                LLM_TENSOR_CLS_OUT,
                  LLM_TENSOR_ATTN_NORM,
                  LLM_TENSOR_ATTN_Q,
                  LLM_TENSOR_ATTN_Q_NORM,
@@ -1874,6 +1882,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                  LLM_TENSOR_FFN_UP_EXPS,
                  LLM_TENSOR_FFN_DOWN_EXPS,
                  LLM_TENSOR_FFN_EXP_PROBS_B,
+                LLM_TENSOR_FFN_LATENT_DOWN,
+                LLM_TENSOR_FFN_LATENT_UP,
                  // MoE shared expert layer
                  LLM_TENSOR_FFN_DOWN_SHEXP,
                  LLM_TENSOR_FFN_UP_SHEXP,
@@ -2749,6 +2759,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
      {LLM_TENSOR_NEXTN_HNORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
      {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    // Nemotron 3 Super
+    {LLM_TENSOR_FFN_LATENT_DOWN,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_LATENT_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
  };
  
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -2786,6 +2799,15 @@ std::string LLM_TN_IMPL::str() const {
      return name;
  }
  
+std::vector<llm_arch> llm_arch_all() {
+    std::vector<llm_arch> ret;
+    ret.reserve(LLM_ARCH_NAMES.size());
+    for (const auto & [arch, _] : LLM_ARCH_NAMES) {
+        ret.push_back(arch);
+    }
+    return ret;
+}
+
  const char * llm_arch_name(llm_arch arch) {
      auto it = LLM_ARCH_NAMES.find(arch);
      if (it == LLM_ARCH_NAMES.end()) {
diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h

index 6d1b1df31c0bd1338100b718ff77bd0910e39618..b1b1dcf18839849e22d8f5aa607d8748a44bf294 100644 (file)
--- a/examples/talk-llama/llama-arch.h
+++ b/examples/talk-llama/llama-arch.h
@@ -4,6 +4,7 @@
  
  #include <string>
  #include <set>
+#include <vector>
  
  //
  // gguf constants (sync with gguf.py)
@@ -188,6 +189,7 @@ enum llm_kv {
      LLM_KV_EXPERT_GROUP_SCALE,
      LLM_KV_EXPERTS_PER_GROUP,
      LLM_KV_MOE_EVERY_N_LAYERS,
+    LLM_KV_MOE_LATENT_SIZE,
      LLM_KV_NEXTN_PREDICT_LAYERS,
      LLM_KV_NUM_DEEPSTACK_LAYERS,
      LLM_KV_POOLING_TYPE,
@@ -233,11 +235,14 @@ enum llm_kv {
      LLM_KV_ATTENTION_TEMPERATURE_SCALE,
      LLM_KV_ATTENTION_KEY_LENGTH_MLA,
      LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+    LLM_KV_ATTENTION_KEY_LENGTH_SWA,
+    LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
      LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
      LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
      LLM_KV_ATTENTION_INDEXER_TOP_K,
  
      LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_DIMENSION_COUNT_SWA,
      LLM_KV_ROPE_DIMENSION_SECTIONS,
      LLM_KV_ROPE_FREQ_BASE,
      LLM_KV_ROPE_FREQ_BASE_SWA,
@@ -381,6 +386,8 @@ enum llm_tensor {
      LLM_TENSOR_FFN_GATE_CHEXPS,
      LLM_TENSOR_FFN_UP_CHEXPS,
      LLM_TENSOR_FFN_EXP_PROBS_B,
+    LLM_TENSOR_FFN_LATENT_DOWN,
+    LLM_TENSOR_FFN_LATENT_UP,
      LLM_TENSOR_ATTN_Q_NORM,
      LLM_TENSOR_ATTN_K_NORM,
      LLM_TENSOR_LAYER_OUT_NORM,
@@ -608,6 +615,8 @@ struct llm_tensor_info {
      ggml_op op;
  };
  
+std::vector<llm_arch> llm_arch_all();
+
  const char * llm_arch_name(llm_arch arch);
  
  llm_arch llm_arch_from_string(const std::string & name);
diff --git a/examples/talk-llama/llama-batch.cpp b/examples/talk-llama/llama-batch.cpp

index 386fab04ac9c7a01dc77ef0bb7b4fcd35d81e548..6bf76939cddcbb7c06232ad7e88bf47e5bd57bc0 100644 (file)
--- a/examples/talk-llama/llama-batch.cpp
+++ b/examples/talk-llama/llama-batch.cpp
@@ -394,11 +394,13 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
      clear();
      split_reset();
  
+    const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
+
      auto udata = std::make_shared<llama_ubatch::data_t>();
  
      udata->token     .resize(n_tokens);
      udata->embd      .clear();
-    udata->pos       .resize(n_tokens);
+    udata->pos       .resize(n_pos_all);
      udata->n_seq_id  .resize(n_tokens);
      udata->seq_id    .resize(n_tokens);
      udata->seq_id_unq.resize(0);
diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp

index 98d055d34ef70a8b86aa8cec867a922a9f442328..1f7a52d7895bbfc07086e708c318e288222e2947 100644 (file)
--- a/examples/talk-llama/llama-context.cpp
+++ b/examples/talk-llama/llama-context.cpp
@@ -7,6 +7,7 @@
  #include "llama-memory.h"
  #include "llama-mmap.h"
  #include "llama-model.h"
+#include "llama-ext.h"
  
  #include <cinttypes>
  #include <cmath>
@@ -150,6 +151,10 @@ llama_context::llama_context(
      cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
      cparams.auto_fa    = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
  
+    cparams.fused_gdn_ar = true;
+    cparams.fused_gdn_ch = true;
+    cparams.auto_fgdn    = true;
+
      // with causal attention, the batch size is limited by the context size
      cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
  
@@ -158,7 +163,7 @@ llama_context::llama_context(
      cparams.op_offload = params.op_offload;
      cparams.kv_unified = params.kv_unified;
  
-    // intialized later
+    // initialized later
      cparams.pipeline_parallel = false;
  
      {
@@ -337,6 +342,14 @@ llama_context::llama_context(
  
          if (cparams.pipeline_parallel) {
              LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
+
+            if (!graph_reuse_disable) {
+                // TODO: figure out a way to make graph reuse work with pipeline parallelism
+                // ref: https://github.com/ggml-org/llama.cpp/pull/20463
+                LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
+
+                graph_reuse_disable = true;
+            }
          }
  
          sched_reserve();
@@ -422,7 +435,7 @@ void llama_context::sched_reserve() {
      if (cparams.auto_fa) {
          auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
          if (!gf) {
-            throw std::runtime_error("failed to split graph for Flash Attention check");
+            throw std::runtime_error("failed to reserve graph for Flash Attention check");
          }
  
          const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
@@ -432,8 +445,7 @@ void llama_context::sched_reserve() {
              if (n->op != GGML_OP_FLASH_ATTN_EXT) {
                  continue;
              }
-            ggml_backend_dev_t device_fa = ggml_backend_get_device(
-                    ggml_backend_sched_get_tensor_backend(sched.get(), n));
+            ggml_backend_dev_t device_fa = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
  
              // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
              GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
@@ -448,6 +460,7 @@ void llama_context::sched_reserve() {
                  break;
              }
          }
+
          if (fa_device_mismatch) {
              cparams.flash_attn = false;
              LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
@@ -459,6 +472,88 @@ void llama_context::sched_reserve() {
          cparams.auto_fa = false;
      }
  
+    if (cparams.auto_fgdn) {
+        LLAMA_LOG_INFO("%s: resolving fused Gated Delta Net support:\n", __func__);
+
+        if (cparams.fused_gdn_ar) {
+            auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
+            if (!gf) {
+                throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (autoregressive)");
+            }
+
+            const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FGDN_AR) + 1;
+            bool gdn_device_mismatch = false;
+            for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+                ggml_tensor * n = ggml_graph_node(gf, i);
+                if (n->op != GGML_OP_GATED_DELTA_NET) {
+                    continue;
+                }
+                ggml_backend_dev_t device_gdn = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
+
+                GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FGDN_AR "-", prefix_len) == 0);
+                const int il = std::stoi(n->name + prefix_len);
+                ggml_backend_dev_t device_kv = model.dev_layer(il);
+                if (device_gdn != device_kv) {
+                    LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the fused Gated Delta Net tensor "
+                            "is assigned to device %s (usually due to missing support)\n",
+                            __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_gdn));
+                    gdn_device_mismatch = true;
+                    break;
+                }
+            }
+
+            if (gdn_device_mismatch) {
+                cparams.fused_gdn_ar = false;
+                LLAMA_LOG_WARN("%s: fused Gated Delta Net (autoregressive) not supported, set to disabled\n", __func__);
+            } else {
+                LLAMA_LOG_INFO("%s: fused Gated Delta Net (autoregressive) enabled\n", __func__);
+            }
+        }
+
+        if (cparams.fused_gdn_ch) {
+            // more than one token in the batch per sequence in order to take the chunked path
+            // note: n_outputs must match n_tokens for embedding models with mean/rank pooling,
+            // because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies
+            // it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens,
+            // the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553).
+            const uint32_t n_tokens_ch = 16*n_seqs;
+            auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true);
+            if (!gf) {
+                throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (chunked)");
+            }
+
+            const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FGDN_CH) + 1;
+            bool gdn_device_mismatch = false;
+            for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+                ggml_tensor * n = ggml_graph_node(gf, i);
+                if (n->op != GGML_OP_GATED_DELTA_NET) {
+                    continue;
+                }
+                ggml_backend_dev_t device_gdn = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
+
+                GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FGDN_CH "-", prefix_len) == 0);
+                const int il = std::stoi(n->name + prefix_len);
+                ggml_backend_dev_t device_kv = model.dev_layer(il);
+                if (device_gdn != device_kv) {
+                    LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the fused Gated Delta Net tensor "
+                            "is assigned to device %s (usually due to missing support)\n",
+                            __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_gdn));
+                    gdn_device_mismatch = true;
+                    break;
+                }
+            }
+
+            if (gdn_device_mismatch) {
+                cparams.fused_gdn_ch = false;
+                LLAMA_LOG_WARN("%s: fused Gated Delta Net (chunked) not supported, set to disabled\n", __func__);
+            } else {
+                LLAMA_LOG_INFO("%s: fused Gated Delta Net (chunked) enabled\n", __func__);
+            }
+        }
+
+        cparams.auto_fgdn = false;
+    }
+
      // reserve worst-case graph
      int n_splits_pp = -1;
      int n_nodes_pp  = -1;
@@ -1039,11 +1134,15 @@ void llama_context::set_adapters_lora(llama_adapter_lora ** adapters, size_t n_a
  bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) {
      LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters);
  
-    if (n_adapters != loras->size()) {
-        return false;
-    }
+    // Adapters with a zero scale are never added to `loras`, so also ignore them for the comparison.
+    size_t n_non_zero = 0;
  
      for (size_t i = 0; i < n_adapters; i ++) {
+        if (scales[i] == 0.0f) {
+            continue;
+        }
+        n_non_zero++;
+
          auto it = loras->find(adapters[i]);
  
          if (it == loras->end() || it->second != scales[i]) {
@@ -1051,6 +1150,10 @@ bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_
          }
      }
  
+    if (n_non_zero != loras->size()) {
+        return false;
+    }
+
      return true;
  }
  
@@ -1114,6 +1217,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
      {
          //const auto t_start_us = ggml_time_us();
  
+        // FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
          res->set_inputs(&ubatch);
  
          //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
@@ -1981,7 +2085,7 @@ ggml_cgraph * llama_context::graph_reserve(
  
      ggml_backend_sched_reset(sched.get());
  
-    // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
+    // when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that
      gf_res_prev->reset();
  
      // store the n_outputs as it is, and restore it afterwards
@@ -2831,19 +2935,23 @@ llama_context * llama_init_from_model(
  
      if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
          const uint32_t blck_size = ggml_blck_size(params.type_k);
-        if (model->hparams.n_embd_head_k % blck_size != 0) {
-            LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
-                __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
-            return nullptr;
+        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+            if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
+                LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+                    __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
+                return nullptr;
+            }
          }
      }
  
      if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
          const uint32_t blck_size = ggml_blck_size(params.type_v);
-        if (model->hparams.n_embd_head_v % blck_size != 0) {
-            LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
-                __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
-            return nullptr;
+        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+            if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
+                LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
+                    __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
+                return nullptr;
+            }
          }
      }
  
@@ -3035,6 +3143,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
      return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
  }
  
+struct ggml_cgraph * llama_graph_reserve(
+        struct llama_context * ctx,
+        uint32_t n_tokens,
+        uint32_t n_seqs,
+        uint32_t n_outputs) {
+    auto * memory = ctx->get_memory();
+    llama_memory_context_ptr mctx;
+    if (memory) {
+        mctx = memory->init_full();
+    }
+    return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
+}
+
  // llama adapter API
  
  int32_t llama_set_adapters_lora(
diff --git a/examples/talk-llama/llama-cparams.h b/examples/talk-llama/llama-cparams.h

index 2da3bbd6f9407ffe4b11942d033414856ac9ef41..9d359474132f91e56e0100ef21e967279c098dea 100644 (file)
--- a/examples/talk-llama/llama-cparams.h
+++ b/examples/talk-llama/llama-cparams.h
@@ -31,6 +31,9 @@ struct llama_cparams {
      bool offload_kqv;
      bool flash_attn;
      bool auto_fa;
+    bool fused_gdn_ar;       // use fused gated delta net (autoregressive)
+    bool fused_gdn_ch;       // use fused gated delta net (chunked)
+    bool auto_fgdn;
      bool no_perf;
      bool warmup;
      bool op_offload;
diff --git a/examples/talk-llama/llama-ext.h b/examples/talk-llama/llama-ext.h

new file mode 100644 (file)

index 0000000..13ced78
--- /dev/null
+++ b/examples/talk-llama/llama-ext.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "llama-context.h"
+#include "ggml.h"
+#include "stdint.h"
+
+// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
+LLAMA_API struct ggml_cgraph * llama_graph_reserve(
+        struct llama_context * ctx,
+        uint32_t n_tokens,
+        uint32_t n_seqs,
+        uint32_t n_outputs);
diff --git a/examples/talk-llama/llama-grammar.cpp b/examples/talk-llama/llama-grammar.cpp

index 2d55070cecc91463b900fb2ceaf296b9cf768bce..aac0d41f2b41a5c5e25530d485f2ebe7c80d3165 100644 (file)
--- a/examples/talk-llama/llama-grammar.cpp
+++ b/examples/talk-llama/llama-grammar.cpp
@@ -601,7 +601,7 @@ const char * llama_grammar_parser::parse_sequence(
                  throw std::runtime_error(std::string("expecting an int at ") + pos);
              }
              const char * int_end = parse_int(pos);
-            uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
+            uint64_t min_times = std::stoull(std::string(pos, int_end - pos));
              pos = parse_space(int_end, is_nested);
  
              uint64_t max_times = UINT64_MAX; // default: no max limit
@@ -614,7 +614,7 @@ const char * llama_grammar_parser::parse_sequence(
  
                  if (is_digit_char(*pos)) {
                      const char * int_end = parse_int(pos);
-                    max_times = std::stoul(std::string(pos, int_end - pos));
+                    max_times = std::stoull(std::string(pos, int_end - pos));
                      pos = parse_space(int_end, is_nested);
                  }
  
@@ -1160,13 +1160,13 @@ struct llama_grammar * llama_grammar_init_impl(
      // if there is a grammar, parse it
      // rules will be empty (default) if there are parse errors
      if (!parser.parse(grammar_str) || parser.rules.empty()) {
-        fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+        LLAMA_LOG_ERROR("failed to parse grammar\n");
          return nullptr;
      }
  
-    // Ensure that there is a "root" node.
-    if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
-        fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+    // Ensure that the grammar contains the start symbol
+    if (parser.symbol_ids.find(grammar_root) == parser.symbol_ids.end()) {
+        LLAMA_LOG_ERROR("grammar does not contain a '%s' symbol\n", grammar_root);
          return nullptr;
      }
  
@@ -1195,7 +1195,7 @@ struct llama_grammar * llama_grammar_init_impl(
              continue;
          }
          if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
-            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu\n", i);
              return nullptr;
          }
      }
diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp

index 23a86ea2905fba1fb26354c159d0221bb8b99ff2..9a215bb77a00c441d537beb2177dc57e7867e9c8 100644 (file)
--- a/examples/talk-llama/llama-graph.cpp
+++ b/examples/talk-llama/llama-graph.cpp
@@ -250,7 +250,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
  
          const bool last = (
               cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
-            (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
+            (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL)) // qwen3 reranking & embedding models use last token
          );
  
          for (int i = 0; i < n_tokens; ++i) {
@@ -509,6 +509,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
      float * data = (float *) cross_kq_mask->data;
  
      for (int i = 0; i < n_tokens; ++i) {
+        GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
          for (int j = 0; j < n_enc; ++j) {
              float f = -INFINITY;
  
@@ -848,13 +849,13 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
      ubatch           (params.ubatch),
      n_embd           (hparams.n_embd),
      n_layer          (hparams.n_layer),
-    n_rot            (hparams.n_rot),
+    n_rot            (hparams.n_rot()),
      n_ctx            (cparams.n_ctx),
      n_head           (hparams.n_head()),
      n_head_kv        (hparams.n_head_kv()),
-    n_embd_head_k    (hparams.n_embd_head_k),
+    n_embd_head_k    (hparams.n_embd_head_k()),
      n_embd_k_gqa     (hparams.n_embd_k_gqa()),
-    n_embd_head_v    (hparams.n_embd_head_v),
+    n_embd_head_v    (hparams.n_embd_head_v()),
      n_embd_v_gqa     (hparams.n_embd_v_gqa()),
      n_expert         (hparams.n_expert),
      n_expert_used    (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
@@ -899,7 +900,8 @@ ggml_tensor * llm_graph_context::build_cvec(
  
  ggml_tensor * llm_graph_context::build_lora_mm(
            ggml_tensor * w,
-          ggml_tensor * cur) const {
+          ggml_tensor * cur,
+          ggml_tensor * w_s) const {
      ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
  
      for (const auto & lora : *loras) {
@@ -920,6 +922,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
          res = ggml_add(ctx0, res, ab_cur);
      }
  
+    if (w_s) {
+        res = ggml_mul(ctx0, res, w_s);
+    }
+
      return res;
  }
  
@@ -1161,12 +1167,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
               int64_t   n_expert_used,
       llm_ffn_op_type   type_op,
                  bool   norm_w,
-                bool   scale_w,
                 float   w_scale,
           llama_expert_gating_func_type gating_op,
                   int   il,
           ggml_tensor * probs_in,
-         ggml_tensor * gate_up_exps) const {
+         ggml_tensor * gate_up_exps,
+         ggml_tensor * up_exps_s,
+         ggml_tensor * gate_exps_s,
+         ggml_tensor * down_exps_s) const {
      return build_moe_ffn(
          cur,
          gate_inp,  /* gate_inp_b  */ nullptr,
@@ -1178,12 +1186,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          n_expert_used,
          type_op,
          norm_w,
-        scale_w,
          w_scale,
          gating_op,
          il,
          probs_in,
-        gate_up_exps
+        gate_up_exps,
+        /* gate_up_exps_b */ nullptr,
+        up_exps_s,
+        gate_exps_s,
+        down_exps_s
      );
  }
  
@@ -1202,13 +1213,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
               int64_t   n_expert_used,
       llm_ffn_op_type   type_op,
                  bool   norm_w,
-                bool   scale_w,
                 float   w_scale,
          llama_expert_gating_func_type gating_op,
                   int   il,
           ggml_tensor * probs_in,
           ggml_tensor * gate_up_exps,
-         ggml_tensor * gate_up_exps_b) const {
+         ggml_tensor * gate_up_exps_b,
+         ggml_tensor * up_exps_s,
+         ggml_tensor * gate_exps_s,
+         ggml_tensor * down_exps_s) const {
      const int64_t n_embd   = cur->ne[0];
      const int64_t n_tokens = cur->ne[1];
      const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
@@ -1330,7 +1343,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
  
          weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
      }
-    if (scale_w) {
+    if (w_scale != 0.0f && w_scale != 1.0f) {
          weights = ggml_scale(ctx0, weights, w_scale);
          cb(weights, "ffn_moe_weights_scaled", il);
      }
@@ -1360,6 +1373,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
              cb(gate_up, "ffn_moe_gate_up_biased", il);
          }
  
+        // apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
+        if (up_exps_s) {
+            ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
+            s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+            s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
+            gate_up = ggml_mul(ctx0, gate_up, s);
+            cb(gate_up, "ffn_moe_gate_up_scaled", il);
+        }
+
          const int64_t n_ff = gate_up->ne[0] / 2;
          cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
          cb(cur, "ffn_moe_gate", il);
@@ -1375,6 +1397,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
              cb(up, "ffn_moe_up_biased", il);
          }
  
+        // apply per-expert scale2 to up
+        if (up_exps_s) {
+            ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
+            s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+            s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
+            up = ggml_mul(ctx0, up, s);
+            cb(up, "ffn_moe_up_scaled", il);
+        }
+
          if (gate_exps) {
              cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
              cb(cur, "ffn_moe_gate", il);
@@ -1386,6 +1417,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
              cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
              cb(cur, "ffn_moe_gate_biased", il);
          }
+
+        // apply per-expert scale2 to gate
+        if (gate_exps_s) {
+            ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
+            s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+            s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
+            cur = ggml_mul(ctx0, cur, s);
+            cb(cur, "ffn_moe_gate_scaled", il);
+        }
      }
  
      const bool has_gate = gate_exps || gate_up_exps;
@@ -1465,6 +1505,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          cb(experts, "ffn_moe_down_biased", il);
      }
  
+    // apply per-expert scale2 to down
+    if (down_exps_s) {
+        ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
+        s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
+        s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
+        experts = ggml_mul(ctx0, experts, s);
+        cb(experts, "ffn_moe_down_scaled", il);
+    }
+
      if (!weight_before_ffn) {
          experts = ggml_mul(ctx0, experts, weights);
          cb(cur, "ffn_moe_weighted", il);
@@ -1607,6 +1656,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
      // this need to be 1x1xN for broadcasting
      cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
      ggml_set_input(cur);
+    ggml_set_name(cur, "attn_scale");
  
      res->add_input(std::move(inp));
  
@@ -1616,7 +1666,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
  ggml_tensor * llm_graph_context::build_inp_out_ids() const {
      // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
      //       but this would make the graph topology depend on the number of output tokens, which can interere with
-    //       features that require constant topology such as pipline parallelism
+    //       features that require constant topology such as pipeline parallelism
      //       ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
      //if (n_outputs < n_tokens) {
      //    return nullptr;
@@ -1779,7 +1829,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
          if (v_mla) {
  #if 0
              // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
-            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
+            // However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient.
              cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
              cur = ggml_mul_mat(ctx0, v_mla, cur);
  #else
@@ -2553,7 +2603,7 @@ void llm_graph_context::build_pooling(
                  }
  
                  // softmax for qwen3 reranker
-                if (arch == LLM_ARCH_QWEN3) {
+                if (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL) {
                      cur = ggml_soft_max(ctx0, cur);
                  }
              } break;
diff --git a/examples/talk-llama/llama-graph.h b/examples/talk-llama/llama-graph.h

index e8f006977d2e4b3d02af4a1ce8a90bfc55744788..4855685ef7144a3c1f82cc84ca34932fe3d8ed71 100644 (file)
--- a/examples/talk-llama/llama-graph.h
+++ b/examples/talk-llama/llama-graph.h
@@ -764,10 +764,11 @@ struct llm_graph_context {
               ggml_tensor * cur,
                       int   il) const;
  
-    // do mat_mul, while optionally apply lora
+    // do mat_mul, while optionally apply lora and per-tensor scale
      ggml_tensor * build_lora_mm(
                ggml_tensor * w,
-              ggml_tensor * cur) const;
+              ggml_tensor * cur,
+              ggml_tensor * w_s = nullptr) const;
  
      // do mat_mul_id, while optionally apply lora
      ggml_tensor * build_lora_mm_id(
@@ -810,12 +811,14 @@ struct llm_graph_context {
                   int64_t   n_expert_used,
           llm_ffn_op_type   type_op,
                      bool   norm_w,
-                    bool   scale_w,
                     float   w_scale,
              llama_expert_gating_func_type gating_op,
                       int   il,
               ggml_tensor * probs_in = nullptr,
-             ggml_tensor * gate_up_exps = nullptr) const;
+             ggml_tensor * gate_up_exps = nullptr,
+             ggml_tensor * up_exps_s = nullptr,
+             ggml_tensor * gate_exps_s = nullptr,
+             ggml_tensor * down_exps_s = nullptr) const;
  
      ggml_tensor * build_moe_ffn(
               ggml_tensor * cur,
@@ -832,13 +835,15 @@ struct llm_graph_context {
                   int64_t   n_expert_used,
           llm_ffn_op_type   type_op,
                      bool   norm_w,
-                    bool   scale_w,
                     float   w_scale,
              llama_expert_gating_func_type gating_op,
                       int   il,
               ggml_tensor * probs_in = nullptr,
               ggml_tensor * gate_up_exps = nullptr,
-             ggml_tensor * gate_up_exps_b = nullptr) const;
+             ggml_tensor * gate_up_exps_b = nullptr,
+             ggml_tensor * up_exps_s = nullptr,
+             ggml_tensor * gate_exps_s = nullptr,
+             ggml_tensor * down_exps_s = nullptr) const;
  
      //
      // inputs
diff --git a/examples/talk-llama/llama-hparams.cpp b/examples/talk-llama/llama-hparams.cpp

index 756dda1a7ab658f25f4b1e0095845ba3c32aeef8..002d15d415fe0897eeb9ac582f652e135a0e7fc6 100644 (file)
--- a/examples/talk-llama/llama-hparams.cpp
+++ b/examples/talk-llama/llama-hparams.cpp
@@ -62,6 +62,14 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
      return n_head/n_head_kv;
  }
  
+uint32_t llama_hparams::n_rot(uint32_t il) const {
+    if (il < n_layer) {
+        return is_swa(il) ? n_rot_swa : n_rot_full;
+    }
+
+    GGML_ABORT("fatal error");
+}
+
  uint32_t llama_hparams::n_embd_inp() const {
      uint32_t n_embd_inp = n_embd;
  
@@ -76,16 +84,32 @@ uint32_t llama_hparams::n_embd_out() const {
      return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
  }
  
+uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
+    if (il < n_layer) {
+        return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
+    if (il < n_layer) {
+        return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
+    }
+
+    GGML_ABORT("fatal error");
+}
+
  uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
      const uint32_t n_head_kv = this->n_head_kv(il);
  
-    return n_embd_head_k * n_head_kv;
+    return n_embd_head_k(il) * n_head_kv;
  }
  
  uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
      const uint32_t n_head_kv = this->n_head_kv(il);
  
-    return n_embd_head_v * n_head_kv;
+    return n_embd_head_v(il) * n_head_kv;
  }
  
  bool llama_hparams::is_n_embd_k_gqa_variable() const {
@@ -197,11 +221,11 @@ bool llama_hparams::is_mla() const {
  }
  
  uint32_t llama_hparams::n_embd_head_k_mla() const {
-    return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
+    return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
  }
  
  uint32_t llama_hparams::n_embd_head_v_mla() const {
-    return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
+    return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
  }
  
  bool llama_hparams::has_kv(uint32_t il) const {
diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h

index c4b2a99da5a98ac908237c647661bf50622b945f..78c0bc27d4d98a29b8917c17cf9701dccdc0ed21 100644 (file)
--- a/examples/talk-llama/llama-hparams.h
+++ b/examples/talk-llama/llama-hparams.h
@@ -44,13 +44,20 @@ struct llama_hparams {
      uint32_t n_embd;
      uint32_t n_layer;
      int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
-    uint32_t n_rot;
-    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
-    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
      uint32_t n_expert = 0;
      uint32_t n_expert_used = 0;
      uint32_t n_rel_attn_bkts = 0;
  
+    // different head size for full_attention and SWA layers
+    uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
+    uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
+    uint32_t n_embd_head_k_swa;
+    uint32_t n_embd_head_v_swa;
+
+    // different RoPE dimensions for full_attention and SWA layers
+    uint32_t n_rot_full;
+    uint32_t n_rot_swa;
+
      // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
      uint32_t n_embd_head_k_mla_impl = 0;
      uint32_t n_embd_head_v_mla_impl = 0;
@@ -82,6 +89,7 @@ struct llama_hparams {
      bool     expert_weights_norm  = false;
      uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
      uint32_t moe_every_n_layers   = 0;
+    uint32_t moe_latent_size      = 0;
      uint32_t nextn_predict_layers = 0;
  
      float f_norm_eps;
@@ -247,12 +255,18 @@ struct llama_hparams {
  
      uint32_t n_gqa(uint32_t il = 0) const;
  
+    uint32_t n_rot(uint32_t il = 0) const;
+
      // dimension of main + auxiliary input embeddings
      uint32_t n_embd_inp() const;
  
      // dimension of output embeddings
      uint32_t n_embd_out() const;
  
+    // dimension of key/value embeddings for each head (per layer)
+    uint32_t n_embd_head_k(uint32_t il = 0) const;
+    uint32_t n_embd_head_v(uint32_t il = 0) const;
+
      // dimension of key embeddings across all k-v heads
      uint32_t n_embd_k_gqa(uint32_t il = 0) const;
  
diff --git a/examples/talk-llama/llama-impl.cpp b/examples/talk-llama/llama-impl.cpp

index 710a5a1e08d3675dbd96ff12c5bd0694ceefa9cb..4c0188ee7226014883d176a354a912589c5b6ce5 100644 (file)
--- a/examples/talk-llama/llama-impl.cpp
+++ b/examples/talk-llama/llama-impl.cpp
@@ -100,9 +100,9 @@ std::string format(const char * fmt, ...) {
  
  std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
      char buf[256];
-    snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
+    snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
      for (size_t i = 1; i < ne.size(); i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
      }
      return buf;
  }
diff --git a/examples/talk-llama/llama-impl.h b/examples/talk-llama/llama-impl.h

index dfd9fee9f445e1e56ce7f0e3aecead5bf078c8aa..e4f35c8e53d64cde77b8c19af976efdd4ac1767e 100644 (file)
--- a/examples/talk-llama/llama-impl.h
+++ b/examples/talk-llama/llama-impl.h
@@ -70,4 +70,6 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t);
  
  std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
  
-#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
+#define LLAMA_TENSOR_NAME_FATTN   "__fattn__"
+#define LLAMA_TENSOR_NAME_FGDN_AR "__fgdn_ar__"
+#define LLAMA_TENSOR_NAME_FGDN_CH "__fgdn_ch__"
diff --git a/examples/talk-llama/llama-kv-cache.cpp b/examples/talk-llama/llama-kv-cache.cpp

index 6b668ee9abdd683967a505a9f613a2440b551949..01166fac9ce19484b9631dfeddc76a77d83dd0c4 100644 (file)
--- a/examples/talk-llama/llama-kv-cache.cpp
+++ b/examples/talk-llama/llama-kv-cache.cpp
@@ -583,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
              break;
          }
  
-        // remeber the position that we found
+        // remember the position that we found
          res.push_back(sinfo_new);
  
          // store the old state of the cells in the recovery stack
@@ -1033,8 +1033,8 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
      const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
  
      return ggml_view_4d(ctx, k,
-            hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
-            ggml_row_size(k->type, hparams.n_embd_head_k),
+            hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
+            ggml_row_size(k->type, hparams.n_embd_head_k(il)),
              ggml_row_size(k->type, n_embd_k_gqa),
              ggml_row_size(k->type, n_embd_k_gqa*kv_size),
              ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
@@ -1056,8 +1056,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
      if (!v_trans) {
          // note: v->nb[1] <= v->nb[2]
          return ggml_view_4d(ctx, v,
-                hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
-                ggml_row_size(v->type, hparams.n_embd_head_v),          // v->nb[1]
+                hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
+                ggml_row_size(v->type, hparams.n_embd_head_v(il)),          // v->nb[1]
                  ggml_row_size(v->type, n_embd_v_gqa),                   // v->nb[2]
                  ggml_row_size(v->type, n_embd_v_gqa*kv_size),           // v->nb[3]
                  ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
@@ -1065,8 +1065,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
  
      // note: v->nb[1] > v->nb[2]
      return ggml_view_4d(ctx, v,
-            n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
-            ggml_row_size(v->type, kv_size*hparams.n_embd_head_v),  // v->nb[1]
+            n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
+            ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)),  // v->nb[1]
              ggml_row_size(v->type, kv_size),                        // v->nb[2]
              ggml_row_size(v->type, kv_size*n_embd_v_gqa),           // v->nb[3]
              ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
@@ -1293,7 +1293,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
      }
  
      for (uint32_t s = 0; s < n_stream; ++s) {
-        // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
+        // bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
          std::unordered_map<llama_seq_id, uint32_t>              seq_srct;
          std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
  
@@ -1544,7 +1544,8 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                  ggml_tensor * shift,
                  ggml_tensor * factors,
                        float   freq_base,
-                      float   freq_scale) const {
+                      float   freq_scale,
+                   uint32_t   il) const {
      const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
  
      const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
@@ -1552,7 +1553,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
      const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
      const auto & yarn_attn_factor = cparams.yarn_attn_factor;
  
-    const auto & n_rot     = hparams.n_rot;
+    const auto & n_rot     = hparams.n_rot(il);
      const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
                                  // @ngxson : this is a workaround
                                  // for M-RoPE, we want to rotate the whole vector when doing KV shift
@@ -1606,13 +1607,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
      auto * ctx = res->get_ctx();
      auto * gf  = res->get_gf();
  
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    const auto & n_rot = hparams.n_rot;
-
-    const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
-
      auto inp = std::make_unique<llm_graph_input_k_shift>(this);
  
      inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
@@ -1626,6 +1620,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
          const int64_t n_head_kv    = hparams.n_head_kv(il);
          const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  
+        const auto n_rot         = hparams.n_rot(il);
+        const auto n_embd_head_k = hparams.n_embd_head_k(il);
+        const auto n_embd_nope   = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
+
          const float freq_base_l  = model.get_rope_freq_base (cparams, il);
          const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  
@@ -1638,7 +1636,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
                  ggml_row_size(layer.k->type, n_embd_k_gqa),
                  ggml_row_size(layer.k->type, n_embd_nope));
  
-        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
  
          ggml_build_forward_expand(gf, cur);
      }
@@ -1760,8 +1758,10 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t
              io.write(&pos,      sizeof(pos));
              io.write(&n_seq_id, sizeof(n_seq_id));
  
-            // TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
-            //       see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
+            if (hparams.n_pos_per_embd() > 1) {
+                const llama_kv_cell_ext ext = cells.ext_get(i);
+                io.write(&ext, sizeof(ext));
+            }
  
              for (const auto & seq_id : seq_ids) {
                  io.write(&seq_id, sizeof(seq_id));
@@ -1895,6 +1895,14 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
                  return false;
              }
  
+            if (hparams.n_pos_per_embd() > 1) {
+                llama_kv_cell_ext ext;
+                io.read_to(&ext, sizeof(ext));
+
+                ubatch.pos[i + ubatch.n_tokens]   = ext.y;
+                ubatch.pos[i + ubatch.n_tokens*2] = ext.x;
+            }
+
              // read the sequence id, but directly discard it - we will use dest_seq_id instead
              {
                  llama_seq_id seq_id;
@@ -1945,6 +1953,12 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
  
              cells.pos_set(i, pos);
  
+            if (hparams.n_pos_per_embd() > 1) {
+                llama_kv_cell_ext ext;
+                io.read_to(&ext, sizeof(ext));
+                cells.ext_set(i, ext);
+            }
+
              for (uint32_t j = 0; j < n_seq_id; ++j) {
                  llama_seq_id seq_id;
                  io.read_to(&seq_id, sizeof(seq_id));
diff --git a/examples/talk-llama/llama-kv-cache.h b/examples/talk-llama/llama-kv-cache.h

index e194bf3e26f3820eb95b78d7f47175b66f5fc031..33c78c5f2107ec7f48b213240a706e589dab8f8d 100644 (file)
--- a/examples/talk-llama/llama-kv-cache.h
+++ b/examples/talk-llama/llama-kv-cache.h
@@ -264,7 +264,8 @@ private:
                      ggml_tensor * shift,
                      ggml_tensor * factors,
                            float   freq_base,
-                          float   freq_scale) const;
+                          float   freq_scale,
+                       uint32_t   il) const;
  
      ggml_cgraph * build_graph_shift(
                 llm_graph_result * res,
diff --git a/examples/talk-llama/llama-model-loader.cpp b/examples/talk-llama/llama-model-loader.cpp

index 1501e392ca84e77207cbed4a6314d521940b25d3..413f34c226854801b72154df796778b0f78d9fac 100644 (file)
--- a/examples/talk-llama/llama-model-loader.cpp
+++ b/examples/talk-llama/llama-model-loader.cpp
@@ -1,12 +1,17 @@
  #include "llama-model-loader.h"
  
+#include "ggml-alloc.h"
  #include "ggml.h"
+#include "gguf.h"
+#include "llama-hparams.h"
  
  #include <algorithm>
  #include <array>
  #include <cinttypes>
+#include <cstdint>
  #include <cstring>
  #include <future>
+#include <regex>
  
  static const size_t kiB = 1024;
  static const size_t MiB = 1024*kiB;
@@ -37,6 +42,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
          case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
          case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
          case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
+        case LLAMA_FTYPE_MOSTLY_NVFP4:    return "NVFP4";
          case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
          case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
          case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
@@ -263,7 +269,7 @@ namespace GGUFMeta {
      template<typename T>
      typename std::enable_if<std::is_integral<T>::value, bool>::type
      llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
-        const int kid = gguf_find_key(meta.get(), key.c_str());
+        const int kid = gguf_find_key(metadata, key.c_str());
  
          if (kid < 0) {
              if (required) {
@@ -273,7 +279,7 @@ namespace GGUFMeta {
          }
  
          struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
  
  
          result = arr_info.length;
@@ -290,7 +296,7 @@ namespace GGUFMeta {
  
      template<typename T>
      bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
-        const gguf_context * ctx = meta.get();
+        const gguf_context * ctx = metadata;
          const int kid = gguf_find_key(ctx, key.c_str());
  
          if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
@@ -331,7 +337,7 @@ namespace GGUFMeta {
  
      template<typename T, size_t N_MAX>
      bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
-        const gguf_context * ctx = meta.get();
+        const gguf_context * ctx = metadata;
          const int kid = gguf_find_key(ctx, key.c_str());
  
          if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
@@ -393,7 +399,7 @@ namespace GGUFMeta {
          const struct llama_model_kv_override * override =
              it != kv_overrides.end() ? &it->second : nullptr;
  
-        const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
+        const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
  
          if (required && !found) {
              throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -427,7 +433,7 @@ namespace GGUFMeta {
      // get array of n <= N_MAX elements, or a single element repeated n times
      template<typename T, size_t N_MAX>
      bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
-        const int kid = gguf_find_key(meta.get(), key.c_str());
+        const int kid = gguf_find_key(metadata, key.c_str());
  
          if (kid < 0) {
              if (required) {
@@ -440,9 +446,9 @@ namespace GGUFMeta {
              throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
          }
  
-        if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
+        if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
              struct GGUFMeta::ArrayInfo arr_info =
-                GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+                GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
  
              if (n != arr_info.length) {
                  throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
@@ -473,7 +479,7 @@ namespace GGUFMeta {
      bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
          const std::string key = llm_kv(kid);
  
-        const int id = gguf_find_key(meta.get(), key.c_str());
+        const int id = gguf_find_key(metadata, key.c_str());
  
          if (id < 0) {
              if (required) {
@@ -483,7 +489,7 @@ namespace GGUFMeta {
          }
  
          // throw and error if type is an array
-        if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
+        if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
              if (required) {
                  throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
              }
@@ -500,6 +506,9 @@ namespace GGUFMeta {
  
  
  llama_model_loader::llama_model_loader(
+        struct gguf_context * meta,
+        llama_model_set_tensor_data_t set_tensor_data,
+        void * set_tensor_data_ud,
          const std::string & fname,
          std::vector<std::string> & splits,
          bool use_mmap,
@@ -507,7 +516,8 @@ llama_model_loader::llama_model_loader(
          bool check_tensors,
          bool no_alloc,
          const llama_model_kv_override * param_overrides_p,
-        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
+        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
+        : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
      int trace = 0;
      if (getenv("LLAMA_TRACE")) {
          trace = atoi(getenv("LLAMA_TRACE"));
@@ -521,136 +531,142 @@ llama_model_loader::llama_model_loader(
  
      tensor_buft_overrides = param_tensor_buft_overrides_p;
  
-    // Load the main GGUF
-    struct ggml_context * ctx = NULL;
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx,
-    };
-
-    meta.reset(gguf_init_from_file(fname.c_str(), params));
-    if (!meta) {
-        throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
-    }
-
-    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
-    llm_kv = LLM_KV(llm_arch_from_string(arch_name));
-
-    files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
-    contexts.emplace_back(ctx);
+    if (!fname.empty()) {
+        // Load the main GGUF
+        struct ggml_context * ctx = NULL;
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx,
+        };
  
-    if (use_mmap && use_direct_io) {
-        if (files.back()->has_direct_io()) {
-            LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
-            use_mmap = false;
-        } else {
-            LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
-            use_direct_io = false;
-
-            // reopen file using std::fopen for mmap
-            files.pop_back();
-            files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+        metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
+        metadata = metadata_ptr.get();
+        if (metadata == nullptr) {
+            throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
          }
-    }
  
-    // Save tensors data offset of the main file.
-    // For subsidiary files, `meta` tensor data offset must not be used,
-    // so we build a unified tensors index for weights.
-    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-        std::string tensor_name = std::string(cur->name);
-        // make sure there is no duplicated tensor names
-        if (weights_map.find(tensor_name) != weights_map.end()) {
-            throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
-        }
-        n_elements += ggml_nelements(cur);
-        n_bytes    += ggml_nbytes(cur);
-        weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
-    }
-    uint16_t n_split = 0;
-    get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+        get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+        llm_kv = LLM_KV(llm_arch_from_string(arch_name));
  
-    // Load additional GGML contexts
-    if (n_split > 1) {
-        // make sure the main file is loaded first
-        uint16_t idx = 0;
-        const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
-        get_key(kv_split_no, idx);
-        if (idx != 0) {
-            throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
-        }
+        files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
+        contexts.emplace_back(ctx);
  
-        // generate list of splits if needed
-        if (splits.empty()) {
-            splits = llama_get_list_splits(fname, idx, n_split);
-        }
+        if (use_mmap && use_direct_io) {
+            if (files.back()->has_direct_io()) {
+                LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+                use_mmap = false;
+            } else {
+                LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
+                use_direct_io = false;
  
-        // in case user give a custom list of splits, check if it matches the expected number
-        if (n_split != (uint16_t)splits.size()) {
-            throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
+                // reopen file using std::fopen for mmap
+                files.pop_back();
+                files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+            }
          }
  
-        if (trace > 0) {
-            LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
-        }
+        // Save tensors data offset of the main file.
+        // For subsidiary files, `meta` tensor data offset must not be used,
+        // so we build a unified tensors index for weights.
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+            std::string tensor_name = std::string(cur->name);
+            // make sure there is no duplicated tensor names
+            if (weights_map.find(tensor_name) != weights_map.end()) {
+                throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+            }
+            n_elements += ggml_nelements(cur);
+            n_bytes    += ggml_nbytes(cur);
+            weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
+        }
+        uint16_t n_split = 0;
+        get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+
+        // Load additional GGML contexts
+        if (n_split > 1) {
+            // make sure the main file is loaded first
+            uint16_t idx = 0;
+            const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
+            get_key(kv_split_no, idx);
+            if (idx != 0) {
+                throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
+            }
  
-        // load other splits
-        for (idx = 1; idx < n_split; idx++) {
-            const char * fname_split = splits[idx].c_str();
+            // generate list of splits if needed
+            if (splits.empty()) {
+                splits = llama_get_list_splits(fname, idx, n_split);
+            }
  
-            struct gguf_init_params split_params = {
-                /*.no_alloc = */ true,
-                /*.ctx      = */ &ctx,
-            };
-            gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
-            if (!ctx_gguf) {
-                throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
+            // in case user give a custom list of splits, check if it matches the expected number
+            if (n_split != (uint16_t)splits.size()) {
+                throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
              }
  
-            // check idx
-            {
-                const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
-                if (kid < 0) {
-                    throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+            if (trace > 0) {
+                LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
+            }
+
+            // load other splits
+            for (idx = 1; idx < n_split; idx++) {
+                const char * fname_split = splits[idx].c_str();
+
+                struct gguf_init_params split_params = {
+                    /*.no_alloc = */ true,
+                    /*.ctx      = */ &ctx,
+                };
+                gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
+                if (!ctx_gguf) {
+                    throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
                  }
-                int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
-                if (idx_gguf != idx) {
-                    throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+
+                // check idx
+                {
+                    const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
+                    if (kid < 0) {
+                        throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+                    }
+                    int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
+                    if (idx_gguf != idx) {
+                        throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+                    }
                  }
-            }
  
-            files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
-            contexts.emplace_back(ctx);
+                files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
+                contexts.emplace_back(ctx);
  
-            // Save tensors data offset info of the shard.
-            for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-                std::string tensor_name = std::string(cur->name);
-                // make sure there is no duplicated tensor names
-                if (weights_map.find(tensor_name) != weights_map.end()) {
-                    throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+                // Save tensors data offset info of the shard.
+                for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+                    std::string tensor_name = std::string(cur->name);
+                    // make sure there is no duplicated tensor names
+                    if (weights_map.find(tensor_name) != weights_map.end()) {
+                        throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+                    }
+                    n_elements += ggml_nelements(cur);
+                    n_bytes    += ggml_nbytes(cur);
+                    weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
                  }
-                n_elements += ggml_nelements(cur);
-                n_bytes    += ggml_nbytes(cur);
-                weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
              }
-        }
  
-        get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
+            get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
  
-        // sanity check
-        {
-            const int n_tensors_loaded = (int) weights_map.size();
-            if (n_tensors != n_tensors_loaded) {
-                throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+            // sanity check
+            {
+                const int n_tensors_loaded = (int) weights_map.size();
+                if (n_tensors != n_tensors_loaded) {
+                    throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+                }
              }
-        }
  
-        LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
+            LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
+        }
+    } else {
+        get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+        llm_kv = LLM_KV(llm_arch_from_string(arch_name));
      }
  
-    n_kv      = gguf_get_n_kv(meta.get());
+    n_kv      = gguf_get_n_kv(metadata);
      n_tensors = weights_map.size();
  
-    fver = (enum llama_fver) gguf_get_version(meta.get());
+    fver = (enum llama_fver) gguf_get_version(metadata);
  
      LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
              __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
@@ -709,6 +725,7 @@ llama_model_loader::llama_model_loader(
              case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
              case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
              case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
+            case GGML_TYPE_NVFP4:   ftype = LLAMA_FTYPE_MOSTLY_NVFP4;   break;
              default:
                  {
                      LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -729,14 +746,14 @@ llama_model_loader::llama_model_loader(
          LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
  
          for (int i = 0; i < n_kv; i++) {
-            const char * name           = gguf_get_key(meta.get(), i);
-            const enum gguf_type type   = gguf_get_kv_type(meta.get(), i);
+            const char * name           = gguf_get_key(metadata, i);
+            const enum gguf_type type   = gguf_get_kv_type(metadata, i);
              const std::string type_name =
                  type == GGUF_TYPE_ARRAY
-                ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
+                ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
                  : gguf_type_name(type);
  
-            std::string value          = gguf_kv_to_str(meta.get(), i);
+            std::string value          = gguf_kv_to_str(metadata, i);
              const size_t MAX_VALUE_LEN = 40;
              if (value.size() > MAX_VALUE_LEN) {
                  value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -838,15 +855,382 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
      return cur;
  }
  
-struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
-    LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
-    const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
+// checks if the weight tensor can be used with the specified buffer type and device
+static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
+    GGML_ASSERT(w != nullptr);
+
+    if (op == GGML_OP_NONE) {
+        return true;
+    }
+
+    ggml_init_params params = {
+        /*.mem_size   =*/ ggml_tensor_overhead()*8,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
+    if (!ctx_ptr) {
+        throw std::runtime_error(format("failed to create ggml context"));
+    }
+    ggml_context * ctx = ctx_ptr.get();
+
+    ggml_tensor * op_tensor = nullptr;
+
+    switch (op) {
+        case GGML_OP_GET_ROWS:
+            {
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+                op_tensor = ggml_get_rows(ctx, w, b);
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
+                op_tensor = ggml_mul_mat(ctx, w, b);
+            } break;
+        case GGML_OP_MUL_MAT_ID:
+            {
+                const int n_expert_used = hparams.n_expert_used;
+                GGML_ASSERT(n_expert_used > 0);
+                ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+                ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+                op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
+            } break;
+        case GGML_OP_ADD:
+            {
+                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+                op_tensor = ggml_add(ctx, a, w);
+            } break;
+        case GGML_OP_ADD_ID:
+            {
+                const int n_expert_used = hparams.n_expert_used;
+                GGML_ASSERT(n_expert_used > 0);
+                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+                ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+                op_tensor = ggml_add_id(ctx, a, w, c);
+            } break;
+        case GGML_OP_MUL:
+            {
+                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+                op_tensor = ggml_mul(ctx, a, w);
+            } break;
+        case GGML_OP_DIV:
+            {
+                ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
+                op_tensor = ggml_div(ctx, a, w);
+            } break;
+        case GGML_OP_ROPE:
+            {
+                const int n_embd_head = hparams.n_embd_head_v();
+                const int n_head = hparams.n_head();
+                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+                op_tensor = ggml_rope_ext(
+                    ctx, a, b, w,
+                    0, 0, 0, 0, 0,
+                    0, 0, 0, 0
+                );
+
+            } break;
+        case GGML_OP_SSM_CONV:
+            {
+                const int64_t n_seq_tokens = 512;
+                const int64_t n_seqs       = 3;
+                ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
+                op_tensor = ggml_ssm_conv(ctx, conv_x, w);
+            } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
+                const int64_t d_state      = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
+                const int64_t n_head       = w->ne[1];
+                const int64_t head_dim     = hparams.ssm_d_inner / n_head;
+                const int64_t n_group      = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
+                const int64_t n_seq_tokens = 512;
+                const int64_t n_seqs       = 3;
+                ggml_tensor * s   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
+                ggml_tensor * x   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
+                ggml_tensor * dt  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
+                ggml_tensor * B   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+                ggml_tensor * C   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+                ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
+                op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
+            } break;
+        case GGML_OP_RWKV_WKV6:
+            {
+                // FIXME
+                const int64_t S = 123;
+                const int64_t H = 123;
+                const int64_t n_tokens = 123;
+                const int64_t n_seqs = 123;
+                ggml_tensor  * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+                ggml_tensor  * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+                ggml_tensor  * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+                ggml_tensor  * tf = w;
+                ggml_tensor  * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+                ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
+                op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
+            } break;
+        case GGML_OP_IM2COL:
+            {
+                const int n_embd_inp = hparams.n_embd_inp();
+                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
+                op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
+            } break;
+        case GGML_OP_SCALE:
+            {
+                op_tensor = ggml_scale(ctx, w, 1.0f);
+            } break;
+        default:
+            GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
+    }
+
+    // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+    GGML_ASSERT(w->buffer == nullptr);
+    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+    ggml_backend_buffer_free(w->buffer);
+    w->buffer = nullptr;
+
+    return op_supported;
+}
+
+// find the first buffer type in the list that can use the tensor
+static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
+    GGML_ASSERT(!buft_list->empty());
+    for (const auto & cur : *buft_list) {
+        ggml_backend_dev_t cur_dev = cur.first;
+        ggml_backend_buffer_type_t cur_buft = cur.second;
+        if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
+            return cur_buft;
+        }
+    }
+
+    return nullptr;
+}
+
+struct ggml_tensor * llama_model_loader::create_tensor(
+        const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
+        const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            // one ggml context per buffer type
+            int max_n_tensors = n_tensors;
+            max_n_tensors += 1;                 // duplicated output tensor
+            max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
+            if (files.empty()) {
+                max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
+            }
+            const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
+
+            ggml_init_params params = {
+                /*.mem_size   =*/ ctx_size,
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                throw std::runtime_error(format("failed to create ggml context"));
+            }
+
+            ctx_map.emplace(buft, ctx);
+
+            return ctx;
+        }
+        return it->second.get();
+    };
+
+    auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
+        if (!t_meta) {
+            if (flags & TENSOR_NOT_REQUIRED) {
+                return nullptr;
+            }
+            throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
+        }
+
+        // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
+        // the tensor is duplicated
+        // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
+        llm_tensor tn_tensor = tn.tensor;
+        if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
+            tn_tensor = LLM_TENSOR_OUTPUT;
+        }
+
+        llm_tensor_info info;
+        try {
+            info = llm_tensor_info_for(tn_tensor);
+        } catch (const std::out_of_range & e) {
+            throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
+        }
+
+        // skip unused tensors
+        if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
+            const size_t nbytes = ggml_nbytes(t_meta);
+            LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
+
+            size_data -= nbytes;
+            n_created++;
+
+            return nullptr;
+        }
+
+        // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
+        ggml_op op;
+        bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
+        if (bias) {
+            if (info.op == GGML_OP_MUL_MAT_ID) {
+                op = GGML_OP_ADD_ID;
+            } else {
+                op = GGML_OP_ADD;
+            }
+        } else {
+            op = info.op;
+        }
+
+        // sanity checks
+        if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
+            if (tn.bid != -1) {
+                GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
+            }
+        } else {
+            if (tn.bid == -1) {
+                GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
+            }
+        }
+
+        // select the buffer type for this tensor
+        const buft_list_t * buft_list;
+        switch (info.layer) {
+            case LLM_TENSOR_LAYER_INPUT:
+                buft_list = buft_list_input;
+                break;
+            case LLM_TENSOR_LAYER_OUTPUT:
+                buft_list = buft_list_output;
+                break;
+            case LLM_TENSOR_LAYER_REPEATING:
+                GGML_ASSERT(buft_list_layer != nullptr);
+                buft_list = buft_list_layer;
+                break;
+            default:
+                GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
+        }
+
+        ggml_backend_buffer_type_t buft = nullptr;
+
+        // check overrides
+        if (tensor_buft_overrides) {
+            std::string tensor_name = tn.str();
+            for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
+                std::regex pattern(overrides->pattern);
+                if (std::regex_search(tensor_name, pattern)) {
+                    if (overrides->buft == ggml_backend_cpu_buffer_type()) {
+                        // when overriding to a CPU buffer, consider the extra buffer types
+                        buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
+                    } else {
+                        buft = overrides->buft;
+                    }
+
+                    LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
+                            tensor_name.c_str(),
+                            ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
+                            ggml_backend_buft_name(buft));
+                    break;
+                }
+            }
+        }
+
+        if (!buft) {
+            buft = select_weight_buft(hparams, t_meta, op, buft_list);
+            if (!buft) {
+                throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+            }
+        }
+
+        // avoid using a host buffer when using mmap
+        auto * buft_dev = ggml_backend_buft_get_device(buft);
+        if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
+            auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+            if (!cpu_dev) {
+                throw std::runtime_error("no CPU backend found");
+            }
+            buft = ggml_backend_dev_buffer_type(cpu_dev);
+        }
+
+        if (buft != buft_list->front().second) {
+            if (n_tensors_moved == 0) {
+                first_tensor_moved_name = t_meta->name;
+                first_tensor_moved_type_name = ggml_type_name(t_meta->type);
+                first_moved_from_buft = buft_list->front().second;
+                first_moved_to_buft   = buft;
+            }
+            n_tensors_moved++;
+        }
+
+        return buft;
+    };
+
+    if (files.empty()) {
+        if (flags & TENSOR_SKIP_IF_VIRTUAL) {
+            return nullptr;
+        }
+        ggml_type type = GGML_TYPE_F32;
+        const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
+        if (tid != -1) {
+            type = gguf_get_tensor_type(metadata, tid);
+        }
+
+        // for tensors that are not required some of the dimensions can be invalid:
+        if (flags & TENSOR_NOT_REQUIRED) {
+            for (size_t dim = 0; dim < ne.size(); dim++) {
+                if (ne.begin()[dim] <= 0) {
+                    return nullptr;
+                }
+            }
+        }
+
+        ggml_tensor t_meta;
+        memset(&t_meta, 0, sizeof(ggml_tensor));
+        t_meta.type = type;
+        for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
+            t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
+            GGML_ASSERT(t_meta.ne[dim] >= 1);
+            t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
+            GGML_ASSERT(t_meta.nb[dim] >= 1);
+        }
+        ggml_set_name(&t_meta, tn.str().c_str());
+
+        ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
+        GGML_ASSERT(buft != nullptr);
+        ggml_context * ctx = ctx_for_buft(buft);
+        ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
+        ggml_set_name(ret, tn.str().c_str());
+        return ret;
+    }
+
+    ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
+    ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
+    if (buft == nullptr) {
+        return nullptr; // return type is ggml_tensor *
+    }
+    ggml_context * ctx = ctx_for_buft(buft);
+
+    // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
+    if (flags & TENSOR_DUPLICATED) {
+        ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+        if (t) {
+            return t;
+        }
+    }
+
+    LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
+    const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
  
      if (cur == NULL) {
          return NULL;
      }
  
-    bool duplicated = flags & TENSOR_DUPLICATED;
+    const bool duplicated = flags & TENSOR_DUPLICATED;
  
      struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
      ggml_set_name(tensor, ggml_get_name(cur));
@@ -858,7 +1242,6 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
      }
  
      return tensor;
-
  }
  
  struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
@@ -893,6 +1276,11 @@ void llama_model_loader::done_getting_tensors() const {
      if (n_created != n_tensors) {
          throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
      }
+    if (n_tensors_moved > 0) {
+        LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
+            __func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
+            ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+    }
  }
  
  void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
@@ -974,6 +1362,12 @@ bool llama_model_loader::load_all_data(
          llama_mlocks * lmlocks,
          llama_progress_callback progress_callback,
          void * progress_callback_user_data) {
+    if (files.empty()) {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            set_tensor_data(t, set_tensor_data_ud);
+        }
+        return true;
+    }
      GGML_ASSERT(size_data != 0 && "call init_mappings() first");
  
      std::vector<no_init<uint8_t>> read_buf;
diff --git a/examples/talk-llama/llama-model-loader.h b/examples/talk-llama/llama-model-loader.h

index 65953dd3d5a6950c38b713d8649dabd4081f1ee7..ed5de729caf776a0ad548591c22a28c80ba33105 100644 (file)
--- a/examples/talk-llama/llama-model-loader.h
+++ b/examples/talk-llama/llama-model-loader.h
@@ -4,17 +4,22 @@
  
  #include "llama-impl.h"
  #include "llama-arch.h"
+#include "llama-hparams.h"
  #include "llama-mmap.h"
  
  #include "ggml-cpp.h"
  
  #include <cstddef>
+#include <cstring>
  #include <map>
  #include <stdexcept>
  #include <unordered_map>
  
  using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
  
+// lists of buffer types used for each layer
+using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
+
  enum llama_fver {
      GGUF_FILE_VERSION_V1 = 1,
      GGUF_FILE_VERSION_V2 = 2,
@@ -58,9 +63,10 @@ struct llama_model_loader {
          }
      };
  
-    static const int TENSOR_NOT_REQUIRED = 1 << 0;
-    static const int TENSOR_DUPLICATED   = 1 << 1;
-    static const int TENSOR_SKIP         = 1 << 2;
+    static const int TENSOR_NOT_REQUIRED    = 1 << 0;
+    static const int TENSOR_DUPLICATED      = 1 << 1;
+    static const int TENSOR_SKIP            = 1 << 2;
+    static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
  
      int n_kv      = 0;
      int n_tensors = 0;
@@ -84,7 +90,10 @@ struct llama_model_loader {
      std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
      const llama_model_tensor_buft_override * tensor_buft_overrides;
  
-    gguf_context_ptr meta;
+    gguf_context_ptr metadata_ptr;
+    struct gguf_context * metadata; // either metadata_ptr.get() or externally set
+    llama_model_set_tensor_data_t set_tensor_data;
+    void * set_tensor_data_ud;
      std::vector<ggml_context_ptr> contexts;
  
      std::string arch_name;
@@ -94,7 +103,26 @@ struct llama_model_loader {
      size_t size_data = 0;
      std::vector<std::pair<size_t, size_t>> mmaps_used;
  
+    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+    struct ggml_backend_buft_comparator {
+        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+        }
+    };
+
+    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
+    // track tensors that had to be moved for debugging:
+    size_t n_tensors_moved = 0;
+    std::string first_tensor_moved_name;
+    std::string first_tensor_moved_type_name;
+    ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
+    ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
+
      llama_model_loader(
+        struct gguf_context * metadata,
+        llama_model_set_tensor_data_t set_tensor_data,
+        void * set_tensor_data_ud,
          const std::string & fname,
          std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
          bool use_mmap,
@@ -149,7 +177,9 @@ struct llama_model_loader {
  
      const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
  
-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
+    struct ggml_tensor * create_tensor(
+        const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
+        const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
  
      struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
  
diff --git a/examples/talk-llama/llama-model-saver.cpp b/examples/talk-llama/llama-model-saver.cpp

index 676efeda709342b88b8448790027c408024736bb..6f6538aeccd2c0e649f5584fa5190223548499d5 100644 (file)
--- a/examples/talk-llama/llama-model-saver.cpp
+++ b/examples/talk-llama/llama-model-saver.cpp
@@ -7,14 +7,19 @@
  #include "llama-model.h"
  #include "llama-vocab.h"
  
+#include <cstdint>
  #include <string>
  
-llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
-    gguf_ctx = gguf_init_empty();
-}
+llama_model_saver::llama_model_saver(const struct llama_model * model) :
+    gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
+
+llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
+        gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
  
  llama_model_saver::~llama_model_saver() {
-    gguf_free(gguf_ctx);
+    if (gguf_ctx_owned) {
+        gguf_free(gguf_ctx);
+    }
  }
  
  void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
@@ -46,7 +51,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
  
  template <typename Container>
  void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
-    const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
+    GGML_ASSERT(model != nullptr || !per_layer);
+    const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
      GGML_ASSERT(n_values <= value.size());
  
      if (n_values == 0) {
@@ -83,6 +89,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c
          GGML_ABORT("fatal error");
      }
  }
+// instantiate for external usage:
+template void llama_model_saver::add_kv<std::vector<uint32_t>>(const enum llm_kv, const std::vector<uint32_t> &, const bool);
  
  void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
      std::vector<const char *> tmp(value.size());
@@ -104,37 +112,39 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
  }
  
  void llama_model_saver::add_kv_from_model() {
-    const llama_hparams & hparams = model.hparams;
-    const llama_vocab   & vocab   = model.vocab;
+    const llama_hparams & hparams = model->hparams;
+    const llama_vocab   & vocab   = model->vocab;
  
      const int32_t n_vocab = vocab.n_tokens();
      std::vector<std::string> tokens(n_vocab);
      std::vector<float>       scores(n_vocab);
      std::vector<int32_t>     token_types(n_vocab);
  
-    for (int32_t id = 0; id < n_vocab; ++id) {
-        const llama_vocab::token_data & token_data = vocab.get_token_data(id);
-
-        tokens[id] = token_data.text;
-        scores[id] = token_data.score;
-
-        switch(token_data.attr) {
-            case LLAMA_TOKEN_ATTR_UNKNOWN:      token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN;      break;
-            case LLAMA_TOKEN_ATTR_UNUSED:       token_types[id] = LLAMA_TOKEN_TYPE_UNUSED;       break;
-            case LLAMA_TOKEN_ATTR_NORMAL:       token_types[id] = LLAMA_TOKEN_TYPE_NORMAL;       break;
-            case LLAMA_TOKEN_ATTR_CONTROL:      token_types[id] = LLAMA_TOKEN_TYPE_CONTROL;      break;
-            case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
-            case LLAMA_TOKEN_ATTR_BYTE:         token_types[id] = LLAMA_TOKEN_TYPE_BYTE;         break;
-            case LLAMA_TOKEN_ATTR_UNDEFINED:
-            default:                            token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED;    break;
+    if (vocab.get_type() != LLAMA_VOCAB_TYPE_NONE) {
+        for (int32_t id = 0; id < n_vocab; ++id) {
+            const llama_vocab::token_data & token_data = vocab.get_token_data(id);
+
+            tokens[id] = token_data.text;
+            scores[id] = token_data.score;
+
+            switch(token_data.attr) {
+                case LLAMA_TOKEN_ATTR_UNKNOWN:      token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN;      break;
+                case LLAMA_TOKEN_ATTR_UNUSED:       token_types[id] = LLAMA_TOKEN_TYPE_UNUSED;       break;
+                case LLAMA_TOKEN_ATTR_NORMAL:       token_types[id] = LLAMA_TOKEN_TYPE_NORMAL;       break;
+                case LLAMA_TOKEN_ATTR_CONTROL:      token_types[id] = LLAMA_TOKEN_TYPE_CONTROL;      break;
+                case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
+                case LLAMA_TOKEN_ATTR_BYTE:         token_types[id] = LLAMA_TOKEN_TYPE_BYTE;         break;
+                case LLAMA_TOKEN_ATTR_UNDEFINED:
+                default:                            token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED;    break;
+            }
          }
      }
  
      // add_kv(LLM_KV_GENERAL_TYPE,                      ???);
-    add_kv(LLM_KV_GENERAL_ARCHITECTURE,              model.arch_name());
+    add_kv(LLM_KV_GENERAL_ARCHITECTURE,              model->arch_name());
      // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION,      ???);
      // add_kv(LLM_KV_GENERAL_ALIGNMENT,                 ???);
-    add_kv(LLM_KV_GENERAL_NAME,                      model.name);
+    add_kv(LLM_KV_GENERAL_NAME,                      model->name);
      // add_kv(LLM_KV_GENERAL_AUTHOR,                    ???);
      // add_kv(LLM_KV_GENERAL_VERSION,                   ???);
      // add_kv(LLM_KV_GENERAL_URL,                       ???);
@@ -176,8 +186,10 @@ void llama_model_saver::add_kv_from_model() {
      add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV,           hparams.n_head_kv_arr, true);
      add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS,          hparams.f_max_alibi_bias);
      add_kv(LLM_KV_ATTENTION_CLAMP_KQV,               hparams.f_clamp_kqv);
-    add_kv(LLM_KV_ATTENTION_KEY_LENGTH,              hparams.n_embd_head_k);
-    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,            hparams.n_embd_head_v);
+    add_kv(LLM_KV_ATTENTION_KEY_LENGTH,              hparams.n_embd_head_k_full);
+    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,            hparams.n_embd_head_v_full);
+    add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA,          hparams.n_embd_head_k_swa);
+    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,        hparams.n_embd_head_v_swa);
      add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS,           hparams.f_norm_eps);
      add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
      add_kv(LLM_KV_ATTENTION_CAUSAL,                  hparams.causal_attn);
@@ -189,7 +201,8 @@ void llama_model_saver::add_kv_from_model() {
  
      const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
  
-    add_kv(LLM_KV_ROPE_DIMENSION_COUNT,              hparams.n_rot);
+    add_kv(LLM_KV_ROPE_DIMENSION_COUNT,              hparams.n_rot_full);
+    add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA,          hparams.n_rot_swa);
      add_kv(LLM_KV_ROPE_FREQ_BASE,                    hparams.rope_freq_base_train);
      // add_kv(LLM_KV_ROPE_SCALE_LINEAR,                 rope_scaling_factor); // old name
      add_kv(LLM_KV_ROPE_SCALING_TYPE,                 llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
@@ -255,25 +268,25 @@ void llama_model_saver::add_kv_from_model() {
  }
  
  void llama_model_saver::add_tensors_from_model() {
-    if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
-        add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
+    if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
+        add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
      }
-    add_tensor(model.type_embd);
-    add_tensor(model.pos_embd);
-    add_tensor(model.tok_norm);
-    add_tensor(model.tok_norm_b);
-    add_tensor(model.output_norm);
-    add_tensor(model.output_norm_b);
-    add_tensor(model.output);
-    add_tensor(model.output_b);
-    add_tensor(model.output_norm_enc);
-    add_tensor(model.cls);
-    add_tensor(model.cls_b);
-    add_tensor(model.cls_out);
-    add_tensor(model.cls_out_b);
-    add_tensor(model.cls_norm);
-
-    for (const struct llama_layer & layer : model.layers) {
+    add_tensor(model->type_embd);
+    add_tensor(model->pos_embd);
+    add_tensor(model->tok_norm);
+    add_tensor(model->tok_norm_b);
+    add_tensor(model->output_norm);
+    add_tensor(model->output_norm_b);
+    add_tensor(model->output);
+    add_tensor(model->output_b);
+    add_tensor(model->output_norm_enc);
+    add_tensor(model->cls);
+    add_tensor(model->cls_b);
+    add_tensor(model->cls_out);
+    add_tensor(model->cls_out_b);
+    add_tensor(model->cls_norm);
+
+    for (const struct llama_layer & layer : model->layers) {
          for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
              add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
          }
diff --git a/examples/talk-llama/llama-model-saver.h b/examples/talk-llama/llama-model-saver.h

index a5a434c30698a432b1c52604383fcad140a36004..2b3541ce6c575b4c4447a75d1afcd0e8183afd62 100644 (file)
--- a/examples/talk-llama/llama-model-saver.h
+++ b/examples/talk-llama/llama-model-saver.h
@@ -1,5 +1,6 @@
  #pragma once
  
+#include "gguf.h"
  #include "llama.h"
  #include "llama-arch.h"
  
@@ -7,10 +8,12 @@
  
  struct llama_model_saver {
      struct gguf_context * gguf_ctx = nullptr;
-    const struct llama_model & model;
+    const bool gguf_ctx_owned;
+    const struct llama_model * model;
      const struct LLM_KV llm_kv;
  
-    llama_model_saver(const struct llama_model & model);
+    llama_model_saver(const struct llama_model * model);
+    llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx);
      ~llama_model_saver();
  
      void add_kv(enum llm_kv key, uint32_t     value);
diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp

index dabf3b3086ef8827a09615532a090fbdd8d6d9c7..e8e1bbf1cd1387174d60f94727408c78f7b8a124 100644 (file)
--- a/examples/talk-llama/llama-model.cpp
+++ b/examples/talk-llama/llama-model.cpp
@@ -1,5 +1,6 @@
  #include "llama-model.h"
  
+#include "ggml.h"
  #include "llama-impl.h"
  #include "llama-mmap.h"
  #include "llama-cparams.h"
@@ -18,6 +19,7 @@
  #include <algorithm>
  #include <cassert>
  #include <cfloat>
+#include <cstdint>
  #include <cstring>
  #include <cmath>
  #include <functional>
@@ -61,6 +63,7 @@ const char * llm_type_name(llm_type type) {
          case LLM_TYPE_0_3B:          return "0.3B";
          case LLM_TYPE_0_5B:          return "0.5B";
          case LLM_TYPE_0_6B:          return "0.6B";
+        case LLM_TYPE_0_8B:          return "0.8B";
          case LLM_TYPE_1B:            return "1B";
          case LLM_TYPE_1_2B:          return "1.2B";
          case LLM_TYPE_1_3B:          return "1.3B";
@@ -132,12 +135,15 @@ const char * llm_type_name(llm_type type) {
          case LLM_TYPE_100B_A6B:      return "100B.A6B";
          case LLM_TYPE_102B_A12B:     return "102B.A12B";
          case LLM_TYPE_106B_A12B:     return "106B.A12B";
+        case LLM_TYPE_120B_A12B:     return "120B.A12B";
+        case LLM_TYPE_122B_A10B:     return "122B.A10B";
          case LLM_TYPE_196B_A11B:     return "196B.A11B";
          case LLM_TYPE_230B_A10B:     return "230B.A10B";
          case LLM_TYPE_235B_A22B:     return "235B.A22B";
          case LLM_TYPE_300B_A47B:     return "300B.A47B";
          case LLM_TYPE_310B_A15B:     return "310B.A15B";
          case LLM_TYPE_355B_A32B:     return "355B.A32B";
+        case LLM_TYPE_397B_A17B:     return "397B.A17B";
          case LLM_TYPE_744B_A40B:     return "744B.A40B";
          case LLM_TYPE_E2B:           return "E2B";
          case LLM_TYPE_E4B:           return "E4B";
@@ -174,160 +180,6 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
      return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  }
  
-// checks if the weight tensor can be used with the specified buffer type and device
-static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
-    GGML_ASSERT(w != nullptr);
-
-    if (op == GGML_OP_NONE) {
-        return true;
-    }
-
-    ggml_init_params params = {
-        /*.mem_size   =*/ ggml_tensor_overhead()*8,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true,
-    };
-    ggml_context_ptr ctx_ptr { ggml_init(params) };
-    if (!ctx_ptr) {
-        throw std::runtime_error(format("failed to create ggml context"));
-    }
-    ggml_context * ctx = ctx_ptr.get();
-
-    ggml_tensor * op_tensor = nullptr;
-
-    switch (op) {
-        case GGML_OP_GET_ROWS:
-            {
-                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
-                op_tensor = ggml_get_rows(ctx, w, b);
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
-                op_tensor = ggml_mul_mat(ctx, w, b);
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                int n_expert_used = hparams.n_expert_used;
-                ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
-                ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
-                op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
-            } break;
-        case GGML_OP_ADD:
-            {
-                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
-                op_tensor = ggml_add(ctx, a, w);
-            } break;
-        case GGML_OP_ADD_ID:
-            {
-                int n_expert_used = hparams.n_expert_used;
-                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
-                ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
-                op_tensor = ggml_add_id(ctx, a, w, c);
-            } break;
-        case GGML_OP_MUL:
-            {
-                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
-                op_tensor = ggml_mul(ctx, a, w);
-            } break;
-        case GGML_OP_DIV:
-            {
-                ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
-                op_tensor = ggml_div(ctx, a, w);
-            } break;
-        case GGML_OP_ROPE:
-            {
-                int n_embd_head = hparams.n_embd_head_v;
-                int n_head = hparams.n_head();
-                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
-                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
-                op_tensor = ggml_rope_ext(
-                    ctx, a, b, w,
-                    0, 0, 0, 0, 0,
-                    0, 0, 0, 0
-                );
-
-            } break;
-        case GGML_OP_SSM_CONV:
-            {
-                const int64_t n_seq_tokens = 512;
-                const int64_t n_seqs       = 3;
-                ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
-                op_tensor = ggml_ssm_conv(ctx, conv_x, w);
-            } break;
-        case GGML_OP_SSM_SCAN:
-            {
-                // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
-                const int64_t d_state      = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
-                const int64_t n_head       = w->ne[1];
-                const int64_t head_dim     = hparams.ssm_d_inner / n_head;
-                const int64_t n_group      = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
-                const int64_t n_seq_tokens = 512;
-                const int64_t n_seqs       = 3;
-                ggml_tensor * s   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
-                ggml_tensor * x   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
-                ggml_tensor * dt  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
-                ggml_tensor * B   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
-                ggml_tensor * C   = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
-                ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
-                op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
-            } break;
-        case GGML_OP_RWKV_WKV6:
-            {
-                // FIXME
-                const int64_t S = 123;
-                const int64_t H = 123;
-                const int64_t n_tokens = 123;
-                const int64_t n_seqs = 123;
-                ggml_tensor  * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
-                ggml_tensor  * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
-                ggml_tensor  * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
-                ggml_tensor  * tf = w;
-                ggml_tensor  * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
-                ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
-                op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
-            } break;
-        case GGML_OP_IM2COL:
-            {
-                const int n_embd_inp = hparams.n_embd_inp();
-                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
-                op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
-            } break;
-        case GGML_OP_SCALE:
-            {
-                op_tensor = ggml_scale(ctx, w, 1.0f);
-            } break;
-        default:
-            GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
-    }
-
-    // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
-    GGML_ASSERT(w->buffer == nullptr);
-    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
-    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-    ggml_backend_buffer_free(w->buffer);
-    w->buffer = nullptr;
-
-    return op_supported;
-}
-
-// lists of buffer types used for each layer
-using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
-
-// find the first buffer type in the list that can use the tensor
-static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
-    GGML_ASSERT(!buft_list.empty());
-    for (const auto & cur : buft_list) {
-        ggml_backend_dev_t cur_dev = cur.first;
-        ggml_backend_buffer_type_t cur_buft = cur.second;
-        if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
-            return cur_buft;
-        }
-    }
-
-    return nullptr;
-}
-
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
      buft_list_t buft_list;
@@ -493,7 +345,7 @@ void llama_model::load_arch(llama_model_loader & ml) {
  }
  
  void llama_model::load_hparams(llama_model_loader & ml) {
-    const gguf_context * ctx = ml.meta.get();
+    const gguf_context * ctx = ml.metadata;
  
      // get metadata as string
      for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -608,26 +460,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
          // gpt-j n_rot = rotary_dim
  
-        hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
-        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+        hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head();
+        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);
  
-        hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
-        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+        hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head();
+        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);
  
          // sanity check for n_rot (optional)
-        hparams.n_rot = hparams.n_embd_head_k;
+        hparams.n_rot_full = hparams.n_embd_head_k_full;
  
-        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
+        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false);
  
          if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
-            if (hparams.n_rot != hparams.n_embd_head_k) {
-                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
+            if (hparams.n_rot_full != hparams.n_embd_head_k_full) {
+                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full));
              }
          }
      } else {
-        hparams.n_rot = 0;
-        hparams.n_embd_head_k = 0;
-        hparams.n_embd_head_v = 0;
+        hparams.n_rot_full = 0;
+        hparams.n_embd_head_k_full = 0;
+        hparams.n_embd_head_v_full = 0;
+    }
+
+    // head size and n_rot for SWA layers
+    {
+        hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
+        hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
+        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false);
+        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false);
+
+        hparams.n_rot_swa = hparams.n_rot_full;
+        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
      }
  
      // for differentiating model types
@@ -687,7 +550,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      hparams.n_attn_temp_floor_scale = 8192;
                      hparams.f_attn_temp_scale       = 0.1f;
                      hparams.f_attn_temp_offset      = 1.0f;
-                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
+                    uint32_t swa_period             = 4; // pattern: 3 chunked - 1 full
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                    hparams.set_swa_pattern(swa_period);
  
                      hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                      hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -724,7 +589,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_AFMOE:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
                  ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
@@ -736,7 +601,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
                  if (hparams.n_swa > 0) {
                      hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    hparams.set_swa_pattern(4);
+                    uint32_t swa_period = 4;
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                    hparams.set_swa_pattern(swa_period);
  
                      hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                      hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -881,7 +748,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_BERT:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn, false);
                  ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
  
                  switch (hparams.n_layer) {
@@ -904,10 +771,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
                  if (found_swa && hparams.n_swa > 0) {
-                    uint32_t swa_period = 3;
                      hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
-
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+                    uint32_t swa_period = 3;
                      ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
                      hparams.set_swa_pattern(swa_period, true);
                  } else {
@@ -915,7 +781,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  }
  
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn, false);
                  ml.get_key(LLM_KV_POOLING_TYPE,            hparams.pooling_type, false);
  
                  switch (hparams.n_layer) {
@@ -931,7 +797,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_JINA_BERT_V2:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn, false);
                  ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
                  hparams.f_max_alibi_bias = 8.0f;
  
@@ -944,7 +810,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_JINA_BERT_V3:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn, false);
                  ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
  
                  switch (hparams.n_layer) {
@@ -957,8 +823,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_NOMIC_BERT_MOE:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
-                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn, false);
+                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
                  ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
  
                  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
@@ -972,8 +838,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_NEO_BERT:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn);
-                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn, false);
+                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type, false);
  
                  if (hparams.n_layer == 28) {
                      type = LLM_TYPE_250M;
@@ -982,8 +848,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_EUROBERT:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn);
-                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn, false);
+                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type, false);
  
                  if (hparams.n_layer == 12) {
                      type = LLM_TYPE_SMALL;  // 0.2B
@@ -1011,7 +877,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
                  ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
-                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
  
                  switch (hparams.n_layer) {
                      case 32: type = LLM_TYPE_7B; break;
@@ -1260,19 +1126,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                          break;
                      default: type = LLM_TYPE_UNKNOWN;
                  }
-
-                // Load attention parameters
-                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH,   hparams.n_embd_head_k, false);
-                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
              } break;
          case LLM_ARCH_PLAMO3:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
                  if (found_swa && hparams.n_swa > 0) {
-                    uint32_t swa_period = 8;
                      hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+                    uint32_t swa_period = 8;
                      ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
                      hparams.set_swa_pattern(swa_period);
                  } else {
@@ -1335,7 +1197,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                  hparams.n_swa = 4096; // default value of gemma 2
-                hparams.set_swa_pattern(2);
+                uint32_t swa_period = 2;
+                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                hparams.set_swa_pattern(swa_period);
                  hparams.attn_soft_cap = true;
                  hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -1356,14 +1220,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
                  hparams.f_attention_scale = type == LLM_TYPE_27B
                      ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
-                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
              } break;
          case LLM_ARCH_GEMMA3:
              {
                  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
                  if (found_swa && hparams.n_swa > 0) {
                      hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    hparams.set_swa_pattern(6);
+                    uint32_t swa_period = 6;
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                    hparams.set_swa_pattern(swa_period);
  
                      ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                  } else {
@@ -1387,12 +1253,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
                  hparams.f_attention_scale = type == LLM_TYPE_27B
                      ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
-                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
              } break;
          case LLM_ARCH_GEMMA3N:
              {
+                uint32_t swa_period = 5;
+                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
                  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.set_swa_pattern(5);
+                hparams.set_swa_pattern(swa_period);
  
                  hparams.n_layer_kv_from_start     = 20;
                  hparams.f_attention_scale         = 1.0f;
@@ -1410,14 +1278,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_GEMMA_EMBEDDING:
              {
                  hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
-                hparams.set_swa_pattern(6);
+                uint32_t swa_period = 6;
+                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                hparams.set_swa_pattern(swa_period);
  
                  hparams.causal_attn = false; // embeddings do not use causal attention
  
                  ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
                  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  
                  //applied only if model converted with --sentence-transformers-dense-modules
                  ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
@@ -1432,7 +1302,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      case 24: type = LLM_TYPE_0_3B; break;
                      default: type = LLM_TYPE_UNKNOWN;
                  }
-                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
  
              } break;
          case LLM_ARCH_STARCODER2:
@@ -1524,7 +1394,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  }
  
                  switch (hparams.n_layer) {
-                    // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
+                    // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
                      case 12: // 900M  8x???M
                      case 32: // 51B  16x?B
                      default: type = LLM_TYPE_UNKNOWN;
@@ -1542,7 +1412,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              } break;
          case LLM_ARCH_COMMAND_R:
              {
-                ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale);
+                ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale, false);
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                  switch (hparams.n_layer) {
                      case 40: type = LLM_TYPE_35B; break;
@@ -1552,7 +1422,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_COHERE2:
              {
                  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.set_swa_pattern(4);
+                uint32_t swa_period = 4;
+                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                hparams.set_swa_pattern(swa_period);
                  hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  
@@ -1594,7 +1466,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
                  if (found_swa && hparams.n_swa > 0) {
                      hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    hparams.set_swa_pattern(4);
+                    uint32_t swa_period = 4;
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                    hparams.set_swa_pattern(swa_period);
  
                      hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                      hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
@@ -1701,10 +1575,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_DEEPSEEK:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
  
                  switch (hparams.n_ff_exp) {
                      case 1408: type = LLM_TYPE_16B; break;
@@ -1718,7 +1592,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
  
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                  if (!is_lite) {
                      ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
                  }
@@ -1820,7 +1694,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
                  ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
  
                  // Expert gating function (GLM-4.5 uses sigmoid)
@@ -1853,7 +1727,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
                  ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
  
                  // deepseek MLA parameters
@@ -1939,7 +1813,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_JAIS:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
  
                  switch (hparams.n_layer) {
                      case 24: type = LLM_TYPE_1_3B; break;
@@ -1988,10 +1862,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+                ml.get_key(LLM_KV_MOE_LATENT_SIZE,                   hparams.moe_latent_size, false);
  
                  switch (hparams.n_layer) {
                      case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
                      case 56: type = LLM_TYPE_9B; break;
+                    case 88: type = LLM_TYPE_120B_A12B; break;
                      default: type = LLM_TYPE_UNKNOWN;
                  }
              } break;
@@ -2009,7 +1885,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  if (hparams.n_layer == 64) {    // 32B
                      hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                      hparams.n_swa = 4096;
-                    hparams.set_swa_pattern(4);
+                    uint32_t swa_period = 4;
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                    hparams.set_swa_pattern(swa_period);
  
                      hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                      hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -2029,7 +1907,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                  hparams.n_swa = 128;
-                hparams.set_swa_pattern(4);
+                uint32_t swa_period = 4;
+                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                hparams.set_swa_pattern(swa_period);
                  hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  
@@ -2042,7 +1922,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
  
                  ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
  
@@ -2126,9 +2006,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                  ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
-                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale);
-                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale);
-                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale);
+                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, false);
+                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, false);
+                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, false);
  
                  // Granite uses rope_finetuned as a switch for rope, so default to true
                  bool rope_finetuned = true;
@@ -2186,7 +2066,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                  hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
-                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
+                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
  
                  switch (hparams.n_layer) {
                      case 32: type = LLM_TYPE_7B; break;
@@ -2199,15 +2079,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                  ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
                  ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn, false);
              } break;
          case LLM_ARCH_BAILINGMOE:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
  
                  switch (hparams.n_layer) {
@@ -2219,11 +2099,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_BAILINGMOE2:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
                  ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
                  ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
@@ -2242,10 +2122,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_DOTS1:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
                  ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
                  switch (hparams.n_layer) {
@@ -2265,7 +2145,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
                      ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
                      ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
-                    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
+                    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
                  }
  
                  switch (hparams.n_layer) {
@@ -2310,7 +2190,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  
                  switch (hparams.n_layer) {
                      case 32: type = LLM_TYPE_A13B; break;
@@ -2346,7 +2226,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
  
                  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.set_swa_pattern(2);
+                uint32_t swa_period = 2;
+                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                hparams.set_swa_pattern(swa_period);
  
                  hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -2384,7 +2266,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              {
                  ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
  
@@ -2403,9 +2285,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  
                  if (found_swa && hparams.n_swa > 0) {
-                    hparams.swa_type      = LLAMA_SWA_TYPE_STANDARD;
-                    hparams.n_swa         = 4096;
-                    hparams.set_swa_pattern(4, true);
+                    hparams.swa_type    = LLAMA_SWA_TYPE_STANDARD;
+                    hparams.n_swa       = 4096;
+                    uint32_t swa_period = 4;
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+                    hparams.set_swa_pattern(swa_period, true);
  
                      hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                      hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -2428,7 +2312,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
          case LLM_ARCH_GROVEMOE:
              {
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp);
+                ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp, false);
                  ml.get_key(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
                  ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
@@ -2528,7 +2412,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  }
  
                  switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_2B; break;
+                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
+                    case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
+                    case 64: type = LLM_TYPE_27B; break;
                      default: type = LLM_TYPE_UNKNOWN;
                  }
              } break;
@@ -2557,8 +2443,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  }
  
                  switch (hparams.n_layer) {
-                    case 28: type = LLM_TYPE_35B_A3B; break;
-                    case 48: type = LLM_TYPE_80B_A3B; break;
+                    case 40: type = LLM_TYPE_35B_A3B; break;
+                    case 48: type = LLM_TYPE_122B_A10B; break;
+                    case 60: type = LLM_TYPE_397B_A17B; break;
                      default: type = LLM_TYPE_UNKNOWN;
                  }
              } break;
@@ -2596,7 +2483,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
  
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
                  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa);
+                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa, false);
                  ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
  
                  switch (hparams.n_layer) {
@@ -2610,7 +2497,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla_impl);
                  ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla_impl);
                  ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
-                ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot);
                  ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv);
                  ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.n_embd_head_kda);
  
@@ -2626,8 +2512,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  // MoE parameters - Kimi uses moe_intermediate_size = 1024
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                  ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
  
                  switch (hparams.n_layer) {
@@ -2641,6 +2527,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
  
                  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  
+                // full_attention layer only use half of the RoPE dimensions
+                hparams.n_rot_full = hparams.n_rot_full / 2;
+
                  // MoE + SWA parameters
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
                  ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
@@ -2654,7 +2543,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  }
  
                  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,  hparams.n_swa);
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,        hparams.rope_freq_base_train_swa);
+                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,        hparams.rope_freq_base_train_swa, false);
                  ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
                  ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,   hparams.swiglu_clamp_exp,   hparams.n_layer, false);
                  ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
@@ -2664,7 +2553,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      default: type = LLM_TYPE_UNKNOWN;
                  }
              } break;
-        default: throw std::runtime_error("unsupported model architecture");
+        default: throw std::runtime_error("unsupported model architecture: " + arch_name());
      }
  
      pimpl->n_bytes = ml.n_bytes;
@@ -2771,44 +2660,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
      // assign the output layer
      pimpl->dev_output = get_layer_buft_list(n_layer);
  
-    // one ggml context per buffer type
-    int max_n_tensors = ml.n_tensors;
-    max_n_tensors += 1;         // duplicated output tensor
-    max_n_tensors += n_layer*2; // duplicated rope freq tensors
-    const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
-
-    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
-    struct ggml_backend_buft_comparator {
-        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
-            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
-        }
-    };
-    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
-
-    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        auto it = ctx_map.find(buft);
-        if (it == ctx_map.end()) {
-            ggml_init_params params = {
-                /*.mem_size   =*/ ctx_size,
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ true,
-            };
-
-            ggml_context * ctx = ggml_init(params);
-            if (!ctx) {
-                throw std::runtime_error(format("failed to create ggml context"));
-            }
-
-            ctx_map.emplace(buft, ctx);
-
-            return ctx;
-        }
-        return it->second.get();
-    };
-
-    const auto TENSOR_DUPLICATED   = llama_model_loader::TENSOR_DUPLICATED;
-    const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
-    const auto TENSOR_SKIP         = llama_model_loader::TENSOR_SKIP;
+    const auto TENSOR_DUPLICATED      = llama_model_loader::TENSOR_DUPLICATED;
+    const auto TENSOR_NOT_REQUIRED    = llama_model_loader::TENSOR_NOT_REQUIRED;
+    const auto TENSOR_SKIP            = llama_model_loader::TENSOR_SKIP;
+    const auto TENSOR_SKIP_IF_VIRTUAL = llama_model_loader::TENSOR_SKIP_IF_VIRTUAL;
  
      // create tensors for the weights
      {
@@ -2818,13 +2673,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
          const int64_t n_embd        = hparams.n_embd;
          const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
          const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-        const int64_t n_embd_head_v = hparams.n_embd_head_v;
+        const int64_t n_embd_head_k = hparams.n_embd_head_k();
+        const int64_t n_embd_head_v = hparams.n_embd_head_v();
          const int64_t n_ff          = hparams.n_ff();
          const int64_t n_embd_gqa    = n_embd_v_gqa;
          const int64_t n_vocab       = vocab.n_tokens();
          const int64_t n_token_types = vocab.n_token_types();
-        const int64_t n_rot         = hparams.n_rot;
+        const int64_t n_rot         = hparams.n_rot();
          const int64_t n_expert      = hparams.n_expert;
          const int64_t n_expert_used = hparams.n_expert_used;
          const int64_t n_ctx_train   = hparams.n_ctx_train;
@@ -2833,147 +2688,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
              throw std::runtime_error("model has expert layers but no expert layers are used");
          }
  
-        int n_moved_tensors = 0;
-        ggml_tensor * first_moved_tensor = nullptr;
-        ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
-        ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
-
          auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
-            ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
-
-            if (!t_meta) {
-                if (flags & TENSOR_NOT_REQUIRED) {
-                    return nullptr;
-                }
-                throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
-            }
-
-            // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
-            // the tensor is duplicated
-            // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
-            llm_tensor tn_tensor = tn.tensor;
-            if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
-                tn_tensor = LLM_TENSOR_OUTPUT;
-            }
-
-            llm_tensor_info info;
-            try {
-                info = llm_tensor_info_for(tn_tensor);
-            } catch (const std::out_of_range & e) {
-                throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
-            }
-
-            // skip unused tensors
-            if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
-                const size_t nbytes = ggml_nbytes(t_meta);
-                LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
-
-                ml.size_data -= nbytes;
-                ml.n_created++;
-
-                return nullptr;
-            }
-
-            // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
-            ggml_op op;
-            bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
-            if (bias) {
-                if (info.op == GGML_OP_MUL_MAT_ID) {
-                    op = GGML_OP_ADD_ID;
-                } else {
-                    op = GGML_OP_ADD;
-                }
-            } else {
-                op = info.op;
-            }
-
-            // sanity checks
-            if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
-                if (tn.bid != -1) {
-                    GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
-                }
-            } else {
-                if (tn.bid == -1) {
-                    GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
-                }
-            }
-
-            // select the buffer type for this tensor
-            buft_list_t * buft_list;
-            switch (info.layer) {
-                case LLM_TENSOR_LAYER_INPUT:
-                    buft_list = pimpl->dev_input.buft_list;
-                    break;
-                case LLM_TENSOR_LAYER_OUTPUT:
-                    buft_list = pimpl->dev_output.buft_list;
-                    break;
-                case LLM_TENSOR_LAYER_REPEATING:
-                    buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
-                    break;
-                default:
-                    GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
-            }
-
-            ggml_backend_buffer_type_t buft = nullptr;
-
-            // check overrides
-            if (ml.tensor_buft_overrides) {
-                std::string tensor_name = tn.str();
-                for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
-                    std::regex pattern(overrides->pattern);
-                    if (std::regex_search(tensor_name, pattern)) {
-                        if (overrides->buft == ggml_backend_cpu_buffer_type()) {
-                            // when overriding to a CPU buffer, consider the extra buffer types
-                            buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
-                        } else {
-                            buft = overrides->buft;
-                        }
-
-                        LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
-                                tensor_name.c_str(),
-                                ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
-                                ggml_backend_buft_name(buft));
-                        break;
-                    }
-                }
-            }
-
-            if (!buft) {
-                buft = select_weight_buft(hparams, t_meta, op, *buft_list);
-                if (!buft) {
-                    throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
-                }
-            }
-
-            // avoid using a host buffer when using mmap
-            auto * buft_dev = ggml_backend_buft_get_device(buft);
-            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
-                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-                if (!cpu_dev) {
-                    throw std::runtime_error("no CPU backend found");
-                }
-                buft = ggml_backend_dev_buffer_type(cpu_dev);
-            }
-
-            if (buft != buft_list->front().second) {
-                n_moved_tensors++;
-                if (!first_moved_tensor) {
-                    first_moved_tensor = t_meta;
-                    first_moved_from_buft = buft_list->front().second;
-                    first_moved_to_buft   = buft;
-                }
-            }
-
-            ggml_context * ctx = ctx_for_buft(buft);
-
-            // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
-            if (flags & TENSOR_DUPLICATED) {
-                ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
-                if (t) {
-                    return t;
-                }
-            }
-            return ml.create_tensor(ctx, tn, ne, flags);
+            const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
+            return ml.create_tensor(
+                hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
+                tn, ne, flags);
          };
  
          layers.resize(n_layer);
@@ -3142,6 +2861,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_LLAMA4:
                  {
+                    if (n_expert == 0) {
+                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
+                    }
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  
                      // output
@@ -3154,7 +2876,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      }
  
                      for (int i = 0; i < n_layer; ++i) {
-                        bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
+                        const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
  
                          auto & layer = layers[i];
  
@@ -3170,7 +2892,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  
                          if (is_moe_layer) {
-                            int n_ff_exp = hparams.n_ff_exp;
+                            const int64_t n_ff_exp = hparams.n_ff_exp;
  
                              layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
                              layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
@@ -3257,8 +2979,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_MINICPM3:
                  {
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
-                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
  
                      const int64_t q_lora_rank  = hparams.n_lora_q;
                      const int64_t kv_lora_rank = hparams.n_lora_kv;
@@ -3301,7 +3023,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
              case LLM_ARCH_GROK:
                  {
                      if (n_expert == 0) {
-                        throw std::runtime_error("Grok model cannot have zero experts");
+                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
                      }
  
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3473,6 +3195,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
              case LLM_ARCH_NOMIC_BERT_MOE:
              case LLM_ARCH_JINA_BERT_V3:
                  {
+                    if (n_token_types == 0) {
+                        throw std::runtime_error(arch_name() + " model needs to define token type count");
+                    }
                      tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
                      type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  
@@ -3739,8 +3464,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                          layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
  
-                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        // FIXME test-llama-archs crashes if q_norm is created
+                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
  
                          layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
                          layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -4126,8 +3852,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));
  
                      // attention parameters
-                    const uint32_t qk_dim = hparams.n_embd_head_k;
-                    const uint32_t v_dim  = hparams.n_embd_head_v;
+                    const uint32_t qk_dim = hparams.n_embd_head_k();
+                    const uint32_t v_dim  = hparams.n_embd_head_v();
  
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  
@@ -4187,8 +3913,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_PLAMO3:
                  {
-                    const int64_t head_dim_q = hparams.n_embd_head_k;
-                    const int64_t head_dim_v = hparams.n_embd_head_v;
+                    const int64_t head_dim_q = hparams.n_embd_head_k();
+                    const int64_t head_dim_v = hparams.n_embd_head_v();
  
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  
@@ -4935,7 +4661,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_SEED_OSS:
                  {
-                    const uint32_t head_dim             = hparams.n_embd_head_k;
+                    const uint32_t head_dim             = hparams.n_embd_head_k();
                      const int64_t n_qo_dim              = n_head * head_dim;
                      const int64_t n_kv_dim              = n_head_kv * head_dim;
  
@@ -5164,8 +4890,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
                      const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
  
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
                      const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+                    GGML_ASSERT(n_embd_head_qk_nope >= 1);
  
                      const int64_t q_lora_rank  = hparams.n_lora_q;
                      const int64_t kv_lora_rank = hparams.n_lora_kv;
@@ -5242,8 +4969,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_PLM:
                  {
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
-                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
                      const int64_t kv_lora_rank = hparams.n_lora_kv;
  
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5283,23 +5010,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  
                          layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
-                        layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.wq_s     = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
                          layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.wk_s     = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
                          layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.wv_s     = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
                          layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.wo_s     = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
  
                          layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
                          layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  
                          layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
                          layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
                          layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up_scale   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_s   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
                      }
                  } break;
              case LLM_ARCH_T5:
@@ -5357,7 +5084,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
                          layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
                          // this tensor seems to be unused in HF transformers implementation
-                        layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+                        layer.attn_rel_b_cross = create_tensor(
+                            tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
  
                          layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
                          layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
@@ -5680,7 +5408,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
                      const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
  
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
                      const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  
                      const int64_t q_lora_rank  = hparams.n_lora_q;
@@ -5819,6 +5547,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      const int64_t n_ssm_head = hparams.ssm_dt_rank;
                      const int64_t n_group    = hparams.ssm_n_group;
                      const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
+                    const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
  
                      // embeddings
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5878,8 +5607,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                                  layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert         }, 0);
  
                                  // MoE branch
-                                layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                                layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                                layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
+                                layer.ffn_latent_up   = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP,   "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+
+                                layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   moe_n_embd, n_expert}, 0);
+                                layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
  
                                  // Shared expert branch
                                  layer.ffn_down_shexp  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
@@ -5963,8 +5695,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      const int64_t n_ff_exp       = hparams.n_ff_exp;
                      const int64_t n_expert       = hparams.n_expert;
                      const int64_t n_expert_used  = hparams.n_expert_used;
-                    const int64_t n_ff_shexp     = hparams.n_ff_shexp;
-                    const int64_t head_dim       = hparams.n_embd_head_k;
+                    const int64_t n_ff_shexp     = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
+                    const int64_t head_dim       = hparams.n_embd_head_k();
                      const int64_t n_qo_dim       = n_head * head_dim;
                      const int64_t n_kv_dim       = n_head_kv * head_dim;
  
@@ -6824,6 +6556,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
                      for (int i = 0; i < n_layer; ++i) {
                          auto & layer = layers[i];
+                        const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
  
                          layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  
@@ -6842,9 +6575,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
                          layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
  
-                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
-                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
-                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
+                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
                      }
                  } break;
              case LLM_ARCH_HUNYUAN_DENSE:
@@ -7180,15 +6913,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
                          const int64_t ssm_d_conv = hparams.ssm_d_conv;
  
-                        // Try loading KDA specific tensors (using SSM_ prefix)
-                        // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
-                        // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
-                        layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
-                        if (!layer.ssm_q_conv) {
-                            layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
-                        }
+                        if (hparams.is_recurrent(i)) {
+                            // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
+                            // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
+                            layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+                            if (!layer.ssm_q_conv) {
+                                layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
+                            }
  
-                        if (layer.ssm_q_conv) {
                               // KDA Layer - Conv1d weights may be 3D or 4D
                               layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
                               if (!layer.ssm_k_conv) {
@@ -7252,10 +6984,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
                               // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
                               // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
-                             const int64_t qk_rope_head_dim = hparams.n_rot;  // From config: qk_rope_head_dim
+                             const int64_t qk_rope_head_dim = hparams.n_rot();  // From config: qk_rope_head_dim
                               layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
                               // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
-                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED);
+                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
+                                {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
                               if (!layer.wkv_b) { // MLA KV cache enabled
                                   layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
                                   layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
@@ -7375,6 +7108,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_QWEN3NEXT:
                  {
+                    if (n_expert == 0) {
+                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
+                    }
+
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  
                      // output
@@ -7403,6 +7140,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
                      for (int i = 0; i < n_layer; ++i) {
                          auto & layer = layers[i];
+                        const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
  
                          layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
                          layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
@@ -7438,9 +7176,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
                          // Shared experts
                          layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
-                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
-                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
-                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
+                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
+                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
+                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
                      }
                  } break;
              case LLM_ARCH_QWEN35MOE:
@@ -7617,7 +7355,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                      // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
                      uint32_t n_rot_max = 0;
                      for (int i = 0; i < n_layer; ++i) {
-                        n_rot_max = std::max(n_rot_max, hparams.n_rot);
+                        n_rot_max = std::max(n_rot_max, hparams.n_rot(i));
                      }
                      if (n_rot_max == 0) {
                          n_rot_max = n_rot;
@@ -7706,10 +7444,72 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  throw std::runtime_error("unknown architecture");
          }
  
-        if (n_moved_tensors > 0) {
-            LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
-                __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
-                ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+        // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
+        // this avoids having to add scale loading to every architecture
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = layers[i];
+
+            // attention weight scales (per-tensor, shape {1})
+            if (!layer.wq_s && layer.wq) {
+                layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wk_s && layer.wk) {
+                layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wv_s && layer.wv) {
+                layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wo_s && layer.wo) {
+                layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wqkv_s && layer.wqkv) {
+                layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wqkv_gate_s && layer.wqkv_gate) {
+                layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+
+            // dense FFN weight scales (per-tensor, shape {1})
+            if (!layer.ffn_gate_s && layer.ffn_gate) {
+                layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_s && layer.ffn_down) {
+                layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_s && layer.ffn_up) {
+                layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
+                layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
+                layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
+                layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+
+            // MoE expert weight scales (per-expert, shape {n_expert})
+            if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
+                layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_exps_s && layer.ffn_down_exps) {
+                layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
+                layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
+
+            // recurrent / linear-attention weight scales (per-tensor, shape {1})
+            if (!layer.ssm_out_s && layer.ssm_out) {
+                layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_alpha_s && layer.ssm_alpha) {
+                layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_beta_s && layer.ssm_beta) {
+                layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
          }
      }
  
@@ -7720,13 +7520,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
      // create the backend buffers
      std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
-    ctx_buf_maps.reserve(ctx_map.size());
+    ctx_buf_maps.reserve(ml.ctx_map.size());
  
      // Ensure we have enough capacity for the maximum backend buffer we will potentially create
-    const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
+    const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
      pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
  
-    for (auto & [buft, ctx_ptr] : ctx_map) {
+    for (auto & [buft, ctx_ptr] : ml.ctx_map) {
          ggml_context * ctx = ctx_ptr.get();
  
          // skip contexts without tensors
@@ -7958,11 +7758,11 @@ void llama_model::print_info() const {
          LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer);
          LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
          LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot);
+        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot_full);
          LLAMA_LOG_INFO("%s: n_swa                 = %u\n",     __func__, hparams.n_swa);
          LLAMA_LOG_INFO("%s: is_swa_any            = %u\n",     __func__, hparams.is_swa_any());
-        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k);
-        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v);
+        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k_full);
+        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v_full);
          LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
          LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
          LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
@@ -7986,6 +7786,9 @@ void llama_model::print_info() const {
          if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
              LLAMA_LOG_INFO("%s: freq_base_swa         = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
              LLAMA_LOG_INFO("%s: freq_scale_swa        = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
+            LLAMA_LOG_INFO("%s: n_embd_head_k_swa     = %u\n",     __func__, hparams.n_embd_head_k_swa);
+            LLAMA_LOG_INFO("%s: n_embd_head_v_swa     = %u\n",     __func__, hparams.n_embd_head_v_swa);
+            LLAMA_LOG_INFO("%s: n_rot_swa             = %u\n",     __func__, hparams.n_rot_swa);
          }
          LLAMA_LOG_INFO("%s: n_ctx_orig_yarn       = %u\n",     __func__, hparams.n_ctx_orig_yarn);
          LLAMA_LOG_INFO("%s: rope_yarn_log_mul     = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h

index d7c3e7d1c1a32f378d6602044ae919981ff9e39f..25bf892e7e2c144285d6a938c738ec010b9e8d2d 100644 (file)
--- a/examples/talk-llama/llama-model.h
+++ b/examples/talk-llama/llama-model.h
@@ -54,6 +54,7 @@ enum llm_type {
      LLM_TYPE_0_3B,
      LLM_TYPE_0_5B,
      LLM_TYPE_0_6B,
+    LLM_TYPE_0_8B,
      LLM_TYPE_1B,
      LLM_TYPE_1_2B,
      LLM_TYPE_1_3B,
@@ -125,12 +126,15 @@ enum llm_type {
      LLM_TYPE_100B_A6B,
      LLM_TYPE_102B_A12B, // Solar-Open
      LLM_TYPE_106B_A12B, // GLM-4.5-Air
+    LLM_TYPE_120B_A12B, // Nemotron 3 Super
+    LLM_TYPE_122B_A10B, // Qwen3.5
      LLM_TYPE_196B_A11B, // Step3.5-Flash
      LLM_TYPE_230B_A10B, // Minimax M2
      LLM_TYPE_235B_A22B,
      LLM_TYPE_300B_A47B, // Ernie MoE big
      LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
      LLM_TYPE_355B_A32B, // GLM-4.5
+    LLM_TYPE_397B_A17B, // Qwen3.5
      LLM_TYPE_744B_A40B, // GLM-5
      LLM_TYPE_E2B,
      LLM_TYPE_E4B,
@@ -291,6 +295,15 @@ struct llama_layer {
      struct ggml_tensor * ffn_up_exps_b     = nullptr;
      struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
  
+    // ff MoE per-expert scales (NVFP4 per-tensor scale2)
+    struct ggml_tensor * ffn_gate_exps_s   = nullptr;
+    struct ggml_tensor * ffn_down_exps_s   = nullptr;
+    struct ggml_tensor * ffn_up_exps_s     = nullptr;
+
+    // ff MoE latent proj
+    struct ggml_tensor * ffn_latent_down = nullptr;
+    struct ggml_tensor * ffn_latent_up   = nullptr;
+
      // ff shared expert (shexp)
      struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
      struct ggml_tensor * ffn_gate_shexp     = nullptr;
@@ -384,13 +397,21 @@ struct llama_layer {
      struct ggml_tensor * rope_freqs = nullptr;
  
      // bitnet scale
-    struct ggml_tensor * wq_scale       = nullptr;
-    struct ggml_tensor * wk_scale       = nullptr;
-    struct ggml_tensor * wv_scale       = nullptr;
-    struct ggml_tensor * wo_scale       = nullptr;
-    struct ggml_tensor * ffn_gate_scale = nullptr;
-    struct ggml_tensor * ffn_up_scale   = nullptr;
-    struct ggml_tensor * ffn_down_scale = nullptr;
+    struct ggml_tensor * wq_s       = nullptr;
+    struct ggml_tensor * wk_s       = nullptr;
+    struct ggml_tensor * wv_s       = nullptr;
+    struct ggml_tensor * wo_s       = nullptr;
+    struct ggml_tensor * wqkv_s     = nullptr;
+    struct ggml_tensor * wqkv_gate_s = nullptr;
+    struct ggml_tensor * ffn_gate_s = nullptr;
+    struct ggml_tensor * ffn_up_s   = nullptr;
+    struct ggml_tensor * ffn_down_s = nullptr;
+    struct ggml_tensor * ffn_gate_shexp_s = nullptr;
+    struct ggml_tensor * ffn_up_shexp_s   = nullptr;
+    struct ggml_tensor * ffn_down_shexp_s = nullptr;
+    struct ggml_tensor * ssm_out_s  = nullptr;
+    struct ggml_tensor * ssm_alpha_s = nullptr;
+    struct ggml_tensor * ssm_beta_s  = nullptr;
  
      // altup & laurel
      struct ggml_tensor * per_layer_inp_gate   = nullptr;
diff --git a/examples/talk-llama/llama-quant.cpp b/examples/talk-llama/llama-quant.cpp

index 24770430e1cd6002a4af9ef2158d7b98bef69f3b..8e8ce231249d6d309fd33aa04a0abbef6e954818 100644 (file)
--- a/examples/talk-llama/llama-quant.cpp
+++ b/examples/talk-llama/llama-quant.cpp
@@ -1,11 +1,11 @@
-#include "llama-quant.h"
+#include "llama.h"
  #include "llama-impl.h"
  #include "llama-model.h"
  #include "llama-model-loader.h"
  
-#include <algorithm>
  #include <cmath>
  #include <cstring>
+#include <string>
  #include <cinttypes>
  #include <fstream>
  #include <mutex>
@@ -13,10 +13,28 @@
  #include <thread>
  #include <unordered_map>
  
-// Quantization types. Changes to this struct must be replicated in quantize.cpp
-struct tensor_quantization {
+// result of parsing --tensor-type option
+// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
+struct tensor_type_option {
      std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
+    ggml_type type = GGML_TYPE_COUNT;
+};
+
+// tensor categorization - used to avoid repeated string matching in quantization logic.
+// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
+enum class tensor_category {
+    TOKEN_EMBD,
+    ATTENTION_Q,
+    ATTENTION_V,
+    ATTENTION_K,
+    ATTENTION_QKV,
+    ATTENTION_KV_B,
+    ATTENTION_OUTPUT,
+    FFN_UP,
+    FFN_GATE,
+    FFN_DOWN,
+    OUTPUT,
+    OTHER
  };
  
  static void zeros(std::ofstream & file, size_t n) {
@@ -54,7 +72,7 @@ static std::string remap_layer(const std::string & orig_name, const std::vector<
      return orig_name;
  }
  
-static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
+static std::string remap_imatrix(const std::string & orig_name, const std::map<int, std::string> & mapped) {
      if (mapped.empty()) {
          return orig_name;
      }
@@ -76,6 +94,73 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
      return orig_name;
  }
  
+//
+// helper functions for tensor name matching
+//
+
+static bool tensor_name_match_token_embd(const char * tensor_name) {
+    return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
+           std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
+}
+
+static bool tensor_name_match_output_weight(const char * tensor_name) {
+    return std::strcmp(tensor_name, "output.weight") == 0;
+}
+
+//
+// tensor categorization for quantization
+//
+// (this is different from LLM_TN - we want broad categories, not specific tensor names per arch)
+//
+
+static tensor_category tensor_get_category(const std::string & tensor_name) {
+    if (tensor_name_match_output_weight(tensor_name.c_str())) {
+        return tensor_category::OUTPUT;
+    }
+    if (tensor_name_match_token_embd(tensor_name.c_str())) {
+        return tensor_category::TOKEN_EMBD;
+    }
+    if (tensor_name.find("attn_qkv.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_QKV;
+    }
+    if (tensor_name.find("attn_kv_b.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_KV_B;
+    }
+    if (tensor_name.find("attn_v.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_V;
+    }
+    if (tensor_name.find("attn_k.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_K;
+    }
+    if (tensor_name.find("attn_q.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_Q;
+    }
+    if (tensor_name.find("attn_output.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_OUTPUT;
+    }
+    if (tensor_name.find("ffn_up") != std::string::npos) {
+        return tensor_category::FFN_UP;
+    }
+    if (tensor_name.find("ffn_gate") != std::string::npos) {
+        return tensor_category::FFN_GATE;
+    }
+    if (tensor_name.find("ffn_down") != std::string::npos) {
+        return tensor_category::FFN_DOWN;
+    }
+    return tensor_category::OTHER;
+}
+
+// check if category is for attention-v-like tensors (more sensitive to quantization)
+static bool category_is_attn_v(tensor_category cat) {
+    return cat == tensor_category::ATTENTION_V     ||
+           cat == tensor_category::ATTENTION_QKV   ||
+           cat == tensor_category::ATTENTION_KV_B;
+}
+
+//
+// quantization state
+//
+
  struct quantize_state_impl {
      const llama_model                 & model;
      const llama_model_quantize_params * params;
@@ -89,20 +174,42 @@ struct quantize_state_impl {
      int i_ffn_gate     = 0;
      int i_ffn_up       = 0;
  
-    int n_k_quantized = 0;
      int n_fallback    = 0;
  
      bool has_imatrix = false;
  
-    // used to figure out if a model shares tok_embd with the output weight
-    bool has_output = false;
+    // used to figure out if a model has tied embeddings (tok_embd shares weights with output)
+    bool has_tied_embeddings = true; // assume tied until we see output.weight
+
+    // tensor type override patterns (compiled once, used twice)
+    std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
+
+    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params):
+        model(model), params(params)
+    {
+        // compile regex patterns once - they are expensive
+        if (params->tensor_types) {
+            const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
+            for (const auto & [tname, qtype] : tensor_types) {
+                tensor_type_patterns.emplace_back(std::regex(tname), qtype);
+            }
+        }
+    }
+};
  
-    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
-        {}
+// per-tensor metadata, computed in the preliminary loop and used in the main loop
+struct tensor_metadata {
+    ggml_type       target_type;
+    tensor_category category;
+    std::string     remapped_imatrix_name;
+    bool            allows_quantization;
+    bool            requires_imatrix;
  };
  
+//
+// dequantization
+//
+
  static void llama_tensor_dequantize_impl(
      ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
      const size_t nelements, const int nthread
@@ -175,12 +282,132 @@ static void llama_tensor_dequantize_impl(
      workers.clear();
  }
  
-static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+//
+// do we allow this tensor to be quantized?
+//
+
+static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
+    // trivial checks first -- no string ops needed
+    if (params->only_copy)       return false;
+
+    // quantize only 2D and 3D tensors (experts)
+    if (ggml_n_dims(tensor) < 2) return false;
+
+    const std::string name = ggml_get_name(tensor);
+
+    // This used to be a regex, but <regex> has an extreme cost to compile times.
+    bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+
+    // do not quantize norm tensors
+    quantize &= name.find("_norm.weight") == std::string::npos;
+
+    quantize &= params->quantize_output_tensor || name != "output.weight";
+
+    // do not quantize expert gating tensors
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
+    // these are very small (e.g. 4x4)
+    quantize &= name.find("altup")  == std::string::npos;
+    quantize &= name.find("laurel") == std::string::npos;
+
+    // these are not too big so keep them as it is
+    quantize &= name.find("per_layer_model_proj") == std::string::npos;
+
+    // do not quantize positional embeddings and token types (BERT)
+    quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD,    "weight");
+    quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
+
+    // do not quantize Mamba/Kimi's small conv1d weights
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    quantize &= name.find("ssm_conv1d") == std::string::npos;
+    quantize &= name.find("shortconv.conv.weight") == std::string::npos;
+
+    // do not quantize RWKV's small yet 2D weights
+    quantize &= name.find("time_mix_first.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_g1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_g2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+
+    // do not quantize relative position bias (T5)
+    quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+
+    // do not quantize specific multimodal tensors
+    quantize &= name.find(".position_embd.") == std::string::npos;
+
+    return quantize;
+}
+
+//
+// tensor type selection
+//
+
+// incompatible tensor shapes are handled here - fallback to a compatible type
+static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) {
+    ggml_type return_type = target_type;
+
+    const int64_t ncols = t->ne[0];
+    const int64_t qk_k = ggml_blck_size(target_type);
+
+    if (ncols % qk_k != 0) { // this tensor's shape is incompatible with this quant
+        LLAMA_LOG_WARN("warning: %-36s - ncols %6" PRId64 " not divisible by %3" PRId64 " (required for type %7s) ",
+                        t->name, ncols, qk_k, ggml_type_name(target_type));
+        ++qs.n_fallback;
+
+        switch (target_type) {
+            // types on the left: block size 256
+            case GGML_TYPE_IQ1_S:
+            case GGML_TYPE_IQ1_M:
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_S:
+            case GGML_TYPE_IQ3_XXS:
+            case GGML_TYPE_IQ3_S:   // types on the right: block size 32
+            case GGML_TYPE_IQ4_XS:  return_type = GGML_TYPE_IQ4_NL; break;
+            case GGML_TYPE_Q2_K:
+            case GGML_TYPE_Q3_K:
+            case GGML_TYPE_TQ1_0:
+            case GGML_TYPE_TQ2_0:   return_type = GGML_TYPE_Q4_0;   break;
+            case GGML_TYPE_Q4_K:    return_type = GGML_TYPE_Q5_0;   break;
+            case GGML_TYPE_Q5_K:    return_type = GGML_TYPE_Q5_1;   break;
+            case GGML_TYPE_Q6_K:    return_type = GGML_TYPE_Q8_0;   break;
+            default:
+                throw std::runtime_error(format("no tensor type fallback is defined for type %s",
+                                                ggml_type_name(target_type)));
+        }
+        if (ncols % ggml_blck_size(return_type) != 0) {
+            //
+            // the fallback return type is still not compatible for this tensor!
+            //
+            // most likely, this tensor's first dimension is not divisible by 32.
+            // this is very rare. we can either abort the quantization, or
+            // fallback to F16 / F32.
+            //
+            LLAMA_LOG_WARN("(WARNING: must use F16 due to unusual shape) ");
+            return_type = GGML_TYPE_F16;
+        }
+        LLAMA_LOG_WARN("-> falling back to %7s\n", ggml_type_name(return_type));
+    }
+    return return_type;
+}
+
+// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
+static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
      const std::string name = ggml_get_name(tensor);
  
      // TODO: avoid hardcoded tensor names - use the TN_* constants
      const llm_arch arch = qs.model.arch;
-    const auto       tn = LLM_TN(arch);
  
      auto use_more_bits = [](int i_layer, int n_layers) -> bool {
          return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
@@ -204,7 +431,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
  
      // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
      // with the quantization of the output tensor
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+    if (category == tensor_category::OUTPUT || (qs.has_tied_embeddings && category == tensor_category::TOKEN_EMBD)) {
          if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
              new_type = qs.params->output_tensor_type;
          } else {
@@ -234,7 +461,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
          } else {
              new_type = GGML_TYPE_Q8_0;
          }
-    } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
+    } else if (category == tensor_category::TOKEN_EMBD) {
          if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
              new_type = qs.params->token_embedding_type;
          } else {
@@ -254,21 +481,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
          }
      } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                 ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
-        if (name.find("attn_v.weight") != std::string::npos) {
+        if (category_is_attn_v(category)) {
              if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
              else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
              ++qs.i_attention_wv;
          }
-        else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+        else if (qs.model.hparams.n_expert == 8 && category == tensor_category::ATTENTION_K) {
              new_type = GGML_TYPE_Q4_K;
          }
-        else if (name.find("ffn_down") != std::string::npos) {
+        else if (category == tensor_category::FFN_DOWN) {
              if (qs.i_ffn_down < qs.n_ffn_down/8) {
                  new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
              }
              ++qs.i_ffn_down;
          }
-        else if (name.find("attn_output.weight") != std::string::npos) {
+        else if (category == tensor_category::ATTENTION_OUTPUT) {
              if (qs.model.hparams.n_expert == 8) {
                  new_type = GGML_TYPE_Q5_K;
              } else {
@@ -276,7 +503,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
              }
          }
-    } else if (name.find("attn_v.weight") != std::string::npos) {
+    } else if (category_is_attn_v(category)) {
          if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
              new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
          }
@@ -314,7 +541,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
              new_type = GGML_TYPE_Q8_0;
          }
          ++qs.i_attention_wv;
-    } else if (name.find("attn_k.weight") != std::string::npos) {
+    } else if (category == tensor_category::ATTENTION_K) {
          if (qs.model.hparams.n_expert == 8) {
              // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
              // TODO: explore better strategies
@@ -326,14 +553,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
          else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
              new_type = GGML_TYPE_IQ2_S;
          }
-    } else if (name.find("attn_q.weight") != std::string::npos) {
+    } else if (category == tensor_category::ATTENTION_Q) {
          if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
              new_type = GGML_TYPE_IQ3_XXS;
          }
          else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
              new_type = GGML_TYPE_IQ2_S;
          }
-    } else if (name.find("ffn_down") != std::string::npos) {
+    } else if (category == tensor_category::FFN_DOWN) {
          auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
          int i_layer = info.first, n_layer = info.second;
          if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -378,7 +605,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
              new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
          }
          ++qs.i_ffn_down;
-    } else if (name.find("attn_output.weight") != std::string::npos) {
+    } else if (category == tensor_category::ATTENTION_OUTPUT) {
          if (arch != LLM_ARCH_FALCON) {
              if (qs.model.hparams.n_expert == 8) {
                  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -398,14 +625,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
              if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
          }
      }
-    else if (name.find("attn_qkv.weight") != std::string::npos) {
+    else if (category == tensor_category::ATTENTION_QKV) {
          if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
              new_type = GGML_TYPE_Q4_K;
          }
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
      }
-    else if (name.find("ffn_gate") != std::string::npos) {
+    else if (category == tensor_category::FFN_GATE) {
          auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
          int i_layer = info.first, n_layer = info.second;
          if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
@@ -413,7 +640,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
          }
          ++qs.i_ffn_gate;
      }
-    else if (name.find("ffn_up") != std::string::npos) {
+    else if (category == tensor_category::FFN_UP) {
          auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
          int i_layer = info.first, n_layer = info.second;
          if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
@@ -425,6 +652,55 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
      return new_type;
  }
  
+// outer wrapper: determine the ggml_type that this tensor should be quantized to
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
+    if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
+        return tensor->type;
+    }
+    if (params->token_embedding_type < GGML_TYPE_COUNT && tm.category == tensor_category::TOKEN_EMBD) {
+        return params->token_embedding_type;
+    }
+    if (params->output_tensor_type < GGML_TYPE_COUNT && tm.category == tensor_category::OUTPUT) {
+        return params->output_tensor_type;
+    }
+
+    ggml_type new_type = default_type;
+
+    // get more optimal quantization type based on the tensor shape, layer, etc.
+    if (!params->pure && ggml_is_quantized(default_type)) {
+        // if the user provided tensor types - use those
+        bool manual = false;
+        if (!qs.tensor_type_patterns.empty()) {
+            const std::string tensor_name(tensor->name);
+            for (const auto & [pattern, qtype] : qs.tensor_type_patterns) {
+                if (std::regex_search(tensor_name, pattern)) {
+                    if (qtype != new_type) {
+                        LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
+                                       __func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
+                        new_type = qtype;
+                        manual = true;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+        if (!manual) {
+            new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, tm.category);
+        }
+
+        // incompatible tensor shapes are handled here - fallback to a compatible type
+        new_type = tensor_type_fallback(qs, tensor, new_type);
+    }
+
+    return new_type;
+}
+
+//
+// quantization implementation
+//
+
  static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
      if (nthread < 2) {
          // single-thread
@@ -479,61 +755,85 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
      return new_size;
  }
  
-static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type, const llama_ftype ftype) {
-    return (
-        dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
-        dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
-        dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
-        (   // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings
-            dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0
-        )
-    );
+//
+// imatrix requirement check
+//
+
+static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type dst_type, const llama_ftype ftype) {
+    if (tensor_name_match_token_embd(tensor_name) || tensor_name_match_output_weight(tensor_name)) {
+        return false;
+    }
+    switch (dst_type) {
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ1_S:
+            return true;
+        case GGML_TYPE_Q2_K:
+            // as a general rule, the k-type quantizations don't require imatrix data.
+            // the only exception is Q2_K tensors that are part of a Q2_K_S file.
+            return ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S;
+        default:
+            return false;
+    }
  }
  
-static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
-    ggml_type default_type;
-    llama_ftype ftype = params->ftype;
+//
+// given a file type, get the default tensor type
+//
  
-    switch (params->ftype) {
-        case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
-        case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
-        case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
-        case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
-        case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
+static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
+    switch (ftype) {
+        case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
+        case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
+        case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0;
+        case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1;
+        case LLAMA_FTYPE_MOSTLY_Q8_0: return GGML_TYPE_Q8_0;
+        case LLAMA_FTYPE_MOSTLY_F16:  return GGML_TYPE_F16;
+        case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
+        case LLAMA_FTYPE_ALL_F32:     return GGML_TYPE_F32;
  
-        case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
+        case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
  
          // K-quants
          case LLAMA_FTYPE_MOSTLY_Q2_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q2_K:    default_type = GGML_TYPE_Q2_K;    break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:  default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_Q2_K:    return GGML_TYPE_Q2_K;
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:  return GGML_TYPE_IQ3_S;
          case LLAMA_FTYPE_MOSTLY_Q3_K_S:
          case LLAMA_FTYPE_MOSTLY_Q3_K_M:
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L:  default_type = GGML_TYPE_Q3_K;    break;
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L:  return GGML_TYPE_Q3_K;
          case LLAMA_FTYPE_MOSTLY_Q4_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M:  default_type = GGML_TYPE_Q4_K;    break;
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M:  return GGML_TYPE_Q4_K;
          case LLAMA_FTYPE_MOSTLY_Q5_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M:  default_type = GGML_TYPE_Q5_K;    break;
-        case LLAMA_FTYPE_MOSTLY_Q6_K:    default_type = GGML_TYPE_Q6_K;    break;
-        case LLAMA_FTYPE_MOSTLY_TQ1_0:   default_type = GGML_TYPE_TQ1_0;   break;
-        case LLAMA_FTYPE_MOSTLY_TQ2_0:   default_type = GGML_TYPE_TQ2_0;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:  default_type = GGML_TYPE_IQ2_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:   default_type = GGML_TYPE_IQ2_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:   default_type = GGML_TYPE_IQ2_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:   default_type = GGML_TYPE_IQ1_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ1_M:   default_type = GGML_TYPE_IQ1_M;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:  default_type = GGML_TYPE_IQ4_NL;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M:  return GGML_TYPE_Q5_K;
+        case LLAMA_FTYPE_MOSTLY_Q6_K:    return GGML_TYPE_Q6_K;
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:   return GGML_TYPE_TQ1_0;
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:   return GGML_TYPE_TQ2_0;
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS;
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:  return GGML_TYPE_IQ2_XS;
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:   return GGML_TYPE_IQ2_XS;
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:   return GGML_TYPE_IQ2_S;
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return GGML_TYPE_IQ3_XXS;
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:   return GGML_TYPE_IQ1_S;
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:   return GGML_TYPE_IQ1_M;
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:  return GGML_TYPE_IQ4_NL;
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  return GGML_TYPE_IQ4_XS;
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:   return GGML_TYPE_IQ3_S;
  
          default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
      }
+}
+
+//
+// main quantization driver
+//
+
+static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+    ggml_type default_type;
+    llama_ftype ftype = params->ftype;
  
      int nthread = params->nthread;
  
@@ -541,6 +841,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          nthread = std::thread::hardware_concurrency();
      }
  
+    default_type = llama_ftype_get_default_type(ftype);
+
      // mmap consistently increases speed on Linux, and also increases speed on Windows with
      // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
  #if defined(__linux__) || defined(_WIN32)
@@ -556,7 +858,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      }
  
      std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
+        fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
      ml.init_mappings(false); // no prefetching
  
      llama_model model(llama_model_default_params());
@@ -574,7 +877,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      if (params->imatrix) {
          imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
          if (imatrix_data) {
-            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+            LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
+                           __func__, (int)imatrix_data->size());
              qs.has_imatrix = true;
              // check imatrix for nans or infs
              for (const auto & kv : *imatrix_data) {
@@ -596,7 +900,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      }
  
      // copy the KV pairs from the input file
-    gguf_set_kv     (ctx_out.get(), ml.meta.get());
+    gguf_set_kv     (ctx_out.get(), ml.metadata);
      gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
      gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
  
@@ -657,35 +961,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          });
      }
  
-    for (const auto * it : tensors) {
-        const struct ggml_tensor * tensor = it->tensor;
-
-        const std::string name = ggml_get_name(tensor);
-
-        // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight")   != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos ||
-            name.find("attn_kv_b.weight")!= std::string::npos) {
-            ++qs.n_attention_wv;
-        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
-            qs.has_output = true;
-        }
-    }
-
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-
-    size_t total_size_org = 0;
-    size_t total_size_new = 0;
-
-    std::vector<std::thread> workers;
-    workers.reserve(nthread);
-
      int idx = 0;
-
-    std::vector<no_init<uint8_t>> read_data;
-    std::vector<no_init<uint8_t>> work;
-    std::vector<no_init<float>> f32_conv_buf;
-
      uint16_t n_split = 1;
  
      // Assume split index is continuous
@@ -697,14 +973,68 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      std::vector<gguf_context_ptr> ctx_outs(n_split);
      ctx_outs[0] = std::move(ctx_out);
  
-    // populate the original tensors so we get an initial meta data
-    for (const auto * it : tensors) {
+    // compute tensor metadata once and cache it
+    std::vector<tensor_metadata> metadata(tensors.size());
+
+    // initialize quantization state before preliminary loop (counters for use_more_bits)
+    {
+        for (size_t i = 0; i < tensors.size(); ++i) {
+            const auto cat = tensor_get_category(tensors[i]->tensor->name);
+            if (category_is_attn_v(cat)) {
+                ++qs.n_attention_wv;
+            }
+            if (cat == tensor_category::OUTPUT) {
+                qs.has_tied_embeddings = false;
+            }
+            metadata[i].category = cat; // save and re-use the category while we're at it
+        }
+        // these also need to be set to n_layer by default
+        qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
+    }
+
+    // flag for --dry-run
+    bool will_require_imatrix = false;
+
+    //
+    // preliminary iteration over all weights
+    //
+
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        const auto * it = tensors[i];
+        const struct ggml_tensor * tensor = it->tensor;
+        const std::string name = ggml_get_name(tensor);
+
          uint16_t i_split = params->keep_split ? it->idx : 0;
-        ggml_tensor * tensor = it->tensor;
          if (!ctx_outs[i_split]) {
              ctx_outs[i_split].reset(gguf_init_empty());
          }
          gguf_add_tensor(ctx_outs[i_split].get(), tensor);
+
+        metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
+
+        if (metadata[i].allows_quantization) {
+            metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
+        } else {
+            metadata[i].target_type = tensor->type;
+        }
+
+        metadata[i].requires_imatrix = tensor_requires_imatrix(tensor->name, metadata[i].target_type, ftype);
+
+        if (params->imatrix) {
+            metadata[i].remapped_imatrix_name = remap_imatrix(tensor->name, mapped);
+        } else if (metadata[i].allows_quantization && metadata[i].requires_imatrix) {
+            if (params->dry_run) {
+                will_require_imatrix = true;
+            } else {
+                LLAMA_LOG_ERROR("\n============================================================================\n"
+                                " ERROR: this quantization requires an importance matrix!\n"
+                                "        - offending tensor: %s\n"
+                                "        - target type: %s\n"
+                                "============================================================================\n\n",
+                                name.c_str(), ggml_type_name(metadata[i].target_type));
+                throw std::runtime_error("this quantization requires an imatrix!");
+            }
+        }
      }
  
      // Set split info if needed
@@ -716,6 +1046,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          }
      }
  
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    std::vector<std::thread> workers;
+    workers.reserve(nthread);
+
+    std::vector<no_init<uint8_t>> read_data;
+    std::vector<no_init<uint8_t>> work;
+    std::vector<no_init<float>> f32_conv_buf;
+
      int cur_split = -1;
      std::ofstream fout;
      auto close_ofstream = [&]() {
@@ -745,20 +1085,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          ::zeros(fout, meta_size);
      };
  
-    const auto tn = LLM_TN(model.arch);
-
      // no output file for --dry-run
      if (!params->dry_run) {
          new_ofstream(0);
      }
  
-    // flag for `--dry-run`, to let the user know if imatrix will be required for a real
-    // quantization, as a courtesy
-    bool will_require_imatrix = false;
+    //
+    // main loop: iterate over all weights
+    //
  
-    for (const auto * it : tensors) {
-        const auto & weight = *it;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        const auto & weight = *tensors[i];
+        const auto & tm = metadata[i];
          ggml_tensor * tensor = weight.tensor;
+
          if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
              close_ofstream();
              new_ofstream(weight.idx);
@@ -777,162 +1117,31 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
              ml.load_data_for(tensor);
          }
  
-        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
+        LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
                 ++idx, ml.n_tensors,
                 ggml_get_name(tensor),
                 llama_format_tensor_shape(tensor).c_str(),
                 ggml_type_name(tensor->type));
  
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
-
-        // quantize only 2D and 3D tensors (experts)
-        quantize &= (ggml_n_dims(tensor) >= 2);
-
-        // do not quantize norm tensors
-        quantize &= name.find("_norm.weight") == std::string::npos;
-
-        quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= !params->only_copy;
-
-        // do not quantize expert gating tensors
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
-
-        // these are very small (e.g. 4x4)
-        quantize &= name.find("altup")  == std::string::npos;
-        quantize &= name.find("laurel") == std::string::npos;
-
-        // these are not too big so keep them as it is
-        quantize &= name.find("per_layer_model_proj") == std::string::npos;
-
-        // do not quantize positional embeddings and token types (BERT)
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
-
-        // do not quantize Mamba /Kimi's small conv1d weights
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ssm_conv1d") == std::string::npos;
-        quantize &= name.find("shortconv.conv.weight") == std::string::npos;
-
-        // do not quantize RWKV's small yet 2D weights
-        quantize &= name.find("time_mix_first.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-
-        // do not quantize relative position bias (T5)
-        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
-
-        // do not quantize specific multimodal tensors
-        quantize &= name.find(".position_embd.") == std::string::npos;
-
-        ggml_type new_type;
-        void * new_data;
-        size_t new_size;
-
-        if (quantize) {
-            new_type = default_type;
-
-            // get more optimal quantization type based on the tensor shape, layer, etc.
-            if (!params->pure && ggml_is_quantized(default_type)) {
-                // if the user provided tensor types - use those
-                bool manual = false;
-                if (params->tensor_types) {
-                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
-                    const std::string tensor_name(tensor->name);
-                    for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
-                            if  (qtype != new_type) {
-                                LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
-                                new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
-                                manual = true;
-                                break;
-                            }
-                        }
-                    }
-                }
-
-                // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
-                if (!manual) {
-                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                }
-
-                // incompatible tensor shapes are handled here - fallback to a compatible type
-                {
-                    bool convert_incompatible_tensor = false;
+        const ggml_type cur_type = tensor->type;
+        const ggml_type new_type = tm.target_type;
  
-                    const int64_t nx = tensor->ne[0];
-                    const int64_t ny = tensor->ne[1];
-                    const int64_t qk_k = ggml_blck_size(new_type);
-
-                    if (nx % qk_k != 0) {
-                        LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
-                        convert_incompatible_tensor = true;
-                    } else {
-                        ++qs.n_k_quantized;
-                    }
-
-                    if (convert_incompatible_tensor) {
-                        switch (new_type) {
-                            case GGML_TYPE_TQ1_0:
-                            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
-                            case GGML_TYPE_IQ2_XXS:
-                            case GGML_TYPE_IQ2_XS:
-                            case GGML_TYPE_IQ2_S:
-                            case GGML_TYPE_IQ3_XXS:
-                            case GGML_TYPE_IQ3_S:
-                            case GGML_TYPE_IQ1_S:
-                            case GGML_TYPE_IQ1_M:
-                            case GGML_TYPE_Q2_K:
-                            case GGML_TYPE_Q3_K:
-                            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
-                            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-                            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-                            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-                            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-                        }
-                        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
-                            new_type = GGML_TYPE_F16;
-                        }
-                        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-                        ++qs.n_fallback;
-                    }
-                }
-            }
-            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
-                new_type = params->token_embedding_type;
-            }
-            if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-                new_type = params->output_tensor_type;
-            }
+        // If we've decided to quantize to the same type the tensor is already
+        // in then there's nothing to do.
+        bool quantize = cur_type != new_type;
  
-            // If we've decided to quantize to the same type the tensor is already
-            // in then there's nothing to do.
-            quantize = tensor->type != new_type;
-        }
+        void * new_data;
+        size_t new_size;
  
-        // we have now decided on the target type for this tensor
          if (params->dry_run) {
-            // the --dry-run option calculates the final quantization size without quantizting
+            // the --dry-run option calculates the final quantization size without quantizing
              if (quantize) {
                  new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
                  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
                                 tensor_size/1024.0/1024.0,
                                 new_size/1024.0/1024.0,
                                 ggml_type_name(new_type));
-                if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+                if (!will_require_imatrix && tm.requires_imatrix) {
                      will_require_imatrix = true;
                  }
              } else {
@@ -945,7 +1154,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          } else {
              // no --dry-run, perform quantization
              if (!quantize) {
-                new_type = tensor->type;
                  new_data = tensor->data;
                  new_size = tensor_size;
                  LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
@@ -954,7 +1162,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
  
                  const float * imatrix = nullptr;
                  if (imatrix_data) {
-                    auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                    auto it = imatrix_data->find(tm.remapped_imatrix_name);
                      if (it == imatrix_data->end()) {
                          LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                      } else {
@@ -968,14 +1176,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                              // this is a significant error and it may be good idea to abort the process if this happens,
                              // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
                              // tok_embd should be ignored in this case, since it always causes this warning
-                            if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+                            if (!tensor_name_match_token_embd(tensor->name)) {
                                  throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
                                          int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
                              }
                          }
                      }
                  }
-                if (!imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+                if (!imatrix && tm.requires_imatrix) {
                      LLAMA_LOG_ERROR("\n\n============================================================\n");
                      LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                      LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
@@ -1020,29 +1228,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                      const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
  
                      new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
-
-                    // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
-#if 0
-                    if (new_type == GGML_TYPE_MXFP4) {
-                        auto * x = f32_data_03;
-
-                        //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
-                        std::vector<float> deq(nrows*n_per_row);
-                        const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
-                        qtype->to_float(new_data_03, deq.data(), deq.size());
-
-                        double err = 0.0f;
-                        for (int i = 0; i < (int) deq.size(); ++i) {
-                            err += fabsf(deq[i] - x[i]);
-                            //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
-                            if (deq[i] != x[i]) {
-                                LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
-                            }
-                        }
-                        //LLAMA_LOG_INFO("err = %f\n", err);
-                        GGML_ASSERT(err == 0.00000);
-                    }
-#endif
                  }
                  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
              }
@@ -1058,7 +1243,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
              fout.write((const char *) new_data, new_size);
              zeros(fout, GGML_PAD(new_size, align) - new_size);
          } // no --dry-run
-    } // iterate over tensors
+    } // main loop
  
      if (!params->dry_run) {
          close_ofstream();
@@ -1075,7 +1260,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
  
      if (qs.n_fallback > 0) {
          LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
-                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+                __func__, qs.n_fallback, ml.n_tensors);
      }
  }
  
diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp

index 194eed238ec07dfb6a1403ac5a67639fb1864680..68ba292d426f302a5d11cfa16a1d5dbacd42ca42 100644 (file)
--- a/examples/talk-llama/llama-vocab.cpp
+++ b/examples/talk-llama/llama-vocab.cpp
@@ -1719,7 +1719,7 @@ private:
  };
  
  void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-    struct gguf_context * ctx = ml.meta.get();
+    struct gguf_context * ctx = ml.metadata;
  
      // determine vocab type
      {
@@ -1833,7 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
  #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-                // correct endiannes of data in precompiled_charsmap binary blob
+                // correct endianness of data in precompiled_charsmap binary blob
                  uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
                  *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
                  assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp

index 6da90d6f1f8d7ca495163af00ee755b79aaaa999..872e659edcaf734b51cb9bc2dca5a2b5413d330f 100644 (file)
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
@@ -1,5 +1,6 @@
  #include "llama.h"
  
+#include "ggml-cpp.h"
  #include "llama-impl.h"
  
  #include "llama-chat.h"
@@ -12,6 +13,7 @@
  
  #include "ggml.h"
  #include "ggml-backend.h"
+#include "gguf.h"
  
  #include <algorithm>
  #include <cassert>
@@ -825,7 +827,8 @@ int64_t llama_time_us(void) {
  }
  
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
+        const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
      // loading time will be recalculated after the first eval, so
      // we take page faults deferred by mmap() into consideration
      model.t_load_us = 0;
@@ -834,7 +837,8 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
      model.t_start_us = tm.t_start_us;
  
      try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
+            params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
  
          ml.print_info();
  
@@ -880,9 +884,13 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
  }
  
  static struct llama_model * llama_model_load_from_file_impl(
+        struct gguf_context * metadata,
+        llama_model_set_tensor_data_t set_tensor_data,
+        void * set_tensor_data_ud,
          const std::string & path_model,
          std::vector<std::string> & splits,
          struct llama_model_params params) {
+    GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
      ggml_time_init();
  
      if (!params.vocab_only && ggml_backend_reg_count() == 0) {
@@ -1003,7 +1011,7 @@ static struct llama_model * llama_model_load_from_file_impl(
                  props.memory_free/1024/1024);
      }
  
-    const int status = llama_model_load(path_model, splits, *model, params);
+    const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
      GGML_ASSERT(status <= 0);
      if (status < 0) {
          if (status == -1) {
@@ -1019,6 +1027,18 @@ static struct llama_model * llama_model_load_from_file_impl(
      return model;
  }
  
+struct llama_model * llama_model_init_from_user(
+        struct gguf_context * metadata,
+        llama_model_set_tensor_data_t set_tensor_data,
+        void * set_tensor_data_ud,
+        struct llama_model_params params) {
+    GGML_ASSERT(metadata != nullptr);
+    std::string path_model;
+    std::vector<std::string> splits = {};
+    params.use_mmap = false;
+    params.use_extra_bufts = false;
+    return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
+}
  // deprecated
  struct llama_model * llama_load_model_from_file(
          const char * path_model,
@@ -1030,7 +1050,7 @@ struct llama_model * llama_model_load_from_file(
          const char * path_model,
          struct llama_model_params params) {
      std::vector<std::string> splits = {};
-    return llama_model_load_from_file_impl(path_model, splits, params);
+    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
  }
  
  struct llama_model * llama_model_load_from_splits(
@@ -1046,11 +1066,11 @@ struct llama_model * llama_model_load_from_splits(
      for (size_t i = 0; i < n_paths; ++i) {
          splits.push_back(paths[i]);
      }
-    return llama_model_load_from_file_impl(splits.front(), splits, params);
+    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
  }
  
  void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
-    llama_model_saver ms(*model);
+    llama_model_saver ms(model);
      ms.add_kv_from_model();
      ms.add_tensors_from_model();
      ms.save(path_model);
diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h

index 077f66dc651fc664034c809c6cc9391a89c69e3d..c6e102abe519f0c9393fe7903f61ad9a6f2d1641 100644 (file)
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@@ -5,6 +5,7 @@
  #include "ggml-cpu.h"
  #include "ggml-backend.h"
  #include "ggml-opt.h"
+#include "gguf.h"
  
  #include <stddef.h>
  #include <stdint.h>
@@ -152,6 +153,7 @@ extern "C" {
          LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
          LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
          LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_NVFP4         = 39, // except 1d tensors
  
          LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
      };
@@ -440,19 +442,30 @@ extern "C" {
  
      LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
  
+    typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
+
+    // Create a new model from GGUF metadata as well as a function to set the tensor data
+    //   - tensors are created as GGML_TYPE_F32 by default,
+    //     override by adding a tensor with the same name but a different name to the context
+    LLAMA_API struct llama_model * llama_model_init_from_user(
+                    struct gguf_context * metadata,
+          llama_model_set_tensor_data_t   set_tensor_data,    // function to initialize tensor data with
+                                   void * set_tensor_data_ud, // userdata for function
+              struct llama_model_params   params);
+
      DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
                               const char * path_model,
                struct llama_model_params   params),
              "use llama_model_load_from_file instead");
  
-    // Load the model from a file
+    // Load a model from a file
      // If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
      // If the split file name does not follow this pattern, use llama_model_load_from_splits
      LLAMA_API struct llama_model * llama_model_load_from_file(
                               const char * path_model,
                struct llama_model_params   params);
  
-    // Load the model from multiple splits (support custom naming scheme)
+    // Load a model from multiple splits (support custom naming scheme)
      // The paths must be in the correct order
      LLAMA_API struct llama_model * llama_model_load_from_splits(
                               const char ** paths,
@@ -973,7 +986,7 @@ extern "C" {
  
      // Logits for the ith token. For positive indices, Equivalent to:
      // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
-    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
+    // Negative indices can be used to access logits in reverse order, -1 is the last logit.
      // returns NULL for invalid ids.
      LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
  
@@ -988,7 +1001,7 @@ extern "C" {
  
      // Get the embeddings for the ith token. For positive indices, Equivalent to:
      // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
-    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
+    // Negative indices can be used to access embeddings in reverse order, -1 is the last embedding.
      // shape: [n_embd] (1-dimensional)
      // returns NULL for invalid ids.
      LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@@ -1008,9 +1021,9 @@ extern "C" {
      // Returns LLAMA_TOKEN_NULL if no token was sampled.
      LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
  
-    // Get the backend sampled probabilites for the ith token
+    // Get the backend sampled probabilities for the ith token
      // The index matches llama_get_sampled_token_ith().
-    // Returns NULL if no probabilites were generated.
+    // Returns NULL if no probabilities were generated.
      LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
      LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
  
@@ -1337,7 +1350,7 @@ extern "C" {
                                 float   tau,
                                 float   eta);
  
-    /// @details Intializes a GBNF grammar, see grammars/README.md for details.
+    /// @details Initializes a GBNF grammar, see grammars/README.md for details.
      /// @param vocab The vocabulary that this grammar will be used with.
      /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
      /// @param grammar_root The name of the start symbol for the grammar.
diff --git a/examples/talk-llama/models/afmoe.cpp b/examples/talk-llama/models/afmoe.cpp

index 6a752a403f6f12beab2446c8df741f7cfcfefe7d..9aabe25c96553f98087d4ff8650551a1c0ae1947 100644 (file)
--- a/examples/talk-llama/models/afmoe.cpp
+++ b/examples/talk-llama/models/afmoe.cpp
@@ -1,8 +1,8 @@
  #include "models.h"
  
  llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
                      n_expert, n_expert_used,
                      LLM_FFN_SILU,
                      hparams.expert_weights_norm,           // norm_w (route_norm=True)
-                    hparams.expert_weights_scale,          // scale_w
                      hparams.expert_weights_scale,          // w_scale (route_scale=2.826)
                      (llama_expert_gating_func_type) hparams.expert_gating_func,
                      il);
diff --git a/examples/talk-llama/models/apertus.cpp b/examples/talk-llama/models/apertus.cpp

index 9af19c1bfe800994f5ba62a551af1582777dd04f..4d65614e466170bc23749ddf10beb76126dc4f8e 100644 (file)
--- a/examples/talk-llama/models/apertus.cpp
+++ b/examples/talk-llama/models/apertus.cpp
@@ -3,10 +3,10 @@
  
  
  llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/arcee.cpp b/examples/talk-llama/models/arcee.cpp

index aa6167dba1e7e617b394c23c449de86c1f43098c..20b9ffd49ebd18fac688ba1bb6158695ff5d6752 100644 (file)
--- a/examples/talk-llama/models/arcee.cpp
+++ b/examples/talk-llama/models/arcee.cpp
@@ -2,10 +2,10 @@
  
  
  llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/arctic.cpp b/examples/talk-llama/models/arctic.cpp

index e8f028a723e3e5c9801c0af658818bd4cd1a0429..b712e08cbd32c233fb23658d4ce8879c45cf2d59 100644 (file)
--- a/examples/talk-llama/models/arctic.cpp
+++ b/examples/talk-llama/models/arctic.cpp
@@ -1,11 +1,10 @@
  #include "models.h"
  
-
  llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/baichuan.cpp b/examples/talk-llama/models/baichuan.cpp

index c04b0c98b0b58b98850120a6529fd9ba88ddafc7..abd03cd0b9711e00ce7b99140e149d3688ab1b48 100644 (file)
--- a/examples/talk-llama/models/baichuan.cpp
+++ b/examples/talk-llama/models/baichuan.cpp
@@ -2,10 +2,10 @@
  
  
  llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -56,6 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
                              );
                      break;
                  case LLM_TYPE_13B:
+                case LLM_TYPE_UNKNOWN:
                      break;
                  default:
                      GGML_ABORT("fatal error");
diff --git a/examples/talk-llama/models/bailingmoe.cpp b/examples/talk-llama/models/bailingmoe.cpp

index ed56b9c4713707c7049e2de25ff064c3366fbf3b..25e3369c313ecd6868316f1198edd1e4268305bf 100644 (file)
--- a/examples/talk-llama/models/bailingmoe.cpp
+++ b/examples/talk-llama/models/bailingmoe.cpp
@@ -1,6 +1,5 @@
  #include "models.h"
  
-
  llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, hparams.expert_weights_norm,
-                    false, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/bailingmoe2.cpp b/examples/talk-llama/models/bailingmoe2.cpp

index fbf7b210c427aa03b1009b67b652d99e711cdd3a..42098624663b6e85f48cbc922d77fae4b9f0df63 100644 (file)
--- a/examples/talk-llama/models/bailingmoe2.cpp
+++ b/examples/talk-llama/models/bailingmoe2.cpp
@@ -1,13 +1,11 @@
  #include "models.h"
  
-
-
  llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, hparams.expert_weights_norm,
-                true, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/bert.cpp b/examples/talk-llama/models/bert.cpp

index bca0e254fc51bc875abe8a65c224b37b6d89476e..8733179141885ce7f1a8f9032d45a13a93cd743e 100644 (file)
--- a/examples/talk-llama/models/bert.cpp
+++ b/examples/talk-llama/models/bert.cpp
@@ -1,12 +1,10 @@
  #include "models.h"
  
-
-
  llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
          // feed-forward network
          if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
              // MoE branch
-            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
-                                model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
-                                LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    nullptr,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    hparams.n_expert, hparams.n_expert_used,
+                    LLM_FFN_GELU, false,
+                    hparams.expert_weights_scale,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
              cb(cur, "ffn_moe_out", il);
          } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
                     model.arch == LLM_ARCH_JINA_BERT_V3) {
diff --git a/examples/talk-llama/models/bitnet.cpp b/examples/talk-llama/models/bitnet.cpp

index 331a3f111979575ffacd144a90a585465a5e3d07..ccf5bc8e82b434f3a7cb31afa3e18d40fe6ededa 100644 (file)
--- a/examples/talk-llama/models/bitnet.cpp
+++ b/examples/talk-llama/models/bitnet.cpp
@@ -2,9 +2,9 @@
  
  
  llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -29,10 +29,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
          // self-attention
          {
              // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            if (model.layers[il].wq_scale) {
-                Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
-            }
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
              cb(Qcur, "Qcur", il);
              if (model.layers[il].bq) {
                  Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -40,10 +37,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
              }
  
              // B1.K
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            if (model.layers[il].wk_scale) {
-                Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
-            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
              cb(Kcur, "Kcur", il);
              if (model.layers[il].bk) {
                  Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
@@ -51,10 +45,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
              }
  
              // B1.V
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            if (model.layers[il].wv_scale) {
-                Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
-            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
              cb(Vcur, "Vcur", il);
              if (model.layers[il].bv) {
                  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -90,10 +81,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
                      LLM_NORM_RMS, il);
              cb(cur, "attn_sub_norm", il);
  
-            cur = build_lora_mm(model.layers[il].wo, cur);
-            if (model.layers[il].wo_scale) {
-                cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
-            }
+            cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
              if (model.layers[il].bo) {
                  cur = ggml_add(ctx0, cur, model.layers[il].bo);
              }
@@ -115,8 +103,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
          cb(cur, "ffn_norm", il);
  
          cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_scale,
-                model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+                model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_s,
+                model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
                  NULL,                      NULL, NULL,
                  NULL,
                  LLM_FFN_SILU, LLM_FFN_PAR, il);
@@ -127,10 +115,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
                  LLM_NORM_RMS, il);
          cb(cur, "ffn_sub_norm", il);
  
-        cur = build_lora_mm(model.layers[il].ffn_down, cur);
-        if (model.layers[il].ffn_down_scale) {
-            cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
-        }
+        cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
          cb(cur, "ffn_down", il);
  
          cur = ggml_add(ctx0, cur, ffn_inp);
diff --git a/examples/talk-llama/models/bloom.cpp b/examples/talk-llama/models/bloom.cpp

index 2c552d1d15ea6d8a29bebd08dcdda96cf15b9ee8..b1c19bb58a2e07dbe5db557b6d54110bbf6b3378 100644 (file)
--- a/examples/talk-llama/models/bloom.cpp
+++ b/examples/talk-llama/models/bloom.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/chameleon.cpp b/examples/talk-llama/models/chameleon.cpp

index 184511aed4c304002fa2b2636a7650058aa8aa98..2f24105fa14ea6dd73c33d11c88ce0d8fae2a4e8 100644 (file)
--- a/examples/talk-llama/models/chameleon.cpp
+++ b/examples/talk-llama/models/chameleon.cpp
@@ -3,10 +3,10 @@
  #include <float.h>
  
  llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/chatglm.cpp b/examples/talk-llama/models/chatglm.cpp

index 2685d4fbcbee81378c380d1f225c2fdc085bd8f8..5887ed22e7eeddba1d0c86ee90a09837f589f2af 100644 (file)
--- a/examples/talk-llama/models/chatglm.cpp
+++ b/examples/talk-llama/models/chatglm.cpp
@@ -2,10 +2,10 @@
  
  
  llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/codeshell.cpp b/examples/talk-llama/models/codeshell.cpp

index 0b3bdbff529ea613c3511ee110593c24c12240fd..e8e13e143f262d6dada8baaa757b630f274d71ea 100644 (file)
--- a/examples/talk-llama/models/codeshell.cpp
+++ b/examples/talk-llama/models/codeshell.cpp
@@ -1,11 +1,11 @@
  #include "models.h"
  
  llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/cogvlm.cpp b/examples/talk-llama/models/cogvlm.cpp

index 0ceae3aaeb550cc812fca7735166b6bbcb1ad7be..2ef2b6e389b4d243248295f704ff0228d2a47add 100644 (file)
--- a/examples/talk-llama/models/cogvlm.cpp
+++ b/examples/talk-llama/models/cogvlm.cpp
@@ -2,11 +2,11 @@
  
  llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * inpL;
      ggml_tensor * cur;
diff --git a/examples/talk-llama/models/cohere2-iswa.cpp b/examples/talk-llama/models/cohere2-iswa.cpp

index 9334b5e42634f4905781bd2650d91d92a529dfae..7c71a59ae7f0f292c4901b869a6e4e2b40226690 100644 (file)
--- a/examples/talk-llama/models/cohere2-iswa.cpp
+++ b/examples/talk-llama/models/cohere2-iswa.cpp
@@ -1,9 +1,9 @@
  #include "models.h"
  
  llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      const float f_logit_scale = hparams.f_logit_scale;
  
diff --git a/examples/talk-llama/models/command-r.cpp b/examples/talk-llama/models/command-r.cpp

index 4d3b643b444ae54e788b2932b6ebaecf60b3c1d2..ba1230f0419358cd684231d560bcf98a5cf0256a 100644 (file)
--- a/examples/talk-llama/models/command-r.cpp
+++ b/examples/talk-llama/models/command-r.cpp
@@ -4,9 +4,9 @@
  
  llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      const float f_logit_scale = hparams.f_logit_scale;
  
diff --git a/examples/talk-llama/models/dbrx.cpp b/examples/talk-llama/models/dbrx.cpp

index 6d2a0ebf1b7eca41639daff30db3dc44afe0a223..73eb5cd24e7de114e6b05ef8e5912b354558b140 100644 (file)
--- a/examples/talk-llama/models/dbrx.cpp
+++ b/examples/talk-llama/models/dbrx.cpp
@@ -1,12 +1,11 @@
  #include "models.h"
  
-
  llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/deci.cpp b/examples/talk-llama/models/deci.cpp

index 7410a3a46d93b9a9d83ff0a13cbbfcc0da493309..ac448bfcaa8f26149943faab9fb0d1b757385f93 100644 (file)
--- a/examples/talk-llama/models/deci.cpp
+++ b/examples/talk-llama/models/deci.cpp
@@ -3,10 +3,10 @@
  
  
  llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/deepseek.cpp b/examples/talk-llama/models/deepseek.cpp

index 17866c0d88e2e9b4fa5ad8df3a16a8152f6022ea..3432359e03a0a9e73dcac3a6bad55d693e58b04a 100644 (file)
--- a/examples/talk-llama/models/deepseek.cpp
+++ b/examples/talk-llama/models/deepseek.cpp
@@ -1,13 +1,11 @@
  #include "models.h"
  
-
-
  llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -100,7 +98,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, false,
-                false, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/deepseek2.cpp b/examples/talk-llama/models/deepseek2.cpp

index b608396e50ea73a2806ef4dc1c5d39e23136842e..d437fe29e71b3a7d4da716885d6ab04805acddd2 100644 (file)
--- a/examples/talk-llama/models/deepseek2.cpp
+++ b/examples/talk-llama/models/deepseek2.cpp
@@ -8,7 +8,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
      const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
      const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
  
-    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
      const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
  
      const uint32_t kv_lora_rank = hparams.n_lora_kv;
@@ -146,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                      cb(Qcur, "Qcur_attn_temp_scaled", il);
                  }
  
-                // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
+                // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
                  cur = build_attn(inp_attn_k,
                          model.layers[il].wo, NULL,
                          Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
@@ -216,7 +216,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il,
                  nullptr,
diff --git a/examples/talk-llama/models/delta-net-base.cpp b/examples/talk-llama/models/delta-net-base.cpp

index 99f1fdd9538cad6d318aeef7094e169ad21c8a27..6bc989c950992a4d43591b5d3f12016947b2ba53 100644 (file)
--- a/examples/talk-llama/models/delta-net-base.cpp
+++ b/examples/talk-llama/models/delta-net-base.cpp
@@ -1,6 +1,6 @@
  #include "models.h"
  
-#define CHUNK_SIZE 64
+#include "llama-impl.h"
  
  // utility to get one slice from the third dimension
  // input dim:  [x, y, c, b]
@@ -57,7 +57,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
      g = ggml_permute(ctx0, g, 0, 2, 1, 3); // [g_0, n_tokens, H_v, n_seqs]
      b = ggml_permute(ctx0, b, 0, 2, 1, 3); // [  1, n_tokens, H_v, n_seqs]
  
-    const int CS = CHUNK_SIZE;
+    const int CS = kda ? 16 : 64; // chunk size
  
      const int pad = (CS - n_tokens % CS) % CS;
      const int n_chunks = (n_tokens + pad) / CS;
@@ -225,9 +225,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
      ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
      cb(kg_t, "key_gdiff_t", il);
  
-    ggml_tensor * s_t = ggml_transpose(ctx0, s);
-    s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
-    cb(s_t, "dnet_add_ch_state", il);
+    s = ggml_reshape_4d(ctx0, s, S_v, S_v, 1, H_v * n_seqs);
+    cb(s, "dnet_add_ch_state", il);
  
      // [CS, S_v, n_chunks, H_v * n_seqs]
      ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
@@ -240,7 +239,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
          ggml_tensor * ch_kg_t    = get_slice_2d(ctx0, kg_t,    chunk); // [ CS, S_k, 1, H_v * n_seqs]
  
          // [CS, S_v, 1, H_v * n_seqs]
-        ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
+        ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s);
          cb(v_t_p, "v_prime", il);
  
          // [CS, S_v, 1, H_v * n_seqs]
@@ -252,7 +251,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
          cb(v_attn, "v_attn", il);
  
          // [S_v, CS, 1, H_v * n_seqs]
-        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
+        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s, ch_q_g_exp);
          cb(attn_inter, "attn_inter", il);
  
          // [S_v, CS, 1, H_v * n_seqs]
@@ -268,13 +267,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
          // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
          ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
  
-        s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
-        s_t = ggml_add(ctx0, s_t, kgv);
-        cb(s_t, "dnet_add_ch_state", il);
+        s = ggml_mul(ctx0, s, ch_g_last_exp_t);
+        s = ggml_add(ctx0, s, kgv);
+        cb(s, "dnet_add_ch_state", il);
      }
  
-    s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
-
      // truncate padded tokens
      ggml_tensor * o = ggml_view_4d(ctx0, v,
              S_v, n_tokens, H_v, n_seqs,
@@ -282,7 +279,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
              ggml_row_size(v->type, S_v * CS * n_chunks),
              ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
      o = ggml_permute  (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
-    s = ggml_transpose(ctx0, s_t);
+    s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
      cb(s, "output_state", il);
  
      return {o, s};
@@ -341,11 +338,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
      g = ggml_exp(ctx0, g);
      s = ggml_mul(ctx0, s, g);
  
-    ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
-
      // [1, S_v, H_v, n_seqs]
      ggml_tensor * sk;
-    sk = ggml_mul     (ctx0, s_t, k);
+    sk = ggml_mul     (ctx0, s, k);
      sk = ggml_sum_rows(ctx0, sk);
  
      // [S_v, 1, H_v, n_seqs]
@@ -362,15 +357,89 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
      k  = ggml_repeat(ctx0, k, s);
      kd = ggml_mul   (ctx0, k, d_t);
  
-    s_t = ggml_add(ctx0, s_t, kd);
+    s = ggml_add(ctx0, s, kd);
  
-    cb(s_t, "dnet_add_ar_state", il);
+    cb(s, "dnet_add_ar_state", il);
  
-    ggml_tensor * s_q = ggml_mul     (ctx0, s_t, q);
+    ggml_tensor * s_q = ggml_mul     (ctx0, s, q);
      ggml_tensor * o   = ggml_sum_rows(ctx0, s_q);
  
      o = ggml_permute  (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
-    s = ggml_transpose(ctx0, s_t);           // [S_v, S_v, H_v, n_seqs]
  
      return {o, s};
  }
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_fused(
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * b,
+        ggml_tensor * s,
+        int           il) {
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+
+    GGML_ASSERT(S_k == S_v);
+    GGML_ASSERT(H_v % H_k == 0);
+
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+    GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
+
+    GGML_ASSERT(g->ne[0] == 1   || g->ne[0] == S_v);
+    GGML_ASSERT(                   g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
+    GGML_ASSERT(b->ne[0] == 1   && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
+    GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v      && s->ne[3] == n_seqs);
+
+    ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
+    if (n_tokens == 1) {
+        cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
+    } else {
+        cb(result, LLAMA_TENSOR_NAME_FGDN_CH, il);
+    }
+
+    ggml_tensor * output = ggml_view_4d(ctx0, result,
+            S_v, H_v, n_tokens, n_seqs,
+            ggml_row_size(result->type, S_v),
+            ggml_row_size(result->type, S_v * H_v),
+            ggml_row_size(result->type, S_v * H_v * n_tokens), 0);
+
+    ggml_tensor * new_state = ggml_view_4d(ctx0, result,
+            S_v, S_v, H_v, n_seqs,
+            ggml_row_size(result->type, S_v),
+            ggml_row_size(result->type, S_v * S_v),
+            ggml_row_size(result->type, S_v * S_v * H_v),
+            ggml_row_size(result->type, S_v * H_v * n_tokens * n_seqs));
+
+    return {output, new_state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net(
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * b,
+        ggml_tensor * s,
+        int           il) {
+    const int64_t n_seq_tokens = q->ne[2];
+
+    if (n_seq_tokens == 1) {
+        if (cparams.fused_gdn_ar) {
+            return build_delta_net_fused(q, k, v, g, b, s, il);
+        }
+        return build_delta_net_autoregressive(q, k, v, g, b, s, il);
+    }
+
+    if (cparams.fused_gdn_ch) {
+        return build_delta_net_fused(q, k, v, g, b, s, il);
+    }
+
+    return build_delta_net_chunking(q, k, v, g, b, s, il);
+}
diff --git a/examples/talk-llama/models/dots1.cpp b/examples/talk-llama/models/dots1.cpp

index 09c36f82fe279a844d00eede266879f657568608..07236dd27c9d3cdad9ee427fff432d8ef18f4a4f 100644 (file)
--- a/examples/talk-llama/models/dots1.cpp
+++ b/examples/talk-llama/models/dots1.cpp
@@ -1,13 +1,11 @@
  #include "models.h"
  
-
-
  llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -91,7 +89,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, hparams.expert_weights_norm,
-                true, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/dream.cpp b/examples/talk-llama/models/dream.cpp

index 2aafbae1397fb8f320a66287fac56cb9686af113..4edc8530cb3cfede88dd4b07ec0a7ea4521c376e 100644 (file)
--- a/examples/talk-llama/models/dream.cpp
+++ b/examples/talk-llama/models/dream.cpp
@@ -5,10 +5,10 @@
  llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      //copied from qwen2
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/ernie4-5-moe.cpp b/examples/talk-llama/models/ernie4-5-moe.cpp

index 0d96d14e6fd328e37230ef219c7c44285464e325..63baf152c40771a75ea74533d3da386719fbfa92 100644 (file)
--- a/examples/talk-llama/models/ernie4-5-moe.cpp
+++ b/examples/talk-llama/models/ernie4-5-moe.cpp
@@ -1,13 +1,11 @@
  #include "models.h"
  
-
-
  llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -103,7 +101,7 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const
                                          model.layers[il].ffn_exp_probs_b,
                                          n_expert, n_expert_used,
                                          LLM_FFN_SILU, true,
-                                        false, 0.0,
+                                        hparams.expert_weights_scale,
                                          LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                                          il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/ernie4-5.cpp b/examples/talk-llama/models/ernie4-5.cpp

index 99aead53283f7e48f8af17a63ca4e24740e7c014..d548de0547b8d8f4127f7462d039d86f8ff6b517 100644 (file)
--- a/examples/talk-llama/models/ernie4-5.cpp
+++ b/examples/talk-llama/models/ernie4-5.cpp
@@ -2,10 +2,10 @@
  
  llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/eurobert.cpp b/examples/talk-llama/models/eurobert.cpp

index 86e3176edc002b48fe8b34a4734ec1a34a799a04..e8628d165d02ddbcfa001833a6c1b1e78d0d02cf 100644 (file)
--- a/examples/talk-llama/models/eurobert.cpp
+++ b/examples/talk-llama/models/eurobert.cpp
@@ -1,9 +1,9 @@
  #include "models.h"
  
  llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/exaone-moe.cpp b/examples/talk-llama/models/exaone-moe.cpp

index bef5b2ad351b3f39e61bbe76791d4cf231c031f6..ea75701c528c3b8d165360d40b91e013f1051cc2 100644 (file)
--- a/examples/talk-llama/models/exaone-moe.cpp
+++ b/examples/talk-llama/models/exaone-moe.cpp
@@ -1,12 +1,11 @@
  #include "models.h"
  
-
  llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -100,7 +99,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, hparams.expert_weights_norm,
-                true, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/exaone.cpp b/examples/talk-llama/models/exaone.cpp

index 62602b284ded8a564b95b65979fe86e9eab7e283..d4eea58e2f1d6513955b998f6c2154af49026b50 100644 (file)
--- a/examples/talk-llama/models/exaone.cpp
+++ b/examples/talk-llama/models/exaone.cpp
@@ -4,10 +4,10 @@
  
  llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/exaone4.cpp b/examples/talk-llama/models/exaone4.cpp

index 8b7e3dc06e5cb0ad1e6d7c523a729de150df9e84..755af3b747b0ef539d648987cae9f30937bf05b3 100644 (file)
--- a/examples/talk-llama/models/exaone4.cpp
+++ b/examples/talk-llama/models/exaone4.cpp
@@ -4,10 +4,10 @@
  template <bool iswa>
  llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/falcon-h1.cpp b/examples/talk-llama/models/falcon-h1.cpp

index 785a7e5e6629cb1190ee8d2bb5e2901550112fb7..ff842d93a41dc8761988fcaf461941ba7523385f 100644 (file)
--- a/examples/talk-llama/models/falcon-h1.cpp
+++ b/examples/talk-llama/models/falcon-h1.cpp
@@ -2,7 +2,7 @@
  
  llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
      llm_build_mamba_base(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/falcon.cpp b/examples/talk-llama/models/falcon.cpp

index db1ccdb50085ff0f5a2388935cc77823843c9c66..9fcba508878a209793a62dc7bfa4993dd89a3ca8 100644 (file)
--- a/examples/talk-llama/models/falcon.cpp
+++ b/examples/talk-llama/models/falcon.cpp
@@ -2,11 +2,11 @@
  
  
  llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/gemma-embedding.cpp b/examples/talk-llama/models/gemma-embedding.cpp

index 944c198bf9502459101d966c58c0ef670a470474..98110d45e3b8d5385d3a9cdc6e146fa7eab57a27 100644 (file)
--- a/examples/talk-llama/models/gemma-embedding.cpp
+++ b/examples/talk-llama/models/gemma-embedding.cpp
@@ -2,7 +2,7 @@
  
  llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/gemma.cpp b/examples/talk-llama/models/gemma.cpp

index 4893d9af4b8c8cee647b6d66ddc4db490fe41c62..1869efd389a9121b31d50e124b69a49955ede988 100644 (file)
--- a/examples/talk-llama/models/gemma.cpp
+++ b/examples/talk-llama/models/gemma.cpp
@@ -2,7 +2,7 @@
  
  
  llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/gemma2-iswa.cpp b/examples/talk-llama/models/gemma2-iswa.cpp

index 7a9198193acc50b4f10f5fedae0caa0a2d5bae66..3927ddd297b2754ea6f700ab16f4874ef3a12655 100644 (file)
--- a/examples/talk-llama/models/gemma2-iswa.cpp
+++ b/examples/talk-llama/models/gemma2-iswa.cpp
@@ -1,7 +1,7 @@
  #include "models.h"
  
  llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/gemma3.cpp b/examples/talk-llama/models/gemma3.cpp

index dec3fc4b8bc3962217d4f5ecb99c6cb3260034bb..bbb4d9a81e825dc8225f845dda040a31bba485a9 100644 (file)
--- a/examples/talk-llama/models/gemma3.cpp
+++ b/examples/talk-llama/models/gemma3.cpp
@@ -2,7 +2,7 @@
  
  template <bool iswa>
  llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/gemma3n-iswa.cpp b/examples/talk-llama/models/gemma3n-iswa.cpp

index 7db6d3bf4ec5c9bf1a8b39d5b2004a1a077acff0..8ce2ae39c2f4c07f30f37835161b1432b1b840ef 100644 (file)
--- a/examples/talk-llama/models/gemma3n-iswa.cpp
+++ b/examples/talk-llama/models/gemma3n-iswa.cpp
@@ -3,7 +3,7 @@
  llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params),
      model(model),
-    n_embd_head(model.hparams.n_embd_head_k),
+    n_embd_head(model.hparams.n_embd_head_k()),
      n_embd_altup(model.hparams.n_embd_altup),
      n_altup(model.hparams.n_altup),
      i_altup_act(model.hparams.i_altup_act) {
diff --git a/examples/talk-llama/models/glm4-moe.cpp b/examples/talk-llama/models/glm4-moe.cpp

index 003f70f739631bef7daf53da67665c85892c58f5..7938545ed8a416623c135c71fcb4c0fe1c82eb84 100644 (file)
--- a/examples/talk-llama/models/glm4-moe.cpp
+++ b/examples/talk-llama/models/glm4-moe.cpp
@@ -1,9 +1,9 @@
  #include "models.h"
  
  llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      int sections[4];
      std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -128,7 +128,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
                      model.layers[il].ffn_exp_probs_b,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, hparams.expert_weights_norm,
-                    true, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale,
                      (llama_expert_gating_func_type) hparams.expert_gating_func,
                      il);
              cb(routed_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/glm4.cpp b/examples/talk-llama/models/glm4.cpp

index bcd837b30d6220f4d109d9bb70a863a2f9a43a18..b6ad8febed3070d74eeba89be47183dea96fcd10 100644 (file)
--- a/examples/talk-llama/models/glm4.cpp
+++ b/examples/talk-llama/models/glm4.cpp
@@ -3,10 +3,10 @@
  
  
  llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      int sections[4];
      std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
diff --git a/examples/talk-llama/models/gpt2.cpp b/examples/talk-llama/models/gpt2.cpp

index 60761c8e76521ddc47a8d6db20503e3fef2506d8..cb1238f2d3440638f2fe3f6ec21a27cc80867fc9 100644 (file)
--- a/examples/talk-llama/models/gpt2.cpp
+++ b/examples/talk-llama/models/gpt2.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * pos;
diff --git a/examples/talk-llama/models/gptneox.cpp b/examples/talk-llama/models/gptneox.cpp

index 2151b14e9394f61829fc913c73fb6b5814f7f59f..1c8fe6c836d183c0c22becf5d191212e54669eba 100644 (file)
--- a/examples/talk-llama/models/gptneox.cpp
+++ b/examples/talk-llama/models/gptneox.cpp
@@ -2,10 +2,10 @@
  
  
  llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/granite-hybrid.cpp b/examples/talk-llama/models/granite-hybrid.cpp

index 726ecdcca776e5aa8d45796f83cbd160ef43f949..9b54a38c386b3b5eb305b4003d4413ed7817ab1a 100644 (file)
--- a/examples/talk-llama/models/granite-hybrid.cpp
+++ b/examples/talk-llama/models/granite-hybrid.cpp
@@ -1,10 +1,9 @@
  #include "models.h"
  
-
  llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
      llm_build_mamba_base(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -160,7 +159,7 @@ ggml_tensor * llm_build_granite_hybrid::build_layer_ffn(ggml_tensor *       cur,
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/granite.cpp b/examples/talk-llama/models/granite.cpp

index 18748e9c26cf37090175beee2b6bb8dc1f354680..7a7e1664c296820a0cc530bf5c7869cd64dcb302 100644 (file)
--- a/examples/talk-llama/models/granite.cpp
+++ b/examples/talk-llama/models/granite.cpp
@@ -1,15 +1,14 @@
  #include "models.h"
  
-
  llm_build_granite::llm_build_granite(
      const llama_model & model,
      const llm_graph_params & params)
      : llm_graph_context(params) {
  
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -175,7 +174,7 @@ ggml_tensor * llm_build_granite::build_layer_ffn(
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/grok.cpp b/examples/talk-llama/models/grok.cpp

index 3c54dfee63684551141d53c8322b3b9ca18a01e6..580d63e36ae5c6a82ee3cd909c61554e6d784e10 100644 (file)
--- a/examples/talk-llama/models/grok.cpp
+++ b/examples/talk-llama/models/grok.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -99,7 +99,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_GELU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/grovemoe.cpp b/examples/talk-llama/models/grovemoe.cpp

index 56b6db9a3d072d8b62c53a725fa0757d528c10e2..aa60d3e9388d24062eaf022af11b21c8e5d1a3c3 100644 (file)
--- a/examples/talk-llama/models/grovemoe.cpp
+++ b/examples/talk-llama/models/grovemoe.cpp
@@ -1,14 +1,12 @@
  #include "models.h"
  
-
-
  llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t n_embd_head    = hparams.n_embd_head_v;
+    const int64_t n_embd_head    = hparams.n_embd_head_v();
      const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -90,7 +88,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il,
                  probs);
@@ -106,7 +104,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
                      nullptr,
                      n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il,
                      probs);
diff --git a/examples/talk-llama/models/hunyuan-dense.cpp b/examples/talk-llama/models/hunyuan-dense.cpp

index 7d5dcc7828b6f1b3331de3ad84b23eb2c03ae8ac..6a51707c85ba1edad34dc047059a038c021b1e9e 100644 (file)
--- a/examples/talk-llama/models/hunyuan-dense.cpp
+++ b/examples/talk-llama/models/hunyuan-dense.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/hunyuan-moe.cpp b/examples/talk-llama/models/hunyuan-moe.cpp

index 77e39de5b8ba265afeb68bc8892e98eada78591f..806c30b36673e4085cb4ce491df88418878f50f4 100644 (file)
--- a/examples/talk-llama/models/hunyuan-moe.cpp
+++ b/examples/talk-llama/models/hunyuan-moe.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -119,8 +119,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
                  n_expert, n_expert_used,
                  LLM_FFN_SILU,
                  true, // norm_topk_prob
-                false,
-                0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur_moe, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/internlm2.cpp b/examples/talk-llama/models/internlm2.cpp

index 387e8211270d798a98a7f15c2996fdc5ffad3c07..441d250268ee2b0b2da0eb9eed935de9f609ab85 100644 (file)
--- a/examples/talk-llama/models/internlm2.cpp
+++ b/examples/talk-llama/models/internlm2.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/jais.cpp b/examples/talk-llama/models/jais.cpp

index 3e3376e6a62431f15e269ee25da86afd976591f4..135bf288ba1b5e9375a55ec365d0a59d551a4857 100644 (file)
--- a/examples/talk-llama/models/jais.cpp
+++ b/examples/talk-llama/models/jais.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/jais2.cpp b/examples/talk-llama/models/jais2.cpp

index a69fcaa3bb3a19f641876356bfb4ec40bc45fd09..2cfe484eb52d510219086d722f40e36b04106f0e 100644 (file)
--- a/examples/talk-llama/models/jais2.cpp
+++ b/examples/talk-llama/models/jais2.cpp
@@ -3,10 +3,10 @@
  // JAIS-2 model graph builder
  // Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
  llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/jamba.cpp b/examples/talk-llama/models/jamba.cpp

index ceab58174079be9810f67d74d76ba7f814bed8c2..c0c89de187a6464da394aeac762012679dee344b 100644 (file)
--- a/examples/talk-llama/models/jamba.cpp
+++ b/examples/talk-llama/models/jamba.cpp
@@ -1,7 +1,7 @@
  #include "models.h"
  
  llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -76,7 +76,7 @@ llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_para
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, false,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
              cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/kimi-linear.cpp b/examples/talk-llama/models/kimi-linear.cpp

index 83d11241f8df898f4f008f5192419a1fc89e9e4d..4d62f4e7159aa5c9dbcd0871b9b4b7576585265f 100644 (file)
--- a/examples/talk-llama/models/kimi-linear.cpp
+++ b/examples/talk-llama/models/kimi-linear.cpp
@@ -1,5 +1,4 @@
  #include "models.h"
-#include "ggml.h"
  
  #include "llama-memory-recurrent.h"
  
@@ -103,7 +102,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
      const int64_t kv_lora_rank = hparams.n_lora_kv;
      // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
      // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
-    const int64_t n_embd_head_qk_rope = hparams.n_rot;  // config.qk_rope_head_dim
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();  // config.qk_rope_head_dim
      const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;  // 192 - 64 = 128
      // Attention scale for MLA
      const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
@@ -118,12 +117,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
  
          ggml_build_forward_expand(gf, cur);
  
-        // Check layer type by checking which tensors exist
-        // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
-        bool is_kda = (layer.ssm_a != nullptr);
-        bool is_mla = (layer.wkv_a_mqa != nullptr);
-
-        if (is_kda) {
+        if (hparams.is_recurrent(il)) {
              // === KDA Layer (Kimi Delta Attention) with Recurrent State ===
              // Reference: vLLM kda.py
              const auto * mctx_cur = inp_rs->mctx;
@@ -175,9 +169,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
              Kcur = ggml_l2_norm(ctx0, Kcur, eps_norm);
  
              // Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
-            std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
-                build_delta_net_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
-                build_delta_net_chunking(Qcur, Kcur, Vcur, g1, beta, state, il);
+            auto attn_out = build_delta_net(Qcur, Kcur, Vcur, g1, beta, state, il);
  
              ggml_tensor * output = ggml_cont(ctx0, attn_out.first);
              ggml_tensor * new_state = attn_out.second;
@@ -211,7 +203,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
              cur = ggml_mul_mat(ctx0, layer.wo, gated);
              cb(cur, "kda_out", il);
  
-        } else if (is_mla) {
+        } else {
              // === MLA Layer (Multi-head Latent Attention) without KV Cache ===
              // Reference: vLLM mla.py
              // Step 1: Q projection and reshape
@@ -310,9 +302,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                  cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
                  cb(cur, "mla_out", il);
              }
-        } else {
-            // Unknown layer type - this should not happen
-            GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
          }
  
          // On last layer, select only the output tokens
@@ -349,7 +338,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                  hparams.n_expert,
                  hparams.n_expert_used,
                  LLM_FFN_SILU, true,
-                true, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/lfm2.cpp b/examples/talk-llama/models/lfm2.cpp

index cf01ad62557784c44d6c72db76d47d2fccc91559..dfa322166b109301be83f04b405d3c74b054fc64 100644 (file)
--- a/examples/talk-llama/models/lfm2.cpp
+++ b/examples/talk-llama/models/lfm2.cpp
@@ -23,17 +23,23 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
      };
      auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
          return build_moe_ffn(cur,
-                            model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                            model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                            model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
-                            static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+                model.layers[il].ffn_gate_inp,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
+                model.layers[il].ffn_exp_probs_b,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, true,
+                hparams.expert_weights_scale,
+                static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
+                il);
      };
      auto build_attn_block = [&model, this](ggml_tensor *   cur,
                                             ggml_tensor *   inp_pos,
                                             inp_attn_type * inp_attn,
                                             int             il) -> ggml_tensor * {
          GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
-        const auto n_embd_head = hparams.n_embd_head_v;
+        const auto n_embd_head = hparams.n_embd_head_v();
          const auto n_head_kv   = hparams.n_head_kv(il);
  
          auto * q = build_lora_mm(model.layers[il].wq, cur);
diff --git a/examples/talk-llama/models/llada-moe.cpp b/examples/talk-llama/models/llada-moe.cpp

index 5f64686f5fb014510cf88cb89b43f2c98d2d72f1..18de88fde1f338264bbb08f7d3c262192e7cf54d 100644 (file)
--- a/examples/talk-llama/models/llada-moe.cpp
+++ b/examples/talk-llama/models/llada-moe.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -90,7 +90,7 @@ llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_gr
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, false,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/llada.cpp b/examples/talk-llama/models/llada.cpp

index 857033660a04e5ba1d12633aeead51fa94bb3182..0dac9d616ae84faf302e5630aab38d258958b232 100644 (file)
--- a/examples/talk-llama/models/llada.cpp
+++ b/examples/talk-llama/models/llada.cpp
@@ -2,10 +2,10 @@
  
  llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
      // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/llama-iswa.cpp b/examples/talk-llama/models/llama-iswa.cpp

index 61dd2c179f1dd18d967f91e39670d1030add45c6..67cb9a10ec5c99e4f412d2877c3a94d2dd6c50a1 100644 (file)
--- a/examples/talk-llama/models/llama-iswa.cpp
+++ b/examples/talk-llama/models/llama-iswa.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -134,7 +134,7 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, false,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
                      il);
  
diff --git a/examples/talk-llama/models/llama.cpp b/examples/talk-llama/models/llama.cpp

index 42b5fcdf42eb80fd5c705bffe0644de229d05cf5..e08ae0c0b0e14f496556778c3ccb32f3400fcc71 100644 (file)
--- a/examples/talk-llama/models/llama.cpp
+++ b/examples/talk-llama/models/llama.cpp
@@ -2,10 +2,10 @@
  
  template <bool embed>
  llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -43,19 +43,19 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
              ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  
              // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
              cb(Qcur, "Qcur", il);
              if (model.layers[il].bq) {
                  Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                  cb(Qcur, "Qcur", il);
              }
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
              cb(Kcur, "Kcur", il);
              if (model.layers[il].bk) {
                  Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                  cb(Kcur, "Kcur", il);
              }
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
              cb(Vcur, "Vcur", il);
              if (model.layers[il].bv) {
                  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -91,6 +91,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
              cur = build_attn(inp_attn,
                      model.layers[il].wo, model.layers[il].bo,
                      Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+            if (model.layers[il].wo_s) {
+                cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
+            }
              cb(cur, "attn_out", il);
          }
          if (il == n_layer - 1 && inp_out_ids) {
@@ -109,9 +112,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
              cb(cur, "ffn_norm", il);
  
              cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   model.layers[il].ffn_up_s,
+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
                      NULL,
                      LLM_FFN_SILU, LLM_FFN_PAR, il);
              cb(cur, "ffn_out", il);
@@ -130,9 +133,13 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    il);
+                    il,
+                    nullptr, nullptr,
+                    model.layers[il].ffn_up_exps_s,
+                    model.layers[il].ffn_gate_exps_s,
+                    model.layers[il].ffn_down_exps_s);
              cb(cur, "ffn_moe_out", il);
          }
          cur = ggml_add(ctx0, cur, ffn_inp);
diff --git a/examples/talk-llama/models/maincoder.cpp b/examples/talk-llama/models/maincoder.cpp

index da57308167e8d4bc2aa9593968fe3a23ac8ae7a1..a72b7790a1f706655471f04b7a10fb298c59f8d2 100644 (file)
--- a/examples/talk-llama/models/maincoder.cpp
+++ b/examples/talk-llama/models/maincoder.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/mamba-base.cpp b/examples/talk-llama/models/mamba-base.cpp

index aaac9487dfaed7c6bf18e1f653f137b4d069189a..9de587db55fbeb49fad55eb0a0de73d04149b537 100644 (file)
--- a/examples/talk-llama/models/mamba-base.cpp
+++ b/examples/talk-llama/models/mamba-base.cpp
@@ -30,6 +30,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
      GGML_ASSERT(n_seqs != 0);
      GGML_ASSERT(ubatch.equal_seqs());
      GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+    GGML_ASSERT(d_inner % n_head == 0);
  
      ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
      ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
@@ -167,6 +168,9 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
      GGML_ASSERT(n_seqs != 0);
      GGML_ASSERT(ubatch.equal_seqs());
      GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+    GGML_ASSERT(d_inner % n_head  == 0);
+    GGML_ASSERT(d_inner % d_state == 0);
+    GGML_ASSERT(d_inner % n_group == 0);
  
      ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
      ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
diff --git a/examples/talk-llama/models/mimo2-iswa.cpp b/examples/talk-llama/models/mimo2-iswa.cpp

index edc87cc9f0d329acd5c64a6f8f42d557f56e5975..06956915ea092670c4d77a9dc4dc05c428a20017 100644 (file)
--- a/examples/talk-llama/models/mimo2-iswa.cpp
+++ b/examples/talk-llama/models/mimo2-iswa.cpp
@@ -1,4 +1,3 @@
-
  #include "models.h"
  
  llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
@@ -88,10 +87,17 @@ llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_
              cb(cur, "ffn_out", il);
          } else {
              // MoE branch
-            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                                model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
-                                0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    model.layers[il].ffn_exp_probs_b,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    hparams.expert_weights_scale,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
+                    il);
              cb(cur, "ffn_moe_out", il);
          }
  
diff --git a/examples/talk-llama/models/minicpm3.cpp b/examples/talk-llama/models/minicpm3.cpp

index 297cc34ba58b738de4ec835613e711c6c7f3e358..89dd71051577e80eb07d9c1ddd81e7485bee580c 100644 (file)
--- a/examples/talk-llama/models/minicpm3.cpp
+++ b/examples/talk-llama/models/minicpm3.cpp
@@ -5,10 +5,10 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
      const int64_t n_embd_base = 256;
      const float scale_embd  = 12.0f;
      const float scale_depth = 1.4f;
-    const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+    const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k()));
  
-    const uint32_t n_embd_head_qk_rope = hparams.n_rot;
-    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+    const uint32_t n_embd_head_qk_rope = hparams.n_rot();
+    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
  
      const uint32_t kv_lora_rank = hparams.n_lora_kv;
  
@@ -51,21 +51,21 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
                      LLM_NORM_RMS, il);
              cb(q, "q", il);
  
-            // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+            // {q_lora_rank, n_head * hparams.n_embd_head_k()} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k(), n_tokens}
              q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
              cb(q, "q", il);
  
              // split into {n_head * n_embd_head_qk_nope, n_tokens}
              ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
-                    ggml_row_size(q->type, hparams.n_embd_head_k),
-                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                    ggml_row_size(q->type, hparams.n_embd_head_k()),
+                    ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
                      0);
              cb(q_nope, "q_nope", il);
  
              // and {n_head * n_embd_head_qk_rope, n_tokens}
              ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
-                    ggml_row_size(q->type, hparams.n_embd_head_k),
-                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                    ggml_row_size(q->type, hparams.n_embd_head_k()),
+                    ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
                      ggml_row_size(q->type, n_embd_head_qk_nope));
              cb(q_pe, "q_pe", il);
  
@@ -97,15 +97,15 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
  
              // split into {n_head * n_embd_head_qk_nope, n_tokens}
              ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
-                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
+                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
                      0);
              cb(k_nope, "k_nope", il);
  
              // and {n_head * n_embd_head_v, n_tokens}
-            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
-                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
+                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
+                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
                      ggml_row_size(kv->type, (n_embd_head_qk_nope)));
              cb(v_states, "v_states", il);
  
diff --git a/examples/talk-llama/models/minimax-m2.cpp b/examples/talk-llama/models/minimax-m2.cpp

index f7001badf75c903aaf4c9e0d3b8780b88ecf1ee0..83d0916c08c1888ab418772c7702ad37d09e62a9 100644 (file)
--- a/examples/talk-llama/models/minimax-m2.cpp
+++ b/examples/talk-llama/models/minimax-m2.cpp
@@ -1,11 +1,10 @@
-
  #include "models.h"
  
  llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    // GGML_ASSERT(n_embd_head == n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -91,7 +90,7 @@ llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_
                  model.layers[il].ffn_exp_probs_b,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  (llama_expert_gating_func_type) hparams.expert_gating_func,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/mistral3.cpp b/examples/talk-llama/models/mistral3.cpp

index 0b672235911aeca70bc4f1f8416b824e7b02cd39..42a5117ff020253754a6ecca26dce9b60291af2b 100644 (file)
--- a/examples/talk-llama/models/mistral3.cpp
+++ b/examples/talk-llama/models/mistral3.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -127,7 +127,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
              cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/models.h b/examples/talk-llama/models/models.h

index 0712d03d8d96b384a36a81705f5a6f6301873625..a86b2b1ebd79cd0dc221c1871753394d3ef14f3a 100644 (file)
--- a/examples/talk-llama/models/models.h
+++ b/examples/talk-llama/models/models.h
@@ -3,7 +3,7 @@
  #include "llama-model.h"
  #include "llama-graph.h"
  
-// note: almost all graphs require atleast sqrtf, so include cmath globally
+// note: almost all graphs require at least sqrtf, so include cmath globally
  #include <cmath>
  
  //
@@ -44,6 +44,26 @@ struct llm_build_delta_net_base : public llm_graph_context {
                  ggml_tensor * b,
                  ggml_tensor * s,
                  int           il);
+
+    // use the ggml_gated_delta_net fused operator
+    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused(
+                ggml_tensor * q,
+                ggml_tensor * k,
+                ggml_tensor * v,
+                ggml_tensor * g,
+                ggml_tensor * b,
+                ggml_tensor * s,
+                        int   il);
+
+    // choose one of two implementations above based on the number of tokens
+    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net(
+                ggml_tensor * q,
+                ggml_tensor * k,
+                ggml_tensor * v,
+                ggml_tensor * g,
+                ggml_tensor * b,
+                ggml_tensor * s,
+                        int   il);
  };
  
  struct llm_build_rwkv6_base : public llm_graph_context {
diff --git a/examples/talk-llama/models/modern-bert.cpp b/examples/talk-llama/models/modern-bert.cpp

index 32066c712b494d29ae5bf0e22d7e3d623a103dbb..26020584c6dbff95a7dc0da2ba58eb067fc2cae6 100644 (file)
--- a/examples/talk-llama/models/modern-bert.cpp
+++ b/examples/talk-llama/models/modern-bert.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/mpt.cpp b/examples/talk-llama/models/mpt.cpp

index 2328e027a74102876b8d60139e752d1ba7a92015..ce44a805f5c2d849e4d25bc913d5d43a618dea3e 100644 (file)
--- a/examples/talk-llama/models/mpt.cpp
+++ b/examples/talk-llama/models/mpt.cpp
@@ -3,10 +3,10 @@
  
  
  llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * pos;
diff --git a/examples/talk-llama/models/nemotron-h.cpp b/examples/talk-llama/models/nemotron-h.cpp

index d61d62a8c962581bea21c7c345eb26830a631d7c..7af99174d1678d21f07c3bcd41fd819a3146af50 100644 (file)
--- a/examples/talk-llama/models/nemotron-h.cpp
+++ b/examples/talk-llama/models/nemotron-h.cpp
@@ -2,8 +2,8 @@
  
  llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
      llm_build_mamba_base(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -114,9 +114,18 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
                  LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
          cb(cur, "ffn_out", il);
      } else {
-        ggml_tensor * ffn_inp = cur;
+        ggml_tensor * inp_emb    = cur;
+        ggml_tensor * inp_latent = cur;
+
+        if (model.layers[il].ffn_latent_down) {
+            inp_latent = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_down, cur);
+        }
+
+        ggml_tensor * router_logits = build_lora_mm(model.layers[il].ffn_gate_inp, cur);
+        cb(router_logits, "ffn_moe_logits", il);
+
          ggml_tensor * moe_out =
-            build_moe_ffn(ffn_inp,
+            build_moe_ffn(inp_latent,
                      model.layers[il].ffn_gate_inp,
                      model.layers[il].ffn_up_exps,
                      nullptr, // no gate
@@ -124,12 +133,17 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
                      model.layers[il].ffn_exp_probs_b,
                      n_expert, n_expert_used,
                      LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
-                    true, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
-                    il);
+                    il,
+                    router_logits);
          cb(moe_out, "ffn_moe_out", il);
  
-        ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
+        if (model.layers[il].ffn_latent_up) {
+            moe_out = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_up, moe_out);
+        }
+
+        ggml_tensor * ffn_shexp = build_ffn(inp_emb,
                      model.layers[il].ffn_up_shexp,  NULL, NULL,
                      NULL /* no gate */           ,  NULL, NULL,
                      model.layers[il].ffn_down_shexp, NULL, NULL,
diff --git a/examples/talk-llama/models/nemotron.cpp b/examples/talk-llama/models/nemotron.cpp

index fcead041f0a2561694ab3f84795d79d8110fd08c..34aa6fa5ec4cf82d6d7e8856ca65d8486d70949d 100644 (file)
--- a/examples/talk-llama/models/nemotron.cpp
+++ b/examples/talk-llama/models/nemotron.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    //GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    //GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/neo-bert.cpp b/examples/talk-llama/models/neo-bert.cpp

index 7c32bfca5f5604f20a8867c778fe46f064aa5402..2fdf4a3692f649b685b0fd805920707dea22b30a 100644 (file)
--- a/examples/talk-llama/models/neo-bert.cpp
+++ b/examples/talk-llama/models/neo-bert.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/olmo.cpp b/examples/talk-llama/models/olmo.cpp

index bbd623f1112a1969603d10dbc2aeecfe0e12b64e..26f4b6ee628523eee8c2f7bd0fb68958cd290fe9 100644 (file)
--- a/examples/talk-llama/models/olmo.cpp
+++ b/examples/talk-llama/models/olmo.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/olmo2.cpp b/examples/talk-llama/models/olmo2.cpp

index 713552dab89732d01bdb874359baea51e888a640..5076359e3f9aa887e6ee80bcf342f6fec1dad006 100644 (file)
--- a/examples/talk-llama/models/olmo2.cpp
+++ b/examples/talk-llama/models/olmo2.cpp
@@ -2,10 +2,10 @@
  
  template <bool iswa>
  llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/olmoe.cpp b/examples/talk-llama/models/olmoe.cpp

index b8b6988f897c1aabe0bd4aa549ea3bfe12830430..83a56a0b3b6a2f8e76e2d368aa1bc117aa964be5 100644 (file)
--- a/examples/talk-llama/models/olmoe.cpp
+++ b/examples/talk-llama/models/olmoe.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -92,7 +92,7 @@ llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_para
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SILU, false,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/openai-moe-iswa.cpp b/examples/talk-llama/models/openai-moe-iswa.cpp

index dbe3ca1851feae5131c45665ff8a0bdd6fdba10e..403f130bc41f96ca5a279c0ecc72ea320fc09808 100644 (file)
--- a/examples/talk-llama/models/openai-moe-iswa.cpp
+++ b/examples/talk-llama/models/openai-moe-iswa.cpp
@@ -95,7 +95,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
                  nullptr,
                  n_expert, n_expert_used,
                  LLM_FFN_SWIGLU_OAI_MOE, false,
-                false, 0.0,
+                hparams.expert_weights_scale,
                  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
                  il);
          cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/openelm.cpp b/examples/talk-llama/models/openelm.cpp

index fbf682ec8352e790f8810014d952315a8f8837c1..5df6fe3e3ce8f71f36457a64dd2579f053be04c6 100644 (file)
--- a/examples/talk-llama/models/openelm.cpp
+++ b/examples/talk-llama/models/openelm.cpp
@@ -1,9 +1,9 @@
  #include "models.h"
  
  llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/orion.cpp b/examples/talk-llama/models/orion.cpp

index bb02273bfe74b88b5908dd1ef69c8ed3d18b5f97..48c01efe368111d5dd861b97ee7f1a938e7fd9f2 100644 (file)
--- a/examples/talk-llama/models/orion.cpp
+++ b/examples/talk-llama/models/orion.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/paddleocr.cpp b/examples/talk-llama/models/paddleocr.cpp

index 39a368df53ba0675fb9765a456be3a5a29c6c045..340455c2d5f07614e4e786043b36a76073225b04 100644 (file)
--- a/examples/talk-llama/models/paddleocr.cpp
+++ b/examples/talk-llama/models/paddleocr.cpp
@@ -5,10 +5,10 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
  
      // NOTE: same with qwen2vl.cpp, but bias tensors are optional
  
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/pangu-embedded.cpp b/examples/talk-llama/models/pangu-embedded.cpp

index 664572a5001467406f05d13a6fab8ae2cb9aece9..1cf0938e68f9cca814c13e9cad87c6f4246c2d4c 100644 (file)
--- a/examples/talk-llama/models/pangu-embedded.cpp
+++ b/examples/talk-llama/models/pangu-embedded.cpp
@@ -2,10 +2,10 @@
  
  
  llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/phi2.cpp b/examples/talk-llama/models/phi2.cpp

index 22dbf610767d24c7d05000cd690bfa6f9c8989c8..32d40d71fb71fb777883417a6e111685094288de 100644 (file)
--- a/examples/talk-llama/models/phi2.cpp
+++ b/examples/talk-llama/models/phi2.cpp
@@ -2,10 +2,10 @@
  
  
  llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * attn_norm_output;
diff --git a/examples/talk-llama/models/phi3.cpp b/examples/talk-llama/models/phi3.cpp

index c8e5da33db7e52a6c6299a6ad1fddbef0b3cd04a..3d11a9459c4caf0d93ed73434767936438b5d080 100644 (file)
--- a/examples/talk-llama/models/phi3.cpp
+++ b/examples/talk-llama/models/phi3.cpp
@@ -2,10 +2,10 @@
  
  template<bool iswa>
  llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -114,7 +114,7 @@ llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
              cb(cur, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/plamo.cpp b/examples/talk-llama/models/plamo.cpp

index 04ff709f9c6ffcd4e4f8d5fffb5f1e9a6703e643..b7a7121104258e29b047736a8a052ecebffb6616 100644 (file)
--- a/examples/talk-llama/models/plamo.cpp
+++ b/examples/talk-llama/models/plamo.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/plamo2.cpp b/examples/talk-llama/models/plamo2.cpp

index 3af236843bb30d2aa2a4861d0946b5ddd35386d9..f02acbc18699cddc0b229a56475845ea220f6970 100644 (file)
--- a/examples/talk-llama/models/plamo2.cpp
+++ b/examples/talk-llama/models/plamo2.cpp
@@ -27,7 +27,7 @@ llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_pa
          cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  
          // check if this layer is Mamba or Attention
-        bool is_mamba_layer = hparams.is_recurrent(il);
+        const bool is_mamba_layer = hparams.is_recurrent(il);
  
          if (is_mamba_layer) {
              // PLaMo-2 Mamba layer
@@ -106,9 +106,9 @@ ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv
          cb(qkv, "wqkv", il);
  
          // split QKV tensor into Q, K, V
-        const int64_t n_embd_head_q = hparams.n_embd_head_k;
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-        const int64_t n_embd_head_v = hparams.n_embd_head_v;
+        const int64_t n_embd_head_q = hparams.n_embd_head_k();
+        const int64_t n_embd_head_k = hparams.n_embd_head_k();
+        const int64_t n_embd_head_v = hparams.n_embd_head_v();
          int32_t       n_head        = hparams.n_head(il);
          int32_t       n_head_kv     = hparams.n_head_kv(il);
  
@@ -171,6 +171,8 @@ ggml_tensor * llm_build_plamo2::build_plamo2_mamba_layer(llm_graph_input_rs * in
      GGML_ASSERT(n_seqs != 0);
      GGML_ASSERT(ubatch.equal_seqs());
      GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+    GGML_ASSERT(d_inner % n_head == 0);
+    GGML_ASSERT(n_group == 0);
  
      ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
      ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
diff --git a/examples/talk-llama/models/plamo3.cpp b/examples/talk-llama/models/plamo3.cpp

index 55c8064679e27e464adfe93ddc412c800b5aaa23..32af6e04663ea54a45238b915353d22fe31d716e 100644 (file)
--- a/examples/talk-llama/models/plamo3.cpp
+++ b/examples/talk-llama/models/plamo3.cpp
@@ -3,8 +3,8 @@
  template <bool iswa>
  llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    const int64_t head_dim_q = hparams.n_embd_head_k;
-    const int64_t head_dim_v = hparams.n_embd_head_v;
+    const int64_t head_dim_q = hparams.n_embd_head_k();
+    const int64_t head_dim_v = hparams.n_embd_head_v();
  
      ggml_tensor * cur;
      ggml_tensor * inpL = build_inp_embd(model.tok_embd);
diff --git a/examples/talk-llama/models/plm.cpp b/examples/talk-llama/models/plm.cpp

index 612a487c564c45c660ad1b1ac25256fac823504b..bcb651ce543f328d10b301dd0979d9dc180b39cd 100644 (file)
--- a/examples/talk-llama/models/plm.cpp
+++ b/examples/talk-llama/models/plm.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
+    const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k()));
  
-    const uint32_t n_embd_head_qk_rope = hparams.n_rot;
-    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+    const uint32_t n_embd_head_qk_rope = hparams.n_rot();
+    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
  
      const uint32_t kv_lora_rank = hparams.n_lora_kv;
  
@@ -38,15 +38,15 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &
  
              // split into {n_head * n_embd_head_qk_nope, n_tokens}
              ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
-                    ggml_row_size(q->type, hparams.n_embd_head_k),
-                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                    ggml_row_size(q->type, hparams.n_embd_head_k()),
+                    ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
                      0);
              cb(q_nope, "q_nope", il);
  
              // and {n_head * n_embd_head_qk_rope, n_tokens}
              ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
-                    ggml_row_size(q->type, hparams.n_embd_head_k),
-                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                    ggml_row_size(q->type, hparams.n_embd_head_k()),
+                    ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
                      ggml_row_size(q->type, n_embd_head_qk_nope));
              cb(q_pe, "q_pe", il);
  
@@ -78,23 +78,23 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &
  
              // split into {n_head * n_embd_head_qk_nope, n_tokens}
              ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
-                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
+                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
                      0);
              cb(k_nope, "k_nope", il);
  
              // and {n_head * n_embd_head_v, n_tokens}
-            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
-                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
+                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
+                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
                      ggml_row_size(kv->type, (n_embd_head_qk_nope)));
              cb(v_states, "v_states", il);
  
              v_states = ggml_cont(ctx0, v_states);
              cb(v_states, "v_states", il);
  
-            v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
-                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+            v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v() * n_head, n_tokens,
+                    ggml_row_size(kv->type, hparams.n_embd_head_v() * n_head),
                      0);
              cb(v_states, "v_states", il);
  
diff --git a/examples/talk-llama/models/qwen.cpp b/examples/talk-llama/models/qwen.cpp

index 31fd9b73763dedb50e211274b82bfc99a241db87..7390f1320bf5d0071012150c779a80c0d72c326d 100644 (file)
--- a/examples/talk-llama/models/qwen.cpp
+++ b/examples/talk-llama/models/qwen.cpp
@@ -2,9 +2,9 @@
  
  
  llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/qwen2.cpp b/examples/talk-llama/models/qwen2.cpp

index 3da4dea3c16752b35040398d6fd6d5b293b9e928..58c10622508422a786bd9c4e90edf81d680e103e 100644 (file)
--- a/examples/talk-llama/models/qwen2.cpp
+++ b/examples/talk-llama/models/qwen2.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/qwen2moe.cpp b/examples/talk-llama/models/qwen2moe.cpp

index 49142b71236618eb45d78ac9765f26b75ed4e14d..60761789dc9c378e2ffa3f473b3593b91d222a1c 100644 (file)
--- a/examples/talk-llama/models/qwen2moe.cpp
+++ b/examples/talk-llama/models/qwen2moe.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -94,7 +94,7 @@ llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_grap
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, false,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/qwen2vl.cpp b/examples/talk-llama/models/qwen2vl.cpp

index 9be38675cf7e2cbb05564d1d6eb7663d03b9eb5c..9004bab9db12a433132248f8092b9adc56f5d1b8 100644 (file)
--- a/examples/talk-llama/models/qwen2vl.cpp
+++ b/examples/talk-llama/models/qwen2vl.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/qwen3.cpp b/examples/talk-llama/models/qwen3.cpp

index a5cfffa531491a324cf7d15491b0465057cb82c5..52081668477a93ccbc68516bfedb447ad2ca9685 100644 (file)
--- a/examples/talk-llama/models/qwen3.cpp
+++ b/examples/talk-llama/models/qwen3.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -30,13 +30,13 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
          // self-attention
          {
              // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
              cb(Qcur, "Qcur", il);
  
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
              cb(Kcur, "Kcur", il);
  
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
              cb(Vcur, "Vcur", il);
  
              Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
@@ -68,6 +68,9 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
              cur = build_attn(inp_attn,
                      model.layers[il].wo, model.layers[il].bo,
                      Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+            if (model.layers[il].wo_s) {
+                cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
+            }
          }
          if (il == n_layer - 1 && inp_out_ids) {
              cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
@@ -83,9 +86,9 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
          cb(cur, "ffn_norm", il);
  
          cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                model.layers[il].ffn_gate, NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
+                model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_s,
+                model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
+                model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
                  NULL,
                  LLM_FFN_SILU, LLM_FFN_PAR, il);
          cb(cur, "ffn_out", il);
diff --git a/examples/talk-llama/models/qwen35.cpp b/examples/talk-llama/models/qwen35.cpp

index bacf7a4c2eefc7b1a3f5b9b9b4c8b2270ec0ec67..3108bf331ac95633478c070400e847d2ceb6d340 100644 (file)
--- a/examples/talk-llama/models/qwen35.cpp
+++ b/examples/talk-llama/models/qwen35.cpp
@@ -4,9 +4,9 @@
  
  llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
      llm_build_delta_net_base(params), model(model) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      int sections[4];
      std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -90,11 +90,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz(
      const int64_t n_seqs       = ubatch.n_seqs;
      const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  
-    ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+    ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s);
      qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
      cb(qkv_mixed, "linear_attn_qkv_mixed", il);
  
-    ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+    ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s);
      cb(z, "z", il);
  
      return { qkv_mixed, z };
@@ -117,13 +117,13 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
          ggml_tensor *             inp_pos,
          int *                     sections,
          int                       il) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
  
      // Qwen3Next uses a single Q projection that outputs query + gate
-    ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
+    ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ]
      cb(Qcur_full, "Qcur_full", il);
  
      ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
@@ -135,10 +135,10 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
      Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
      cb(Qcur, "Qcur_normed", il);
  
-    ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+    ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
      cb(Kcur, "Kcur", il);
  
-    ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+    ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
      cb(Vcur, "Vcur", il);
  
      // Apply K normalization
@@ -186,7 +186,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
      cur = ggml_mul(ctx0, cur, gate_sigmoid);
      cb(cur, "attn_gated", il);
  
-    cur = build_lora_mm(model.layers[il].wo, cur);
+    cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
      cb(cur, "attn_output", il);
  
      return cur;
@@ -217,13 +217,13 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
      ggml_tensor * qkv_mixed = qkvz.first;
      ggml_tensor * z         = qkvz.second;
  
-    ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
+    ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s);
      beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
      cb(beta, "beta", il);
  
      beta = ggml_sigmoid(ctx0, beta);
  
-    ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
+    ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s);
      alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
      cb(alpha, "alpha", il);
  
@@ -321,9 +321,9 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
      //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
  
      // if head keys and value keys are different, repeat to force tensors into matching shapes
-    if (num_k_heads != num_v_heads) {
+    // note: need explicit repeat only if we are not using the fused GDN
+    if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) {
          GGML_ASSERT(num_v_heads % num_k_heads == 0);
-        // TODO: try to avoid these explicit repeats by utilizing op broadcast
          q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
          k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
      }
@@ -332,13 +332,8 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
      cb(k_conv, "k_conv_predelta", il);
      cb(v_conv, "v_conv_predelta", il);
  
-    // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
-    std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
-    if (n_seq_tokens == 1) {
-        attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
-    } else {
-        attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
-    }
+    auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il);
+
      ggml_tensor * output    = attn_out.first;
      ggml_tensor * new_state = attn_out.second;
      cb(output, "attn_output", il);
@@ -361,7 +356,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
      cb(final_output, "final_output", il);
  
      // Output projection
-    cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+    cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s);
      cb(cur, "linear_attn_out", il);
  
      // Reshape back to original dimensions
@@ -375,9 +370,9 @@ ggml_tensor * llm_build_qwen35::build_layer_ffn(ggml_tensor * cur, const int il)
      GGML_ASSERT(model.layers[il].ffn_gate_inp == nullptr);
  
      cur = build_ffn(cur,
-        model.layers[il].ffn_up, NULL, NULL,
-        model.layers[il].ffn_gate, NULL, NULL,
-        model.layers[il].ffn_down, NULL, NULL,
+        model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
+        model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
+        model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
          NULL,
          LLM_FFN_SILU, LLM_FFN_PAR, il);
      cb(cur, "ffn_out", il);
diff --git a/examples/talk-llama/models/qwen35moe.cpp b/examples/talk-llama/models/qwen35moe.cpp

index 22d708f20622261be4a90ae150f86e8c4c6834d2..165e2412e560b913ccf5f665f3786cd88e0c1242 100644 (file)
--- a/examples/talk-llama/models/qwen35moe.cpp
+++ b/examples/talk-llama/models/qwen35moe.cpp
@@ -4,9 +4,9 @@
  
  llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
      llm_build_delta_net_base(params), model(model) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      int sections[4];
      std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -90,11 +90,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_qkvz(
      const int64_t n_seqs       = ubatch.n_seqs;
      const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  
-    ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+    ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s);
      qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
      cb(qkv_mixed, "linear_attn_qkv_mixed", il);
  
-    ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+    ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s);
      cb(z, "z", il);
  
      return { qkv_mixed, z };
@@ -117,13 +117,13 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
          ggml_tensor *             inp_pos,
          int *                     sections,
          int                       il) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
  
      // Qwen3Next uses a single Q projection that outputs query + gate
-    ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
+    ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ]
      cb(Qcur_full, "Qcur_full", il);
  
      ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
@@ -135,10 +135,10 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
      Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
      cb(Qcur, "Qcur_normed", il);
  
-    ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+    ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
      cb(Kcur, "Kcur", il);
  
-    ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+    ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
      cb(Vcur, "Vcur", il);
  
      // Apply K normalization
@@ -186,7 +186,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
      cur = ggml_mul(ctx0, cur, gate_sigmoid);
      cb(cur, "attn_gated", il);
  
-    cur = build_lora_mm(model.layers[il].wo, cur);
+    cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
      cb(cur, "attn_output", il);
  
      return cur;
@@ -217,13 +217,13 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
      ggml_tensor * qkv_mixed = qkvz.first;
      ggml_tensor * z         = qkvz.second;
  
-    ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
+    ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s);
      beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
      cb(beta, "beta", il);
  
      beta = ggml_sigmoid(ctx0, beta);
  
-    ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
+    ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s);
      alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
      cb(alpha, "alpha", il);
  
@@ -321,9 +321,9 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
      //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
  
      // if head keys and value keys are different, repeat to force tensors into matching shapes
-    if (num_k_heads != num_v_heads) {
+    // note: need explicit repeat only if we are not using the fused GDN
+    if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) {
          GGML_ASSERT(num_v_heads % num_k_heads == 0);
-        // TODO: try to avoid these explicit repeats by utilizing op broadcast
          q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
          k_conv = ggml_repeat_4d(ctx0, k_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs);
      }
@@ -332,13 +332,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
      cb(k_conv, "k_conv_predelta", il);
      cb(v_conv, "v_conv_predelta", il);
  
-    // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
-    std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
-    if (n_seq_tokens == 1) {
-        attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
-    } else {
-        attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
-    }
+    auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il);
+
      ggml_tensor * output    = attn_out.first;
      ggml_tensor * new_state = attn_out.second;
      cb(output, "attn_output", il);
@@ -361,7 +356,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
      cb(final_output, "final_output", il);
  
      // Output projection
-    cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+    cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s);
      cb(cur, "linear_attn_out", il);
  
      // Reshape back to original dimensions
@@ -376,21 +371,28 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int
  
      ggml_tensor * moe_out =
          build_moe_ffn(cur,
-            model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-            model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+            model.layers[il].ffn_gate_inp,
+            model.layers[il].ffn_up_exps,
+            model.layers[il].ffn_gate_exps,
+            model.layers[il].ffn_down_exps,
              nullptr,
-            n_expert, n_expert_used, LLM_FFN_SILU,
-            true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
-            nullptr, model.layers[il].ffn_gate_up_exps);
+            n_expert, n_expert_used,
+            LLM_FFN_SILU, true,
+            hparams.expert_weights_scale,
+            LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+            nullptr, model.layers[il].ffn_gate_up_exps,
+            model.layers[il].ffn_up_exps_s,
+            model.layers[il].ffn_gate_exps_s,
+            model.layers[il].ffn_down_exps_s);
      cb(moe_out, "ffn_moe_out", il);
  
      // Add shared experts if present - following Qwen3Next reference implementation
      if (model.layers[il].ffn_up_shexp != nullptr) {
          ggml_tensor * ffn_shexp =
              build_ffn(cur,
-                model.layers[il].ffn_up_shexp, NULL, NULL,
-                model.layers[il].ffn_gate_shexp, NULL, NULL,
-                model.layers[il].ffn_down_shexp, NULL, NULL,
+                model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
+                model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s,
+                model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
                  NULL,
                  LLM_FFN_SILU, LLM_FFN_PAR, il);
          cb(ffn_shexp, "ffn_shexp", il);
diff --git a/examples/talk-llama/models/qwen3moe.cpp b/examples/talk-llama/models/qwen3moe.cpp

index 888534fb34746e922c41fded972c93d1c4e8927e..dba46618ff2003460e6c75d5202054807aee493e 100644 (file)
--- a/examples/talk-llama/models/qwen3moe.cpp
+++ b/examples/talk-llama/models/qwen3moe.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -30,13 +30,13 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
          // self_attention
          {
              // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
              cb(Qcur, "Qcur", il);
  
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
              cb(Kcur, "Kcur", il);
  
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
              cb(Vcur, "Vcur", il);
  
              Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
@@ -68,6 +68,9 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
              cur = build_attn(inp_attn,
                      model.layers[il].wo, model.layers[il].bo,
                      Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+            if (model.layers[il].wo_s) {
+                cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
+            }
          }
          if (il == n_layer - 1 && inp_out_ids) {
              cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
@@ -91,9 +94,13 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    il);
+                    il,
+                    nullptr, nullptr,
+                    model.layers[il].ffn_up_exps_s,
+                    model.layers[il].ffn_gate_exps_s,
+                    model.layers[il].ffn_down_exps_s);
          cb(moe_out, "ffn_moe_out", il);
          cur = moe_out;
  
diff --git a/examples/talk-llama/models/qwen3next.cpp b/examples/talk-llama/models/qwen3next.cpp

index f2621200f2337e95547956213438d4222b59bfbc..cc479dd075c2026a3e6ae41357e552ca65aff310 100644 (file)
--- a/examples/talk-llama/models/qwen3next.cpp
+++ b/examples/talk-llama/models/qwen3next.cpp
@@ -100,8 +100,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
          ggml_tensor *             cur,
          ggml_tensor *             inp_pos,
          int                       il) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
  
@@ -406,6 +406,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
      //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
  
      // if head keys and value keys are different, repeat to force tensors into matching shapes
+    // TODO: avoid repeats for fused GDN, needs broadcast configuration for GDN op [TAG_GGML_GDN_BCAST]
      if (num_k_heads != num_v_heads) {
          GGML_ASSERT(num_v_heads % num_k_heads == 0);
          int64_t repeat_factor = num_v_heads / num_k_heads;
@@ -431,13 +432,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
      cb(k_conv, "k_conv_predelta", il);
      cb(v_conv, "v_conv_predelta", il);
  
-    // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
-    std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
-    if (n_seq_tokens == 1) {
-        attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
-    } else {
-        attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
-    }
+    auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il);
+
      ggml_tensor * output    = attn_out.first;
      ggml_tensor * new_state = attn_out.second;
      cb(output, "attn_output", il);
@@ -475,11 +471,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
          // MoE branch
          ggml_tensor * moe_out =
              build_moe_ffn(cur,
-                model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+                model.layers[il].ffn_gate_inp,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
                  nullptr,
-                n_expert, n_expert_used, LLM_FFN_SILU,
-                true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, true,
+                hparams.expert_weights_scale,
+                LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
                  nullptr, model.layers[il].ffn_gate_up_exps);
          cb(moe_out, "ffn_moe_out", il);
  
diff --git a/examples/talk-llama/models/qwen3vl-moe.cpp b/examples/talk-llama/models/qwen3vl-moe.cpp

index e5e1a2150c811ad779ceeb3b6fd31a380514a72b..195daea66c988949c8a6a945037b3c3038552610 100644 (file)
--- a/examples/talk-llama/models/qwen3vl-moe.cpp
+++ b/examples/talk-llama/models/qwen3vl-moe.cpp
@@ -4,10 +4,10 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
      const size_t n_deepstack_layers = hparams.n_deepstack_layers;
  
      const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -99,7 +99,7 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/qwen3vl.cpp b/examples/talk-llama/models/qwen3vl.cpp

index 0f8315b32404b5d9ba521f130eb4687dc7806a10..bbd5f42ba5bd8db77a16eee2982185ca469fe81b 100644 (file)
--- a/examples/talk-llama/models/qwen3vl.cpp
+++ b/examples/talk-llama/models/qwen3vl.cpp
@@ -4,10 +4,10 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
      const size_t n_deepstack_layers = hparams.n_deepstack_layers;
  
      const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/refact.cpp b/examples/talk-llama/models/refact.cpp

index ff5eb2841db9342ea0d2928d03aa847710132421..140700d9e2d8b46b4325ce331fadd2080fc25c26 100644 (file)
--- a/examples/talk-llama/models/refact.cpp
+++ b/examples/talk-llama/models/refact.cpp
@@ -1,9 +1,9 @@
  #include "models.h"
  
  llm_build_refact::llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/rnd1.cpp b/examples/talk-llama/models/rnd1.cpp

index 46b3dc3efca22bdd17020ae78012ed16427c6bb1..c8e1f43400fffbed70f618c372342fa319c1148a 100644 (file)
--- a/examples/talk-llama/models/rnd1.cpp
+++ b/examples/talk-llama/models/rnd1.cpp
@@ -2,10 +2,10 @@
  
  // RND1 is a Qwen3Moe AR model converted to diffusion model.
  llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -93,7 +93,7 @@ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_SILU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                      il);
          cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/seed-oss.cpp b/examples/talk-llama/models/seed-oss.cpp

index 0dc33c50ba33f67c41afdc622dd689ec4029bc57..a4d0b75d84699020307e5c53361055068dbbbef1 100644 (file)
--- a/examples/talk-llama/models/seed-oss.cpp
+++ b/examples/talk-llama/models/seed-oss.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/smallthinker.cpp b/examples/talk-llama/models/smallthinker.cpp

index 4c497ca76f4295b0b85c7f3a0b4806c4bd900a5d..e2155aacef41a0c71488691f9bbb96e2fa52e391 100644 (file)
--- a/examples/talk-llama/models/smallthinker.cpp
+++ b/examples/talk-llama/models/smallthinker.cpp
@@ -2,10 +2,10 @@
  
  template <bool iswa>
  llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
@@ -93,7 +93,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
                      nullptr,
                      n_expert, n_expert_used,
                      LLM_FFN_RELU, true,
-                    false, 0.0,
+                    hparams.expert_weights_scale,
                      static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
                      il, probs);
  
diff --git a/examples/talk-llama/models/smollm3.cpp b/examples/talk-llama/models/smollm3.cpp

index 97c30deed54e6c158279c6594c370ecbcef23ffc..e267fd8f32f5cb3bec368dc1d5e51cccbee60049 100644 (file)
--- a/examples/talk-llama/models/smollm3.cpp
+++ b/examples/talk-llama/models/smollm3.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/stablelm.cpp b/examples/talk-llama/models/stablelm.cpp

index bed1915c0067686727a55d1f15cbc1f388603924..ff5aced93b3f4e7648addfda066155eaa3c121d6 100644 (file)
--- a/examples/talk-llama/models/stablelm.cpp
+++ b/examples/talk-llama/models/stablelm.cpp
@@ -1,9 +1,9 @@
  #include "models.h"
  
  llm_build_stablelm::llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/starcoder.cpp b/examples/talk-llama/models/starcoder.cpp

index e197af4a8c63f0f521826f542b9e6ca2e372c999..941cee982194abd4426ee2e93d79cf42b7c3508f 100644 (file)
--- a/examples/talk-llama/models/starcoder.cpp
+++ b/examples/talk-llama/models/starcoder.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/starcoder2.cpp b/examples/talk-llama/models/starcoder2.cpp

index e40ef2cb7493a20549ac33618026c58a2bb6d8d9..a5965aceb3b4ac4b12923dafbf76a915985e4b72 100644 (file)
--- a/examples/talk-llama/models/starcoder2.cpp
+++ b/examples/talk-llama/models/starcoder2.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/step35-iswa.cpp b/examples/talk-llama/models/step35-iswa.cpp

index f8737815a674e8e5d2a0edf03157fd2e4975d365..176209cd93e15b7b60b91b74ebee0ce2517547d1 100644 (file)
--- a/examples/talk-llama/models/step35-iswa.cpp
+++ b/examples/talk-llama/models/step35-iswa.cpp
@@ -52,7 +52,7 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll
              // RoPE (partial rotary factors per layer)
              const bool is_swa = hparams.is_swa(il);
              ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il);
-            const int64_t n_rot_l = is_swa ? hparams.n_rot : (hparams.n_rot / 2);
+            const int64_t n_rot_l = hparams.n_rot(il);
              Qcur = ggml_rope_ext(
                  ctx0, Qcur, inp_pos, rope_factors,
                  n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
@@ -119,9 +119,6 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll
              cb(cur, "ffn_out", il);
          } else {
              // MoE routed experts
-            const bool  norm_w  = hparams.expert_weights_norm;
-            const float w_scale = hparams.expert_weights_scale;
-            const bool  scale_w = w_scale != 0.0f;
              ggml_tensor * moe_out = build_moe_ffn(cur,
                      model.layers[il].ffn_gate_inp,
                      model.layers[il].ffn_up_exps,
@@ -129,8 +126,8 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll
                      model.layers[il].ffn_down_exps,
                      model.layers[il].ffn_exp_probs_b,
                      n_expert, n_expert_used,
-                    LLM_FFN_SILU,
-                    norm_w, scale_w, w_scale,
+                    LLM_FFN_SILU, hparams.expert_weights_norm,
+                    hparams.expert_weights_scale,
                      (llama_expert_gating_func_type) hparams.expert_gating_func,
                      il);
              cb(moe_out, "ffn_moe_out", il);
diff --git a/examples/talk-llama/models/t5-dec.cpp b/examples/talk-llama/models/t5-dec.cpp

index 297e450de76fe60073d459b50a3a40a672ec86c5..8ca8372bd4ccd31864dcf06709b06d90602db2f4 100644 (file)
--- a/examples/talk-llama/models/t5-dec.cpp
+++ b/examples/talk-llama/models/t5-dec.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
      //const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/t5-enc.cpp b/examples/talk-llama/models/t5-enc.cpp

index 70e1d80dcddc7f6b1e13658d53305046456c1512..395dfb510420f02d497ba6b2c0838035a0264844 100644 (file)
--- a/examples/talk-llama/models/t5-enc.cpp
+++ b/examples/talk-llama/models/t5-enc.cpp
@@ -1,9 +1,9 @@
  #include "models.h"
  
  llm_build_t5_enc::llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/models/xverse.cpp b/examples/talk-llama/models/xverse.cpp

index 364797dd31b883ef36c94ee79a0c5466627aa79f..3a8dfafccebe1f0315204dd8b636d80958056ff5 100644 (file)
--- a/examples/talk-llama/models/xverse.cpp
+++ b/examples/talk-llama/models/xverse.cpp
@@ -1,10 +1,10 @@
  #include "models.h"
  
  llm_build_xverse::llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
  
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
  
      ggml_tensor * cur;
      ggml_tensor * inpL;
diff --git a/examples/talk-llama/unicode.cpp b/examples/talk-llama/unicode.cpp

index 1475b53b6597400235204dda6c01b3f07aa494ec..122c8ca04a511f52eb03b5d51d2d772104ff42c5 100644 (file)
--- a/examples/talk-llama/unicode.cpp
+++ b/examples/talk-llama/unicode.cpp
@@ -773,7 +773,7 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
          // tiny_aya digit grouping pattern from tokenizer.json:
          //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
          // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
-        // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
+        // TODO: Revisit this regex, in case there are any subtle tokenization differences with the original regex.
          bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
      }
author	Georgi Gerganov <redacted>
	Mon, 16 Mar 2026 05:16:46 +0000 (07:16 +0200)
committer	Georgi Gerganov <redacted>
	Mon, 16 Mar 2026 11:10:15 +0000 (13:10 +0200)
examples/talk-llama/llama-arch.cpp		patch \| blob \| history
examples/talk-llama/llama-arch.h		patch \| blob \| history
examples/talk-llama/llama-batch.cpp		patch \| blob \| history
examples/talk-llama/llama-context.cpp		patch \| blob \| history
examples/talk-llama/llama-cparams.h		patch \| blob \| history
examples/talk-llama/llama-ext.h	[new file with mode: 0644]	patch \| blob
examples/talk-llama/llama-grammar.cpp		patch \| blob \| history
examples/talk-llama/llama-graph.cpp		patch \| blob \| history
examples/talk-llama/llama-graph.h		patch \| blob \| history
examples/talk-llama/llama-hparams.cpp		patch \| blob \| history
examples/talk-llama/llama-hparams.h		patch \| blob \| history
examples/talk-llama/llama-impl.cpp		patch \| blob \| history
examples/talk-llama/llama-impl.h		patch \| blob \| history
examples/talk-llama/llama-kv-cache.cpp		patch \| blob \| history
examples/talk-llama/llama-kv-cache.h		patch \| blob \| history
examples/talk-llama/llama-model-loader.cpp		patch \| blob \| history
examples/talk-llama/llama-model-loader.h		patch \| blob \| history
examples/talk-llama/llama-model-saver.cpp		patch \| blob \| history
examples/talk-llama/llama-model-saver.h		patch \| blob \| history
examples/talk-llama/llama-model.cpp		patch \| blob \| history
examples/talk-llama/llama-model.h		patch \| blob \| history
examples/talk-llama/llama-quant.cpp		patch \| blob \| history
examples/talk-llama/llama-vocab.cpp		patch \| blob \| history
examples/talk-llama/llama.cpp		patch \| blob \| history
examples/talk-llama/llama.h		patch \| blob \| history
examples/talk-llama/models/afmoe.cpp		patch \| blob \| history
examples/talk-llama/models/apertus.cpp		patch \| blob \| history
examples/talk-llama/models/arcee.cpp		patch \| blob \| history
examples/talk-llama/models/arctic.cpp		patch \| blob \| history
examples/talk-llama/models/baichuan.cpp		patch \| blob \| history
examples/talk-llama/models/bailingmoe.cpp		patch \| blob \| history
examples/talk-llama/models/bailingmoe2.cpp		patch \| blob \| history
examples/talk-llama/models/bert.cpp		patch \| blob \| history
examples/talk-llama/models/bitnet.cpp		patch \| blob \| history
examples/talk-llama/models/bloom.cpp		patch \| blob \| history
examples/talk-llama/models/chameleon.cpp		patch \| blob \| history
examples/talk-llama/models/chatglm.cpp		patch \| blob \| history
examples/talk-llama/models/codeshell.cpp		patch \| blob \| history
examples/talk-llama/models/cogvlm.cpp		patch \| blob \| history
examples/talk-llama/models/cohere2-iswa.cpp		patch \| blob \| history
examples/talk-llama/models/command-r.cpp		patch \| blob \| history
examples/talk-llama/models/dbrx.cpp		patch \| blob \| history
examples/talk-llama/models/deci.cpp		patch \| blob \| history
examples/talk-llama/models/deepseek.cpp		patch \| blob \| history
examples/talk-llama/models/deepseek2.cpp		patch \| blob \| history
examples/talk-llama/models/delta-net-base.cpp		patch \| blob \| history
examples/talk-llama/models/dots1.cpp		patch \| blob \| history
examples/talk-llama/models/dream.cpp		patch \| blob \| history
examples/talk-llama/models/ernie4-5-moe.cpp		patch \| blob \| history
examples/talk-llama/models/ernie4-5.cpp		patch \| blob \| history
examples/talk-llama/models/eurobert.cpp		patch \| blob \| history
examples/talk-llama/models/exaone-moe.cpp		patch \| blob \| history
examples/talk-llama/models/exaone.cpp		patch \| blob \| history
examples/talk-llama/models/exaone4.cpp		patch \| blob \| history
examples/talk-llama/models/falcon-h1.cpp		patch \| blob \| history
examples/talk-llama/models/falcon.cpp		patch \| blob \| history
examples/talk-llama/models/gemma-embedding.cpp		patch \| blob \| history
examples/talk-llama/models/gemma.cpp		patch \| blob \| history
examples/talk-llama/models/gemma2-iswa.cpp		patch \| blob \| history
examples/talk-llama/models/gemma3.cpp		patch \| blob \| history
examples/talk-llama/models/gemma3n-iswa.cpp		patch \| blob \| history
examples/talk-llama/models/glm4-moe.cpp		patch \| blob \| history
examples/talk-llama/models/glm4.cpp		patch \| blob \| history
examples/talk-llama/models/gpt2.cpp		patch \| blob \| history
examples/talk-llama/models/gptneox.cpp		patch \| blob \| history
examples/talk-llama/models/granite-hybrid.cpp		patch \| blob \| history
examples/talk-llama/models/granite.cpp		patch \| blob \| history
examples/talk-llama/models/grok.cpp		patch \| blob \| history
examples/talk-llama/models/grovemoe.cpp		patch \| blob \| history
examples/talk-llama/models/hunyuan-dense.cpp		patch \| blob \| history
examples/talk-llama/models/hunyuan-moe.cpp		patch \| blob \| history
examples/talk-llama/models/internlm2.cpp		patch \| blob \| history
examples/talk-llama/models/jais.cpp		patch \| blob \| history
examples/talk-llama/models/jais2.cpp		patch \| blob \| history
examples/talk-llama/models/jamba.cpp		patch \| blob \| history
examples/talk-llama/models/kimi-linear.cpp		patch \| blob \| history
examples/talk-llama/models/lfm2.cpp		patch \| blob \| history
examples/talk-llama/models/llada-moe.cpp		patch \| blob \| history
examples/talk-llama/models/llada.cpp		patch \| blob \| history
examples/talk-llama/models/llama-iswa.cpp		patch \| blob \| history
examples/talk-llama/models/llama.cpp		patch \| blob \| history
examples/talk-llama/models/maincoder.cpp		patch \| blob \| history
examples/talk-llama/models/mamba-base.cpp		patch \| blob \| history
examples/talk-llama/models/mimo2-iswa.cpp		patch \| blob \| history
examples/talk-llama/models/minicpm3.cpp		patch \| blob \| history
examples/talk-llama/models/minimax-m2.cpp		patch \| blob \| history
examples/talk-llama/models/mistral3.cpp		patch \| blob \| history
examples/talk-llama/models/models.h		patch \| blob \| history
examples/talk-llama/models/modern-bert.cpp		patch \| blob \| history
examples/talk-llama/models/mpt.cpp		patch \| blob \| history
examples/talk-llama/models/nemotron-h.cpp		patch \| blob \| history
examples/talk-llama/models/nemotron.cpp		patch \| blob \| history
examples/talk-llama/models/neo-bert.cpp		patch \| blob \| history
examples/talk-llama/models/olmo.cpp		patch \| blob \| history
examples/talk-llama/models/olmo2.cpp		patch \| blob \| history
examples/talk-llama/models/olmoe.cpp		patch \| blob \| history
examples/talk-llama/models/openai-moe-iswa.cpp		patch \| blob \| history
examples/talk-llama/models/openelm.cpp		patch \| blob \| history
examples/talk-llama/models/orion.cpp		patch \| blob \| history
examples/talk-llama/models/paddleocr.cpp		patch \| blob \| history
examples/talk-llama/models/pangu-embedded.cpp		patch \| blob \| history
examples/talk-llama/models/phi2.cpp		patch \| blob \| history
examples/talk-llama/models/phi3.cpp		patch \| blob \| history
examples/talk-llama/models/plamo.cpp		patch \| blob \| history
examples/talk-llama/models/plamo2.cpp		patch \| blob \| history
examples/talk-llama/models/plamo3.cpp		patch \| blob \| history
examples/talk-llama/models/plm.cpp		patch \| blob \| history
examples/talk-llama/models/qwen.cpp		patch \| blob \| history
examples/talk-llama/models/qwen2.cpp		patch \| blob \| history
examples/talk-llama/models/qwen2moe.cpp		patch \| blob \| history
examples/talk-llama/models/qwen2vl.cpp		patch \| blob \| history
examples/talk-llama/models/qwen3.cpp		patch \| blob \| history
examples/talk-llama/models/qwen35.cpp		patch \| blob \| history
examples/talk-llama/models/qwen35moe.cpp		patch \| blob \| history
examples/talk-llama/models/qwen3moe.cpp		patch \| blob \| history
examples/talk-llama/models/qwen3next.cpp		patch \| blob \| history
examples/talk-llama/models/qwen3vl-moe.cpp		patch \| blob \| history
examples/talk-llama/models/qwen3vl.cpp		patch \| blob \| history
examples/talk-llama/models/refact.cpp		patch \| blob \| history
examples/talk-llama/models/rnd1.cpp		patch \| blob \| history
examples/talk-llama/models/seed-oss.cpp		patch \| blob \| history
examples/talk-llama/models/smallthinker.cpp		patch \| blob \| history
examples/talk-llama/models/smollm3.cpp		patch \| blob \| history
examples/talk-llama/models/stablelm.cpp		patch \| blob \| history
examples/talk-llama/models/starcoder.cpp		patch \| blob \| history
examples/talk-llama/models/starcoder2.cpp		patch \| blob \| history
examples/talk-llama/models/step35-iswa.cpp		patch \| blob \| history
examples/talk-llama/models/t5-dec.cpp		patch \| blob \| history
examples/talk-llama/models/t5-enc.cpp		patch \| blob \| history
examples/talk-llama/models/xverse.cpp		patch \| blob \| history
examples/talk-llama/unicode.cpp		patch \| blob \| history