talk-llama : sync llama.cpp

author Georgi Gerganov <redacted>

Fri, 12 Dec 2025 15:56:43 +0000 (17:56 +0200)

committer Georgi Gerganov <redacted>

Fri, 12 Dec 2025 16:15:27 +0000 (18:15 +0200)
author Georgi Gerganov <redacted>
Fri, 12 Dec 2025 15:56:43 +0000 (17:56 +0200)
committer Georgi Gerganov <redacted>
Fri, 12 Dec 2025 16:15:27 +0000 (18:15 +0200)
diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp

index b2eb2477f930d962823e3e79bf6763e0b2e98d36..64ad1b77690a55cf4ecd0e3de1cc863da7c3f322 100644 (file)
--- a/examples/talk-llama/llama-arch.cpp
+++ b/examples/talk-llama/llama-arch.cpp
@@ -32,6 +32,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
      { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
      { LLM_ARCH_QWEN3,            "qwen3"            },
      { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
+    { LLM_ARCH_QWEN3NEXT,        "qwen3next"        },
      { LLM_ARCH_QWEN3VL,          "qwen3vl"          },
      { LLM_ARCH_QWEN3VLMOE,       "qwen3vlmoe"       },
      { LLM_ARCH_PHI2,             "phi2"             },
@@ -108,24 +109,38 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
      { LLM_ARCH_APERTUS,          "apertus"          },
      { LLM_ARCH_MINIMAX_M2,       "minimax-m2"       },
      { LLM_ARCH_COGVLM,           "cogvlm"           },
+    { LLM_ARCH_RND1,             "rnd1"             },
      { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
      { LLM_ARCH_UNKNOWN,          "(unknown)"        },
  };
  
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-    { LLM_KV_GENERAL_TYPE,                 "general.type"                          },
-    { LLM_KV_GENERAL_ARCHITECTURE,         "general.architecture"                  },
-    { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"          },
-    { LLM_KV_GENERAL_ALIGNMENT,            "general.alignment"                     },
-    { LLM_KV_GENERAL_FILE_TYPE,            "general.file_type"                     },
-    { LLM_KV_GENERAL_NAME,                 "general.name"                          },
-    { LLM_KV_GENERAL_AUTHOR,               "general.author"                        },
-    { LLM_KV_GENERAL_VERSION,              "general.version"                       },
-    { LLM_KV_GENERAL_URL,                  "general.url"                           },
-    { LLM_KV_GENERAL_DESCRIPTION,          "general.description"                   },
-    { LLM_KV_GENERAL_LICENSE,              "general.license"                       },
-    { LLM_KV_GENERAL_SOURCE_URL,           "general.source.url"                    },
-    { LLM_KV_GENERAL_SOURCE_HF_REPO,       "general.source.huggingface.repository" },
+    { LLM_KV_GENERAL_TYPE,                     "general.type"                          },
+    { LLM_KV_GENERAL_ARCHITECTURE,             "general.architecture"                  },
+    { LLM_KV_GENERAL_QUANTIZATION_VERSION,     "general.quantization_version"          },
+    { LLM_KV_GENERAL_ALIGNMENT,                "general.alignment"                     },
+    { LLM_KV_GENERAL_FILE_TYPE,                "general.file_type"                     },
+    { LLM_KV_GENERAL_SAMPLING_SEQUENCE,        "general.sampling.sequence"             },
+    { LLM_KV_GENERAL_SAMPLING_TOP_K,           "general.sampling.top_k"                },
+    { LLM_KV_GENERAL_SAMPLING_TOP_P,           "general.sampling.top_p"                },
+    { LLM_KV_GENERAL_SAMPLING_MIN_P,           "general.sampling.min_p"                },
+    { LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, "general.sampling.xtc_probability"      },
+    { LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,   "general.sampling.xtc_threshold"        },
+    { LLM_KV_GENERAL_SAMPLING_TEMP,            "general.sampling.temp"                 },
+    { LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,  "general.sampling.penalty_last_n"       },
+    { LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,  "general.sampling.penalty_repeat"       },
+    { LLM_KV_GENERAL_SAMPLING_MIROSTAT,        "general.sampling.mirostat"             },
+    { LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,    "general.sampling.mirostat_tau"         },
+    { LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,    "general.sampling.mirostat_eta"         },
+    { LLM_KV_GENERAL_NAME,                     "general.name"                          },
+    { LLM_KV_GENERAL_AUTHOR,                   "general.author"                        },
+    { LLM_KV_GENERAL_VERSION,                  "general.version"                       },
+    { LLM_KV_GENERAL_URL,                      "general.url"                           },
+    { LLM_KV_GENERAL_DESCRIPTION,              "general.description"                   },
+    { LLM_KV_GENERAL_LICENSE,                  "general.license"                       },
+    { LLM_KV_GENERAL_SOURCE_URL,               "general.source.url"                    },
+    { LLM_KV_GENERAL_SOURCE_HF_REPO,           "general.source.huggingface.repository" },
  
      { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
      { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
@@ -190,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
      { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
      { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
      { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
+    { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
      { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
      { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
  
@@ -816,6 +832,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
              { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
          },
      },
+    {
+        LLM_ARCH_QWEN3NEXT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_POST_NORM,     "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_SSM_A_NOSCAN,       "blk.%d.ssm_a" },
+            { LLM_TENSOR_SSM_CONV1D,         "blk.%d.ssm_conv1d" },
+            { LLM_TENSOR_SSM_DT,             "blk.%d.ssm_dt" },
+            { LLM_TENSOR_SSM_BETA_ALPHA,     "blk.%d.ssm_ba" },
+            { LLM_TENSOR_SSM_IN,             "blk.%d.ssm_in" },
+            { LLM_TENSOR_SSM_NORM,           "blk.%d.ssm_norm" },
+            { LLM_TENSOR_SSM_OUT,            "blk.%d.ssm_out" },
+        },
+    },
      {
          LLM_ARCH_QWEN3VL,
          {
@@ -2224,7 +2272,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
              { LLM_TENSOR_SHORTCONV_INPROJ,  "blk.%d.shortconv.in_proj" },
              { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
              { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,       "token_embd_norm" }, // note: wrong tensor name
              { LLM_TENSOR_OUTPUT,            "output" },
          }
      },
@@ -2246,7 +2294,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
              { LLM_TENSOR_SHORTCONV_INPROJ,  "blk.%d.shortconv.in_proj" },
              { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
              { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,       "token_embd_norm" }, // note: wrong tensor name
              { LLM_TENSOR_FFN_GATE_INP,      "blk.%d.ffn_gate_inp" },
              { LLM_TENSOR_FFN_GATE_EXPS,     "blk.%d.ffn_gate_exps" },
              { LLM_TENSOR_FFN_DOWN_EXPS,     "blk.%d.ffn_down_exps" },
@@ -2446,6 +2494,52 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
              { LLM_TENSOR_VISEXP_FFN_UP,   "blk.%d.vis_up" },
          },
      },
+    {
+        LLM_ARCH_RND1,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_MISTRAL3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
      {
          LLM_ARCH_UNKNOWN,
          {
@@ -2454,11 +2548,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
      },
  };
  
+// declare information about the model weight tensors:
+// - the layer in which the tensor is going to be used. this is needed in order to assign the correct buffer type for the weight
+// - the operator which is going to use the weight. this is needed to determine if the respective backend supports the operator
+//
+// for example, input layers are usually assigned to CPU/host buffer types
+//
+// a mismatch between the declared information and the actual layer/op in which the tensor is used can lead to sub-optimal
+//   assignment of the buffer types and extra overhead during computation
+// example: https://github.com/ggml-org/llama.cpp/pull/17548
+//
  static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
      {LLM_TENSOR_TOKEN_EMBD,                 {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
      {LLM_TENSOR_POS_EMBD,                   {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
-    {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
      {LLM_TENSOR_TOKEN_TYPES,                {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
      {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
@@ -2513,6 +2617,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
      {LLM_TENSOR_SSM_X,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_SSM_DT,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_SSM_OUT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_BETA_ALPHA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_TIME_MIX_W1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_TIME_MIX_W2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
      {LLM_TENSOR_TIME_MIX_A1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2534,6 +2639,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
      {LLM_TENSOR_FFN_ACT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
      {LLM_TENSOR_SSM_CONV1D,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
      {LLM_TENSOR_SSM_A,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
+    {LLM_TENSOR_SSM_A_NOSCAN,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
      {LLM_TENSOR_SSM_DT_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
      {LLM_TENSOR_SSM_B_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
      {LLM_TENSOR_SSM_C_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -2711,6 +2817,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
          case LLM_ARCH_LFM2:
          case LLM_ARCH_LFM2MOE:
          case LLM_ARCH_NEMOTRON_H:
+        case LLM_ARCH_QWEN3NEXT:
              return true;
          default:
              return false;
@@ -2722,6 +2829,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
          case LLM_ARCH_DREAM:
          case LLM_ARCH_LLADA:
          case LLM_ARCH_LLADA_MOE:
+        case LLM_ARCH_RND1:
              return true;
          default:
              return false;
diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h

index ae7fa222acaa603b7b4650408b76de597ec3bf86..e113180024d4d14c2a3d46c6c8f900ef5b5ae395 100644 (file)
--- a/examples/talk-llama/llama-arch.h
+++ b/examples/talk-llama/llama-arch.h
@@ -36,6 +36,7 @@ enum llm_arch {
      LLM_ARCH_QWEN2VL,
      LLM_ARCH_QWEN3,
      LLM_ARCH_QWEN3MOE,
+    LLM_ARCH_QWEN3NEXT,
      LLM_ARCH_QWEN3VL,
      LLM_ARCH_QWEN3VLMOE,
      LLM_ARCH_PHI2,
@@ -112,7 +113,9 @@ enum llm_arch {
      LLM_ARCH_APERTUS,
      LLM_ARCH_MINIMAX_M2,
      LLM_ARCH_COGVLM,
+    LLM_ARCH_RND1,
      LLM_ARCH_PANGU_EMBED,
+    LLM_ARCH_MISTRAL3,
      LLM_ARCH_UNKNOWN,
  };
  
@@ -122,6 +125,18 @@ enum llm_kv {
      LLM_KV_GENERAL_QUANTIZATION_VERSION,
      LLM_KV_GENERAL_ALIGNMENT,
      LLM_KV_GENERAL_FILE_TYPE,
+    LLM_KV_GENERAL_SAMPLING_SEQUENCE,
+    LLM_KV_GENERAL_SAMPLING_TOP_K,
+    LLM_KV_GENERAL_SAMPLING_TOP_P,
+    LLM_KV_GENERAL_SAMPLING_MIN_P,
+    LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY,
+    LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD,
+    LLM_KV_GENERAL_SAMPLING_TEMP,
+    LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N,
+    LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT,
+    LLM_KV_GENERAL_SAMPLING_MIROSTAT,
+    LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU,
+    LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA,
      LLM_KV_GENERAL_NAME,
      LLM_KV_GENERAL_AUTHOR,
      LLM_KV_GENERAL_VERSION,
@@ -194,6 +209,7 @@ enum llm_kv {
      LLM_KV_ATTENTION_SCALE,
      LLM_KV_ATTENTION_OUTPUT_SCALE,
      LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
+    LLM_KV_ATTENTION_TEMPERATURE_SCALE,
      LLM_KV_ATTENTION_KEY_LENGTH_MLA,
      LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
  
@@ -363,11 +379,13 @@ enum llm_tensor {
      LLM_TENSOR_SSM_DT,
      LLM_TENSOR_SSM_DT_NORM,
      LLM_TENSOR_SSM_A,
+    LLM_TENSOR_SSM_A_NOSCAN,        // qwen3next special case with MUL instead of SSM_SCAN
      LLM_TENSOR_SSM_B_NORM,
      LLM_TENSOR_SSM_C_NORM,
      LLM_TENSOR_SSM_D,
      LLM_TENSOR_SSM_NORM,
      LLM_TENSOR_SSM_OUT,
+    LLM_TENSOR_SSM_BETA_ALPHA,      // qwen3next
      LLM_TENSOR_TIME_MIX_W0,
      LLM_TENSOR_TIME_MIX_W1,
      LLM_TENSOR_TIME_MIX_W2,
diff --git a/examples/talk-llama/llama-batch.cpp b/examples/talk-llama/llama-batch.cpp

index 86a1a4ba187eef004e17f5a5185367b889470520..386fab04ac9c7a01dc77ef0bb7b4fcd35d81e548 100644 (file)
--- a/examples/talk-llama/llama-batch.cpp
+++ b/examples/talk-llama/llama-batch.cpp
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
      udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
      udata->output    .resize(n_tokens);
  
+    udata->seq_id_data.reserve(n_tokens);
+
      seq_set_t seq_set_unq;
  
      for (size_t i = 0; i < idxs.size(); ++i) {
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
          }
  
          udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
-        udata->seq_id[i]   = batch.seq_id[idxs[i]];
          udata->output[i]   = batch.logits[idxs[i]];
  
          for (int s = 0; s < udata->n_seq_id[i]; ++s) {
-            seq_set_unq.set(udata->seq_id[i][s]);
+            const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
+
+            udata->seq_id_data.push_back(seq_id);
+            seq_set_unq.set(seq_id);
          }
  
          if (udata->output[i]) {
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
          }
      }
  
+    llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        udata->seq_id[i] = seq_id_ptr;
+        seq_id_ptr += udata->n_seq_id[i];
+    }
+
      for (uint32_t s = 0; s < n_seq_max; ++s) {
          if (seq_set_unq.test(s)) {
              udata->seq_idx[s] = udata->seq_id_unq.size();
diff --git a/examples/talk-llama/llama-batch.h b/examples/talk-llama/llama-batch.h

index 209cf3699de23b07b993b0971e4ca27aaa897b0b..8e6fac0efabba3d5f1b3dd528ebb9e3d2c9ec99a 100644 (file)
--- a/examples/talk-llama/llama-batch.h
+++ b/examples/talk-llama/llama-batch.h
@@ -56,13 +56,15 @@ struct llama_ubatch {
          std::vector<float>          embd;
          std::vector<llama_pos>      pos;
          std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
          std::vector<llama_seq_id>   seq_id_unq;
          std::vector<int32_t>        seq_idx;
          std::vector<int8_t>         output;
+
+        std::vector<llama_seq_id> seq_id_data;
      };
  
-    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
+    // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
      std::shared_ptr<data_t> data;
  };
  
diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp

index 70a3ec62dfc635402e9e723960c3758ffb4cebfa..2692297dcaba20116e49a601d4135c238ce4f596 100644 (file)
--- a/examples/talk-llama/llama-context.cpp
+++ b/examples/talk-llama/llama-context.cpp
@@ -1,5 +1,6 @@
  #include "llama-context.h"
  
+#include "llama-arch.h"
  #include "llama-impl.h"
  #include "llama-batch.h"
  #include "llama-io.h"
@@ -92,14 +93,6 @@ llama_context::llama_context(
      // with causal attention, the batch size is limited by the context size
      cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
  
-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
      cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
  
      cparams.op_offload = params.op_offload;
@@ -247,7 +240,10 @@ llama_context::llama_context(
  
          LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
  
-        const size_t max_nodes = this->graph_max_nodes();
+        const uint32_t n_seqs = cparams.n_seq_max;
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+        const size_t max_nodes = this->graph_max_nodes(n_tokens);
  
          LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
  
@@ -299,9 +295,6 @@ llama_context::llama_context(
  
          cross.v_embd.clear();
  
-        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
-        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
          // avoid reserving graphs with zero outputs - assume one output per sequence
          n_outputs = n_seqs;
  
@@ -542,7 +535,7 @@ bool llama_context::memory_update(bool optimize) {
              throw std::runtime_error("failed to initialize memory context");
          }
  
-        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
+        const uint32_t n_seqs = cparams.n_seq_max;
          const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  
          auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@@ -1248,7 +1241,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
  
          // make the outputs have the same order they had in the user-provided batch
          // note: this is mostly relevant for recurrent models atm
-        if (!sorted_output) {
+        if (!sorted_output && n_outputs > 1) {
              GGML_ASSERT((size_t) n_outputs == out_ids.size());
  
              // TODO: is there something more efficient which also minimizes swaps?
@@ -1385,7 +1378,10 @@ void llama_context::output_reorder() {
  // graph
  //
  
-uint32_t llama_context::graph_max_nodes() const {
+uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
+    if (model.arch == LLM_ARCH_QWEN3NEXT) {
+        return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
+    }
      return std::max<uint32_t>(1024u, 8u*model.n_tensors());
  }
  
diff --git a/examples/talk-llama/llama-context.h b/examples/talk-llama/llama-context.h

index 20cbd78955412c3bdec9e462bd23e02d7347b99e..cd26eafe18942522b05bb31bf2ec5509b2cc312a 100644 (file)
--- a/examples/talk-llama/llama-context.h
+++ b/examples/talk-llama/llama-context.h
@@ -197,7 +197,7 @@ private:
      //
  
  public:
-    uint32_t graph_max_nodes() const;
+    uint32_t graph_max_nodes(uint32_t n_tokens) const;
  
      // can reuse the llm_graph_result instance of the context (for example to update a memory module)
      llm_graph_result * get_gf_res_reserve() const;
diff --git a/examples/talk-llama/llama-grammar.cpp b/examples/talk-llama/llama-grammar.cpp

index bed706bb248d139664d8024726948e9fb1ba4cb5..75d5d750c3998a74a86153955961440bf81f190a 100644 (file)
--- a/examples/talk-llama/llama-grammar.cpp
+++ b/examples/talk-llama/llama-grammar.cpp
@@ -6,8 +6,10 @@
  
  #include <cmath>
  #include <algorithm>
+#include <cstdint>
  #include <stdexcept>
  
+#define MAX_REPETITION_THRESHOLD 2000
  //
  // helpers
  //
@@ -179,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
      throw std::runtime_error("unexpected end of input");
  }
  
+static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
+    const char * pos = src;
+    if (*pos != '<') {
+        throw std::runtime_error(std::string("expecting '<' at ") + pos);
+    }
+    pos++;
+
+    // Parse <[id]>
+    if (*pos == '[') {
+        pos++;
+        const char * int_end = parse_int(pos);
+        uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
+        pos = int_end;
+        if (*pos != ']') {
+            throw std::runtime_error(std::string("expecting ']' at ") + pos);
+        }
+        pos++;
+        if (*pos != '>') {
+            throw std::runtime_error(std::string("expecting '>' at ") + pos);
+        }
+        pos++;
+        return std::make_pair(token_id, pos);
+    }
+
+    if (vocab == nullptr) {
+        throw std::runtime_error(std::string("no vocab to parse token at ") + src);
+    }
+
+    // Parse <token> and tokenize to obtain the token id
+    while (*pos != 0 && *pos != '>') {
+        pos++;
+    }
+    if (*pos != '>') {
+        throw std::runtime_error(std::string("expecting '>' at ") + pos);
+    }
+    pos++;
+
+    llama_token tokens[2];
+    int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
+    if (n_tokens != 1) {
+        // must tokenize to exactly 1 token
+        throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
+    }
+    return std::make_pair(tokens[0], pos);
+}
+
  static void print_grammar_char(FILE * file, uint32_t c) {
      if (0x20 <= c && c <= 0x7f) {
          fprintf(file, "%c", static_cast<char>(c));
@@ -210,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
              case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
              case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
              case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
+            case LLAMA_GRETYPE_TOKEN:          fprintf(file, "TOKEN");          break;
+            case LLAMA_GRETYPE_TOKEN_NOT:      fprintf(file, "TOKEN_NOT");      break;
          }
          switch (elem.type) {
              case LLAMA_GRETYPE_END:
@@ -226,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
                  print_grammar_char(file, elem.value);
                  fprintf(file, "\") ");
                  break;
+            case LLAMA_GRETYPE_TOKEN:
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
+            case LLAMA_GRETYPE_TOKEN_NOT:
+                fprintf(file, "!");
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
          }
      }
      fprintf(file, "\n");
@@ -282,6 +343,17 @@ static void print_rule(
              case LLAMA_GRETYPE_CHAR_ANY:
                  fprintf(file, ".");
                  break;
+            case LLAMA_GRETYPE_TOKEN:
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
+            case LLAMA_GRETYPE_TOKEN_NOT:
+                fprintf(file, "!");
+                fprintf(file, "<[");
+                fprintf(file, "%u", elem.value);
+                fprintf(file, "]> ");
+                break;
          }
          if (is_char_element(elem)) {
              switch (rule[i + 1].type) {
@@ -345,8 +417,10 @@ const char * llama_grammar_parser::parse_sequence(
      size_t last_sym_start = rule.size();
      const char * pos = src;
  
-    auto handle_repetitions = [&](int min_times, int max_times) {
-
+    // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
+    // (though it's technically the same as -1 now)
+    auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
+        bool no_max = max_times == UINT64_MAX;
          if (last_sym_start == rule.size()) {
              throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
          }
@@ -373,20 +447,20 @@ const char * llama_grammar_parser::parse_sequence(
              rule.resize(last_sym_start);
          } else {
              // Repeat the previous elements (min_times - 1) times
-            for (int i = 1; i < min_times; i++) {
+            for (uint64_t i = 1; i < min_times; i++) {
                  rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
              }
          }
  
          uint32_t last_rec_rule_id = 0;
-        auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+        auto n_opt = no_max ? 1 : max_times - min_times;
  
          llama_grammar_rule rec_rule(prev_rule);
-        for (int i = 0; i < n_opt; i++) {
+        for (uint64_t i = 0; i < n_opt; i++) {
              rec_rule.resize(prev_rule.size());
              uint32_t rec_rule_id = generate_symbol_id( rule_name);
-            if (i > 0 || max_times < 0) {
-                rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+            if (i > 0 || no_max) {
+                rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
              }
              rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
              rec_rule.push_back({LLAMA_GRETYPE_END, 0});
@@ -440,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
                  }
              }
              pos = parse_space(pos + 1, is_nested);
+        } else if (*pos == '<' || *pos == '!') { // token
+            auto type = LLAMA_GRETYPE_TOKEN;
+            if (*pos == '!') { // token inverse
+                type = LLAMA_GRETYPE_TOKEN_NOT;
+                pos++;
+            }
+            auto token_pair = parse_token(vocab, pos);
+            const char * token_end  = token_pair.second;
+            last_sym_start = rule.size();
+            rule.push_back({type, token_pair.first});
+            pos = parse_space(token_end, is_nested);
          } else if (is_word_char(*pos)) { // rule reference
              const char * name_end    = parse_name(pos);
              uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
@@ -478,10 +563,10 @@ const char * llama_grammar_parser::parse_sequence(
                  throw std::runtime_error(std::string("expecting an int at ") + pos);
              }
              const char * int_end = parse_int(pos);
-            int min_times = std::stoul(std::string(pos, int_end - pos));
+            uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
              pos = parse_space(int_end, is_nested);
  
-            int max_times = -1;
+            uint64_t max_times = UINT64_MAX; // default: no max limit
  
              if (*pos == '}') {
                  max_times = min_times;
@@ -502,6 +587,10 @@ const char * llama_grammar_parser::parse_sequence(
              } else {
                  throw std::runtime_error(std::string("expecting ',' at ") + pos);
              }
+            bool has_max = max_times != UINT64_MAX;
+            if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
+                throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
+            }
              handle_repetitions(min_times, max_times);
          } else {
              break;
@@ -683,6 +772,21 @@ static bool llama_grammar_match_partial_char(
      return !is_positive_char;
  }
  
+// returns true iff token matches the rule at pos (regular or inverse)
+// asserts that pos is pointing to a token element
+static bool llama_grammar_match_token(
+    const llama_grammar_element * pos,
+    const llama_token             token) {
+    GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
+    if (pos->type == LLAMA_GRETYPE_TOKEN) {
+        return pos->value == static_cast<uint32_t>(token);
+    }
+    if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        return pos->value != static_cast<uint32_t>(token);
+    }
+    return false;
+}
+
  // transforms a grammar pushdown stack into N possible stacks, all ending
  // at a character range (terminal element)
  static void llama_grammar_advance_stack(
@@ -730,6 +834,8 @@ static void llama_grammar_advance_stack(
          case LLAMA_GRETYPE_CHAR:
          case LLAMA_GRETYPE_CHAR_NOT:
          case LLAMA_GRETYPE_CHAR_ANY:
+        case LLAMA_GRETYPE_TOKEN:
+        case LLAMA_GRETYPE_TOKEN_NOT:
              if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
                  // only add the stack if it's not a duplicate of one we already have
                  new_stacks.emplace_back(stack);
@@ -823,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
      return grammar->stacks;
  }
  
+static void llama_grammar_accept_chr(
+        struct llama_grammar       & grammar,
+        const llama_grammar_stack  & stack,
+              uint32_t               chr,
+              llama_grammar_stacks & new_stacks) {
+    if (stack.empty()) {
+        return;
+    }
+
+    const llama_grammar_element * pos = stack.back();
+
+    // ignore if this turns into a token
+    if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        return;
+    }
+
+    auto match = llama_grammar_match_char(pos, chr);
+    if (match.first) {
+        llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+        if (!llama_grammar_is_end_of_sequence(match.second)) {
+            new_stack.push_back(match.second);
+        }
+        llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
+    }
+}
+
  void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
      llama_grammar_stacks stacks_new;
      stacks_new.reserve(grammar->stacks.size());
  
      for (const auto & stack : grammar->stacks) {
-        if (stack.empty()) {
-            continue;
-        }
-
-        auto match = llama_grammar_match_char(stack.back(), chr);
-        if (match.first) {
-            const llama_grammar_element * pos = match.second;
-
-            // update top of stack to next element, if any
-            llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
-            if (!llama_grammar_is_end_of_sequence(pos)) {
-                new_stack.push_back(pos);
-            }
-            llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
-        }
+        llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
      }
  
      grammar->stacks = std::move(stacks_new);
@@ -867,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
  
      const llama_grammar_element * stack_pos = stack.back();
  
+    // if the top of the stack is a token rule, then we only need to check the token id
+    if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+        for (const auto & tok : candidates) {
+            if (*tok.code_points == 0) {
+                // reached the end of a token consumed by char rules, reject iff it ended
+                // in a partial response
+                if (tok.partial_utf8.n_remain != 0) {
+                    rejects.push_back(tok);
+                }
+            } else if (!llama_grammar_match_token(stack_pos, tok.id)) {
+                rejects.push_back(tok);
+            }
+        }
+        return rejects;
+    }
+
      llama_grammar_candidates next_candidates;
      next_candidates.reserve(candidates.size());
  
@@ -879,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
                  rejects.push_back(tok);
              }
          } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
-            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
+            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
          } else {
              rejects.push_back(tok);
          }
@@ -897,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
  
      auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
      for (const auto & tok : next_rejects) {
-        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
+        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
      }
  
      return rejects;
@@ -964,12 +1098,13 @@ struct llama_grammar * llama_grammar_init_impl(
          vocab,
          std::move(vec_rules),
          std::move(stacks),
-        /* .partial_utf8 = */     {},
-        /* .lazy =*/              false,
-        /* .awaiting_trigger = */ false,
-        /* .trigger_buffer = */   "",
-        /* .trigger_tokens   = */ {},
-        /* .trigger_patterns    = */ {},
+        /* .partial_utf8 = */             {},
+        /* .lazy = */                     false,
+        /* .awaiting_trigger = */         false,
+        /* .trigger_buffer = */           "",
+        /* .trigger_buffer_positions = */ {},
+        /* .trigger_tokens = */           {},
+        /* .trigger_patterns = */         {},
      };
  }
  
@@ -982,7 +1117,7 @@ struct llama_grammar * llama_grammar_init_impl(
                              size_t num_trigger_patterns,
                 const llama_token * trigger_tokens,
                              size_t num_trigger_tokens) {
-    llama_grammar_parser parser;
+    llama_grammar_parser parser(vocab);
  
      // if there is a grammar, parse it
      // rules will be empty (default) if there are parse errors
@@ -1069,10 +1204,11 @@ struct llama_grammar * llama_grammar_init_impl(
          vocab,
          std::move(vec_rules),
          std::move(stacks),
-        /* .partial_utf8 = */     {},
-        /* .lazy = */             lazy,
-        /* .awaiting_trigger = */ lazy,
-        /* .trigger_buffer = */   "",
+        /* .partial_utf8 = */             {},
+        /* .lazy = */                     lazy,
+        /* .awaiting_trigger = */         lazy,
+        /* .trigger_buffer = */           "",
+        /* .trigger_buffer_positions = */ {},
          std::move(vec_trigger_tokens),
          std::move(vec_trigger_patterns),
      };
@@ -1095,6 +1231,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
          grammar.lazy,
          grammar.awaiting_trigger,
          grammar.trigger_buffer,
+        grammar.trigger_buffer_positions,
          grammar.trigger_tokens,
          grammar.trigger_patterns,
      };
@@ -1148,7 +1285,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
              cur_p->data[i].logit = -INFINITY;
          } else {
              candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
-            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
+            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
          }
      }
  
@@ -1167,10 +1304,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
          if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
              grammar.awaiting_trigger = false;
              grammar.trigger_buffer.clear();
-            llama_grammar_accept_str(grammar, piece);
+            llama_grammar_accept_token(grammar, token, piece);
              LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
              return;
          } else {
+            auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
+            grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
              grammar.trigger_buffer += piece;
  
              std::smatch match;
@@ -1188,10 +1327,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
                      if (start == std::string::npos) {
                          start = match.position(0);
                      }
+
+                    // replay tokens that overlap with [start, end)
+                    for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
+                        auto [tok_start, tok_end] = tok_pos;
+                        if (tok_end <= start) {
+                            continue;
+                        }
+
+                        size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
+                        size_t piece_len = tok_end - piece_start;
+                        auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
+                        llama_grammar_accept_token(grammar, tok, tok_piece);
+                    }
+
                      auto constrained_str = grammar.trigger_buffer.substr(start);
-                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                      grammar.trigger_buffer.clear();
-                    llama_grammar_accept_str(grammar, constrained_str);
+                    grammar.trigger_buffer_positions.clear();
                      LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
                      return;
                  }
@@ -1210,7 +1362,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
          GGML_ABORT("fatal error");
      }
  
-    llama_grammar_accept_str(grammar, piece);
+    llama_grammar_accept_token(grammar, token, piece);
  }
  
  void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
@@ -1227,3 +1379,59 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
          throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
      }
  }
+
+void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
+    // Note terminating 0 in decoded string
+    const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
+    const auto & code_points = decoded.first;
+
+    llama_grammar_stacks stacks_new;
+    stacks_new.reserve(grammar.stacks.size());
+
+    for (const auto & stack : grammar.stacks) {
+        if (stack.empty()) {
+            continue;
+        }
+
+        const llama_grammar_element * pos = stack.back();
+
+        if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
+            if (llama_grammar_match_token(pos, token)) {
+                llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
+                    new_stack.push_back(pos + 1);
+                }
+                llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
+            }
+        } else {
+            llama_grammar_stacks current_stacks = {stack};
+
+            for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+                llama_grammar_stacks next_stacks;
+
+                for (const auto & cur_stack : current_stacks) {
+                    llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
+                }
+
+                current_stacks = std::move(next_stacks);
+                if (current_stacks.empty()) {
+                    break;
+                }
+            }
+
+            for (auto & surviving_stack : current_stacks) {
+                if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
+                    stacks_new.emplace_back(surviving_stack);
+                }
+            }
+        }
+    }
+
+    grammar.stacks = std::move(stacks_new);
+    grammar.partial_utf8 = decoded.second;
+
+    if (grammar.stacks.empty()) {
+        throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
+    }
+}
+
diff --git a/examples/talk-llama/llama-grammar.h b/examples/talk-llama/llama-grammar.h

index f8c291de999ac6f320936e7b1d6255a57ad65ac7..a4c978ac1154549ba4df8a4590d2ec214c94a044 100644 (file)
--- a/examples/talk-llama/llama-grammar.h
+++ b/examples/talk-llama/llama-grammar.h
@@ -36,11 +36,17 @@ enum llama_gretype {
  
      // any character (.)
      LLAMA_GRETYPE_CHAR_ANY       = 7,
+
+    // terminal element: token (<[token-id]>)
+    LLAMA_GRETYPE_TOKEN          = 8,
+
+    // inverse token (!<[token-id]>)
+    LLAMA_GRETYPE_TOKEN_NOT      = 9,
  };
  
  typedef struct llama_grammar_element {
      enum llama_gretype type;
-    uint32_t           value; // Unicode code point or rule ID
+    uint32_t           value; // Unicode code point, rule ID, or token ID
  } llama_grammar_element;
  
  struct llama_partial_utf8 {
@@ -52,6 +58,7 @@ struct llama_grammar_candidate {
      size_t               index;
      const uint32_t     * code_points;
      llama_partial_utf8   partial_utf8;
+    llama_token          id;
  };
  
  using llama_grammar_rule  = std::vector<      llama_grammar_element>;
@@ -77,10 +84,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
          const llama_grammar_candidates & candidates);
  
  struct llama_grammar_parser {
+    const llama_vocab * vocab;
      std::map<std::string, uint32_t> symbol_ids;
  
      llama_grammar_rules rules;
  
+    llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
+
      llama_grammar_stack c_rules() const;
  
      uint32_t get_symbol_id(const char * src, size_t len);
@@ -112,6 +122,9 @@ struct llama_grammar_trigger_pattern {
  };
  
  struct llama_grammar {
+    // maintain a list of llama_tokens and their positions in the trigger_buffer
+    using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
+
      // note: allow null vocab for testing (not great)
      const llama_vocab * vocab;
  
@@ -127,6 +140,7 @@ struct llama_grammar {
      bool                     lazy             = false;
      bool                     awaiting_trigger = false; // Initialized to true for lazy grammars only
      std::string              trigger_buffer;           // Output buffered by lazy grammar. Will be cleared once trigger is found.
+    std::vector<token_pos>   trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
      std::vector<llama_token> trigger_tokens;           // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
      std::vector<llama_grammar_trigger_pattern>
                               trigger_patterns;         // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
@@ -171,3 +185,8 @@ void llama_grammar_accept_impl(
  void llama_grammar_accept_str(
                struct llama_grammar & grammar,
                   const std::string & piece);
+
+void llama_grammar_accept_token(
+              struct llama_grammar & grammar,
+                       llama_token   token,
+                 const std::string & piece);
diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp

index 650e40ec6ffcebf4e422e46124ddb98924e203e1..6cf9a883a6ebb6c74d6936b629333ed42849f741 100644 (file)
--- a/examples/talk-llama/llama-graph.cpp
+++ b/examples/talk-llama/llama-graph.cpp
@@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
      if (ubatch->pos && attn_scale) {
          const int64_t n_tokens = ubatch->n_tokens;
  
+        GGML_ASSERT(f_attn_temp_scale != 0.0f);
+        GGML_ASSERT(n_attn_temp_floor_scale != 0);
+
          std::vector<float> attn_scale_data(n_tokens, 0.0f);
          for (int i = 0; i < n_tokens; ++i) {
              const float pos = ubatch->pos[i];
@@ -382,7 +385,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
    //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
  
      res &= self_kq_mask->ne[0] == mctx->get_n_kv();
-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
  
      return res;
  }
@@ -413,10 +416,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
    //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
  
      res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
  
      res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
-    res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
  
      return res;
  }
@@ -449,7 +452,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
              }
          }
  
-        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+        for (int i = n_tokens; i < n_tokens; ++i) {
              for (int j = 0; j < n_enc; ++j) {
                  data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
              }
@@ -810,9 +813,6 @@ ggml_tensor * llm_graph_context::build_ffn(
              GGML_ABORT("fatal error");
      }
  
-    //expand here so that we can fuse ffn gate
-    ggml_build_forward_expand(gf, cur);
-
      if (gate && type_gate == LLM_FFN_PAR) {
          cur = ggml_mul(ctx0, cur, tmp);
          cb(cur, "ffn_gate_par", il);
@@ -961,25 +961,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          // organize experts into n_expert_groups
          ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
  
-        ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
+        ggml_tensor * group_scores = ggml_argsort_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
          group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
  
          // get top n_group_used expert groups
          group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
          group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
  
-        ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
+        ggml_tensor * expert_groups = ggml_argsort_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
          cb(expert_groups, "ffn_moe_group_topk", il);
  
          // mask out the other groups
          selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
-        selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
+        selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
          selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
          cb(selection_probs, "ffn_moe_probs_masked", il);
      }
  
      // select experts
-    ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+    ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
      cb(selected_experts->src[0], "ffn_moe_argsort", il);
      cb(selected_experts, "ffn_moe_topk", il);
  
@@ -1093,9 +1093,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
              GGML_ABORT("fatal error");
      }
  
-    //expand here so that we can fuse ffn gate
-    ggml_build_forward_expand(gf, cur);
-
      experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
      cb(experts, "ffn_moe_down", il);
  
@@ -1473,13 +1470,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
      auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
  
      // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
      ggml_set_input(inp->self_kq_mask);
  
      inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
  
      if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
          ggml_set_input(inp->self_kq_mask_swa);
  
          inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1561,7 +1558,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
          inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
          inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
  
-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
          ggml_set_input(inp->self_kq_mask);
  
          inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1704,7 +1701,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
  
      const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
  
-    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
      ggml_set_input(inp->cross_kq_mask);
  
      inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
@@ -1770,7 +1767,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
          inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
          inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
  
-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
          ggml_set_input(inp->self_kq_mask);
  
          inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1784,7 +1781,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
          inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
          inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
  
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
          ggml_set_input(inp->self_kq_mask_swa);
  
          inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h

index 9203af83b2e3239bd8833daf5dd2e5b5bb69cfe3..6eff334a5fd849a1387c13f7c1589700b24b30e0 100644 (file)
--- a/examples/talk-llama/llama-hparams.h
+++ b/examples/talk-llama/llama-hparams.h
@@ -6,7 +6,7 @@
  
  // bump if necessary
  #define LLAMA_MAX_LAYERS  512
-#define LLAMA_MAX_EXPERTS 384  // Kimi-K2
+#define LLAMA_MAX_EXPERTS 512 // Qwen3 Next
  
  enum llama_expert_gating_func_type {
      LLAMA_EXPERT_GATING_FUNC_TYPE_NONE           = 0,
@@ -162,8 +162,8 @@ struct llama_hparams {
      // llama4 smallthinker
      uint32_t n_moe_layer_step        = 0;
      uint32_t n_no_rope_layer_step    = 4;
-    uint32_t n_attn_temp_floor_scale = 8192;
-    float    f_attn_temp_scale       = 0.1;
+    uint32_t n_attn_temp_floor_scale = 0;
+    float    f_attn_temp_scale       = 0.0f;
  
      // gemma3n altup
      uint32_t n_altup      = 4; // altup_num_inputs
diff --git a/examples/talk-llama/llama-impl.cpp b/examples/talk-llama/llama-impl.cpp

index 6ec709dd323a6d5319d67d64f607c0acc85f3267..c7a1880aad3db1eef1bf89334d4976a943a72b56 100644 (file)
--- a/examples/talk-llama/llama-impl.cpp
+++ b/examples/talk-llama/llama-impl.cpp
@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
  time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
  
  time_meas::~time_meas() {
-        if (t_start_us >= 0) {
-            t_acc += ggml_time_us() - t_start_us;
-        }
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
      }
+}
  
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
      ggml_log_set(log_callback, user_data);
diff --git a/examples/talk-llama/llama-impl.h b/examples/talk-llama/llama-impl.h

index c5163e9225a5e04ee38c426aad139bf15ddc9a90..c3391e79f51662cd9efd6805e70c2c7b0ca2ccd8 100644 (file)
--- a/examples/talk-llama/llama-impl.h
+++ b/examples/talk-llama/llama-impl.h
@@ -37,7 +37,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
  template <typename T>
  struct no_init {
      T value;
-    no_init() { /* do nothing */ }
+    no_init() = default;
  };
  
  struct time_meas {
diff --git a/examples/talk-llama/llama-kv-cache.cpp b/examples/talk-llama/llama-kv-cache.cpp

index e26385a1feaf16460d0b13e0502b2f4d64d16a28..3e02bd62977067ca3e7515fae29649d567d10c4a 100644 (file)
--- a/examples/talk-llama/llama-kv-cache.cpp
+++ b/examples/talk-llama/llama-kv-cache.cpp
@@ -1232,8 +1232,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
      GGML_ASSERT(n_tokens%n_stream == 0);
  
      // n_tps == n_tokens_per_stream
-    const int64_t n_tps     = n_tokens/n_stream;
-    const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
+    const int64_t n_tps = n_tokens/n_stream;
  
      std::fill(data, data + ggml_nelements(dst), -INFINITY);
  
@@ -1266,7 +1265,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                  const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
                  const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
  
-                const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
+                const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
  
                  for (uint32_t j = 0; j < n_kv; ++j) {
                      if (cells.is_empty(j)) {
diff --git a/examples/talk-llama/llama-mmap.cpp b/examples/talk-llama/llama-mmap.cpp

index 47497cf953fd3990d6e147e4837980bed6223b63..0641c2d22f67651c93045d38ad2d59207008fe42 100644 (file)
--- a/examples/talk-llama/llama-mmap.cpp
+++ b/examples/talk-llama/llama-mmap.cpp
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
          if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
              suggest = false;
          }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
+        if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
              suggest = false;
          }
  #endif
diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp

index e703181a19804d12ec615c88f6e2cae2ef8b38a9..fc337b045eb0a8e497e3c0319d2f3b171ad50469 100644 (file)
--- a/examples/talk-llama/llama-model.cpp
+++ b/examples/talk-llama/llama-model.cpp
@@ -2,7 +2,6 @@
  
  #include "llama-impl.h"
  #include "llama-mmap.h"
-#include "llama-batch.h"
  #include "llama-cparams.h"
  #include "llama-model-loader.h"
  
@@ -121,6 +120,7 @@ const char * llm_type_name(llm_type type) {
          case LLM_TYPE_16B_A1B:       return "16B.A1B";
          case LLM_TYPE_21B_A3B:       return "21B.A3B";
          case LLM_TYPE_30B_A3B:       return "30B.A3B";
+        case LLM_TYPE_80B_A3B:       return "80B.A3B";
          case LLM_TYPE_100B_A6B:      return "100B.A6B";
          case LLM_TYPE_106B_A12B:     return "106B.A12B";
          case LLM_TYPE_230B_A10B:     return "230B.A10B";
@@ -424,8 +424,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
  }
  
  struct llama_model::impl {
-    impl() {}
-    ~impl() {}
+    impl() = default;
+    ~impl() = default;
  
      uint64_t n_elements = 0;
  
@@ -462,7 +462,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
      pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  }
  
-llama_model::~llama_model() {}
+llama_model::~llama_model() = default;
  
  void llama_model::load_stats(llama_model_loader & ml) {
      pimpl->n_elements = ml.n_elements;
@@ -664,8 +664,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
                      hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
                  } else {
-                    hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
-                    hparams.n_swa         = 8192;
+                    hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
+                    hparams.n_swa                   = 8192;
+                    hparams.n_attn_temp_floor_scale = 8192;
+                    hparams.f_attn_temp_scale       = 0.1f;
                      hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
                  }
  
@@ -1036,6 +1038,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      default: type = LLM_TYPE_UNKNOWN;
                  }
              } break;
+        case LLM_ARCH_RND1:
+            {
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 48: type = LLM_TYPE_30B_A3B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+                // Set non-causal attention for diffusion models
+                hparams.causal_attn = false;
+            } break;
          case LLM_ARCH_QWEN2MOE:
              {
                  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
@@ -1251,18 +1265,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
              } break;
          case LLM_ARCH_GEMMA3:
              {
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.set_swa_pattern(6);
+                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                if (found_swa && hparams.n_swa > 0) {
+                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                    hparams.set_swa_pattern(6);
  
-                hparams.rope_freq_base_train_swa  = 10000.0f;
-                hparams.rope_freq_scale_train_swa = 1.0f;
+                    hparams.rope_freq_base_train_swa  = 10000.0f;
+                    hparams.rope_freq_scale_train_swa = 1.0f;
+                } else {
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+                }
  
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+                hparams.f_final_logit_softcapping = 0.0f;
+                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  
                  switch (hparams.n_layer) {
                      case 18: type = LLM_TYPE_270M; break;
                      case 26: type = LLM_TYPE_1B; break;
+                    case 32: type = LLM_TYPE_8B; break; // Rnj-1
                      case 34: type = LLM_TYPE_4B; break;
                      case 48: type = LLM_TYPE_12B; break;
                      case 62: type = LLM_TYPE_27B; break;
@@ -1586,14 +1607,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
                  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
  
-                switch (hparams.n_layer) {
-                    case 28: type = LLM_TYPE_20B; break;
+                switch (hparams.n_ff_exp) {
+                    case 1408: type = LLM_TYPE_16B; break;
+                    case 1792: type = LLM_TYPE_20B; break;
                      default: type = LLM_TYPE_UNKNOWN;
                  }
              } break;
          case LLM_ARCH_DEEPSEEK2:
              {
-                bool is_lite = (hparams.n_layer == 27);
+                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
+                bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                  ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
                  if (!is_lite) {
@@ -1614,6 +1637,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  }
                  ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
  
+                // (optional) temperature tuning - used by mistral-large
+                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
+                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
+
                  switch (hparams.n_layer) {
                      case 27: type = LLM_TYPE_16B; break;
                      case 60: type = LLM_TYPE_236B; break;
@@ -2212,6 +2239,65 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      default: type = LLM_TYPE_UNKNOWN;
                  }
              } break;
+        case LLM_ARCH_QWEN3NEXT:
+            {
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
+                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+
+                // Load linear attention (gated delta net) parameters
+                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+                // Mark recurrent layers (linear attention layers)
+                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+                    hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
+                }
+
+                switch (hparams.n_layer) {
+                    case 48: type = LLM_TYPE_80B_A3B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_MISTRAL3:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
+
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);
+
+                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
+                if (hparams.f_attn_temp_scale != 0.0f) {
+                    hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
+                    if (hparams.n_attn_temp_floor_scale == 0) {
+                        throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
+                    }
+                }
+
+                // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
+                //       but may need further verification with other values
+                if (hparams.rope_yarn_log_mul != 0.0f) {
+                    float factor = 1.0f / hparams.rope_freq_scale_train;
+                    float mscale = 1.0f;
+                    float mscale_all_dims = hparams.rope_yarn_log_mul;
+                    static auto get_mscale = [](float scale, float mscale) {
+                        return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
+                    };
+                    hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
+                }
+
+                switch (hparams.n_layer) {
+                    case 26: type = LLM_TYPE_3B; break;
+                    case 34: type = LLM_TYPE_8B; break;
+                    case 40: type = LLM_TYPE_14B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
          default: throw std::runtime_error("unsupported model architecture");
      }
  
@@ -2525,6 +2611,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
              case LLM_ARCH_MINICPM:
              case LLM_ARCH_GRANITE:
              case LLM_ARCH_GRANITE_MOE:
+            case LLM_ARCH_MISTRAL3:
                  {
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  
@@ -3401,6 +3488,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_QWEN3MOE:
              case LLM_ARCH_QWEN3VLMOE:
+            case LLM_ARCH_RND1:
                  {
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  
@@ -4581,7 +4669,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                  } break;
              case LLM_ARCH_DEEPSEEK2:
                  {
-                    const bool is_lite = (hparams.n_layer == 27);
+                    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
+                    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
  
                      const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  
@@ -6118,9 +6207,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
              case LLM_ARCH_LFM2:
              case LLM_ARCH_LFM2MOE:
                  {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
-                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
-                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT,          "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  
                      if (output == NULL) {
                          output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6399,6 +6489,74 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                      }
                  } break;
+            case LLM_ARCH_QWEN3NEXT:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+                    }
+
+                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+                    // Calculate dimensions from hyperparameters
+                    const int64_t head_k_dim = hparams.ssm_d_state;
+                    const int64_t head_v_dim = hparams.ssm_d_state;
+                    const int64_t n_k_heads  = hparams.ssm_n_group;
+                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
+                    const int64_t key_dim    = head_k_dim * n_k_heads;
+                    const int64_t value_dim  = head_v_dim * n_v_heads;
+                    const int64_t conv_dim   = key_dim * 2 + value_dim;
+
+                    // Calculate projection sizes
+                    const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
+                    const int64_t ba_dim   = n_v_heads * 2;
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+                        if (!hparams.is_recurrent(i)) {
+                            // Attention layers
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+                            // Q/K normalization for attention layers
+                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+                        } else {
+                            // Linear attention (gated delta net) specific tensors
+                            // Create tensors with calculated dimensions
+                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, 0);
+                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
+                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
+                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
+                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
+                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+                        }
+
+                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+
+                        // Shared experts
+                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
+                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
+                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
+                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
+                    }
+                } break;
              default:
                  throw std::runtime_error("unknown architecture");
          }
@@ -6669,6 +6827,7 @@ void llama_model::print_info() const {
          arch == LLM_ARCH_FALCON_H1 ||
          arch == LLM_ARCH_PLAMO2 ||
          arch == LLM_ARCH_GRANITE_HYBRID ||
+        arch == LLM_ARCH_QWEN3NEXT ||
          arch == LLM_ARCH_NEMOTRON_H) {
          LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
          LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
@@ -6718,7 +6877,7 @@ void llama_model::print_info() const {
          LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
      }
  
-    if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
+    if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
          LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
      }
  
@@ -6880,6 +7039,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
          case LLM_ARCH_DREAM:
          case LLM_ARCH_LLADA:
          case LLM_ARCH_LLADA_MOE:
+        case LLM_ARCH_RND1:
              {
                  res = nullptr;
              } break;
@@ -7073,6 +7233,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
                  llm = std::make_unique<llm_build_llada_moe>(*this, params);
              }
              break;
+        case LLM_ARCH_RND1:
+            {
+                llm = std::make_unique<llm_build_rnd1>(*this, params);
+            }
+            break;
          case LLM_ARCH_QWEN2VL:
              {
                  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -7148,7 +7313,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
              } break;
          case LLM_ARCH_GEMMA3:
              {
-                llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
+                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+                    llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
+                } else {
+                    llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
+                }
              } break;
          case LLM_ARCH_GEMMA3N:
              {
@@ -7404,7 +7573,15 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
          case LLM_ARCH_PANGU_EMBED:
              {
                  llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
-            }break;
+            } break;
+        case LLM_ARCH_QWEN3NEXT:
+            {
+                llm = std::make_unique<llm_build_qwen3next>(*this, params);
+            } break;
+        case LLM_ARCH_MISTRAL3:
+            {
+                llm = std::make_unique<llm_build_mistral3>(*this, params);
+            } break;
          default:
              GGML_ABORT("fatal error");
      }
@@ -7573,6 +7750,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
          case LLM_ARCH_ARCEE:
          case LLM_ARCH_ERNIE4_5:
          case LLM_ARCH_ERNIE4_5_MOE:
+        case LLM_ARCH_MISTRAL3:
              return LLAMA_ROPE_TYPE_NORM;
  
          // the pairs of head values are offset by n_rot/2
@@ -7593,6 +7771,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
          case LLM_ARCH_QWEN3:
          case LLM_ARCH_QWEN3MOE:
          case LLM_ARCH_LLADA_MOE:
+        case LLM_ARCH_RND1:
          case LLM_ARCH_OLMO2:
          case LLM_ARCH_OLMOE:
          case LLM_ARCH_PHI2:
@@ -7630,6 +7809,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
          case LLM_ARCH_COGVLM:
          case LLM_ARCH_PANGU_EMBED:
          case LLM_ARCH_AFMOE:
+        case LLM_ARCH_QWEN3NEXT:
              return LLAMA_ROPE_TYPE_NEOX;
  
          case LLM_ARCH_QWEN2VL:
@@ -7665,6 +7845,24 @@ int32_t llama_model_meta_count(const llama_model * model) {
      return (int)model->gguf_kv.size();
  }
  
+const char * llama_model_meta_key_str(llama_model_meta_key key) {
+    switch (key) {
+        case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE:        return "general.sampling.sequence";
+        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K:           return "general.sampling.top_k";
+        case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P:           return "general.sampling.top_p";
+        case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P:           return "general.sampling.min_p";
+        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
+        case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD:   return "general.sampling.xtc_threshold";
+        case LLAMA_MODEL_META_KEY_SAMPLING_TEMP:            return "general.sampling.temp";
+        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N:  return "general.sampling.penalty_last_n";
+        case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT:  return "general.sampling.penalty_repeat";
+        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT:        return "general.sampling.mirostat";
+        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU:    return "general.sampling.mirostat_tau";
+        case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA:    return "general.sampling.mirostat_eta";
+        default:                                            return nullptr;
+    }
+}
+
  int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
      if (i < 0 || i >= (int)model->gguf_kv.size()) {
          if (buf_size > 0) {
diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h

index f730c49540cfeec82b446c65a222e7b0590aa229..f8342cf2cb136ad68575b02db250402ad1b40504 100644 (file)
--- a/examples/talk-llama/llama-model.h
+++ b/examples/talk-llama/llama-model.h
@@ -113,6 +113,7 @@ enum llm_type {
      LLM_TYPE_16B_A1B,
      LLM_TYPE_21B_A3B, // Ernie MoE small
      LLM_TYPE_30B_A3B,
+    LLM_TYPE_80B_A3B, // Qwen3 Next
      LLM_TYPE_100B_A6B,
      LLM_TYPE_106B_A12B, // GLM-4.5-Air
      LLM_TYPE_230B_A10B, // Minimax M2
@@ -309,6 +310,9 @@ struct llama_layer {
      struct ggml_tensor * ssm_conv1d_b = nullptr;
      struct ggml_tensor * ssm_dt_b     = nullptr;
  
+    // qwen3next
+    struct ggml_tensor * ssm_beta_alpha = nullptr;
+
      // rwkv
      struct ggml_tensor * time_mix_w1         = nullptr;
      struct ggml_tensor * time_mix_w2         = nullptr;
diff --git a/examples/talk-llama/llama-quant.cpp b/examples/talk-llama/llama-quant.cpp

index a56b2626ae1c5f872ec8cc2a4462f98d63e91e26..351dcb7baaae16de68aff7bf27119d6eb8e9a2eb 100644 (file)
--- a/examples/talk-llama/llama-quant.cpp
+++ b/examples/talk-llama/llama-quant.cpp
@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
  
      std::map<int, std::string> mapped;
      int blk_id = 0;
-    int pruned_attention_w = 0;
  
      // make a list of weights
      std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@@ -674,14 +673,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      for (const auto & it : ml.weights_map) {
          const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
          if (remapped_name.empty()) {
-            if (it.first.find("attn_v.weight") != std::string::npos ||
-                it.first.find("attn_qkv.weight") != std::string::npos ||
-                it.first.find("attn_kv_b.weight") != std::string::npos) {
-                    pruned_attention_w++;
-            }
              LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
              continue;
-        } else if (remapped_name != it.first) {
+        }
+
+        if (remapped_name != it.first) {
              ggml_set_name(it.second.tensor, remapped_name.c_str());
              LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
          }
@@ -701,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          });
      }
  
-    bool is_clip_model = false;
      for (const auto * it : tensors) {
          const struct ggml_tensor * tensor = it->tensor;
  
@@ -715,26 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
              qs.has_output = true;
          }
-
-        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
      }
  
      qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
  
-    // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0 && !is_clip_model)
-    {
-        const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
-        // attention layers have a non-zero number of kv heads
-        int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
-        if (llama_model_has_encoder(&model)) {
-            // now n_attn_layer is the number of attention layers in the encoder
-            // for each decoder block, there are 2 attention layers
-            n_attn_layer += 2 * model.hparams.dec_n_layer;
-        }
-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
-    }
-
      size_t total_size_org = 0;
      size_t total_size_new = 0;
  
diff --git a/examples/talk-llama/llama-sampling.cpp b/examples/talk-llama/llama-sampling.cpp

index adb3f8810ed33bbbaeeaa890bb5f0629cfb8f35b..3f4a729bc36c7d81a4d33b661755ae1e7da587f4 100644 (file)
--- a/examples/talk-llama/llama-sampling.cpp
+++ b/examples/talk-llama/llama-sampling.cpp
@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
      for (auto * smpl : chain->samplers) {
          llama_sampler_reset(smpl);
      }
-
-    chain->t_sample_us = 0;
-    chain->n_sample    = 0;
  }
  
  static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
  void llama_perf_sampler_print(const struct llama_sampler * chain) {
      const auto data = llama_perf_sampler(chain);
  
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+    LLAMA_LOG_INFO("%s:    samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
  }
  
  void llama_perf_sampler_reset(struct llama_sampler * chain) {
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
  
      auto * ctx = (struct llama_sampler_chain *) chain->ctx;
  
-    ctx->t_sample_us = ctx->n_sample = 0;
+    ctx->t_sample_us = 0;
+    ctx->n_sample    = 0;
  }
diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp

index 29e31cecd1565c804972a79286e3c0bb207d9cfe..e2cca66e48fe7f75620f1528d030f222ba6946b6 100644 (file)
--- a/examples/talk-llama/llama-vocab.cpp
+++ b/examples/talk-llama/llama-vocab.cpp
@@ -1281,6 +1281,7 @@ struct llm_tokenizer_plamo2 : llm_tokenizer {
  
          // Build suffix list in lexicographical order of reversed strings
          std::vector<std::string> suffixes;
+        suffixes.reserve(suffix_to_score.size() + 1);
          for (const auto & pair : suffix_to_score) {
              suffixes.push_back(pair.first);
          }
@@ -3252,8 +3253,7 @@ void llama_vocab::impl::print_info() const {
  llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
  }
  
-llama_vocab::~llama_vocab() {
-}
+llama_vocab::~llama_vocab() = default;
  
  void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
      pimpl->load(ml, kv);
diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h

index 8547226ff210cb08ae931be7d152ab06ad8e3443..b52eaacfa7e839e2767b29f2670c4bc885d0675e 100644 (file)
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@@ -246,6 +246,21 @@ extern "C" {
          LLAMA_KV_OVERRIDE_TYPE_STR,
      };
  
+    enum llama_model_meta_key {
+        LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
+        LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
+        LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
+        LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
+        LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
+        LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
+        LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
+        LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
+        LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
+        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
+        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
+        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
+    };
+
      struct llama_model_kv_override {
          enum llama_model_kv_override_type tag;
  
@@ -518,6 +533,9 @@ extern "C" {
      // Get the number of metadata key/value pairs
      LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
  
+    // Get sampling metadata key name. Returns nullptr if the key is invalid
+    LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
+
      // Get metadata key name by index
      LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
  
diff --git a/examples/talk-llama/models/deepseek2.cpp b/examples/talk-llama/models/deepseek2.cpp

index 68f72f72bb643157f7da1817a7493cc7469dada7..dbaa8297be926d7f398c0c2915f99d458613383d 100644 (file)
--- a/examples/talk-llama/models/deepseek2.cpp
+++ b/examples/talk-llama/models/deepseek2.cpp
@@ -4,7 +4,8 @@
  
  llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
-    bool is_lite = (hparams.n_layer == 27);
+    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
+    bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
  
      const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  
@@ -29,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
      // {n_embd, n_tokens}
      inpL = build_inp_embd(model.tok_embd);
  
+    // (optional) temperature tuning - used by mistral-large
+    ggml_tensor * inp_attn_scale = nullptr;
+    if (hparams.f_attn_temp_scale != 0.0f) {
+        inp_attn_scale = build_inp_attn_scale();
+    }
+
      // inp_pos - contains the positions
      ggml_tensor * inp_pos = build_inp_pos();
  
@@ -127,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                  ggml_tensor * Vcur = kv_cmpr;
                  cb(Vcur, "Vcur", il);
  
+                if (inp_attn_scale) {
+                    // apply llama 4 temperature scaling
+                    Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                    cb(Qcur, "Qcur_attn_temp_scaled", il);
+                }
+
                  // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
                  cur = build_attn(inp_attn,
                          model.layers[il].wo, NULL,
@@ -159,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                  ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
                  cb(Kcur, "Kcur", il);
  
+                if (inp_attn_scale) {
+                    // apply llama 4 temperature scaling
+                    Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                    cb(Qcur, "Qcur_attn_temp_scaled", il);
+                }
+
                  // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
                  cur = build_attn(inp_attn,
                              model.layers[il].wo, NULL,
diff --git a/examples/talk-llama/models/gemma3-iswa.cpp b/examples/talk-llama/models/gemma3-iswa.cpp

deleted file mode 100644 (file)

index 839ff6d..0000000
--- a/examples/talk-llama/models/gemma3-iswa.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "models.h"
-
-llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    if (ubatch.token) {
-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-        cb(inpL, "inp_scaled", -1);
-    }
-    // inp_pos - contains the positions
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    // TODO: is causal == true correct? might need some changes
-    auto * inp_attn = build_attn_inp_kv_iswa();
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        // norm
-        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
-            cb(Qcur, "Qcur_normed", il);
-
-            Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
-
-            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
-            cb(Kcur, "Kcur_normed", il);
-
-            Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
-            Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
-
-            cur = build_attn(inp_attn,
-                    model.layers[il].wo, NULL,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
-        }
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-        }
-        cur = build_norm(cur,
-                model.layers[il].attn_post_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "attn_post_norm", il);
-
-        ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
-        cb(sa_out, "sa_out", il);
-
-        cur = build_norm(sa_out,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        // feed-forward network
-        {
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_GELU, LLM_FFN_PAR, il);
-            cb(cur, "ffn_out", il);
-        }
-        cur = build_norm(cur,
-                model.layers[il].ffn_post_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "ffn_post_norm", -1);
-
-        cur = ggml_add(ctx0, cur, sa_out);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
diff --git a/examples/talk-llama/models/gemma3.cpp b/examples/talk-llama/models/gemma3.cpp

new file mode 100644 (file)

index 0000000..ae60ef4
--- /dev/null
+++ b/examples/talk-llama/models/gemma3.cpp
@@ -0,0 +1,156 @@
+#include "models.h"
+
+template <bool iswa>
+llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_k;
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    // TODO: is causal == true correct? might need some changes
+    using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+    inp_attn_type * inp_attn = nullptr;
+
+    if constexpr (iswa) {
+        inp_attn = build_attn_inp_kv_iswa();
+    } else {
+        inp_attn = build_attn_inp_kv();
+    }
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        float freq_base_l  = 0.0f;
+        float freq_scale_l = 0.0f;
+
+        if constexpr (iswa) {
+            freq_base_l  = model.get_rope_freq_base (cparams, il);
+            freq_scale_l = model.get_rope_freq_scale(cparams, il);
+        } else {
+            freq_base_l  = freq_base;
+            freq_scale_l = freq_scale;
+        }
+
+        // norm
+        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+            cb(Qcur, "Qcur_normed", il);
+
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+            cb(Kcur, "Kcur_normed", il);
+
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
+            Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, NULL,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+        }
+        cur = build_norm(cur,
+                model.layers[il].attn_post_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_post_norm", il);
+
+        ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+        cb(sa_out, "sa_out", il);
+
+        cur = build_norm(sa_out,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        // feed-forward network
+        {
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_GELU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        }
+        cur = build_norm(cur,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "ffn_post_norm", il);
+
+        cur = ggml_add(ctx0, cur, sa_out);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+
+    if (hparams.f_final_logit_softcapping) {
+        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+        cur = ggml_tanh(ctx0, cur);
+        cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+    }
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
+
+template struct llm_build_gemma3<false>;
+template struct llm_build_gemma3<true>;
diff --git a/examples/talk-llama/models/lfm2.cpp b/examples/talk-llama/models/lfm2.cpp

index ca06bacd7bcb81a207879255b0a57a0aba6f90d2..7f805d78795a9848bf75460f38f4843eda6ae953 100644 (file)
--- a/examples/talk-llama/models/lfm2.cpp
+++ b/examples/talk-llama/models/lfm2.cpp
@@ -9,6 +9,8 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
      ggml_tensor * cur = build_inp_embd(model.tok_embd);
      cb(cur, "model.embed_tokens", -1);
  
+    ggml_build_forward_expand(gf, cur);
+
      ggml_tensor * inp_pos     = build_inp_pos();
      auto *        inp_hybrid  = build_inp_mem_hybrid();
      ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -40,12 +42,12 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
          cur = ggml_add(ctx0, cur, ffn_out);
      }
  
-    cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
-    cb(cur, "model.embedding_norm", -1);
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
      res->t_embd = cur;
  
      cur = build_lora_mm(model.output, cur);
-    cb(cur, "lm_head", -1);
+    cb(cur, "result_output", -1);
  
      res->t_logits = cur;
  
diff --git a/examples/talk-llama/models/mistral3.cpp b/examples/talk-llama/models/mistral3.cpp

new file mode 100644 (file)

index 0000000..0b67223
--- /dev/null
+++ b/examples/talk-llama/models/mistral3.cpp
@@ -0,0 +1,160 @@
+#include "models.h"
+
+llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    // (optional) temperature tuning
+    ggml_tensor * inp_attn_scale = nullptr;
+    if (hparams.f_attn_temp_scale != 0.0f) {
+        inp_attn_scale = build_inp_attn_scale();
+    }
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // norm
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self-attention
+        {
+            // rope freq factors for llama3; may return nullptr for llama2 and other models
+            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+            // compute Q and K and RoPE them
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            if (model.layers[il].bq) {
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+            if (model.layers[il].bk) {
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+            if (model.layers[il].bv) {
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+            }
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            if (inp_attn_scale) {
+                // apply llama 4 temperature scaling
+                Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                cb(Qcur, "Qcur_attn_temp_scaled", il);
+            }
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // feed-forward network (non-MoE)
+        if (model.layers[il].ffn_gate_inp == nullptr) {
+
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+            cb(cur, "ffn_moe_out", il);
+        }
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "ffn_out", il);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/examples/talk-llama/models/models.h b/examples/talk-llama/models/models.h

index 4d7aeb4f42caa3ea2d5744c2b851ea1c55bb74c2..6494f5450181873f9015be5e38a09e60fe66c0f1 100644 (file)
--- a/examples/talk-llama/models/models.h
+++ b/examples/talk-llama/models/models.h
@@ -2,8 +2,9 @@
  
  #include "../llama-model.h"
  #include "../llama-graph.h"
-#include "../llama-memory-recurrent.h"
  
+// TODO: remove in follow-up PR - move to .cpp files
+#include "../llama-memory-recurrent.h"
  #include <cmath>
  
  struct llm_graph_context_mamba : public llm_graph_context {
@@ -178,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
      llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
  };
  
-struct llm_build_gemma3_iswa : public llm_graph_context {
-    llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
+template <bool iswa>
+struct llm_build_gemma3 : public llm_graph_context {
+    llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
  };
  
  struct llm_build_gemma3n_iswa : public llm_graph_context {
@@ -321,6 +323,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
      llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
  };
  
+struct llm_build_mistral3 : public llm_graph_context {
+    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
+};
+
  struct llm_build_mpt : public llm_graph_context {
      llm_build_mpt(const llama_model & model, const llm_graph_params & params);
  };
@@ -421,7 +427,56 @@ struct llm_build_qwen3vl : public llm_graph_context {
  struct llm_build_qwen3vlmoe : public llm_graph_context {
      llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
  };
+struct llm_build_qwen3next : public llm_graph_context_mamba {
+    llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
+private:
+    ggml_tensor * build_layer_attn(
+    llm_graph_input_attn_kv * inp_attn,
+                ggml_tensor * cur,
+                ggml_tensor * inp_pos,
+                        int   il);
+
+    ggml_tensor * build_layer_attn_linear(
+         llm_graph_input_rs * inp,
+                ggml_tensor * cur,
+                ggml_tensor * causal_mask,
+                ggml_tensor * identity,
+                        int   il);
  
+    ggml_tensor * build_layer_ffn(
+                ggml_tensor * cur,
+                        int   il);
+
+    ggml_tensor * build_delta_net_recurrent(
+                ggml_tensor * q,
+                ggml_tensor * k,
+                ggml_tensor * v,
+                ggml_tensor * g,
+                ggml_tensor * beta,
+                ggml_tensor * state,
+                ggml_tensor * causal_mask,
+                ggml_tensor * identity,
+                        int   il);
+
+    ggml_tensor * build_delta_net_chunking(
+                ggml_tensor * q,
+                ggml_tensor * k,
+                ggml_tensor * v,
+                ggml_tensor * g,
+                ggml_tensor * beta,
+                ggml_tensor * state,
+                ggml_tensor * causal_mask,
+                ggml_tensor * identity,
+                        int   il);
+
+    ggml_tensor * build_norm_gated(
+                ggml_tensor * input,
+                ggml_tensor * weights,
+                ggml_tensor * gate,
+                        int   layer);
+
+    const llama_model & model;
+};
  
  struct llm_build_qwen : public llm_graph_context {
      llm_build_qwen(const llama_model & model, const llm_graph_params & params);
@@ -431,6 +486,10 @@ struct llm_build_refact : public llm_graph_context {
      llm_build_refact(const llama_model & model, const llm_graph_params & params);
  };
  
+struct llm_build_rnd1 : public llm_graph_context {
+    llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
+};
+
  struct llm_build_rwkv6 : public llm_build_rwkv6_base {
      llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
  };
diff --git a/examples/talk-llama/models/qwen3next.cpp b/examples/talk-llama/models/qwen3next.cpp

new file mode 100644 (file)

index 0000000..c8f1b5e
--- /dev/null
+++ b/examples/talk-llama/models/qwen3next.cpp
@@ -0,0 +1,1042 @@
+#include "ggml.h"
+#include "models.h"
+
+#define CHUNK_SIZE 64
+
+llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) :
+    llm_graph_context_mamba(params), model(model) {
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+    cb(inpL, "model.embed_tokens", -1);
+
+    auto * inp = build_inp_mem_hybrid();
+
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    ggml_tensor * causal_mask =
+        ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f),
+                    GGML_TRI_TYPE_LOWER);
+
+    ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f));
+
+    ggml_build_forward_expand(gf, causal_mask);
+    ggml_build_forward_expand(gf, identity);
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // Determine layer type and build appropriate attention mechanism
+        if (hparams.is_recurrent(il)) {
+            // Linear attention layer (gated delta net)
+            cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il);
+        } else {
+            // Full attention layer
+            cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+
+        // Residual connection
+        cur = ggml_add(ctx0, cur, inpSA);
+        cb(cur, "attn_residual", il);
+
+        // Save the tensor before post-attention norm for residual connection
+        ggml_tensor * ffn_residual = cur;
+
+        // Post-attention norm
+        ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
+        cb(attn_post_norm, "attn_post_norm", il);
+
+        // FFN layer (MoE or dense) - without residual connection
+        cur = build_layer_ffn(attn_post_norm, il);
+        cb(cur, "ffn_out", il);
+
+        // Residual connection for FFN - add to the tensor from before post_attention_layernorm
+        cur = ggml_add(ctx0, cur, ffn_residual);
+        cb(cur, "post_moe", il);
+
+        // Input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+
+    // Final norm
+    cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // LM head
+    cur = build_lora_mm(model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        ggml_tensor * causal_mask,
+        ggml_tensor * identity,
+        int           il) {
+    GGML_ASSERT(ggml_is_contiguous(q));
+    GGML_ASSERT(ggml_is_contiguous(k));
+    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(g));
+    GGML_ASSERT(ggml_is_contiguous(beta));
+    GGML_ASSERT(ggml_is_contiguous(state));
+
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+
+    GGML_ASSERT(v->ne[2] == n_tokens);
+    GGML_ASSERT(k->ne[2] == n_tokens);
+    GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
+
+    // TODO: can this ever be false?
+    const bool use_qk_l2norm = true;
+
+    if (use_qk_l2norm) {
+        const float eps_norm = hparams.f_norm_rms_eps;
+
+        q = ggml_l2_norm(ctx0, q, eps_norm);
+        k = ggml_l2_norm(ctx0, k, eps_norm);
+    }
+
+    const float scale = 1.0f / sqrtf(S_v);
+
+    q = ggml_scale(ctx0, q, scale);
+
+    beta = ggml_sigmoid(ctx0, beta);
+
+    ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
+
+    cb(q, "q_in", il);
+    cb(k, "k_in", il);
+    cb(v, "v_in", il);
+    cb(beta, "beta_in", il);
+    cb(g, "g_in", il);
+
+    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
+
+    beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+    cb(q, "q_perm", il);
+    cb(k, "k_perm", il);
+    cb(v, "v_perm", il);
+    cb(beta, "beta_perm", il);
+    cb(g, "g_perm", il);
+    cb(state, "state_in", il);
+
+    GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+    GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+    // Do padding
+    const int64_t chunk_size = CHUNK_SIZE;
+
+    const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
+    const int64_t n_chunks = (n_tokens + pad) / chunk_size;
+
+    q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+    k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+    v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+    g = ggml_pad(ctx0, g, pad, 0, 0, 0);
+    beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
+
+    cb(q, "q_pad", il);
+    cb(k, "k_pad", il);
+    cb(v, "v_pad", il);
+    cb(beta, "beta_pad", il);
+    cb(g, "g_pad", il);
+
+    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+    cb(v_beta, "v_beta", il);
+    cb(k_beta, "k_beta", il);
+
+    ggml_tensor * chunked_mask =
+        ggml_view_4d(ctx0, causal_mask, chunk_size,
+                chunk_size,         causal_mask->ne[2], causal_mask->ne[3],
+                causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0);
+
+    ggml_tensor * chunked_diag_mask =
+        ggml_view_4d(ctx0, causal_diag_mask, chunk_size,
+                chunk_size,              causal_diag_mask->ne[2], causal_diag_mask->ne[3],
+                causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
+
+    ggml_tensor * chunked_identity =
+        ggml_view_4d(ctx0, identity, chunk_size,
+            chunk_size,      identity->ne[2], identity->ne[3],
+            identity->nb[1], identity->nb[2], identity->nb[3], 0);
+
+    q      = ggml_cont_4d(ctx0, q,      S_k, chunk_size, n_chunks, H_k * n_seqs);
+    k      = ggml_cont_4d(ctx0, k,      S_k, chunk_size, n_chunks, H_k * n_seqs);
+    k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
+    v      = ggml_cont_4d(ctx0, v,      S_v, chunk_size, n_chunks, H_v * n_seqs);
+    v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
+
+    g    = ggml_cont_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
+    beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
+
+    ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
+
+    cb(g_cumsum, "g_cumsum", il);
+
+    ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
+    ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
+
+    ggml_tensor * gcs_j_broadcast =
+        ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
+
+    ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
+
+    cb(decay_mask, "decay_mask", il);
+
+    decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
+    decay_mask = ggml_exp(ctx0, decay_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
+
+    ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
+
+    ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
+    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, chunked_mask));
+
+    cb(attn, "attn_pre_solve", il);
+
+    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, chunked_mask);
+    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower);
+
+    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
+    attn                     = ggml_mul(ctx0, lin_solve, chunked_mask);
+    attn                     = ggml_add(ctx0, attn, chunked_identity);
+
+    cb(attn, "attn_solved", il);
+
+    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
+
+    ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
+    ggml_tensor * gexp       = ggml_exp(ctx0, g_cumsum_t);
+
+    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
+
+    cb(kbeta_gexp, "kbeta_gexp", il);
+
+    ggml_tensor * k_cumdecay =
+        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
+
+    cb(k_cumdecay, "k_cumdecay", il);
+
+    ggml_tensor * core_attn_out = nullptr;
+    ggml_tensor * new_state = ggml_dup(ctx0, state);
+
+    cb(new_state, "new_state", il);
+
+    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+        auto chunkify = [=](ggml_tensor * t) {
+            return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3],
+                t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
+        };
+
+        auto chunkify_g = [=](ggml_tensor * t) {
+            return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, t->ne[1], 1, t->ne[3],
+                t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk));
+        };
+
+        ggml_tensor * k_chunk = chunkify(k);
+        ggml_tensor * q_chunk = chunkify(q);
+        ggml_tensor * v_chunk = chunkify(v);
+
+        ggml_tensor * g_cs_chunk = chunkify_g(g_cumsum);
+        ggml_tensor * g_cs_chunk_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cs_chunk));
+
+        ggml_tensor * decay_mask_chunk = chunkify(decay_mask);
+        ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay);
+
+        ggml_tensor * gexp_chunk = ggml_exp(ctx0, g_cs_chunk_t);
+
+        // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+        attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
+        attn = ggml_mul(ctx0, attn, decay_mask_chunk);
+        attn = ggml_mul(ctx0, attn, ggml_add(ctx0, chunked_identity, chunked_mask));
+
+        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
+
+        // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
+        ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+
+        // v_new = v_i - v_prime
+        ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
+        ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+
+        // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+        ggml_tensor * q_g_exp    = ggml_mul(ctx0, q_chunk, gexp_chunk);
+        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
+
+        // core_attn_out[:, :, i] = attn_inter + attn @ v_new
+        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
+
+        ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+
+        core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1);
+
+        // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
+        // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
+        // key_gdiff = key * g_diff.unsqueeze(-1)
+        // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+        // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+
+        ggml_tensor * g_cum_last =
+            ggml_cont(ctx0, ggml_view_4d(ctx0, g_cs_chunk_t, g_cs_chunk_t->ne[0], 1, g_cs_chunk_t->ne[2], g_cs_chunk_t->ne[3],
+                                        g_cs_chunk_t->nb[1], g_cs_chunk_t->nb[2], g_cs_chunk_t->nb[3],
+                                        g_cs_chunk_t->nb[0] * (g_cs_chunk_t->ne[1] - 1)));
+
+        ggml_tensor * gexp_last =
+            ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
+
+        ggml_tensor * g_cum_last_3d =
+            ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
+
+        ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cs_chunk, g_cs_chunk->ne[0], g_cs_chunk->ne[2], g_cs_chunk->ne[3]);
+
+        ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
+
+        ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
+
+        ggml_tensor * key_gdiff = ggml_mul(ctx0, k_chunk,
+                                        ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
+                                                        g_diff_exp->ne[2] * g_diff_exp->ne[3]));
+
+        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
+
+        new_state = ggml_add(ctx0,
+            ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last, gexp_last->ne[0], gexp_last->ne[1], H_v, n_seqs)),
+            ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
+    }
+
+    core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs);
+
+    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, S_v, n_tokens, H_v, n_seqs, core_attn_out->nb[1], core_attn_out->nb[2], core_attn_out->nb[3], 0);
+    cb(output_tokens, "output_tokens", il);
+
+    // flatten output
+    ggml_tensor * flat_output =
+        ggml_cont_1d(ctx0, ggml_permute(ctx0, output_tokens, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
+
+    ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs);
+
+    return ggml_concat(ctx0, flat_output, flat_state, 0);
+}
+
+ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        ggml_tensor * causal_mask,
+        ggml_tensor * identity,
+        int           il) {
+    GGML_ASSERT(ggml_is_contiguous(q));
+    GGML_ASSERT(ggml_is_contiguous(k));
+    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(g));
+    GGML_ASSERT(ggml_is_contiguous(beta));
+    GGML_ASSERT(ggml_is_contiguous(state));
+
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+
+    GGML_ASSERT(v->ne[2] == n_tokens);
+    GGML_ASSERT(k->ne[2] == n_tokens);
+    GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
+
+    // TODO: can this ever be false?
+    const bool use_qk_l2norm = true;
+
+    if (use_qk_l2norm) {
+        const float eps_norm = hparams.f_norm_rms_eps;
+
+        q = ggml_l2_norm(ctx0, q, eps_norm);
+        k = ggml_l2_norm(ctx0, k, eps_norm);
+    }
+
+    const float scale = 1.0f / sqrtf(S_v);
+
+    q = ggml_scale(ctx0, q, scale);
+
+    beta = ggml_sigmoid(ctx0, beta);
+
+    ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
+
+    cb(q, "q_in", il);
+    cb(k, "k_in", il);
+    cb(v, "v_in", il);
+    cb(beta, "beta_in", il);
+    cb(g, "g_in", il);
+
+    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
+
+    beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+    cb(q, "q_perm", il);
+    cb(k, "k_perm", il);
+    cb(v, "v_perm", il);
+    cb(beta, "beta_perm", il);
+    cb(g, "g_perm", il);
+    cb(state, "state_in", il);
+
+    GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+    GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+    ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
+
+    cb(k_beta, "k_beta", il);
+    cb(v_beta, "v_beta", il);
+    cb(g_cumsum, "g_cumsum", il);
+
+    ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, n_tokens, 1, H_v, n_seqs);  // [chunk_size, 1, n_tokens, n_seqs]
+    ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, n_tokens, H_v, n_seqs);  // [1, chunk_size, n_tokens, n_seqs]
+
+    // Broadcast both tensors to [chunk_size, chunk_size, H_v, n_seqs]
+    // ggml_tensor * gcs_i_broadcast =
+    //     ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v,
+    //                     n_seqs);  // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
+    // Don't need this, this one will get auto-broadcast
+    ggml_tensor * gcs_j_broadcast =
+        ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs);  // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
+
+    ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
+
+    // Apply lower triangular mask to ensure attention is causal (only past tokens influence current)
+    decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
+    // Apply exponential to get the decay mask values
+    decay_mask = ggml_exp(ctx0, decay_mask);
+    // Apply lower triangular mask again to ensure only lower triangular values remain
+    decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
+
+    cb(decay_mask, "decay_mask", il);
+
+    // attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
+    ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
+
+    cb(kmulkbeta, "kmulkbeta", il);
+
+    ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
+    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
+
+    cb(attn, "attn_pre_rec", il);
+
+    // for i in range(1, chunk_size):
+    //          row = attn[..., i, :i].clone()
+    //          sub = attn[..., :i, :i].clone()
+    //          attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
+    // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
+    //
+    // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
+    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
+    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
+
+    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
+    attn                     = ggml_mul(ctx0, lin_solve, causal_mask);
+    attn                     = ggml_add(ctx0, attn, identity);
+
+    // value = attn @ v_beta
+    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
+
+    cb(v, "value_beta", il);
+
+    // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
+    ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
+    ggml_tensor * gexp       = ggml_exp(ctx0, g_cumsum_t);
+
+    cb(gexp, "g_cum_exp", il);
+
+    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
+
+    cb(kbeta_gexp, "kbeta_gexp", il);
+
+    ggml_tensor * k_cumdecay =
+        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
+
+    cb(k_cumdecay, "k_cumdecay", il);
+
+    // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+    attn = ggml_mul_mat(ctx0, k, q);
+    attn = ggml_mul(ctx0, attn, decay_mask);
+    attn = ggml_mul(ctx0, attn, ggml_add(ctx0, identity, causal_mask));
+
+    cb(attn, "attn_decay_key", il);
+
+    ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
+
+    // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
+    ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
+
+    cb(v_prime, "v_prime", il);
+
+    // v_new = v_i - v_prime
+    ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v, v_prime), v_prime);
+
+    ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+
+    cb(v_new, "v_new", il);
+
+    // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+    ggml_tensor * q_g_exp    = ggml_mul(ctx0, q, gexp);
+    ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
+
+    cb(attn_inter, "attn_inter", il);
+
+    // core_attn_out[:, :, i] = attn_inter + attn @ v_new
+    ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
+
+    cb(v_attn, "v_attn", il);
+
+    ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
+
+    cb(core_attn_out, "core_attn_out", il);
+
+    // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
+    // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
+    // key_gdiff = key * g_diff.unsqueeze(-1)
+    // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+    // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+
+    ggml_tensor * g_cum_last =
+        ggml_cont(ctx0, ggml_view_4d(ctx0, g_cumsum_t, g_cumsum_t->ne[0], 1, g_cumsum_t->ne[2], g_cumsum_t->ne[3],
+                                    g_cumsum_t->nb[1], g_cumsum_t->nb[2], g_cumsum_t->nb[3],
+                                    g_cumsum_t->nb[0] * (g_cumsum_t->ne[1] - 1)));
+
+    cb(g_cum_last, "g_cum_last", il);
+
+    ggml_tensor * gexp_last =
+        ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
+
+    cb(gexp_last, "gexp_last", il);
+
+    ggml_tensor * g_cum_last_3d =
+        ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
+
+    cb(g_cum_last_3d, "g_cum_last_3d", il);
+
+    ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cumsum, g_cumsum->ne[0], g_cumsum->ne[2], g_cumsum->ne[3]);
+
+    cb(g_cumsum_3d, "g_cumsum_3d", il);
+
+    ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
+
+    cb(g_diff, "g_diff", il);
+
+    ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
+
+    cb(g_diff_exp, "g_diff_exp", il);
+
+    ggml_tensor * key_gdiff = ggml_mul(ctx0, k,
+                                    ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
+                                                    g_diff_exp->ne[2] * g_diff_exp->ne[3]));
+
+    cb(key_gdiff, "key_gdiff", il);
+
+    ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
+
+    cb(kgdmulvnew, "kgdmulvnew", il);
+
+    state = ggml_add(ctx0, ggml_mul(ctx0, state, gexp_last), kgdmulvnew);
+
+    cb(state, "new_state", il);
+
+    // flatten output
+    ggml_tensor * flat_output =
+        ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
+
+    ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
+
+    return ggml_concat(ctx0, flat_output, flat_state, 0);
+}
+
+ggml_tensor * llm_build_qwen3next::build_norm_gated(
+        ggml_tensor * input,
+        ggml_tensor * weights,
+        ggml_tensor * gate,
+        int           layer) {
+    ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer);
+    ggml_tensor * gated_silu = ggml_silu(ctx0, gate);
+
+    return ggml_mul(ctx0, normalized, gated_silu);
+}
+
+ggml_tensor * llm_build_qwen3next::build_layer_attn(
+        llm_graph_input_attn_kv * inp,
+        ggml_tensor *             cur,
+        ggml_tensor *             inp_pos,
+        int                       il) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+    // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
+
+    // Qwen3Next uses a single Q projection that outputs query + gate
+    ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur);
+    cb(Qcur_full, "Qcur_full", il);
+
+    Qcur_full = ggml_reshape_4d(ctx0, Qcur_full, n_embd_head * 2, n_head, n_tokens, 1);
+
+    // Split Q projection into query and gate
+    // The split should be along dimension 0 (the feature dimension)
+    ggml_tensor * Qcur = ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
+                                             Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], 0);
+    ggml_tensor * gate =
+        ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
+                     Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], n_embd_head * ggml_element_size(Qcur_full));
+    cb(Qcur, "Qcur", il);
+    cb(gate, "gate", il);
+
+    // Now reshape Qcur to [n_embd_head, n_head, n_tokens] for multi-head attention
+    Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+    cb(Qcur, "Qcur_reshaped", il);
+
+    // Apply Q normalization
+    Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+    cb(Qcur, "Qcur_normed", il);
+
+    ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+    cb(Kcur, "Kcur", il);
+
+    ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+    cb(Vcur, "Vcur", il);
+
+    // Apply K normalization
+    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+    Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
+    cb(Kcur, "Kcur_normed", il);
+
+    // Reshape gate to [n_embd, n_tokens] for the sigmoid gating (flatten the heads)
+    gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
+    cb(gate, "gate_reshaped", il);
+
+    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+    // Apply RoPE
+    Qcur = ggml_rope_ext(
+            ctx0, Qcur, inp_pos, nullptr,
+            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+
+    Kcur = ggml_rope_ext(
+            ctx0, Kcur, inp_pos, nullptr,
+            n_rot, rope_type, n_ctx_orig, freq_base,
+            freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+
+    cb(Qcur, "Qcur", il);
+    cb(Kcur, "Kcur", il);
+    cb(Vcur, "Vcur", il);
+
+    // Attention computation
+    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+    cur = build_attn(inp,
+                nullptr, nullptr,
+                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+    cb(cur, "attn_pregate", il);
+
+    ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
+    cb(gate_sigmoid, "gate_sigmoid", il);
+
+    cur = ggml_mul(ctx0, cur, gate_sigmoid);
+    cb(cur, "attn_gated", il);
+
+    cur = build_lora_mm(model.layers[il].wo, cur);
+    cb(cur, "attn_output", il);
+
+    return cur;
+}
+
+ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
+        llm_graph_input_rs * inp,
+        ggml_tensor *        cur,
+        ggml_tensor *        causal_mask,
+        ggml_tensor *        identity,
+        int                  il) {
+    const auto * mctx_cur = inp->mctx;
+
+    const int64_t d_inner      = hparams.ssm_d_inner;
+    const int64_t n_seqs       = ubatch.n_seqs;
+    const int64_t head_k_dim   = hparams.ssm_d_state;
+    const int64_t num_k_heads  = hparams.ssm_n_group;
+    const int64_t num_v_heads  = hparams.ssm_dt_rank;
+    const int64_t head_v_dim   = d_inner / num_v_heads;
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    const auto kv_head = mctx_cur->get_head();
+
+    GGML_ASSERT(n_seqs != 0);
+    GGML_ASSERT(ubatch.equal_seqs());
+    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+    // Input projections
+    ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, cur);
+    cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
+
+    ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
+    cb(mixed_ba, "linear_attn_mixed_ba", il);
+
+    int64_t       qkvz_new_dim        = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
+    ggml_tensor * mixed_qkvz_reshaped = ggml_cont_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+
+    // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
+    int64_t       ba_new_dim        = 2 * num_v_heads / num_k_heads;
+    ggml_tensor * mixed_ba_reshaped = ggml_cont_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+
+    // Split mixed_ba into b and a (beta and alpha parameters)
+    int64_t split_sizes_ba[2] = {
+        num_v_heads / num_k_heads,  // beta size
+        num_v_heads / num_k_heads   // alpha size
+    };
+
+    ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_seq_tokens, n_seqs,
+                                   mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0);
+    cb(b, "b", il);
+
+    ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_seq_tokens, n_seqs,
+                                   mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3],
+                                   split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
+    cb(a, "a", il);
+
+    // Reshape b and a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
+    ggml_tensor * beta  = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
+    ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
+
+    GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
+
+    ggml_tensor * alpha_biased   = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
+    ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
+    cb(alpha_softplus, "a_softplus", il);
+    ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a);  // -A_log.exp() * softplus
+    cb(gate, "gate", il);
+
+    // Split mixed_qkvz into query, key, value, z
+    int64_t split_sizes_qkvz[4] = {
+        head_k_dim,                              // query size
+        head_k_dim,                              // key size
+        head_v_dim * num_v_heads / num_k_heads,  // value size
+        head_v_dim * num_v_heads / num_k_heads   // z size
+    };
+
+    ggml_tensor * query =
+        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
+                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
+    cb(query, "q", il);
+
+    ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
+                                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                                     split_sizes_qkvz[0] * sizeof(float));
+    cb(key, "k", il);
+
+    ggml_tensor * value =
+        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
+                     mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                     (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * sizeof(float));
+    cb(value, "v", il);
+
+    ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
+                                   mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                                   (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
+    cb(z, "z", il);
+
+    GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value) + ggml_nelements(z) ==
+                ggml_nelements(mixed_qkvz));
+
+    // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
+    // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
+    ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
+    cb(query_flat, "query_flat", il);
+
+    // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
+    ggml_tensor * key_flat = ggml_cont_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
+    cb(key_flat, "key_flat", il);
+
+    // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
+    ggml_tensor * value_flat = ggml_cont_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+    cb(value_flat, "value_flat", il);
+
+    // Get convolution states from cache
+    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
+
+    // bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
+
+    // Build the convolution states tensor
+    ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+    cb(conv_states, "conv_states", il);
+
+    // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
+    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
+    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
+    cb(qkv_mixed, "qkv_mixed", il);
+
+    qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
+    cb(qkv_mixed, "qkv_mixed_permuted", il);
+
+    // Calculate the total conv dimension
+    int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
+
+    // Calculate convolution kernel size
+    ggml_tensor * conv_kernel      = model.layers[il].ssm_conv1d;
+    const int64_t conv_kernel_size = conv_kernel->ne[0];
+    const int64_t conv_channels    = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
+    conv_states                    = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
+    cb(conv_states, "conv_states_reshaped", il);
+
+    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
+    cb(conv_input, "conv_input", il);
+
+    // Update convolution state cache
+    // Extract the last (conv_kernel_size - 1) states from conv_input
+    ggml_tensor * last_conv_states =
+        ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1],
+                     conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input));
+    cb(last_conv_states, "last_conv_states", il);
+
+    ggml_tensor * state_update_target =
+        ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
+                     kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
+    cb(state_update_target, "state_update_target", il);
+
+    ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
+    cb(conv_states_all, "conv_states_updated", il);
+
+    // Apply SSM convolution
+    ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
+    cb(conv_output_proper, "conv_output_raw", il);
+
+    conv_output_proper = ggml_cont(ctx0, ggml_transpose(ctx0, conv_output_proper));
+    cb(conv_output_proper, "conv_output_pre_silu", il);
+
+    ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
+    cb(conv_output_silu, "conv_output_silu", il);
+
+    ggml_tensor * conv_qkv_mix =
+        ggml_cont_2d(ctx0, ggml_transpose(ctx0, conv_output_silu), qkv_dim, n_seq_tokens * n_seqs);
+    cb(conv_qkv_mix, "conv_qkv_mix", il);
+
+    // Extract the convolved Q, K, V from conv_output
+    ggml_tensor * q_conv =
+        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1], 0);
+    cb(q_conv, "q_conv", il);
+    ggml_tensor * k_conv =
+        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
+                     head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+    cb(k_conv, "k_conv", il);
+    ggml_tensor * v_conv =
+        ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, conv_qkv_mix->nb[1],
+                     2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+    cb(v_conv, "v_conv", il);
+
+    // Unsqueeze them
+    q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+    k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+    v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+
+    beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
+
+    ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
+    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
+    cb(state, "state_predelta", il);
+
+    // if head keys and value keys are different, repeat to force tensors into matching shapes
+    if (num_k_heads != num_v_heads) {
+        GGML_ASSERT(num_v_heads % num_k_heads == 0);
+        int64_t repeat_factor = num_v_heads / num_k_heads;
+
+        // repeat interleave: reshape to (repeat part, 1, remaining part), do repeat, then reshape back
+        ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
+        ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
+
+        // Repeat along the third dimension (the new dimension with size 1)
+        ggml_tensor * q_repeated =
+            ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
+        ggml_tensor * k_repeated =
+            ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
+
+        // Reshape back to merge the head and repeat dimensions
+        // From [head_dim, num_k_heads, repeat_factor, n_seq_tokens * n_seqs]
+        // Back to [head_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs]
+        q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
+        k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
+    }
+
+    cb(q_conv, "q_conv_predelta", il);
+    cb(k_conv, "k_conv_predelta", il);
+    cb(v_conv, "v_conv_predelta", il);
+
+    // Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
+    ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
+        build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) :
+        build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il);
+    cb(attn_out, "attn_out", il);
+
+    // The tensors were concatenated 1d, so we need to extract them 1d as well
+    const int64_t output_flat_size = head_v_dim * num_v_heads * n_seq_tokens * n_seqs;
+    ggml_tensor * attn_out_1d      = ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
+    cb(attn_out_1d, "attn_out_1d", il);
+
+    ggml_tensor * attn_out_final = ggml_cont_4d(ctx0, attn_out_1d, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+    cb(attn_out_final, "attn_out_reshaped", il);
+
+    // Extract the state part (second part of the concatenated tensor)
+    // State starts after n_tokens elements along dimension 1
+    const int64_t state_flat_size = head_v_dim * head_v_dim * num_v_heads * n_seqs;
+
+    ggml_tensor * state_1d =
+        ggml_view_1d(ctx0, attn_out, state_flat_size, output_flat_size * ggml_element_size(attn_out));
+    cb(state_1d, "state_1d", il);
+
+    // Update the recurrent states
+    ggml_build_forward_expand(gf,
+                              ggml_cpy(ctx0, state_1d,
+                                       ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+                                                    kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
+
+    GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out));
+
+    // Reshape both attn_out_final and z to 2D tensors for normalization
+    // attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
+    ggml_tensor * attn_out_2d_final =
+        ggml_cont_2d(ctx0, attn_out_final, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+    // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
+    ggml_tensor * z_2d = ggml_cont_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+    // Apply gated normalization: self.norm(core_attn_out, z)
+    ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
+
+    // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
+    ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+    cb(final_output, "final_output", il);
+
+    // Output projection
+    cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+    cb(cur, "linear_attn_out", il);
+
+    // Reshape back to original dimensions
+    cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+    return cur;
+}
+
+ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int il) {
+    // Check if this is an MoE layer
+    if (model.layers[il].ffn_gate_inp != nullptr) {
+        // MoE branch
+        ggml_tensor * moe_out =
+            build_moe_ffn(cur,
+                model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+                nullptr,
+                n_expert, n_expert_used, LLM_FFN_SILU,
+                true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+        cb(moe_out, "ffn_moe_out", il);
+
+        // Add shared experts if present - following Qwen3Next reference implementation
+        if (model.layers[il].ffn_up_shexp != nullptr) {
+            ggml_tensor * ffn_shexp =
+                build_ffn(cur,
+                    model.layers[il].ffn_up_shexp, NULL, NULL,
+                    model.layers[il].ffn_gate_shexp, NULL, NULL,
+                    model.layers[il].ffn_down_shexp, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(ffn_shexp, "ffn_shexp", il);
+
+            // Apply shared expert gating as in the reference implementation
+            // The shared expert has its own gate that is sigmoided
+            // Note: ffn_gate_inp_shexp is the shared expert gate (outputs 1 value per token)
+            ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
+            cb(shared_gate, "shared_expert_gate", il);
+
+            // Apply sigmoid to the gate
+            shared_gate = ggml_sigmoid(ctx0, shared_gate);
+            cb(shared_gate, "shared_expert_gate_sigmoid", il);
+
+            // The gate needs to be broadcast to match the dimensions of ffn_shexp
+            // ffn_shexp is [n_embd, n_tokens, 1, 1] and shared_gate is [1, n_tokens, 1, 1]
+            // We need to repeat the gate along the feature dimension
+            shared_gate = ggml_repeat(ctx0, shared_gate, ffn_shexp);
+            cb(shared_gate, "shared_expert_gate_broadcast", il);
+
+            // Apply the gate to the shared expert output
+            ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
+            cb(ffn_shexp, "ffn_shexp_gated", il);
+
+            cur = ggml_add(ctx0, moe_out, ffn_shexp);
+            cb(cur, "ffn_out", il);
+        } else {
+            cur = moe_out;
+        }
+    } else {
+        // Dense FFN branch (not currently used I believe)
+        cur = build_ffn(cur,
+            model.layers[il].ffn_up, NULL, NULL,
+            model.layers[il].ffn_gate, NULL, NULL,
+            model.layers[il].ffn_down, NULL, NULL,
+            NULL,
+            LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+    }
+    return cur;
+}
diff --git a/examples/talk-llama/models/rnd1.cpp b/examples/talk-llama/models/rnd1.cpp

new file mode 100644 (file)

index 0000000..46b3dc3
--- /dev/null
+++ b/examples/talk-llama/models/rnd1.cpp
@@ -0,0 +1,126 @@
+#include "models.h"
+
+// RND1 is a Qwen3Moe AR model converted to diffusion model.
+llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    // Non-causal attention for diffusion
+    auto * inp_attn = build_attn_inp_no_cache();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // norm
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self_attention
+        {
+            // compute Q and K and RoPE them
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+            cb(Qcur, "Qcur_normed", il);
+
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+            cb(Kcur, "Kcur_normed", il);
+
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // MoE branch
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        ggml_tensor * moe_out =
+            build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+        cb(moe_out, "ffn_moe_out", il);
+        cur = moe_out;
+
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/examples/talk-llama/unicode.cpp b/examples/talk-llama/unicode.cpp

index 77ba4fc46bc1170e52ce18370d3db3f46fc8d070..bb44edfaddffdbf159c26629a815fa8c7a4d8cd8 100644 (file)
--- a/examples/talk-llama/unicode.cpp
+++ b/examples/talk-llama/unicode.cpp
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
  
  // use std::wregex to split the text
  static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr);
+    std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
      std::vector<size_t> bpe_offsets; // store the offset of each word
      bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
      size_t start = 0;
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
  
  // use std::regex to split the text
  static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr);
+    std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
      std::vector<size_t> bpe_offsets; // store the offset of each word
      bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
      size_t start = 0;
author	Georgi Gerganov <redacted>
	Fri, 12 Dec 2025 15:56:43 +0000 (17:56 +0200)
committer	Georgi Gerganov <redacted>
	Fri, 12 Dec 2025 16:15:27 +0000 (18:15 +0200)
examples/talk-llama/llama-arch.cpp		patch \| blob \| history
examples/talk-llama/llama-arch.h		patch \| blob \| history
examples/talk-llama/llama-batch.cpp		patch \| blob \| history
examples/talk-llama/llama-batch.h		patch \| blob \| history
examples/talk-llama/llama-context.cpp		patch \| blob \| history
examples/talk-llama/llama-context.h		patch \| blob \| history
examples/talk-llama/llama-grammar.cpp		patch \| blob \| history
examples/talk-llama/llama-grammar.h		patch \| blob \| history
examples/talk-llama/llama-graph.cpp		patch \| blob \| history
examples/talk-llama/llama-hparams.h		patch \| blob \| history
examples/talk-llama/llama-impl.cpp		patch \| blob \| history
examples/talk-llama/llama-impl.h		patch \| blob \| history
examples/talk-llama/llama-kv-cache.cpp		patch \| blob \| history
examples/talk-llama/llama-mmap.cpp		patch \| blob \| history
examples/talk-llama/llama-model.cpp		patch \| blob \| history
examples/talk-llama/llama-model.h		patch \| blob \| history
examples/talk-llama/llama-quant.cpp		patch \| blob \| history
examples/talk-llama/llama-sampling.cpp		patch \| blob \| history
examples/talk-llama/llama-vocab.cpp		patch \| blob \| history
examples/talk-llama/llama.h		patch \| blob \| history
examples/talk-llama/models/deepseek2.cpp		patch \| blob \| history
examples/talk-llama/models/gemma3-iswa.cpp	[deleted file]	patch \| blob \| history
examples/talk-llama/models/gemma3.cpp	[new file with mode: 0644]	patch \| blob
examples/talk-llama/models/lfm2.cpp		patch \| blob \| history
examples/talk-llama/models/mistral3.cpp	[new file with mode: 0644]	patch \| blob
examples/talk-llama/models/models.h		patch \| blob \| history
examples/talk-llama/models/qwen3next.cpp	[new file with mode: 0644]	patch \| blob
examples/talk-llama/models/rnd1.cpp	[new file with mode: 0644]	patch \| blob
examples/talk-llama/unicode.cpp		patch \| blob \| history