]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
model : wire up Nemotron-H tensors for NVFP4 support (#20561)
authorSigbjørn Skjæret <redacted>
Mon, 16 Mar 2026 08:19:16 +0000 (09:19 +0100)
committerGitHub <redacted>
Mon, 16 Mar 2026 08:19:16 +0000 (09:19 +0100)
* wire up Nemotron-H tensors for NVFP4 support

* add ssm tensors

* alignment

src/llama-model.cpp
src/llama-model.h
src/models/mamba-base.cpp
src/models/nemotron-h.cpp

index e8e1bbf1cd1387174d60f94727408c78f7b8a124..bae02e32b1730451b228e4a78c5b3884a01e6a8d 100644 (file)
@@ -7501,6 +7501,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             }
 
             // recurrent / linear-attention weight scales (per-tensor, shape {1})
+            if (!layer.ssm_in_s && layer.ssm_in) {
+                layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
             if (!layer.ssm_out_s && layer.ssm_out) {
                 layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
             }
index 25bf892e7e2c144285d6a938c738ec010b9e8d2d..aefcfe700f7450aad2ed2142a2258069222fa20b 100644 (file)
@@ -409,7 +409,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_shexp_s = nullptr;
     struct ggml_tensor * ffn_up_shexp_s   = nullptr;
     struct ggml_tensor * ffn_down_shexp_s = nullptr;
-    struct ggml_tensor * ssm_out_s  = nullptr;
+    struct ggml_tensor * ssm_in_s    = nullptr;
+    struct ggml_tensor * ssm_out_s   = nullptr;
     struct ggml_tensor * ssm_alpha_s = nullptr;
     struct ggml_tensor * ssm_beta_s  = nullptr;
 
index 9de587db55fbeb49fad55eb0a0de73d04149b537..c37f29c487ed7ae9ff0cbd51013a164b6187ae58 100644 (file)
@@ -42,7 +42,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
     cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
 
     // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
-    ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
+    ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur, layer.ssm_in_s);
     // split the above in two
     // => {d_inner, n_seq_tokens, n_seqs}
     ggml_tensor * x  = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
@@ -137,7 +137,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
         y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
 
         // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
-        cur = build_lora_mm(layer.ssm_out, y);
+        cur = build_lora_mm(layer.ssm_out, y, layer.ssm_out_s);
     }
 
     // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
@@ -184,7 +184,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
     // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
 
     // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
-    ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
+    ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur, model.layers[il].ssm_in_s);
 
     // split the above in three
     ggml_tensor * z   = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
@@ -278,7 +278,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
         y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
 
         // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
-        cur = build_lora_mm(model.layers[il].ssm_out, y);
+        cur = build_lora_mm(model.layers[il].ssm_out, y, model.layers[il].ssm_out_s);
     }
 
     // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
index 7af99174d1678d21f07c3bcd41fd819a3146af50..d3fccfb70d48877e5f4a18521125758e168e0d6c 100644 (file)
@@ -107,9 +107,9 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
 ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
     if (model.layers[il].ffn_gate_inp == nullptr) {
         cur = build_ffn(cur,
-                model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   model.layers[il].ffn_up_s,
                 NULL,                      NULL,                        NULL,
-                model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
                 NULL,
                 LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
         cb(cur, "ffn_out", il);
@@ -136,7 +136,10 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
                     hparams.expert_weights_scale,
                     LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
                     il,
-                    router_logits);
+                    router_logits, nullptr,
+                    model.layers[il].ffn_up_exps_s,
+                    nullptr, // no gate
+                    model.layers[il].ffn_down_exps_s);
         cb(moe_out, "ffn_moe_out", il);
 
         if (model.layers[il].ffn_latent_up) {
@@ -144,9 +147,9 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
         }
 
         ggml_tensor * ffn_shexp = build_ffn(inp_emb,
-                    model.layers[il].ffn_up_shexp,  NULL, NULL,
-                    NULL /* no gate */           ,  NULL, NULL,
-                    model.layers[il].ffn_down_shexp, NULL, NULL,
+                    model.layers[il].ffn_up_shexp,   NULL, model.layers[il].ffn_up_shexp_s,
+                    NULL /* no gate */           ,   NULL, NULL,
+                    model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
                     NULL,
                     LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
         cb(ffn_shexp, "ffn_shexp", il);