model : fix llama4 graph (#13663)

author Georgi Gerganov <redacted>

Tue, 20 May 2025 16:21:04 +0000 (19:21 +0300)

committer GitHub <redacted>

Tue, 20 May 2025 16:21:04 +0000 (19:21 +0300)
author Georgi Gerganov <redacted>
Tue, 20 May 2025 16:21:04 +0000 (19:21 +0300)
committer GitHub <redacted>
Tue, 20 May 2025 16:21:04 +0000 (19:21 +0300)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 057f1fc1777fbccadaf5a8d4b07c02166a549c0d..383972f94153856f5c87fe6deb5783bda5d32d3e 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4803,8 +4803,21 @@ struct llm_build_llama_iswa : public llm_graph_context {
              ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
              cb(ffn_inp, "ffn_inp", il);
  
-            {
-                // llama4 MoE
+            // feed-forward network (non-MoE)
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            } else {
                  ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
                          model.layers[il].ffn_norm, NULL,
                          LLM_NORM_RMS, il);
author	Georgi Gerganov <redacted>
	Tue, 20 May 2025 16:21:04 +0000 (19:21 +0300)
committer	GitHub <redacted>
	Tue, 20 May 2025 16:21:04 +0000 (19:21 +0300)