graph : reduce topology branching (#18548)

author Georgi Gerganov <redacted>

Fri, 2 Jan 2026 17:01:56 +0000 (19:01 +0200)

committer GitHub <redacted>

Fri, 2 Jan 2026 17:01:56 +0000 (19:01 +0200)
author Georgi Gerganov <redacted>
Fri, 2 Jan 2026 17:01:56 +0000 (19:01 +0200)
committer GitHub <redacted>
Fri, 2 Jan 2026 17:01:56 +0000 (19:01 +0200)
diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp

index edf0d1424ceaea43606f59c18b3fe3aa28a33959..0ceae3aaeb550cc812fca7735166b6bbcb1ad7be 100644 (file)
--- a/src/models/cogvlm.cpp
+++ b/src/models/cogvlm.cpp
@@ -3,12 +3,14 @@
  llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_v;
-    float         kq_scale    = 1.0f / sqrtf(float(n_embd_head));
+    const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
  
      GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
      GGML_ASSERT(n_embd_head == hparams.n_rot);
  
-    ggml_tensor *inpL, *cur;
+    ggml_tensor * inpL;
+    ggml_tensor * cur;
+
      inpL = build_inp_embd(model.tok_embd);
  
      ggml_tensor * inp_pos = build_inp_pos();
@@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
          }
  
          ggml_tensor * inpSA = inpL;
-        cur                 = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  
          // build self attention
          {
diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp

index 90a98f7abf0fd3aaa8578884d86ea910e675bc71..944c198bf9502459101d966c58c0ef670a470474 100644 (file)
--- a/src/models/gemma-embedding.cpp
+++ b/src/models/gemma-embedding.cpp
@@ -1,7 +1,5 @@
  #include "models.h"
  
-
-
  llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params) {
      const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
      inpL = build_inp_embd(model.tok_embd);
  
      // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    if (ubatch.token) {
-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-        cb(inpL, "inp_scaled", -1);
-    }
+    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    cb(inpL, "inp_scaled", -1);
  
      // inp_pos - contains the positions
      ggml_tensor * inp_pos = build_inp_pos();
diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp

index ae60ef4790c9796d05d80f7dbd31683997aee200..dec3fc4b8bc3962217d4f5ecb99c6cb3260034bb 100644 (file)
--- a/src/models/gemma3.cpp
+++ b/src/models/gemma3.cpp
@@ -10,10 +10,9 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
      inpL = build_inp_embd(model.tok_embd);
  
      // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    if (ubatch.token) {
-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-        cb(inpL, "inp_scaled", -1);
-    }
+    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    cb(inpL, "inp_scaled", -1);
+
      // inp_pos - contains the positions
      ggml_tensor * inp_pos = build_inp_pos();
  
diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp

index a0bdd6a15a123efa44d6f9fbe0813fffd6ff1742..9c7b3ba0bbf55644aca6717e000b4bf989287fc5 100644 (file)
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -1,7 +1,5 @@
  #include "models.h"
  
-
-
  llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
      llm_graph_context(params),
      model(model),
@@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
      inpL = build_inp_embd(model.tok_embd);
  
      // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    if (ubatch.token) {
-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-        cb(inpL, "inp_scaled", -1);
-    }
+    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    cb(inpL, "inp_scaled", -1);
+
      // inp_pos - contains the positions
      ggml_tensor * inp_pos = build_inp_pos();
  
@@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
  // equivalent to get_per_layer_inputs() in python code
  // output shape: [n_embd_altup, n_layer, n_tokens]
  ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
-    auto          inp = std::make_unique<llm_graph_input_embd>();
+    auto inp = std::make_unique<llm_graph_input_embd>();
      ggml_tensor * inp_per_layer;
      if (ubatch.token) {
          inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
author	Georgi Gerganov <redacted>
	Fri, 2 Jan 2026 17:01:56 +0000 (19:01 +0200)
committer	GitHub <redacted>
	Fri, 2 Jan 2026 17:01:56 +0000 (19:01 +0200)
src/models/cogvlm.cpp		patch \| blob \| history
src/models/gemma-embedding.cpp		patch \| blob \| history
src/models/gemma3.cpp		patch \| blob \| history
src/models/gemma3n-iswa.cpp		patch \| blob \| history