llama : remove llm_graph_input_one (#14603)

author Xuan-Son Nguyen <redacted>

Wed, 9 Jul 2025 21:09:28 +0000 (23:09 +0200)

committer GitHub <redacted>

Wed, 9 Jul 2025 21:09:28 +0000 (23:09 +0200)
author Xuan-Son Nguyen <redacted>
Wed, 9 Jul 2025 21:09:28 +0000 (23:09 +0200)
committer GitHub <redacted>
Wed, 9 Jul 2025 21:09:28 +0000 (23:09 +0200)
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 55a059d0975d25e44857966ac519608239764bc6..a248a7ec22350898ac2f31ba9d817e528651b925 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -340,13 +340,6 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
      inp_rs->set_input(ubatch);
  }
  
-void llm_graph_input_one::set_input(const llama_ubatch * ubatch) {
-    GGML_UNUSED(ubatch);
-    GGML_ASSERT(one && ggml_nelements(one) == 1);
-    float f_one = 1.0f;
-    ggml_backend_tensor_set(one, &f_one, 0, sizeof(float));
-}
-
  //
  // llm_graph_context
  //
diff --git a/src/llama-graph.h b/src/llama-graph.h

index 54eaaac02b99e083d070cfbe4d72ade581cbd575..fbf8e2889564ddb591af4bad939857305737f36a 100644 (file)
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -341,17 +341,6 @@ public:
      const llama_memory_hybrid_context * mctx;
  };
  
-// TODO: remove this when ggml_scale_add is implemented
-class llm_graph_input_one : public llm_graph_input_i {
-public:
-    llm_graph_input_one() {}
-    virtual ~llm_graph_input_one() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * one = nullptr; // F32
-};
-
  //
  // llm_graph_result
  //
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index c21cc28806c75e5a1222e8ba1400b52ad96bb813..ca094e47b6cb531c25704fae2723bd736377ea73 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -9485,8 +9485,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
      const int     n_layer_sparsity = 10; // number of layers using activation sparsity
      const float   f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
  
-    ggml_tensor * one; // containing single element 1.0f
-
      llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
              : llm_graph_context(params),
                model(model),
@@ -9498,14 +9496,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
          ggml_tensor * cur;
          ggml_tensor * inpL;
  
-        // TODO: remove this when ggml_scale_add is implemented
-        one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        {
-            auto inp = std::make_unique<llm_graph_input_one>();
-            inp->one = one;
-            res->add_input(std::move(inp));
-        }
-
          inpL = build_inp_embd(model.tok_embd);
  
          // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
@@ -9895,7 +9885,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
          cb(innovation, "innovation", il);
  
          ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
-        all_coefs = ggml_add(ctx0, all_coefs, one);
+        all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
          cb(all_coefs, "all_coefs", il);
          all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
          all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
author	Xuan-Son Nguyen <redacted>
	Wed, 9 Jul 2025 21:09:28 +0000 (23:09 +0200)
committer	GitHub <redacted>
	Wed, 9 Jul 2025 21:09:28 +0000 (23:09 +0200)
src/llama-graph.cpp		patch \| blob \| history
src/llama-graph.h		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history