llama-graph : use ggml_repeat_4d (#13998)

author Xuan-Son Nguyen <redacted>

Wed, 4 Jun 2025 08:11:26 +0000 (10:11 +0200)

committer GitHub <redacted>

Wed, 4 Jun 2025 08:11:26 +0000 (10:11 +0200)
author Xuan-Son Nguyen <redacted>
Wed, 4 Jun 2025 08:11:26 +0000 (10:11 +0200)
committer GitHub <redacted>
Wed, 4 Jun 2025 08:11:26 +0000 (10:11 +0200)
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 727e119e334f6093a00a26f8d1ea3fc056c3addd..c4bdd6603927754b9377b2691e852dffc7098235 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -769,9 +769,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
      cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
  
      if (weight_before_ffn) {
-        // TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (ggml_repeat_4d)
-        ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
-        repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
+        // repeat cur to [n_embd, n_expert_used, n_tokens]
+        ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1);
          cur = ggml_mul(ctx0, repeated, weights);
          cb(cur, "ffn_moe_weighted", il);
      }
author	Xuan-Son Nguyen <redacted>
	Wed, 4 Jun 2025 08:11:26 +0000 (10:11 +0200)
committer	GitHub <redacted>
	Wed, 4 Jun 2025 08:11:26 +0000 (10:11 +0200)