vulkan/cuda: fix topk_moe with exp_probs_b (llama/18071)

author Jeff Bolz <redacted>

Sun, 21 Dec 2025 09:27:34 +0000 (03:27 -0600)

committer Georgi Gerganov <redacted>

Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
author Jeff Bolz <redacted>
Sun, 21 Dec 2025 09:27:34 +0000 (03:27 -0600)
committer Georgi Gerganov <redacted>
Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index ab0f6fe9ce92a3d59cf8142e29e2c0f57668d240..55fa2e6a7cc3473f2172ef2bae13ca9536c95af0 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3076,8 +3076,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
          ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx];
          ggml_tensor * weights = cgraph->nodes[node_idx + 9];
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
  
-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
              return true;
          }
      }
@@ -3085,7 +3088,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
      if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx];
          ggml_tensor * weights = cgraph->nodes[node_idx + 4];
-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
+
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
              return true;
          }
      }
@@ -3094,8 +3101,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
          ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
          ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
          ggml_tensor * weights = cgraph->nodes[node_idx + 5];
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
  
-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
              return true;
          }
      }
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu

index 572379fcbf0e81ba031a8c6fe92dba6555ba850b..48e569efa0d989fd5a2c0b6707d8b9054fa5b9fe 100644 (file)
--- a/ggml/src/ggml-cuda/topk-moe.cu
+++ b/ggml/src/ggml-cuda/topk-moe.cu
@@ -268,7 +268,23 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
      }
  }
  
-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp) {
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+                                   const ggml_tensor * weights,
+                                   const ggml_tensor * get_rows,
+                                   const ggml_tensor * argsort,
+                                   const ggml_tensor * clamp,
+                                   int n_expert) {
+    ggml_tensor * probs = get_rows->src[0];
+    if (probs->op != GGML_OP_RESHAPE) {
+        return false;
+    }
+    probs = probs->src[0];
+    ggml_tensor * selection_probs = argsort->src[0];
+
+    if (probs != selection_probs) {
+        return false;
+    }
+
      float scale    = 1.0f;
      float max_bias = 0.0f;
  
@@ -288,7 +304,6 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso
          return false;
      }
  
-    const int n_expert = softmax->ne[0];
      // n_expert must be a power of 2
      if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
          return false;
diff --git a/ggml/src/ggml-cuda/topk-moe.cuh b/ggml/src/ggml-cuda/topk-moe.cuh

index 2eff408b03058b82a2b27f8a95de98aad6168426..6b6c13c58706930409a4aa46fcc8589315eb56e1 100644 (file)
--- a/ggml/src/ggml-cuda/topk-moe.cuh
+++ b/ggml/src/ggml-cuda/topk-moe.cuh
@@ -11,6 +11,11 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
                             const bool                  delayed_softmax = false,
                             ggml_tensor *               weight_clamp    = nullptr);
  
-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr);
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+                                   const ggml_tensor * weights,
+                                   const ggml_tensor * get_rows,
+                                   const ggml_tensor * argsort,
+                                   const ggml_tensor * clamp,
+                                   int n_expert);
  
  std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp

index 1c614536362d29dc9c24ed66e6f4911ea67c5c87..a5308fa54426641624d1935da015ca78aab6a189 100644 (file)
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12941,24 +12941,43 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
  
      const ggml_tensor * softmax;
      const ggml_tensor * weights;
+    const ggml_tensor * get_rows;
+    const ggml_tensor * argsort;
  
      switch (mode) {
      case TOPK_MOE_EARLY_SOFTMAX_NORM:
          softmax = cgraph->nodes[node_idx + 0];
          weights = cgraph->nodes[node_idx + 9];
+        get_rows = cgraph->nodes[node_idx + 4];
+        argsort = cgraph->nodes[node_idx + 2];
          break;
      case TOPK_MOE_EARLY_SOFTMAX:
          softmax = cgraph->nodes[node_idx + 0];
          weights = cgraph->nodes[node_idx + 4];
+        get_rows = cgraph->nodes[node_idx + 4];
+        argsort = cgraph->nodes[node_idx + 2];
          break;
      case TOPK_MOE_LATE_SOFTMAX:
          softmax = cgraph->nodes[node_idx + 4];
          weights = cgraph->nodes[node_idx + 5];
+        get_rows = cgraph->nodes[node_idx + 2];
+        argsort = cgraph->nodes[node_idx + 0];
          break;
      default:
          return false;
      }
  
+    ggml_tensor * probs = get_rows->src[0];
+    if (probs->op != GGML_OP_RESHAPE) {
+        return false;
+    }
+    probs = probs->src[0];
+    ggml_tensor * selection_probs = argsort->src[0];
+
+    if (probs != selection_probs) {
+        return false;
+    }
+
      const float * op_params = (const float *)softmax->op_params;
  
      float scale = op_params[0];
author	Jeff Bolz <redacted>
	Sun, 21 Dec 2025 09:27:34 +0000 (03:27 -0600)
committer	Georgi Gerganov <redacted>
	Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-cuda/topk-moe.cu		patch \| blob \| history
ggml/src/ggml-cuda/topk-moe.cuh		patch \| blob \| history
ggml/src/ggml-vulkan/ggml-vulkan.cpp		patch \| blob \| history