models : optimize qwen3next graph (llama/19375)

author Georgi Gerganov <redacted>

Sat, 14 Feb 2026 10:57:36 +0000 (12:57 +0200)

committer Georgi Gerganov <redacted>

Sat, 14 Feb 2026 22:20:18 +0000 (00:20 +0200)
author Georgi Gerganov <redacted>
Sat, 14 Feb 2026 10:57:36 +0000 (12:57 +0200)
committer Georgi Gerganov <redacted>
Sat, 14 Feb 2026 22:20:18 +0000 (00:20 +0200)
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu

index 85ce96958fa0c050e4a5349d7366bbf7bbf250f4..bed5c71a1bdaf2dd3510cacb87f49fc81ebdc48b 100644 (file)
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -2872,6 +2872,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
      const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
      const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
      const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
+    const std::string delta_net_prefix = "dnet_add";
  
      for (int i = 0; i < cgraph->n_nodes; i++) {
          ggml_tensor * node = cgraph->nodes[i];
@@ -2902,7 +2903,8 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
              strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
              strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
              strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
-            strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
+            strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 &&
+            strncmp(node->name, delta_net_prefix.c_str(), delta_net_prefix.size()) != 0) {
              // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
              // by means of matching node names. See
              // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
@@ -4544,6 +4546,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                  case GGML_UNARY_OP_CEIL:
                  case GGML_UNARY_OP_ROUND:
                  case GGML_UNARY_OP_TRUNC:
+                    // TODO: should become:
+                    //return ggml_is_contiguous_rows(op->src[0]);
                      return ggml_is_contiguous(op->src[0]);
                  default:
                      return false;
diff --git a/src/ggml-metal/ggml-metal-common.cpp b/src/ggml-metal/ggml-metal-common.cpp

index 87e13786849001c2e7c361d8292da9e6e53679f3..2eb9820bff91c7e753b7cc72f03a72dd88449199 100644 (file)
--- a/src/ggml-metal/ggml-metal-common.cpp
+++ b/src/ggml-metal/ggml-metal-common.cpp
@@ -273,6 +273,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
              case GGML_OP_DIAG:
              case GGML_OP_MUL:
              case GGML_OP_ADD:
+            case GGML_OP_SUB:
              case GGML_OP_DIV:
              case GGML_OP_GLU:
              case GGML_OP_SCALE:
author	Georgi Gerganov <redacted>
	Sat, 14 Feb 2026 10:57:36 +0000 (12:57 +0200)
committer	Georgi Gerganov <redacted>
	Sat, 14 Feb 2026 22:20:18 +0000 (00:20 +0200)
src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
src/ggml-metal/ggml-metal-common.cpp		patch \| blob \| history