cuda : Enable CUDA Graph usage for Nemotron Nano v2 (NemotronH) (#16328)

author anavp-nvidia <redacted>

Tue, 30 Sep 2025 08:13:22 +0000 (08:13 +0000)

committer GitHub <redacted>

Tue, 30 Sep 2025 08:13:22 +0000 (11:13 +0300)
author anavp-nvidia <redacted>
Tue, 30 Sep 2025 08:13:22 +0000 (08:13 +0000)
committer GitHub <redacted>
Tue, 30 Sep 2025 08:13:22 +0000 (11:13 +0300)
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu

index 1b763a62898492438b379a910d3e99c21e098741..746f43966b84c244d3824b400007083310f89810 100644 (file)
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -329,7 +329,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
          } else
  #endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
          {
-            CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+            if (src0->type == GGML_TYPE_F32) {
+                ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            } else {
+                CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+            }
          }
      } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
          ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -400,7 +404,13 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  
  void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
      if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-        return nullptr;
+        // Prioritize CUDA graph compatibility over direct memory copy optimization.
+        // Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
+        if (src0->type == GGML_TYPE_F32) {
+            return (void*) cpy_flt<cpy_1_flt<float, float>>;
+        } else {
+            return nullptr;
+        }
      } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
          return (void*) cpy_flt<cpy_1_flt<float, float>>;
      } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index 5a9e54721e46386df950613317363b57baddfdff..b7e81b21bcb2844ca7f016b37b889599ca3b64ee 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2641,6 +2641,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
      const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
      const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
      const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
+    const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
+    const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
  
      for (int i = 0; i < cgraph->n_nodes; i++) {
          ggml_tensor * node = cgraph->nodes[i];
@@ -2669,7 +2671,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
              (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
              strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
              strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
-            strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0) {
+            strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
+            strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
+            strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
              // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
              // by means of matching node names. See
              // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index ffd9286ef8912f9609ca82d20bac483d5e5fde8d..2470f87850f2b62492919c49adef3891afcb1d97 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -11751,6 +11751,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
              // TODO: skip computing output earlier for unused tokens
  
              y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
+            cb(y, "mamba2_y_add_d", il);
              y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  
              // grouped RMS norm
@@ -14705,6 +14706,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
          ggml_tensor * inpL;
  
          inpL = build_inp_embd(model.tok_embd);
+        ggml_build_forward_expand(gf, inpL);
  
          auto * inp = build_inp_mem_hybrid();
  
@@ -14736,7 +14738,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
  
              // add residual
              cur = ggml_add(ctx0, cur, inpSA);
-            cb(cur, "block_out", il);
+            cb(cur, "nemotron_h_block_out", il);
  
              // input for next layer
              inpL = cur;
author	anavp-nvidia <redacted>
	Tue, 30 Sep 2025 08:13:22 +0000 (08:13 +0000)
committer	GitHub <redacted>
	Tue, 30 Sep 2025 08:13:22 +0000 (11:13 +0300)
ggml/src/ggml-cuda/cpy.cu		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history