ggml: Re-enable CUDA graphs in presence of CONT and DUP nodes (llama/12970)

author Alan Gray <redacted>

Thu, 17 Apr 2025 13:19:42 +0000 (14:19 +0100)

committer Georgi Gerganov <redacted>

Thu, 24 Apr 2025 17:39:16 +0000 (20:39 +0300)
author Alan Gray <redacted>
Thu, 17 Apr 2025 13:19:42 +0000 (14:19 +0100)
committer Georgi Gerganov <redacted>
Thu, 24 Apr 2025 17:39:16 +0000 (20:39 +0300)
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu

index 4f4faa3e63ae7c1245eeeedd6f940d576ddc3f85..ed25646e8e2610378602ba7e5a615afea84ced8a 100644 (file)
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -551,7 +551,7 @@ static void ggml_cpy_f16_f16_cuda(
          (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
  }
  
-void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
+void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
      const int64_t ne = ggml_nelements(src0);
      GGML_ASSERT(ne == ggml_nelements(src1));
  
@@ -588,7 +588,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
      char ** dest_ptrs_d = nullptr;
      int graph_cpynode_index = -1;
  #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
-    if(ctx.cuda_graph->use_cpy_indirection) {
+    if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
          dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
          graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
      }
@@ -636,7 +636,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
                  ggml_type_name(src0->type), ggml_type_name(src1->type));
      }
  #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
-    if(ctx.cuda_graph->use_cpy_indirection) {
+    if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
          ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
      }
  #endif
@@ -645,7 +645,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
  
  void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
      const ggml_tensor * src0 = dst->src[0];
-    ggml_cuda_cpy(ctx, src0, dst);
+    bool disable_indirection = true;
+    ggml_cuda_cpy(ctx, src0, dst, disable_indirection);
  }
  
  void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh

index 6bed0564df27af508b16dea1f9b9816449e9c8db..0bd3c0c6f8c277e00e54b95377307d0ea43a9647 100644 (file)
--- a/ggml/src/ggml-cuda/cpy.cuh
+++ b/ggml/src/ggml-cuda/cpy.cuh
@@ -2,7 +2,7 @@
  
  #define CUDA_CPY_BLOCK_SIZE 64
  
-void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
+void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1,  bool disable_indirection = false);
  
  void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
  
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index 9ced4665127888bcf77b1cb62b90fd0054f8de63..bab85809ac06867d59c104dac1b7aa63b4b199da 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2489,7 +2489,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
  #endif
          }
  
-        if (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_CONT || node->op == GGML_OP_DUP) {
+        if (node->op == GGML_OP_MUL_MAT_ID) {
              use_cuda_graph = false; // This node type is not supported by CUDA graph capture
  #ifndef NDEBUG
              GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
author	Alan Gray <redacted>
	Thu, 17 Apr 2025 13:19:42 +0000 (14:19 +0100)
committer	Georgi Gerganov <redacted>
	Thu, 24 Apr 2025 17:39:16 +0000 (20:39 +0300)
ggml/src/ggml-cuda/cpy.cu		patch \| blob \| history
ggml/src/ggml-cuda/cpy.cuh		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history