Overlap cmdbuffer creation and cmdbuffer execution in Vulkan backend by submitting...

author Markus Tavenrath <redacted>

Sun, 8 Sep 2024 19:43:48 +0000 (21:43 +0200)

committer Georgi Gerganov <redacted>

Fri, 20 Sep 2024 19:03:57 +0000 (22:03 +0300)
author Markus Tavenrath <redacted>
Sun, 8 Sep 2024 19:43:48 +0000 (21:43 +0200)
committer Georgi Gerganov <redacted>
Fri, 20 Sep 2024 19:03:57 +0000 (22:03 +0300)
diff --git a/src/ggml-vulkan.cpp b/src/ggml-vulkan.cpp

index 611fe69acb473e6924e4fd3ad24307cff7b1dfd2..3845c04acf111998974758cf1b4d8135b1638287 100644 (file)
--- a/src/ggml-vulkan.cpp
+++ b/src/ggml-vulkan.cpp
@@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
  
  static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
      if (ctx->seqs.empty()) {
+        if (fence) {
+            ctx->q->queue.submit({}, fence);
+        }
          return;
      }
      VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
@@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
      }
  }
  
-static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
+
+// Returns true if node has enqueued work into the queue, false otherwise
+// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
+static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
      ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
  
      if (ggml_is_empty(node) || extra == nullptr) {
-        return;
+        return false;
      }
  
      VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
@@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
      case GGML_OP_PERMUTE:
      case GGML_OP_TRANSPOSE:
      case GGML_OP_NONE:
-        return;
+        return false;
      case GGML_OP_UNARY:
          switch (ggml_get_unary_op(node)) {
          case GGML_UNARY_OP_SILU:
@@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
          case GGML_UNARY_OP_TANH:
              break;
          default:
-            return;
+            return false;
          }
          break;
      case GGML_OP_REPEAT:
@@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
      default:
          std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
          GGML_ABORT("fatal error");
-        return;
+        return false;
      }
  
      vk_context compute_ctx;
@@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
              ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
              break;
          default:
-            return;
+            return false;
          }
          break;
      case GGML_OP_DIAG_MASK_INF:
@@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
  
          break;
      default:
-        return;
+        return false;
      }
  
      if (dryrun) {
-        return;
+        return false;
      }
  
      ctx->tensor_ctxs[node_idx] = compute_ctx;
@@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
      last_node = true;
  #endif
  
-    if (last_node) {
+    if (submit || last_node) {
          ggml_vk_ctx_end(compute_ctx);
-        compute_ctx->exit_tensor_idx = node_idx;
+
+        // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
+        if (last_node) {
+            compute_ctx->exit_tensor_idx = node_idx_begin;
+        }
+        else {
+            compute_ctx->exit_tensor_idx = -1;
+        }
+
          ctx->compute_ctx.reset();
+
+        bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
+        if (!ok) {
+            if (node->op == GGML_OP_UNARY) {
+                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
+            }
+            else {
+                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+            }
+        }
+
      }
+    return true;
  }
  
-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
      ggml_tensor_extra_gpu * extra = nullptr;
  
      switch (tensor->op) {
@@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
  
      VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
  
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    ggml_vk_check_results_0(tensor);
-#endif
-
      vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
  
-#ifdef GGML_VULKAN_PERF
-    std::chrono::steady_clock::time_point start;
-#endif // GGML_VULKAN_PERF
+    // always wait for the GPU work to be done for the last submit
+    if (tensor_idx == subctx->exit_tensor_idx) {
+        use_fence = true;
+    }
  
      // Only run if ctx hasn't been submitted yet
      if (!subctx->seqs.empty()) {
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_0(tensor);
+        use_fence = true;
+#endif
+
          // Do staging buffer copies
          for (auto& cpy : subctx->in_memcpys) {
              memcpy(cpy.dst, cpy.src, cpy.n);
          }
  
-#ifdef GGML_VULKAN_PERF
-        start = std::chrono::steady_clock::now();
-#endif // GGML_VULKAN_PERF
+        ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
+
+        if (use_fence) {
+            VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
  
-        ggml_vk_submit(subctx, ctx->fence);
+            ctx->device->device.resetFences({ ctx->fence });
+        }
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_1(tensor);
+#endif
      }
  
      if (tensor_idx == subctx->exit_tensor_idx) {
-        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
-
-#ifdef GGML_VULKAN_PERF
-        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
-        ctx->device->perf_logger->log_timing(tensor, duration.count());
-#endif // GGML_VULKAN_PERF
-
-        ctx->device->device.resetFences({ ctx->fence });
-
          // Do staging buffer copies
          for (auto& cpy : subctx->out_memcpys) {
              memcpy(cpy.dst, cpy.src, cpy.n);
@@ -6483,7 +6508,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
      ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
  
      for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
+        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
      }
      ggml_vk_preallocate_buffers(ctx);
      ggml_pipeline_allocate_descriptor_sets(ctx->device);
@@ -6498,31 +6523,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
      // Reserve tensor context space for all nodes
      ctx->tensor_ctxs.resize(cgraph->n_nodes);
  
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
-    }
+    bool first_node_in_batch = true; // true if next node will be first node in a batch
+    int submit_node_idx = 0; // index to first node in a batch
  
+    // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
+    constexpr int submit_count = 100;
+    int submitted_nodes = 0;
      for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (ggml_vk_is_empty(node)) {
-            continue;
+        if (first_node_in_batch) {
+            submit_node_idx = i;
          }
  
-        bool ok = ggml_vk_compute_forward(ctx, node, i);
-        if (!ok) {
-            if (node->op == GGML_OP_UNARY) {
-                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
-            } else {
-                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+        bool submit = (submitted_nodes >= submit_count) || (i == last_node);
+
+
+        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
+
+        if (enqueued) {
+            ++submitted_nodes;
+
+#ifndef GGML_VULKAN_CHECK_RESULTS
+            if (first_node_in_batch) {
+                first_node_in_batch = false;
              }
+#endif
          }
-#ifdef GGML_VULKAN_CHECK_RESULTS
-        else {
-            ggml_vk_check_results_1(node);
+
+        if (submit) {
+            first_node_in_batch = true;
+            submitted_nodes = 0;
          }
-#endif
-        GGML_ASSERT(ok);
      }
  
  #ifdef GGML_VULKAN_PERF
author	Markus Tavenrath <redacted>
	Sun, 8 Sep 2024 19:43:48 +0000 (21:43 +0200)
committer	Georgi Gerganov <redacted>
	Fri, 20 Sep 2024 19:03:57 +0000 (22:03 +0300)