#endif
#include "ggml-common.h"
-#include <cstdio>
#include <array>
#include <cassert>
#include <cfloat>
+#include <cstdio>
#include <string>
#include <vector>
name(GGML_CUDA_NAME + std::to_string(device)) {
}
- ~ggml_backend_cuda_context() {
- if (copy_event != nullptr) {
- CUDA_CHECK(cudaEventDestroy(copy_event));
- }
- for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
- for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
- if (streams[i][j] != nullptr) {
- CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
- }
- }
- if (cublas_handles[i] != nullptr) {
- CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
- }
- }
- }
+ ~ggml_backend_cuda_context();
cudaStream_t stream(int device, int stream) {
if (streams[device][stream] == nullptr) {
#include <atomic>
#include <charconv>
#include <cinttypes>
+#include <condition_variable>
#include <cstddef>
#include <cstdint>
#include <float.h>
#include <map>
#include <memory>
#include <mutex>
-#include <stdint.h>
-#include <stdio.h>
#include <stdarg.h>
+#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <vector>
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
}
+// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
+// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
+
+static std::mutex ggml_cuda_lock;
+static std::condition_variable ggml_cuda_lock_cv;
+static std::atomic<int> ggml_cuda_lock_counter;
+
+ggml_backend_cuda_context::~ggml_backend_cuda_context() {
+ std::unique_lock<std::mutex> lock(ggml_cuda_lock);
+ ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
+
+ if (copy_event != nullptr) {
+ CUDA_CHECK(cudaEventDestroy(copy_event));
+ }
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
+ for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
+ if (streams[i][j] != nullptr) {
+ CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
+ }
+ }
+ if (cublas_handles[i] != nullptr) {
+ CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
+ }
+ }
+}
+
+
// cuda buffer
struct ggml_backend_cuda_buffer_context {
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
graph_evaluated_or_captured = true; // CUDA graph has been captured
+
+ std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+ if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
+ ggml_cuda_lock_cv.notify_all();
+ }
} else {
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
}
}
}
- if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
+ if (use_cuda_graph && cuda_graph_update_required) {
+ // Start CUDA graph capture
+ {
+ std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+ ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
+ }
+
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
}