rpc : fix segfault with nkvo (#9389)

author Radoslav Gerganov <redacted>

Mon, 9 Sep 2024 15:40:10 +0000 (18:40 +0300)

committer GitHub <redacted>

Mon, 9 Sep 2024 15:40:10 +0000 (18:40 +0300)
author Radoslav Gerganov <redacted>
Mon, 9 Sep 2024 15:40:10 +0000 (18:40 +0300)
committer GitHub <redacted>
Mon, 9 Sep 2024 15:40:10 +0000 (18:40 +0300)
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu

index 982316f565e9c168ab18da500756dbf25fc65af5..d53de4edd8098d1ff26f5d5b741e858d2350d022 100644 (file)
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2552,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
          for (int i = 0; i < cgraph->n_nodes; i++) {
              ggml_tensor * node = cgraph->nodes[i];
  
-            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                continue;
+            }
+
+            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
  #ifndef NDEBUG
                  GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp

index 8f9d0a46019691a6d66cb8c33b13a4335ea1f09a..9c600c7cae4f9336a7bd92c9411040395756c5fc 100644 (file)
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
      }
      result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
      if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
-        return nullptr;
+        result->buffer = nullptr;
      }
  
-    // require that the tensor data does not go beyond the buffer end
-    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
-    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
-    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
-    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
-    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+    if (result->buffer) {
+        // require that the tensor data does not go beyond the buffer end
+        uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+        uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+        uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+        GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+        GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+    }
  
      result->op = (ggml_op) tensor->op;
      for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
@@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
      const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
      GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
  
-    static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
      struct ggml_init_params params = {
          /*.mem_size   =*/ buf_size,
          /*.mem_buffer =*/ NULL,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c

index 28ee46e042bbce644505a77114ad3440cef08ad4..d7157ca6d4b838e8f2823b6850727be0ae325589 100644 (file)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3847,7 +3847,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
  
      if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
          GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                __func__, cur_end + size_needed, ctx->mem_size);
+                __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
          assert(false);
          return NULL;
      }
author	Radoslav Gerganov <redacted>
	Mon, 9 Sep 2024 15:40:10 +0000 (18:40 +0300)
committer	GitHub <redacted>
	Mon, 9 Sep 2024 15:40:10 +0000 (18:40 +0300)
ggml/src/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-rpc.cpp		patch \| blob \| history
ggml/src/ggml.c		patch \| blob \| history