metal : report OOM errors (llama/16274)

author Georgi Gerganov <redacted>

Fri, 26 Sep 2025 11:14:28 +0000 (14:14 +0300)

committer Georgi Gerganov <redacted>

Mon, 29 Sep 2025 12:18:11 +0000 (15:18 +0300)
author Georgi Gerganov <redacted>
Fri, 26 Sep 2025 11:14:28 +0000 (14:14 +0300)
committer Georgi Gerganov <redacted>
Mon, 29 Sep 2025 12:18:11 +0000 (15:18 +0300)
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m

index af9ff214360794f2f2e062b7881f7b268f1afae2..02147a0ea4a52e5d06162730a63523a54775145e 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -222,7 +222,28 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
          ctx->cmd_buf_last = nil;
      }
  
-    // release any completed command buffers
+    // check status of all command buffers
+    {
+        const int n_cb = ctx->n_cb;
+
+        for (int cb_idx = 0; cb_idx <= n_cb; ++cb_idx) {
+            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
+            if (!cmd_buf) {
+                continue;
+            }
+
+            MTLCommandBufferStatus status = [cmd_buf status];
+            if (status != MTLCommandBufferStatusCompleted) {
+                GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, cb_idx, (int) status);
+                if (status == MTLCommandBufferStatusError) {
+                    GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
+                }
+                GGML_ABORT("fatal error");
+            }
+        }
+    }
+
+    // release any completed extra command buffers
      if (ctx->cmd_bufs_ext.count > 0) {
          for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
              id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
@@ -260,6 +281,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
                                                           length:size
                                                          options:MTLResourceStorageModeShared];
  
+        GGML_ASSERT(buf_src);
+
          struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(tensor);
          if (bid_dst.metal == nil) {
              GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
@@ -299,6 +322,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
                                                                options:MTLResourceStorageModeShared
                                                            deallocator:nil];
  
+        GGML_ASSERT(buf_dst);
+
          struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(tensor);
          if (bid_src.metal == nil) {
              GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m

index 5f744d1a0bd960719eeb359c5aa46d0923f89915..9c7e1f2c8fa47d963c4358f907f53a7295b69c9e 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1176,6 +1176,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
                                                                options:MTLResourceStorageModeShared
                                                            deallocator:nil];
  
+        GGML_ASSERT(buf_src);
+
          // dst
          struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
          bid_dst.offs += offset;
@@ -1232,6 +1234,8 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
                                                                options:MTLResourceStorageModeShared
                                                            deallocator:nil];
  
+        GGML_ASSERT(buf_dst);
+
          id<MTLCommandQueue>  queue   = buf->queue;
          id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
author	Georgi Gerganov <redacted>
	Fri, 26 Sep 2025 11:14:28 +0000 (14:14 +0300)
committer	Georgi Gerganov <redacted>
	Mon, 29 Sep 2025 12:18:11 +0000 (15:18 +0300)
ggml/src/ggml-metal/ggml-metal-context.m		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal-device.m		patch \| blob \| history