metal: handle command buffer failures gracefully in synchronize (llama/20306)

author Julian Pscheid <redacted>

Tue, 10 Mar 2026 06:32:24 +0000 (23:32 -0700)

committer Georgi Gerganov <redacted>

Sun, 15 Mar 2026 19:50:13 +0000 (21:50 +0200)
author Julian Pscheid <redacted>
Tue, 10 Mar 2026 06:32:24 +0000 (23:32 -0700)
committer Georgi Gerganov <redacted>
Sun, 15 Mar 2026 19:50:13 +0000 (21:50 +0200)
diff --git a/src/ggml-metal/ggml-metal-context.m b/src/ggml-metal/ggml-metal-context.m

index 5d3a8ce412ac57aa82bcb2a64a82b9fd7538190a..1136ce99b095be45f03078681e6fd04f1fb866eb 100644 (file)
--- a/src/ggml-metal/ggml-metal-context.m
+++ b/src/ggml-metal/ggml-metal-context.m
@@ -75,6 +75,10 @@ struct ggml_metal {
      // abort ggml_metal_graph_compute if callback returns true
      ggml_abort_callback abort_callback;
      void *              abort_callback_data;
+
+    // error state - set when a command buffer fails during synchronize
+    // once set, graph_compute will return GGML_STATUS_FAILED until the backend is recreated
+    bool has_error;
  };
  
  ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
@@ -158,6 +162,8 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
      res->capture_started = false;
      res->capture_scope = nil;
  
+    res->has_error = false;
+
      res->gf = nil;
      res->encode_async = nil;
      for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
@@ -246,7 +252,8 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
                  if (status == MTLCommandBufferStatusError) {
                      GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
                  }
-                GGML_ABORT("fatal error");
+                ctx->has_error = true;
+                return;
              }
          }
      }
@@ -262,7 +269,15 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
                  if (status == MTLCommandBufferStatusError) {
                      GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
                  }
-                GGML_ABORT("fatal error");
+
+                // release this and all remaining command buffers before returning
+                for (size_t j = i; j < ctx->cmd_bufs_ext.count; ++j) {
+                    [ctx->cmd_bufs_ext[j] release];
+                }
+                [ctx->cmd_bufs_ext removeAllObjects];
+
+                ctx->has_error = true;
+                return;
              }
  
              [cmd_buf release];
@@ -414,6 +429,11 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con
  }
  
  enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
+    if (ctx->has_error) {
+        GGML_LOG_ERROR("%s: backend is in error state from a previous command buffer failure - recreate the backend to recover\n", __func__);
+        return GGML_STATUS_FAILED;
+    }
+
      // number of nodes encoded by the main thread (empirically determined)
      const int n_main = MAX(64, 0.1*gf->n_nodes);
author	Julian Pscheid <redacted>
	Tue, 10 Mar 2026 06:32:24 +0000 (23:32 -0700)
committer	Georgi Gerganov <redacted>
	Sun, 15 Mar 2026 19:50:13 +0000 (21:50 +0200)