vulkan: use smaller combined allocations to avoid fragmentation (llama/11551)

author Jeff Bolz <redacted>

Thu, 6 Feb 2025 06:02:18 +0000 (00:02 -0600)

committer Georgi Gerganov <redacted>

Thu, 27 Feb 2025 06:55:36 +0000 (08:55 +0200)
author Jeff Bolz <redacted>
Thu, 6 Feb 2025 06:02:18 +0000 (00:02 -0600)
committer Georgi Gerganov <redacted>
Thu, 27 Feb 2025 06:55:36 +0000 (08:55 +0200)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c

index 9a3bf9f29235c60b137da9659f22d5b5f615c421..7244a9cbb0605f037c6cac292321013fc160bdca 100644 (file)
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -989,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
              this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
          }
  
-        if (this_size > max_size) {
-            GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
-                    __func__, t->name,
-                    ggml_backend_buft_name(buft),
-                    this_size, max_size);
-            for (size_t i = 0; i < n_buffers; i++) {
-                ggml_backend_buffer_free(buffers[i]);
-            }
-            free(buffers);
-            return NULL;
-        }
-
-        if ((cur_buf_size + this_size) > max_size) {
+        if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
              // allocate tensors in the current buffer
              if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                  return NULL;
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp

index 48ac489a6554f867f6faa8eba1ced4ef3310c093..2e1bcf691b3b0ce259345e95aa6b956f8664371c 100644 (file)
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -156,6 +156,7 @@ struct vk_device_struct {
      vk::PhysicalDeviceProperties properties;
      std::string name;
      uint64_t max_memory_allocation_size;
+    uint64_t suballocation_block_size;
      bool fp16;
      bool pipeline_robustness;
      vk::Device device;
@@ -2269,6 +2270,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
  
          device->physical_device.getProperties2(&props2);
          device->properties = props2.properties;
+        device->vendor_id = device->properties.vendorID;
  
          const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
  
@@ -2280,7 +2282,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
              device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
          }
  
-        device->vendor_id = device->properties.vendorID;
+        const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");
+
+        if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
+            device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
+#if defined(_WIN32)
+        } else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
+            // Limit batching of allocations to 1GB by default to avoid fragmentation issues
+            device->suballocation_block_size = 1024*1024*1024;
+#endif
+        } else {
+            device->suballocation_block_size = device->max_memory_allocation_size;
+        }
+        device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
+
          device->subgroup_size = subgroup_props.subgroupSize;
          device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
          if (sm_builtins) {
@@ -7561,7 +7576,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
  
  static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
      ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    return ctx->device->max_memory_allocation_size;
+    return ctx->device->suballocation_block_size;
  }
  
  static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
author	Jeff Bolz <redacted>
	Thu, 6 Feb 2025 06:02:18 +0000 (00:02 -0600)
committer	Georgi Gerganov <redacted>
	Thu, 27 Feb 2025 06:55:36 +0000 (08:55 +0200)
ggml/src/ggml-alloc.c		patch \| blob \| history
ggml/src/ggml-vulkan/ggml-vulkan.cpp		patch \| blob \| history