this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
- if (this_size > max_size) {
- GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
- __func__, t->name,
- ggml_backend_buft_name(buft),
- this_size, max_size);
- for (size_t i = 0; i < n_buffers; i++) {
- ggml_backend_buffer_free(buffers[i]);
- }
- free(buffers);
- return NULL;
- }
-
- if ((cur_buf_size + this_size) > max_size) {
+ if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
// allocate tensors in the current buffer
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
vk::PhysicalDeviceProperties properties;
std::string name;
uint64_t max_memory_allocation_size;
+ uint64_t suballocation_block_size;
bool fp16;
bool pipeline_robustness;
vk::Device device;
device->physical_device.getProperties2(&props2);
device->properties = props2.properties;
+ device->vendor_id = device->properties.vendorID;
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
}
- device->vendor_id = device->properties.vendorID;
+ const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");
+
+ if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
+ device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
+#if defined(_WIN32)
+ } else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
+ // Limit batching of allocations to 1GB by default to avoid fragmentation issues
+ device->suballocation_block_size = 1024*1024*1024;
+#endif
+ } else {
+ device->suballocation_block_size = device->max_memory_allocation_size;
+ }
+ device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
+
device->subgroup_size = subgroup_props.subgroupSize;
device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
if (sm_builtins) {
static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
- return ctx->device->max_memory_allocation_size;
+ return ctx->device->suballocation_block_size;
}
static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {