ggml: add env var GGML_OP_OFFLOAD_MIN_BATCH (#18535)

author Doctor Shotgun <redacted>

Thu, 8 Jan 2026 09:03:21 +0000 (01:03 -0800)

committer GitHub <redacted>

Thu, 8 Jan 2026 09:03:21 +0000 (11:03 +0200)
author Doctor Shotgun <redacted>
Thu, 8 Jan 2026 09:03:21 +0000 (01:03 -0800)
committer GitHub <redacted>
Thu, 8 Jan 2026 09:03:21 +0000 (11:03 +0200)
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp

index 162d238ae441833b5c53b9dd683de20000ebaf8b..d7a93848df89d4c2061e03812f5df9b9a9c2dc2c 100644 (file)
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2541,27 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
      return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
  }
  
-/**
- * @brief Determines if a tensor operation should be offloaded to the CANN
- * backend.
- *
- * This function checks if a given tensor operation should be offloaded to the
- * CANN backend based on the operation type and the size of the tensor. It
- * returns true if the second dimension (ne[1]) of the tensor is greater than or
- * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
- *
- * @param backend Pointer to the CANN backend.
- * @param op Pointer to the tensor operation to check.
- * @return bool Returns true if the operation should be offloaded, otherwise
- * false.
- */
-static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
-    GGML_UNUSED(dev);
-
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
-}
-
  /**
   * @brief Records an event on the CANN backend stream.
   *
@@ -2637,6 +2616,7 @@ struct ggml_backend_cann_device_context {
      int         device;
      std::string name;
      std::string description;
+    int op_offload_min_batch_size;
  };
  
  static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
@@ -2713,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
      return ggml_backend_cann_host_buffer_type();
  }
  
+/**
+ * @brief Determines if a tensor operation should be offloaded to the CANN
+ * backend.
+ *
+ * This function checks if a given tensor operation should be offloaded to the
+ * CANN backend based on the operation type and the size of the tensor. It
+ * returns true if the second dimension (ne[1]) of the tensor is greater than or
+ * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param op Pointer to the tensor operation to check.
+ * @return bool Returns true if the operation should be offloaded, otherwise
+ * false.
+ */
+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+
+    return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
+}
+
  /**
   * @brief Creates a new event for the CANN backend device.
   *
@@ -2829,12 +2829,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
          if (!initialized) {
              aclInit(nullptr);
              ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
  
              for (int i = 0; i < ggml_cann_info().device_count; i++) {
                  ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
                  dev_ctx->description                       = aclrtGetSocName();
                  dev_ctx->device                            = i;
                  dev_ctx->name                              = GGML_CANN_NAME + std::to_string(i);
+                dev_ctx->op_offload_min_batch_size         = min_batch_size;
                  ggml_cann_set_device(i);
                  ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface   = */ ggml_backend_cann_device_interface,
                                                                    /* .reg     = */ &reg,
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index bac69cdd1c8a49d7c224857e5b607c86b1180e59..f021de1d7456a2243e162d8d046043217ae8cf0d 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4122,6 +4122,7 @@ struct ggml_backend_cuda_device_context {
      std::string name;
      std::string description;
      std::string pci_bus_id;
+    int op_offload_min_batch_size;
  };
  
  static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4676,11 +4677,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
  }
  
  static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
-
-    return get_op_batch_size(op) >= min_batch_size;
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
  
-    GGML_UNUSED(dev);
+    return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
  }
  
  static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
@@ -4848,6 +4847,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
          std::lock_guard<std::mutex> lock(mutex);
          if (!initialized) {
              ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
  
              for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                  ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -4861,6 +4861,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                  char pci_bus_id[16] = {};
                  snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
                  dev_ctx->pci_bus_id = pci_bus_id;
+                dev_ctx->op_offload_min_batch_size = min_batch_size;
  
                  ggml_backend_dev_t dev = new ggml_backend_device {
                      /* .iface   = */ ggml_backend_cuda_device_interface,
diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h

index d983b666ca2ec52786e8b98fd9b9cd8d1d9a3e50..9c3b00148789e97d7b3ef0482ce47ff2d1f2faaf 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -219,6 +219,8 @@ struct ggml_metal_device_props {
      bool use_shared_buffers;
  
      bool supports_gpu_family_apple7;
+
+    int op_offload_min_batch_size;
  };
  
  ggml_metal_device_t ggml_metal_device_init(void);
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m

index 59badd00431a0dbef3187155caebfd1977605163..ff899a81709a4870c1cb9806588d80ce8061e812 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
  
              dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
  
+            dev->props.op_offload_min_batch_size  = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
+
              dev->props.max_buffer_size            = dev->mtl_device.maxBufferLength;
              dev->props.max_working_set_size       = dev->mtl_device.recommendedMaxWorkingSetSize;
              dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp

index 70bf6f3d981f88f948355f00db6c836083e4712c..56b59f0afdf75e70bae3c771eee1d084f4eba319 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
  }
  
  static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
+    ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
  
      return (op->op == GGML_OP_MUL_MAT ||
              op->op == GGML_OP_MUL_MAT_ID) &&
-            get_op_batch_size(op) >= min_batch_size;
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(op);
+            get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
  }
  
  static ggml_backend_device_i ggml_backend_metal_device_i = {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp

index e996d98be8c2de2509a7b306c847f92bea7f8354..8f8176b678a22acdc57e019c711ee2d9d221100c 100644 (file)
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context {
      int device;
      std::string name;
      std::string description;
+    int op_offload_min_batch_size;
  };
  
  static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
@@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
  }
  
  static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
-    return get_op_batch_size(op) >= min_batch_size;
-    GGML_UNUSED(dev);
+    ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
  }
  
  static ggml_backend_event_t
@@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
          std::lock_guard<std::mutex> lock(mutex);
          if (!initialized) {
              ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
  
              for (int i = 0; i < ggml_sycl_info().device_count; i++) {
                  ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
@@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
                      prop, dpct::dev_mgr::instance().get_device(i))));
  
                  dev_ctx->description = prop.get_name();
+                dev_ctx->op_offload_min_batch_size = min_batch_size;
  
                  ggml_backend_dev_t dev = new ggml_backend_device {
                      /* .iface       = */ ggml_backend_sycl_device_interface,
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp

index d68735a040adbc1d1dc83402bb304fc5b4af1951..4d3c085f67af687c4a894d6ef2bc26bf9cb58160 100644 (file)
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -14249,6 +14249,7 @@ struct ggml_backend_vk_device_context {
      std::string description;
      bool is_integrated_gpu;
      std::string pci_bus_id;
+    int op_offload_min_batch_size;
  };
  
  static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
@@ -14820,12 +14821,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
  }
  
  static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
+    ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
  
-    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
-
-    UNUSED(dev);
+    return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
  }
  
  static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
@@ -14951,6 +14950,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
          static std::mutex mutex;
          std::lock_guard<std::mutex> lock(mutex);
          if (!initialized) {
+            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
              for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
                  ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
                  char desc[256];
@@ -14960,6 +14960,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                  ctx->description = desc;
                  ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
                  ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
+                ctx->op_offload_min_batch_size = min_batch_size;
                  devices.push_back(new ggml_backend_device {
                      /* .iface   = */ ggml_backend_vk_device_i,
                      /* .reg     = */ reg,
author	Doctor Shotgun <redacted>
	Thu, 8 Jan 2026 09:03:21 +0000 (01:03 -0800)
committer	GitHub <redacted>
	Thu, 8 Jan 2026 09:03:21 +0000 (11:03 +0200)
ggml/src/ggml-cann/ggml-cann.cpp		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal-device.h		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal-device.m		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal.cpp		patch \| blob \| history
ggml/src/ggml-sycl/ggml-sycl.cpp		patch \| blob \| history
ggml/src/ggml-vulkan/ggml-vulkan.cpp		patch \| blob \| history