llama : greatly reduce output buffer memory usage (llama/6122)

author compilade <redacted>

Tue, 26 Mar 2024 14:46:41 +0000 (10:46 -0400)

committer Georgi Gerganov <redacted>

Wed, 27 Mar 2024 11:20:00 +0000 (13:20 +0200)
author compilade <redacted>
Tue, 26 Mar 2024 14:46:41 +0000 (10:46 -0400)
committer Georgi Gerganov <redacted>
Wed, 27 Mar 2024 11:20:00 +0000 (13:20 +0200)
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h

index 425c9b6ab2d6df4924ade1895be332dacfe50ca0..5d4a4ceb65c7e106bf2008ba82089fe8d4d7a83f 100644 (file)
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -750,6 +750,7 @@ extern "C" {
      GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
      GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
      GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_empty     (const struct ggml_tensor * tensor);
      GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
      GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
      GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu

index 48232b6e18d6ce98f6c57dabdafc44297df9ff7b..be8e33a56c40f45dac8807b419925379c72f3c0f 100644 (file)
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -2505,7 +2505,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
      for (int i = 0; i < cgraph->n_nodes; i++) {
          ggml_tensor * node = cgraph->nodes[i];
  
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
              continue;
          }
  
diff --git a/src/ggml-kompute.cpp b/src/ggml-kompute.cpp

index 81dd5067864ce73033a003efdcf54cb44eab4eea..407062e6fd47625d6cb2f78e17649f64387d2866 100644 (file)
--- a/src/ggml-kompute.cpp
+++ b/src/ggml-kompute.cpp
@@ -1430,6 +1430,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
              struct ggml_tensor * dst = gf->nodes[i];
              GGML_ASSERT(dst->data != nullptr);
  
+            if (ggml_is_empty(dst)) {
+                continue;
+            }
+
              switch (dst->op) {
                  case GGML_OP_NONE:
                  case GGML_OP_RESHAPE:
diff --git a/src/ggml-metal.m b/src/ggml-metal.m

index cbe22aa3792b460d376a69f26fa8ecd8f1a4f007..a08abbc2918028cc178d997a67946b8030295fd6 100644 (file)
--- a/src/ggml-metal.m
+++ b/src/ggml-metal.m
@@ -847,6 +847,10 @@ static enum ggml_status ggml_metal_graph_compute(
              struct ggml_tensor * src2 = gf->nodes[i]->src[2];
              struct ggml_tensor * dst  = gf->nodes[i];
  
+            if (ggml_is_empty(dst)) {
+                continue;
+            }
+
              switch (dst->op) {
                  case GGML_OP_NONE:
                  case GGML_OP_RESHAPE:
diff --git a/src/ggml-opencl.cpp b/src/ggml-opencl.cpp

index aa73d67df84b04197700e727e2b02693edf84591..b3f8b7eaf0a3b8f7e7ebfa5389611e9b1a165cf8 100644 (file)
--- a/src/ggml-opencl.cpp
+++ b/src/ggml-opencl.cpp
@@ -2234,6 +2234,11 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
  static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
      for (int i = 0; i < graph->n_nodes; ++i) {
          ggml_tensor * node = graph->nodes[i];
+
+        if (ggml_is_empty(node)) {
+            continue;
+        }
+
          switch (node->op) {
              case GGML_OP_MUL_MAT:
                  ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
diff --git a/src/ggml-sycl.cpp b/src/ggml-sycl.cpp

index fc4d2964ccac9621e28d142cbfea8f2aa0f15c74..789ba97bfba39ac8a7a9b0064bdf549da1177f76 100644 (file)
--- a/src/ggml-sycl.cpp
+++ b/src/ggml-sycl.cpp
@@ -16973,7 +16973,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
      params.ith = 0;
      for (int i = 0; i < cgraph->n_nodes; i++) {
          ggml_tensor * node = cgraph->nodes[i];
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
              continue;
          }
  #ifndef NDEBUG
diff --git a/src/ggml-vulkan.cpp b/src/ggml-vulkan.cpp

index cbceaa19fbacde29d6b950c504edeccf92c11989..521a1314b35655dc6fc1e3ef42e013137eae1347 100644 (file)
--- a/src/ggml-vulkan.cpp
+++ b/src/ggml-vulkan.cpp
@@ -5566,7 +5566,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
      for (int i = 0; i < cgraph->n_nodes; i++) {
          ggml_tensor * node = cgraph->nodes[i];
  
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
              continue;
          }
  
diff --git a/src/ggml.c b/src/ggml.c

index a86b41c158558820c5921e0ad2c827dea874016e..eb469d0f7953d514bdf14a99009fdda0c2b00453 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -2607,6 +2607,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
          tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
  }
  
+GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (tensor->ne[i] == 0) {
+            // empty if any dimension has no elements
+            return true;
+        }
+    }
+    return false;
+}
+
  bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
      static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  
@@ -2621,7 +2631,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
  static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
      static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  
-    return
+    return ggml_is_empty(t0) ? ggml_is_empty(t1) :
          (t1->ne[0]%t0->ne[0] == 0) &&
          (t1->ne[1]%t0->ne[1] == 0) &&
          (t1->ne[2]%t0->ne[2] == 0) &&
@@ -16114,7 +16124,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
      GGML_ASSERT(params);
  
-    if (tensor->op == GGML_OP_NONE) {
+    if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
          return;
      }
  
@@ -17983,6 +17993,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
  static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
      int n_tasks = 0;
  
+    if (ggml_is_empty(node)) {
+        // no need to multi-thread a no-op
+        n_tasks = 1;
+        return n_tasks;
+    }
+
      switch (node->op) {
          case GGML_OP_CPY:
          case GGML_OP_DUP:
author	compilade <redacted>
	Tue, 26 Mar 2024 14:46:41 +0000 (10:46 -0400)
committer	Georgi Gerganov <redacted>
	Wed, 27 Mar 2024 11:20:00 +0000 (13:20 +0200)
include/ggml/ggml.h		patch \| blob \| history
src/ggml-cuda.cu		patch \| blob \| history
src/ggml-kompute.cpp		patch \| blob \| history
src/ggml-metal.m		patch \| blob \| history
src/ggml-opencl.cpp		patch \| blob \| history
src/ggml-sycl.cpp		patch \| blob \| history
src/ggml-vulkan.cpp		patch \| blob \| history
src/ggml.c		patch \| blob \| history