vulkan: Optimize contiguous copies (#10254)

author Jeff Bolz <redacted>

Wed, 13 Nov 2024 06:58:57 +0000 (00:58 -0600)

committer GitHub <redacted>

Wed, 13 Nov 2024 06:58:57 +0000 (07:58 +0100)
author Jeff Bolz <redacted>
Wed, 13 Nov 2024 06:58:57 +0000 (00:58 -0600)
committer GitHub <redacted>
Wed, 13 Nov 2024 06:58:57 +0000 (07:58 +0100)
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp

index 6c4c922625121239f4074a9bf74dddfbd30db2a3..ec31e726af8e0defb5906e9037d4168d10f59d2c 100644 (file)
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -196,6 +196,7 @@ struct vk_device_struct {
      vk_pipeline pipeline_pad_f32;
      vk_pipeline pipeline_repeat_f32;
      vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
+    vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
      vk_pipeline pipeline_norm_f32;
      vk_pipeline pipeline_group_norm_f32;
      vk_pipeline pipeline_rms_norm_f32;
@@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
          std::lock_guard<std::mutex> guard(compile_count_mutex);
          assert(compile_count > 0);
          compile_count--;
+
+        // "Progress bar" for shader compiles
+        static uint32_t total_compile_count = 0;
+        if ((total_compile_count++ % 10) == 0) {
+            std::cerr << ".";
+        }
      }
      compile_count_cond.notify_all();
  }
@@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
  static void ggml_vk_load_shaders(vk_device& device) {
      VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
  
+    std::cerr << "ggml_vulkan: Compiling shaders";
+
      // mulmat
      std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
      std::initializer_list<uint32_t> warptile_m = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
@@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
      ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
      ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
  
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
      ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
      ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
  
@@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
      for (auto &c : compiles) {
          c.wait();
      }
+    std::cerr << "Done!" << std::endl;
  }
  
  static vk_device ggml_vk_get_device(size_t idx) {
@@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
          tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
  }
  
-static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
-    if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
-        return ctx->device->pipeline_cpy_f32_f32;
+static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
+
+    // Choose "contiguous copy" shader if src/dst are contiguous
+    bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
+
+    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f32_f32;
+        } else {
+            return ctx->device->pipeline_cpy_f32_f32;
+        }
      }
-    if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
-        return ctx->device->pipeline_cpy_f32_f16;
+    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f32_f16;
+        } else {
+            return ctx->device->pipeline_cpy_f32_f16;
+        }
      }
-    if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
-        return ctx->device->pipeline_cpy_f16_f16;
+    if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f16_f16;
+        } else {
+            return ctx->device->pipeline_cpy_f16_f16;
+        }
      }
  
-    std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
+    std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
      GGML_ABORT("fatal error");
  }
  
@@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
      const int tensor_type_size = ggml_type_size(tensor->type);
  
      const uint32_t ne = ggml_nelements(tensor);
+    std::array<uint32_t, 3> elements;
+
+    if (ne > 262144) {
+        elements = { 512, 512, CEIL_DIV(ne, 262144) };
+    } else if (ne > 512) {
+        elements = { 512, CEIL_DIV(ne, 512), 1 };
+    } else {
+        elements = { ne, 1, 1 };
+    }
  
      const vk_op_unary_push_constants pc = {
          (uint32_t)ne,
@@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
          0.0f, 0.0f,
      };
      ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
  }
  
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
      vk_pipeline to_fp16_vk_1 = nullptr;
  
      if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
      } else {
          to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
      }
      if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
      } else {
          to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
      }
@@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
      vk_pipeline to_fp16_vk_0 = nullptr;
      vk_pipeline to_fp16_vk_1 = nullptr;
      if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
      }
      if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
      } else {
          to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
      }
@@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
      vk_pipeline to_fp16_vk_1 = nullptr;
  
      if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
      } else {
          to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
      }
      if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
      } else {
          to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
      }
@@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
      vk_pipeline to_fp16_vk_0 = nullptr;
      vk_pipeline to_fp16_vk_1 = nullptr;
      if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
      }
      if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
      } else {
          to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
      }
@@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
      case GGML_OP_CPY:
      case GGML_OP_CONT:
      case GGML_OP_DUP:
-        return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
+        return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
      case GGML_OP_NORM:
          if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
              return ctx->device->pipeline_norm_f32;
@@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
      case GGML_OP_DIV:
      case GGML_OP_CONCAT:
      case GGML_OP_UPSCALE:
-    case GGML_OP_SCALE:
      case GGML_OP_SQR:
      case GGML_OP_SIN:
      case GGML_OP_COS:
diff --git a/ggml/src/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp

index 7071302a4b65828088a23acbb6126732b0c227de..ae8fa8753daddd8cf57a74b5ef9679097c44a2cc 100644 (file)
--- a/ggml/src/vulkan-shaders/clamp.comp
+++ b/ggml/src/vulkan-shaders/clamp.comp
@@ -3,6 +3,8 @@
  #include "types.comp"
  #include "generic_unary_head.comp"
  
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
  void main() {
      const uint idx = get_idx();
  
diff --git a/ggml/src/vulkan-shaders/contig_copy.comp b/ggml/src/vulkan-shaders/contig_copy.comp

new file mode 100644 (file)

index 0000000..9acbdd3
--- /dev/null
+++ b/ggml/src/vulkan-shaders/contig_copy.comp
@@ -0,0 +1,42 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+#extension GL_EXT_control_flow_attributes : require
+
+const uint num_threads = 128;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint idx = get_idx();
+
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 4;
+
+    // fast path for when all four iterations are in-bounds
+    if (idx + (num_iter-1)*num_threads < p.ne) {
+        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
+#else
+            data_d[p.d_offset + idx] = data_a[idx];
+#endif
+            idx += num_threads;
+        }
+    } else {
+        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+            if (idx >= p.ne) {
+                continue;
+            }
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
+#else
+            data_d[p.d_offset + idx] = data_a[idx];
+#endif
+            idx += num_threads;
+        }
+    }
+}
diff --git a/ggml/src/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp

index c26917c0f9af5438fc48718413deed5a9be8d264..2775068f9ab86c131ad8ac1bfde7356b96f25c75 100644 (file)
--- a/ggml/src/vulkan-shaders/copy.comp
+++ b/ggml/src/vulkan-shaders/copy.comp
@@ -3,6 +3,8 @@
  #include "types.comp"
  #include "generic_unary_head.comp"
  
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
  void main() {
      const uint idx = get_idx();
  
diff --git a/ggml/src/vulkan-shaders/cos.comp b/ggml/src/vulkan-shaders/cos.comp

index f9a858cbf16ce2e4b837b018995286d1e735af22..fbd9d272c3336172c753db94d80f683359c413cd 100644 (file)
--- a/ggml/src/vulkan-shaders/cos.comp
+++ b/ggml/src/vulkan-shaders/cos.comp
@@ -3,6 +3,8 @@
  #include "types.comp"
  #include "generic_unary_head.comp"
  
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
  void main() {
      const uint idx = get_idx();
  
diff --git a/ggml/src/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp

index eacdefc7d8aa765e4c67b8fe6551b9eddd123b6f..4e1fa3af3ad62959e7bb1f8ea2446d81cae86cc3 100644 (file)
--- a/ggml/src/vulkan-shaders/generic_unary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_unary_head.comp
@@ -1,4 +1,5 @@
  #extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_control_flow_attributes : require
  
  layout (push_constant) uniform parameter
  {
@@ -9,8 +10,6 @@ layout (push_constant) uniform parameter
      float param1; float param2;
  } p;
  
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
  layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
  
diff --git a/ggml/src/vulkan-shaders/pad.comp b/ggml/src/vulkan-shaders/pad.comp

index a465cd52bcfa81d69f2e4d3e4b347f8b020f1f8e..e87d8b18b1ee1283946dd35e40f7b3b0add55ca2 100644 (file)
--- a/ggml/src/vulkan-shaders/pad.comp
+++ b/ggml/src/vulkan-shaders/pad.comp
@@ -3,6 +3,8 @@
  #include "types.comp"
  #include "generic_unary_head.comp"
  
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
  void main() {
      const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
  
diff --git a/ggml/src/vulkan-shaders/repeat.comp b/ggml/src/vulkan-shaders/repeat.comp

index a86af87e7b7f9d5d886a749cc861266f98ff35b7..c03f737cc1d951bedc6f4c0c501cf6a152d22508 100644 (file)
--- a/ggml/src/vulkan-shaders/repeat.comp
+++ b/ggml/src/vulkan-shaders/repeat.comp
@@ -3,6 +3,8 @@
  #include "types.comp"
  #include "generic_unary_head.comp"
  
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
  uint src0_idx_mod(uint idx) {
      const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
      const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
diff --git a/ggml/src/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp

index 5cd2f668d01f3efe802c454ce225f046a31b0b7a..5cfee8c3bdbea46f6517f55af18be01e0d49f31b 100644 (file)
--- a/ggml/src/vulkan-shaders/scale.comp
+++ b/ggml/src/vulkan-shaders/scale.comp
@@ -3,12 +3,22 @@
  #include "types.comp"
  #include "generic_unary_head.comp"
  
+const uint num_threads = 128;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
  void main() {
-    const uint idx = get_idx();
+    uint idx = get_idx();
  
-    if (idx >= p.ne) {
-        return;
-    }
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 4;
  
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+
+        data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
+        idx += num_threads;
+    }
  }
diff --git a/ggml/src/vulkan-shaders/sin.comp b/ggml/src/vulkan-shaders/sin.comp

index 7faf9be9362bfc62059b474fd6be5c03eb236ef6..67c48fb9aa01b43715c07132326755c750c5ff0b 100644 (file)
--- a/ggml/src/vulkan-shaders/sin.comp
+++ b/ggml/src/vulkan-shaders/sin.comp
@@ -3,6 +3,8 @@
  #include "types.comp"
  #include "generic_unary_head.comp"
  
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
  void main() {
      const uint idx = get_idx();
  
diff --git a/ggml/src/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp

index 1fa118c996e04e12f07a67f927f5791464028685..2ff48ddc53bf66e1ec3d69e41594d96793db5e6a 100644 (file)
--- a/ggml/src/vulkan-shaders/square.comp
+++ b/ggml/src/vulkan-shaders/square.comp
@@ -3,6 +3,8 @@
  #include "types.comp"
  #include "generic_unary_head.comp"
  
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
  void main() {
      const uint idx = get_idx();
  
diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp

index 477355c29304bf93e26ecd33a38bb8150ef29e31..5c84f473fc05ac685af04e92c85716894dbd4995 100644 (file)
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -350,6 +350,9 @@ void process_shaders() {
      string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
      string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
      string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+    string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
  
      string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
      string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index 65be4328148286805c3eaae25848a1ae57496858..6618d03d150a00316b6aad419c48cc4df1d2e587 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -681,6 +681,7 @@ struct test_case {
  
          // run
          int64_t total_time_us = 0;
+        int64_t total_mem = 0;
          int total_runs = 0;
          do {
              int64_t start_time = ggml_time_us();
@@ -688,6 +689,7 @@ struct test_case {
              int64_t end_time = ggml_time_us();
  
              total_time_us += end_time - start_time;
+            total_mem += mem;
              total_runs += n_runs;
          } while (total_time_us < 1000*1000); // run for at least 1 second
  
@@ -717,7 +719,7 @@ struct test_case {
          } else {
              printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
                  op_size(out) / 1024,
-                mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
+                total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
          }
          printf("\n");
  
@@ -2740,6 +2742,13 @@ struct test_flash_attn_ext : public test_case {
          return 5e-4;
      }
  
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        // Just counting matmul costs:
+        // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
+        return 2 * 2 * nh * nb * hs * kv;
+    }
+
      test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
                          bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
          : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
@@ -3779,6 +3788,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
      test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
      test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
  
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
+
      for (int bs : {1, 512}) {
          for (ggml_type type_a : all_types) {
              for (ggml_type type_b : {GGML_TYPE_F32}) {
author	Jeff Bolz <redacted>
	Wed, 13 Nov 2024 06:58:57 +0000 (00:58 -0600)
committer	GitHub <redacted>
	Wed, 13 Nov 2024 06:58:57 +0000 (07:58 +0100)
ggml/src/ggml-vulkan.cpp		patch \| blob \| history
ggml/src/vulkan-shaders/clamp.comp		patch \| blob \| history
ggml/src/vulkan-shaders/contig_copy.comp	[new file with mode: 0644]	patch \| blob
ggml/src/vulkan-shaders/copy.comp		patch \| blob \| history
ggml/src/vulkan-shaders/cos.comp		patch \| blob \| history
ggml/src/vulkan-shaders/generic_unary_head.comp		patch \| blob \| history
ggml/src/vulkan-shaders/pad.comp		patch \| blob \| history
ggml/src/vulkan-shaders/repeat.comp		patch \| blob \| history
ggml/src/vulkan-shaders/scale.comp		patch \| blob \| history
ggml/src/vulkan-shaders/sin.comp		patch \| blob \| history
ggml/src/vulkan-shaders/square.comp		patch \| blob \| history
ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history