llava: Add ACC OP for GPU acceleration to the Vulkan backend in the LLAVA CLIP model...

author Changyeon Kim <redacted>

Tue, 20 Aug 2024 19:00:00 +0000 (04:00 +0900)

committer GitHub <redacted>

Tue, 20 Aug 2024 19:00:00 +0000 (21:00 +0200)
author Changyeon Kim <redacted>
Tue, 20 Aug 2024 19:00:00 +0000 (04:00 +0900)
committer GitHub <redacted>
Tue, 20 Aug 2024 19:00:00 +0000 (21:00 +0200)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp

index 342042ffba63c3ec86dd3eb75f7295f31e0754fa..94d8294cd02cc3f8a7847fa487ab410798b57710 100644 (file)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -20,6 +20,10 @@
  #include "ggml-cann.h"
  #endif
  
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
  #define STB_IMAGE_IMPLEMENTATION
  #include "stb_image.h"
  
@@ -1142,6 +1146,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
      LOG_TEE("%s: CLIP using CANN backend\n", __func__);
  #endif
  
+#ifdef GGML_USE_VULKAN
+    new_clip->backend = ggml_backend_vk_init(0);
+    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
+#endif
  
      if (!new_clip->backend) {
          new_clip->backend = ggml_backend_cpu_init();
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp

index 7a0ec706f4c3105c4de9d09b870a71093bd7cc44..32fda32a879ba9d45eb29bbe7e7f8000634e6b2c 100644 (file)
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -180,6 +180,7 @@ struct vk_device_struct {
      vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
      vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
      vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_acc_f32;
      vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
      vk_pipeline pipeline_mul_f32;
      vk_pipeline pipeline_div_f32;
@@ -1687,6 +1688,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
      ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
      ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
  
+    ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+
      ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
      ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
  
@@ -3971,6 +3974,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
              return ctx->device->pipeline_get_rows_f32[src0->type];
          }
          return nullptr;
+    case GGML_OP_ACC:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_acc_f32;
+        }
+        return nullptr;
      case GGML_OP_ADD:
          if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
              return ctx->device->pipeline_add_f32;
@@ -4463,6 +4471,28 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
      }, dryrun);
  }
  
+static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = ggml_type_size(src1->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+    const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] /  dst_type_size,
+        d_offset,
+        0.0f, 0.0f, offset,
+    }, dryrun);
+}
+
  static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
      const uint32_t src0_type_size = ggml_type_size(src0->type);
      const uint32_t src1_type_size = ggml_type_size(src1->type);
@@ -5621,6 +5651,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
      case GGML_OP_REPEAT:
      case GGML_OP_GET_ROWS:
      case GGML_OP_ADD:
+    case GGML_OP_ACC:
      case GGML_OP_MUL:
      case GGML_OP_DIV:
      case GGML_OP_CONCAT:
@@ -5668,6 +5699,10 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
      case GGML_OP_REPEAT:
          ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
  
+        break;
+    case GGML_OP_ACC:
+        ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun);
+
          break;
      case GGML_OP_GET_ROWS:
          ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun);
@@ -5808,6 +5843,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
  
      switch (tensor->op) {
      case GGML_OP_ADD:
+    case GGML_OP_ACC:
      case GGML_OP_GET_ROWS:
      case GGML_OP_MUL:
      case GGML_OP_DIV:
@@ -6539,6 +6575,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
          case GGML_OP_GROUP_NORM:
          case GGML_OP_RMS_NORM:
          case GGML_OP_ADD:
+        case GGML_OP_ACC:
          case GGML_OP_MUL:
          case GGML_OP_DIV:
          case GGML_OP_CONCAT:
@@ -6995,6 +7032,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
          tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone);
      } else if (tensor->op == GGML_OP_ADD) {
          tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
+    } else if (tensor->op == GGML_OP_ACC) {
+        tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone, tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
      } else if (tensor->op == GGML_OP_NORM) {
          tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
      } else if (tensor->op == GGML_OP_GROUP_NORM) {
diff --git a/ggml/src/vulkan-shaders/acc.comp b/ggml/src/vulkan-shaders/acc.comp

new file mode 100644 (file)

index 0000000..4c8739e
--- /dev/null
+++ b/ggml/src/vulkan-shaders/acc.comp
@@ -0,0 +1,24 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.x;
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint offset = p.param3;
+    const uint src1_i = idx - offset;
+    const uint oz = src1_i / p.nb02;
+    const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
+    const uint ox = src1_i % p.nb01;
+
+    if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
+        data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
+    } else {
+        data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]));
+    }
+}
+
diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp

index 53ceb13d30fde9e6946a59ddfb360ae4bbe17972..89ac99f29696bad9f42e59a1df03de3c729c31d8 100644 (file)
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -368,6 +368,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
          string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
      }));
  
+    tasks.push_back(std::async(std::launch::async, [] {
+        string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+    }));
+
      tasks.push_back(std::async(std::launch::async, [] {
          string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
      }));
author	Changyeon Kim <redacted>
	Tue, 20 Aug 2024 19:00:00 +0000 (04:00 +0900)
committer	GitHub <redacted>
	Tue, 20 Aug 2024 19:00:00 +0000 (21:00 +0200)
examples/llava/clip.cpp		patch \| blob \| history
ggml/src/ggml-vulkan.cpp		patch \| blob \| history
ggml/src/vulkan-shaders/acc.comp	[new file with mode: 0644]	patch \| blob
ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp		patch \| blob \| history