vk_pipeline pipeline_sum_rows_f32;
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
vk_pipeline pipeline_timestep_embedding_f32;
+ vk_pipeline pipeline_pool2d_f32;
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
uint32_t max_period;
};
+struct vk_op_pool2d_push_constants {
+ uint32_t IW; uint32_t IH;
+ uint32_t OW; uint32_t OH;
+ uint32_t OC;
+ uint32_t pelements;
+ uint32_t op;
+ int32_t k0; int32_t k1;
+ int32_t s0; int32_t s1;
+ int32_t p0; int32_t p1;
+};
+
// Allow pre-recording command buffers
struct vk_staging_memcpy {
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
+
for (auto &c : compiles) {
c.wait();
}
return ctx->device->pipeline_timestep_embedding_f32;
}
return nullptr;
+ case GGML_OP_POOL_2D:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_pool2d_f32;
+ }
+ return nullptr;
case GGML_OP_LEAKY_RELU:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_leaky_relu_f32;
uint32_t half_ceil = (dim + 1) / 2;
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
} break;
+ case GGML_OP_POOL_2D:
+ {
+ const uint32_t N = dst->ne[3];
+ const uint32_t OC = dst->ne[2];
+ const uint32_t OH = dst->ne[1];
+ const uint32_t OW = dst->ne[0];
+ elements = { N * OC * OH * OW, 1, 1};
+ } break;
case GGML_OP_ADD:
case GGML_OP_DIV:
case GGML_OP_MUL:
}, dryrun);
}
+static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+ uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
+ const int32_t k1 = dst->op_params[1];
+ const int32_t k0 = dst->op_params[2];
+ const int32_t s1 = dst->op_params[3];
+ const int32_t s0 = dst->op_params[4];
+ const int32_t p1 = dst->op_params[5];
+ const int32_t p0 = dst->op_params[6];
+
+ const uint32_t IH = src0->ne[1];
+ const uint32_t IW = src0->ne[0];
+
+ const uint32_t N = dst->ne[3];
+
+ const uint32_t OC = dst->ne[2];
+ const uint32_t OH = dst->ne[1];
+ const uint32_t OW = dst->ne[0];
+
+ const uint32_t parallel_elements = N * OC * OH * OW;
+
+ ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
+ IW, IH, OW, OH, OC,
+ parallel_elements,
+ op,
+ k0, k1, s0, s1, p0, p1,
+ }, dryrun);
+}
+
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
const float * op_params = (const float *)dst->op_params;
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
case GGML_OP_SUM_ROWS:
case GGML_OP_IM2COL:
case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_POOL_2D:
case GGML_OP_LEAKY_RELU:
break;
default:
case GGML_OP_TIMESTEP_EMBEDDING:
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
+ break;
+ case GGML_OP_POOL_2D:
+ ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
+
break;
case GGML_OP_LEAKY_RELU:
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
case GGML_OP_SUM_ROWS:
case GGML_OP_IM2COL:
case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_POOL_2D:
case GGML_OP_LEAKY_RELU:
case GGML_OP_REPEAT:
buf = tensor->buffer;
case GGML_OP_SUM_ROWS:
case GGML_OP_IM2COL:
case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_POOL_2D:
case GGML_OP_LEAKY_RELU:
return true;
default:
const int32_t dim = tensor->op_params[0];
const int32_t max_period = tensor->op_params[1];
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
+ } else if (tensor->op == GGML_OP_POOL_2D) {
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
+ const int32_t k0 = tensor->op_params[1];
+ const int32_t k1 = tensor->op_params[2];
+ const int32_t s0 = tensor->op_params[3];
+ const int32_t s1 = tensor->op_params[4];
+ const int32_t p0 = tensor->op_params[5];
+ const int32_t p1 = tensor->op_params[6];
+
+ tensor_clone = ggml_pool_2d(ggml_ctx, src0_clone, op, k0, k1, s0, s1, p0, p1);
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
const float * op_params = (const float *)tensor->op_params;
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
--- /dev/null
+#version 450
+
+#include "types.comp"
+
+#extension GL_EXT_shader_16bit_storage : require
+
+layout(push_constant) uniform parameter {
+ uint IW; uint IH;
+ uint OW; uint OH;
+ uint OC;
+ uint pelements;
+ uint op;
+ int k0; int k1;
+ int s0; int s1;
+ int p0; int p1;
+} p;
+
+#define BLOCK_SIZE 512
+#define FLT_MAX 3.402823466e+38F
+#define OP_POOL_MAX 0u
+#define OP_POOL_AVG 1u
+
+layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+ const uint idx = gl_GlobalInvocationID.x;
+ if (idx >= p.pelements) {
+ return;
+ }
+
+ const uint O_HW = p.OW * p.OH;
+
+ const uint nc = idx / O_HW;
+ const uint cur_oh = (idx % O_HW) / p.OW;
+ const uint cur_ow = (idx % O_HW) % p.OW;
+
+ const int start_h = int(cur_oh) * p.s0 - p.p0;
+ const uint bh = max(start_h, 0);
+ const uint eh = min(start_h + p.k0, p.IH);
+
+ const int start_w = int(cur_ow) * p.s1 - p.p1;
+ const uint bw = max(start_w, 0);
+ const uint ew = min(start_w + p.k1, p.IW);
+
+ const float scale = 1.0 / float(p.k0 * p.k1);
+ float res;
+
+ if (p.op == OP_POOL_AVG) {
+ res = 0.0;
+ } else if (p.op == OP_POOL_MAX) {
+ res = -FLT_MAX;
+ } else {
+ return;
+ }
+
+ #pragma unroll
+ for (uint i = bh; i < eh; i++) {
+ #pragma unroll
+ for (uint j = bw; j < ew; j++) {
+ const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);
+
+ if (p.op == OP_POOL_AVG) {
+ res += cur * scale;
+ } else if (p.op == OP_POOL_MAX) {
+ res = max(res, cur);
+ }
+ }
+ }
+
+ data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
+}