GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
return NULL;
}
-
return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
*s = idx;
}
+//
+// data types
+//
+
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE",
"ARGMAX",
"REPEAT",
"REPEAT_BACK",
+ "CONCAT",
"SILU_BACK",
"NORM",
"RMS_NORM",
"RMS_NORM_BACK",
+ "GROUP_NORM",
"MUL_MAT",
"OUT_PROD",
"CLAMP",
"CONV_1D",
"CONV_2D",
+ "CONV_TRANSPOSE_2D",
"POOL_1D",
"POOL_2D",
+ "UPSCALE",
"FLASH_ATTN",
"FLASH_FF",
"FLASH_ATTN_BACK",
"WIN_PART",
"WIN_UNPART",
+ "GET_REL_POS",
+ "ADD_REL_POS",
"UNARY",
"MAP_UNARY",
"MAP_BINARY",
+ "MAP_CUSTOM1_F32",
+ "MAP_CUSTOM2_F32",
+ "MAP_CUSTOM3_F32",
+
"MAP_CUSTOM1",
"MAP_CUSTOM2",
"MAP_CUSTOM3",
"CROSS_ENTROPY_LOSS_BACK",
};
-static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"argmax(x)",
"repeat(x)",
"repeat_back(x)",
+ "concat(x, y)",
"silu_back(x)",
"norm(x)",
"rms_norm(x)",
"rms_norm_back(x)",
+ "group_norm(x)",
"X*Y",
"X*Y",
"clamp(x)",
"conv_1d(x)",
"conv_2d(x)",
+ "conv_transpose_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
+ "upscale(x)",
"flash_attn(x)",
"flash_ff(x)",
"flash_attn_back(x)",
"win_part(x)",
"win_unpart(x)",
+ "get_rel_pos(x)",
+ "add_rel_pos(x)",
"unary(x)",
"f(x)",
"f(x,y)",
+ "custom_f32(x)",
+ "custom_f32(x,y)",
+ "custom_f32(x,y,z)",
+
"custom(x)",
"custom(x,y)",
"custom(x,y,z)",
"cross_entropy_loss_back(x,y)",
};
-static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
p[GGML_OP_DIAG_MASK_ZERO ] = true;
p[GGML_OP_CONV_1D ] = true;
p[GGML_OP_CONV_2D ] = true;
+ p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
+ p[GGML_OP_ADD_REL_POS ] = true;
}
{ // FINALIZE
return result;
}
+// ggml_concat
+
+struct ggml_tensor* ggml_concat(
+ struct ggml_context* ctx,
+ struct ggml_tensor* a,
+ struct ggml_tensor* b) {
+ GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
+
+ bool is_node = false;
+
+ if (a->grad || b->grad) {
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
+
+ result->op = GGML_OP_CONCAT;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = b;
+
+ return result;
+}
+
// ggml_abs
struct ggml_tensor * ggml_abs(
return ggml_norm_impl(ctx, a, true);
}
+// ggml_rms_norm
+
static struct ggml_tensor * ggml_rms_norm_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
return ggml_rms_norm_impl(ctx, a, eps, true);
}
+// ggml_rms_norm_back
+
struct ggml_tensor * ggml_rms_norm_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
return result;
}
+// ggml_group_norm
+
+static struct ggml_tensor * ggml_group_norm_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_groups,
+ bool inplace) {
+
+ bool is_node = false;
+ if (!inplace && (a->grad)) {
+ GGML_ASSERT(false); // TODO: implement backward
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+ result->op = GGML_OP_GROUP_NORM;
+ result->op_params[0] = n_groups;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = NULL; // TODO: maybe store epsilon here?
+
+ return result;
+}
+
+struct ggml_tensor * ggml_group_norm(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_groups) {
+ return ggml_group_norm_impl(ctx, a, n_groups, false);
+}
+
+struct ggml_tensor * ggml_group_norm_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_groups) {
+ return ggml_group_norm_impl(ctx, a, n_groups, true);
+}
// ggml_mul_mat
int n_ctx,
float freq_base,
float freq_scale,
+ float xpos_base,
+ bool xpos_down,
bool inplace) {
GGML_ASSERT(n_past >= 0);
bool is_node = false;
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- int32_t params[6] = { n_past, n_dims, mode, n_ctx };
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
memcpy(params + 4, &freq_base, sizeof(float));
memcpy(params + 5, &freq_scale, sizeof(float));
+ memcpy(params + 6, &xpos_base, sizeof(float));
+ memcpy(params + 7, &xpos_down, sizeof(bool));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE;
int n_dims,
int mode,
int n_ctx) {
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
}
struct ggml_tensor * ggml_rope_inplace(
int n_dims,
int mode,
int n_ctx) {
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
}
struct ggml_tensor * ggml_rope_custom(
int n_ctx,
float freq_base,
float freq_scale) {
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
}
struct ggml_tensor * ggml_rope_custom_inplace(
int n_ctx,
float freq_base,
float freq_scale) {
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
+}
+
+struct ggml_tensor * ggml_rope_xpos_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ int n_dims,
+ float base,
+ bool down) {
+ return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
}
// ggml_rope_back
int n_past,
int n_dims,
int mode,
- int n_ctx) {
+ int n_ctx,
+ float freq_base,
+ float freq_scale,
+ float xpos_base,
+ bool xpos_down) {
GGML_ASSERT(n_past >= 0);
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
- int32_t params[] = { n_past, n_dims, mode, n_ctx };
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
+ memcpy(params + 4, &freq_base, sizeof(float));
+ memcpy(params + 5, &freq_scale, sizeof(float));
+ memcpy(params + 6, &xpos_base, sizeof(float));
+ memcpy(params + 7, &xpos_down, sizeof(bool));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE_BACK;
return result;
}
+// ggml_conv_1d_ph
+
+struct ggml_tensor* ggml_conv_1d_ph(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s,
+ int d) {
+ return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
+}
+
// ggml_conv_2d
struct ggml_tensor * ggml_conv_2d(
}
-// ggml_conv_1d_ph
+// ggml_conv_2d_sk_p0
-struct ggml_tensor * ggml_conv_1d_ph(
+struct ggml_tensor * ggml_conv_2d_sk_p0(
struct ggml_context * ctx,
struct ggml_tensor * a,
- struct ggml_tensor * b,
- int s,
- int d) {
- return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
+ struct ggml_tensor * b) {
+ return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
+}
+
+// ggml_conv_2d_s1_ph
+
+struct ggml_tensor * ggml_conv_2d_s1_ph(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b) {
+ return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
}
+// ggml_conv_transpose_2d_p0
+
+static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
+ return (ins - 1) * s - 2 * p + ks;
+}
+
+struct ggml_tensor * ggml_conv_transpose_2d_p0(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int stride) {
+ GGML_ASSERT(a->ne[3] == b->ne[2]);
+
+ bool is_node = false;
+
+ if (a->grad || b->grad) {
+ GGML_ASSERT(false); // TODO: implement backward
+ is_node = true;
+ }
+
+ const int64_t ne[4] = {
+ ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
+ ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
+ a->ne[2], b->ne[3],
+ };
+
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+ result->op = GGML_OP_CONV_TRANSPOSE_2D;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = b;
+ result->src[2] = ggml_new_i32(ctx, stride);
+
+ return result;
+}
// ggml_pool_*
return result;
}
+// ggml_upscale
+
+static struct ggml_tensor * ggml_upscale_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int scale_factor) {
+ bool is_node = false;
+
+ if (a->grad) {
+ GGML_ASSERT(false); // TODO: implement backward
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+ a->ne[0] * scale_factor,
+ a->ne[1] * scale_factor,
+ a->ne[2], a->ne[3]);
+
+ result->op = GGML_OP_UPSCALE;
+ result->op_params[0] = scale_factor;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = NULL;
+
+ return result;
+}
+
+struct ggml_tensor * ggml_upscale(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int scale_factor) {
+ return ggml_upscale_impl(ctx, a, scale_factor);
+}
+
// ggml_flash_attn
struct ggml_tensor * ggml_flash_attn(
return result;
}
+// ggml_get_rel_pos
+
+struct ggml_tensor * ggml_get_rel_pos(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int qh,
+ int kh) {
+ GGML_ASSERT(qh == kh);
+ GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
+
+ bool is_node = false;
+
+ if (a->grad) {
+ GGML_ASSERT(false); // TODO: implement backward
+ is_node = true;
+ }
+
+ const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
+
+ result->op = GGML_OP_GET_REL_POS;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = NULL;
+
+ return result;
+}
+
+// ggml_add_rel_pos
+
+static struct ggml_tensor * ggml_add_rel_pos_impl(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pw,
+ struct ggml_tensor * ph,
+ bool inplace) {
+ GGML_ASSERT(ggml_are_same_shape(pw, ph));
+ GGML_ASSERT(ggml_is_contiguous(a));
+ GGML_ASSERT(ggml_is_contiguous(pw));
+ GGML_ASSERT(ggml_is_contiguous(ph));
+ GGML_ASSERT(ph->type == GGML_TYPE_F32);
+ GGML_ASSERT(pw->type == GGML_TYPE_F32);
+ GGML_ASSERT(pw->ne[3] == a->ne[2]);
+ GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
+ GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
+
+ bool is_node = false;
+
+ if (!inplace && (a->grad || pw->grad || ph->grad)) {
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+ ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
+
+ result->op = GGML_OP_ADD_REL_POS;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = pw;
+ result->src[2] = ph;
+
+ return result;
+}
+
+
+struct ggml_tensor * ggml_add_rel_pos(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pw,
+ struct ggml_tensor * ph) {
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
+}
+
+struct ggml_tensor * ggml_add_rel_pos_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pw,
+ struct ggml_tensor * ph) {
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
+}
+
// gmml_unary
static struct ggml_tensor * ggml_unary_impl(
}
}
+// ggml_compute_forward_concat
+
+static void ggml_compute_forward_concat_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+
+ GGML_TENSOR_BINARY_OP_LOCALS;
+
+ // TODO: support for transposed / permuted tensors
+ GGML_ASSERT(nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+ GGML_ASSERT(nb10 == sizeof(float));
+
+ for (int i3 = 0; i3 < ne3; i3++) {
+ for (int i2 = ith; i2 < ne2; i2++) {
+ if (i2 < ne02) { // src0
+ for (int i1 = 0; i1 < ne1; i1++) {
+ for (int i0 = 0; i0 < ne0; i0++) {
+ const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
+
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
+ *y = *x;
+ }
+ }
+ } // src1
+ else {
+ for (int i1 = 0; i1 < ne1; i1++) {
+ for (int i0 = 0; i0 < ne0; i0++) {
+ const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
+
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
+ *y = *x;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_concat(
+ const struct ggml_compute_params* params,
+ const struct ggml_tensor* src0,
+ const struct ggml_tensor* src1,
+ struct ggml_tensor* dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_concat_f32(params, src0, src1, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
// ggml_compute_forward_abs
static void ggml_compute_forward_abs_f32(
}
}
+// ggml_compute_forward_group_rms_norm
+
static void ggml_compute_forward_rms_norm_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
}
}
-
static void ggml_compute_forward_rms_norm_back_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
}
}
-// ggml_compute_forward_mul_mat
+// ggml_compute_forward_group_norm
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- //const int64_t ne00 = src0->ne[0];
- //const int64_t ne01 = src0->ne[1];
+static void ggml_compute_forward_group_norm_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
- const int64_t ne10 = src1->ne[0];
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
- const int64_t ne0 = dst->ne[0];
- const int64_t ne1 = dst->ne[1];
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
- // TODO: find the optimal values for these
- if (ggml_is_contiguous(src0) &&
- ggml_is_contiguous(src1) &&
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+ const int ith = params->ith;
+ const int nth = params->nth;
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
- return true;
- }
+ GGML_TENSOR_UNARY_OP_LOCALS;
- return false;
-}
-#endif
+ const float eps = 1e-6f; // TODO: make this a parameter
-static void ggml_compute_forward_mul_mat(
- const struct ggml_compute_params * params,
- const struct ggml_tensor * src0,
- const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
- int64_t t0 = ggml_perf_time_us();
+ // TODO: optimize
+
+ int n_channels = src0->ne[2];
+ int n_groups = dst->op_params[0];
+ int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
+ for (int i = ith; i < n_groups; i+=nth) {
+ int start = i * n_channels_per_group;
+ int end = start + n_channels_per_group;
+ if (end > n_channels) {
+ end = n_channels;
+ }
+ int step = end - start;
+
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ ggml_float sum = 0.0;
+ for (int64_t i02 = start; i02 < end; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ sum += (ggml_float)x[i00];
+ }
+ }
+ }
+ float mean = sum / (ne00 * ne01 * step);
+ ggml_float sum2 = 0.0;
+
+ for (int64_t i02 = start; i02 < end; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ float v = x[i00] - mean;
+ y[i00] = v;
+ sum2 += (ggml_float)(v * v);
+ }
+ }
+ }
+ float variance = sum2 / (ne00 * ne01 * step);
+ const float scale = 1.0f / sqrtf(variance + eps);
+
+ for (int64_t i02 = start; i02 < end; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+ ggml_vec_scale_f32(ne00, y, scale);
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_group_norm(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_group_norm_f32(params, src0, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
+// ggml_compute_forward_mul_mat
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+// helper function to determine if it is better to use BLAS or not
+// for large matrices, BLAS is faster
+static bool ggml_compute_forward_mul_mat_use_blas(
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ //const int64_t ne00 = src0->ne[0];
+ //const int64_t ne01 = src0->ne[1];
+
+ const int64_t ne10 = src1->ne[0];
+
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
+
+ // TODO: find the optimal values for these
+ if (ggml_is_contiguous(src0) &&
+ ggml_is_contiguous(src1) &&
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+
+ /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
+ return true;
+ }
+
+ return false;
+}
+#endif
+
+static void ggml_compute_forward_mul_mat(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS;
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
+ // broadcast factors
+ const int64_t r2 = ne12/ne02;
+ const int64_t r3 = ne13/ne03;
+
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
- // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
- // ref: https://github.com/ggerganov/ggml/pull/224
- GGML_ASSERT(ne02 == ne12);
- GGML_ASSERT(ne03 == ne13);
-
if (params->ith != 0) {
return;
}
return;
}
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
+ // broadcast src0 into src1 across 2nd,3rd dimension
+ const int64_t i03 = i13/r3;
+ const int64_t i02 = i12/r2;
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
+
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata;
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
- to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
+ to_float((const char *) x + i01*nb01, wdata + id, ne00);
id += ne00;
}
assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0);
- // broadcast factors
- const int64_t r2 = ne12/ne02;
- const int64_t r3 = ne13/ne03;
-
// block-tiling attempt
const int64_t blck_0 = 16;
const int64_t blck_1 = 16;
}
}
-
// ggml_compute_forward_clamp
static void ggml_compute_forward_clamp_f32(
float freq_base;
float freq_scale;
+ // these two only relevant for xPos RoPE:
+ float xpos_base;
+ bool xpos_down;
+
const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) dst->op_params)[3];
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
assert(n_past >= 0);
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
+ // zeta scaling for xPos only:
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
+ if (xpos_down) zeta = 1.0f / zeta;
theta *= theta_scale;
const float x0 = src[0];
const float x1 = src[1];
- dst_data[0] = x0*cos_theta - x1*sin_theta;
- dst_data[1] = x0*sin_theta + x1*cos_theta;
+ dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
+ dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
}
} else {
// TODO: this is probably wrong, but I can't figure it out ..
// dx = rope_back(dy, src1)
// src0 is dy, src1 contains options
+ float freq_base;
+ float freq_scale;
+
+ // these two only relevant for xPos RoPE:
+ float xpos_base;
+ bool xpos_down;
+
const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
+ const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
assert(n_past >= 0);
// row index used to determine which thread to use
int ir = 0;
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_neox = mode & 2;
if (ir++ < ir0) continue;
if (ir > ir1) break;
- float theta = (float)p;
+ float theta = freq_scale * (float)p;
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
+ // zeta scaling for xPos only:
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
+ if (xpos_down) zeta = 1.0f / zeta;
theta *= theta_scale;
const float dy0 = dy[0];
const float dy1 = dy[1];
- dx[0] = dy0*cos_theta + dy1*sin_theta;
- dx[1] = - dy0*sin_theta + dy1*cos_theta;
+ dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
+ dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
}
}
+// ggml_compute_forward_conv_transpose_2d
+
+static void ggml_compute_forward_conv_transpose_2d(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * opt0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+ int64_t t0 = ggml_perf_time_us();
+ UNUSED(t0);
+
+ GGML_TENSOR_BINARY_OP_LOCALS;
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nk = ne00*ne01*ne02*ne03;
+
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb10 == sizeof(float));
+
+ if (params->type == GGML_TASK_INIT) {
+ memset(params->wdata, 0, params->wsize);
+
+ // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
+ {
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
+ ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
+ }
+ }
+ }
+ }
+ }
+
+ // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
+ {
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+ for (int i12 = 0; i12 < ne12; i12++) {
+ for (int i11 = 0; i11 < ne11; i11++) {
+ const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
+ ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
+ for (int i10 = 0; i10 < ne10; i10++) {
+ dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
+ }
+ }
+ }
+ }
+
+ return;
+ }
+
+ if (params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int32_t stride = ((const int32_t*)(opt0->data))[0];
+
+ // total patches in dst
+ const int np = ne2;
+
+ // patches per thread
+ const int dp = (np + nth - 1)/nth;
+
+ // patch range for this thread
+ const int ip0 = dp*ith;
+ const int ip1 = MIN(ip0 + dp, np);
+
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+ ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
+
+ for (int i2 = ip0; i2 < ip1; i2++) { // Cout
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
+ ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
+ for (int i11 = 0; i11 < ne11; i11++) {
+ for (int i10 = 0; i10 < ne10; i10++) {
+ const int i1n = i11*ne10*ne12 + i10*ne12;
+ for (int i01 = 0; i01 < ne01; i01++) {
+ for (int i00 = 0; i00 < ne00; i00++) {
+ float v = 0;
+ ggml_vec_dot_f16(ne03, &v,
+ (ggml_fp16_t *) wdata_src + i1n,
+ (ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
+
+ dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
+ }
+ }
+ }
+ }
+ }
+}
+
// ggml_compute_forward_pool_1d_sk_p0
static void ggml_compute_forward_pool_1d_sk_p0(
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
}
+// ggml_compute_forward_upscale
+
+static void ggml_compute_forward_upscale_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+
+ GGML_TENSOR_UNARY_OP_LOCALS;
+
+ const int scale_factor = dst->op_params[0];
+
+ // TODO: optimize
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = ith; i02 < ne02; i02++) {
+ for (int m = 0; m < dst->ne[1]; m++) {
+ int i01 = m / scale_factor;
+ for (int n = 0; n < dst->ne[0]; n++) {
+ int i00 = n / scale_factor;
+
+ const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+ float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
+
+ *y = *x;
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_upscale(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_upscale_f32(params, src0, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
// ggml_compute_forward_flash_attn
}
}
+// ggml_compute_forward_get_rel_pos
+
+static void ggml_compute_forward_get_rel_pos_f16(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
+
+ GGML_TENSOR_UNARY_OP_LOCALS;
+
+ const int64_t w = ne1;
+
+ ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
+ ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
+
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
+ const int64_t pos = (w - i1 - 1) + i2;
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
+ dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_get_rel_pos(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F16:
+ {
+ ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
+// ggml_compute_forward_add_rel_pos
+
+static void ggml_compute_forward_add_rel_pos_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * src2,
+ struct ggml_tensor * dst) {
+
+ const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
+ if (!inplace && params->type == GGML_TASK_INIT) {
+ memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
+ return;
+ }
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ int64_t t0 = ggml_perf_time_us();
+ UNUSED(t0);
+
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
+
+ float * src1_data = (float *) src1->data;
+ float * src2_data = (float *) src2->data;
+ float * dst_data = (float *) dst->data;
+
+ const int64_t ne10 = src1->ne[0];
+ const int64_t ne11 = src1->ne[1];
+ const int64_t ne12 = src1->ne[2];
+ const int64_t ne13 = src1->ne[3];
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ // total patches in dst
+ const int np = ne13;
+
+ // patches per thread
+ const int dp = (np + nth - 1)/nth;
+
+ // patch range for this thread
+ const int ip0 = dp*ith;
+ const int ip1 = MIN(ip0 + dp, np);
+
+
+ for (int64_t i13 = ip0; i13 < ip1; ++i13) {
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
+ const int64_t jp0 = jp1 + i10;
+ const float src1_e = src1_data[jp0];
+ const float src2_e = src2_data[jp0];
+
+ const int64_t jdh = jp0 * ne10;
+ const int64_t jdw = jdh - (ne10 - 1) * i10;
+
+ for (int64_t j = 0; j < ne10; ++j) {
+ dst_data[jdh + j ] += src2_e;
+ dst_data[jdw + j*ne10] += src1_e;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_add_rel_pos(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ const struct ggml_tensor * src2,
+ struct ggml_tensor * dst) {
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
+ } break;
+ default:
+ {
+ GGML_ASSERT(false);
+ } break;
+ }
+}
+
// ggml_compute_forward_map_unary
static void ggml_compute_forward_map_unary_f32(
{
ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
} break;
+ case GGML_OP_CONCAT:
+ {
+ ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
+ } break;
case GGML_OP_SILU_BACK:
{
ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
{
ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
} break;
+ case GGML_OP_GROUP_NORM:
+ {
+ ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
+ } break;
case GGML_OP_MUL_MAT:
{
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
{
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
} break;
+ case GGML_OP_CONV_TRANSPOSE_2D:
+ {
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+ } break;
case GGML_OP_POOL_1D:
{
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
{
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
} break;
+ case GGML_OP_UPSCALE:
+ {
+ ggml_compute_forward_upscale(params, tensor->src[0], tensor);
+ } break;
case GGML_OP_FLASH_ATTN:
{
const int32_t t = ggml_get_op_params_i32(tensor, 0);
{
ggml_compute_forward_unary(params, tensor->src[0], tensor);
} break;
+ case GGML_OP_GET_REL_POS:
+ {
+ ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
+ } break;
+ case GGML_OP_ADD_REL_POS:
+ {
+ ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+ } break;
case GGML_OP_MAP_UNARY:
{
ggml_unary_op_f32_t fun;
inplace);
}
} break;
+ case GGML_OP_CONCAT:
+ {
+ GGML_ASSERT(false); // TODO: implement
+ } break;
case GGML_OP_SILU_BACK:
{
GGML_ASSERT(false); // TODO: not implemented
{
GGML_ASSERT(false); // TODO: not implemented
} break;
+ case GGML_OP_GROUP_NORM:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
case GGML_OP_MUL_MAT:
{
// https://cs231n.github.io/optimization-2/#staged
const int n_dims = ((int32_t *) tensor->op_params)[1];
const int mode = ((int32_t *) tensor->op_params)[2];
const int n_ctx = ((int32_t *) tensor->op_params)[3];
+ float freq_base;
+ float freq_scale;
+ float xpos_base;
+ bool xpos_down;
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
+
src0->grad = ggml_add_impl(ctx,
src0->grad,
ggml_rope_back(ctx,
n_past,
n_dims,
mode,
- n_ctx),
+ n_ctx,
+ freq_base,
+ freq_scale,
+ xpos_base,
+ xpos_down),
inplace);
}
} break;
const int n_dims = ((int32_t *) tensor->op_params)[1];
const int mode = ((int32_t *) tensor->op_params)[2];
const int n_ctx = ((int32_t *) tensor->op_params)[3];
+ float freq_base;
+ float freq_scale;
+ float xpos_base;
+ bool xpos_down;
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
+
src0->grad = ggml_add_impl(ctx,
src0->grad,
- ggml_rope(ctx,
+ ggml_rope_impl(ctx,
tensor->grad,
n_past,
n_dims,
mode,
- n_ctx),
+ n_ctx,
+ freq_base,
+ freq_scale,
+ xpos_base,
+ xpos_down,
+ false),
inplace);
}
} break;
{
GGML_ASSERT(false); // TODO: not implemented
} break;
+ case GGML_OP_CONV_TRANSPOSE_2D:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
case GGML_OP_POOL_1D:
{
GGML_ASSERT(false); // TODO: not implemented
{
GGML_ASSERT(false); // TODO: not implemented
} break;
+ case GGML_OP_UPSCALE:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
case GGML_OP_FLASH_ATTN:
{
struct ggml_tensor * flash_grad = NULL;
GGML_ASSERT(false);
}
} break;
+ case GGML_OP_GET_REL_POS:
+ case GGML_OP_ADD_REL_POS:
case GGML_OP_MAP_UNARY:
case GGML_OP_MAP_BINARY:
case GGML_OP_MAP_CUSTOM1_F32:
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
+ case GGML_OP_GROUP_NORM:
{
n_tasks = n_threads;
} break;
+ case GGML_OP_CONCAT:
case GGML_OP_MUL_MAT:
case GGML_OP_OUT_PROD:
{
case GGML_OP_SOFT_MAX_BACK:
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
+ case GGML_OP_ADD_REL_POS:
{
n_tasks = n_threads;
} break;
GGML_ASSERT(false);
}
+ work_size = MAX(work_size, cur);
+ } break;
+ case GGML_OP_CONV_TRANSPOSE_2D:
+ {
+ n_tasks = n_threads;
+
+ const int64_t ne00 = node->src[0]->ne[0]; // W
+ const int64_t ne01 = node->src[0]->ne[1]; // H
+ const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
+ const int64_t ne03 = node->src[0]->ne[3]; // Channels In
+
+ const int64_t ne10 = node->src[1]->ne[0]; // W
+ const int64_t ne11 = node->src[1]->ne[1]; // H
+ const int64_t ne12 = node->src[1]->ne[2]; // Channels In
+
+ size_t cur = 0;
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
+ cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
+
work_size = MAX(work_size, cur);
} break;
case GGML_OP_POOL_1D:
{
n_tasks = 1;
} break;
+ case GGML_OP_UPSCALE:
+ {
+ n_tasks = n_threads;
+ } break;
case GGML_OP_FLASH_ATTN:
{
n_tasks = n_threads;
} break;
case GGML_OP_WIN_PART:
case GGML_OP_WIN_UNPART:
+ case GGML_OP_GET_REL_POS:
case GGML_OP_MAP_UNARY:
case GGML_OP_MAP_BINARY:
case GGML_OP_MAP_CUSTOM1_F32:
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
+ UNUSED(rc);
}
}
+
workers[0].ith = 0;
workers[0].shared = &state_shared;