#include "ggml-impl.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
+#include "ggml-cpu/unary-ops.h"
+#include "ggml-cpu/binary-ops.h"
#include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
// ggml_compute_forward_add
-static void ggml_compute_forward_add_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT( nb0 == sizeof(float));
- GGML_ASSERT(nb00 == sizeof(float));
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- if (nb10 == sizeof(float)) {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src1 is broadcastable across src0 and dst in i1, i2, i3
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
- const int64_t nr0 = ne00 / ne10;
-
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
- for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
- vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
- ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
- }
- }
- } else {
- // src1 is not contiguous
- for (int ir = ir0; ir < ir1; ++ir) {
- // src1 is broadcastable across src0 and dst in i1, i2, i3
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
-
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
- for (int64_t i0 = 0; i0 < ne0; ++i0) {
- const int64_t i10 = i0 % ne10;
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
- dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
- }
- }
- }
-}
-
-static void ggml_compute_forward_add_f16_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
- if (dst->type == GGML_TYPE_F32) {
- GGML_ASSERT( nb0 == sizeof(float));
- }
- else {
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
- GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
- }
-
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- if (nb10 == sizeof(float)) {
- if (dst->type == GGML_TYPE_F16) {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src0, src1 and dst are same shape => same indices
- const int i3 = ir/(ne2*ne1);
- const int i2 = (ir - i3*ne2*ne1)/ne1;
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
- for (int i = 0; i < ne0; i++) {
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
- }
- }
- } else {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src0, src1 and dst are same shape => same indices
- const int i3 = ir/(ne2*ne1);
- const int i2 = (ir - i3*ne2*ne1)/ne1;
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
- float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
- for (int i = 0; i < ne0; i++) {
- dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
- }
- }
- }
- }
- else {
- // src1 is not contiguous
- GGML_ABORT("fatal error");
- }
-}
-
-static void ggml_compute_forward_add_bf16_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT(src0->type == GGML_TYPE_BF16);
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
- if (dst->type == GGML_TYPE_F32) {
- GGML_ASSERT( nb0 == sizeof(float));
- }
- else {
- GGML_ASSERT(dst->type == GGML_TYPE_BF16);
- GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
- }
-
- GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- if (nb10 == sizeof(float)) {
- if (dst->type == GGML_TYPE_BF16) {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src0, src1 and dst are same shape => same indices
- const int i3 = ir/(ne2*ne1);
- const int i2 = (ir - i3*ne2*ne1)/ne1;
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
- ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
- ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
- for (int i = 0; i < ne0; i++) {
- dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
- }
- }
- } else {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src0, src1 and dst are same shape => same indices
- const int i3 = ir/(ne2*ne1);
- const int i2 = (ir - i3*ne2*ne1)/ne1;
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
- float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
- ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
- for (int i = 0; i < ne0; i++) {
- dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
- }
- }
- }
- }
- else {
- // src1 is not contiguous
- GGML_ABORT("fatal error");
- }
-}
-
-static void ggml_compute_forward_add_f16_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
-
- GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- if (nb10 == sizeof(ggml_fp16_t)) {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src1 is broadcastable across src0 and dst in i1, i2, i3
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
- const int64_t nr0 = ne00 / ne10;
-
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
- ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
- for (int64_t r = 0; r < nr0; ++r) {
- ggml_vec_add_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
- }
- }
- }
- else {
- // src1 is not contiguous
- GGML_ABORT("fatal error");
- }
-}
-
-static void ggml_compute_forward_add_bf16_bf16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT(src0->type == GGML_TYPE_BF16);
- GGML_ASSERT(src1->type == GGML_TYPE_BF16);
- GGML_ASSERT(dst->type == GGML_TYPE_BF16);
-
- GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
- GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
-
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
-
- if (nb10 == sizeof(ggml_bf16_t)) {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src0, src1 and dst are same shape => same indices
- const int i3 = ir/(ne2*ne1);
- const int i2 = (ir - i3*ne2*ne1)/ne1;
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
- ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
- ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
- ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
- for (int i = 0; i < ne0; i++) {
- dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i]));
- }
- }
- }
- else {
- // src1 is not contiguous
- GGML_ABORT("fatal error");
- }
-}
-
static void ggml_compute_forward_add_q_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
- {
- if (src1->type == GGML_TYPE_F32) {
- ggml_compute_forward_add_f32(params, dst);
- }
- else {
- GGML_ABORT("fatal error");
- }
- } break;
case GGML_TYPE_F16:
- {
- if (src1->type == GGML_TYPE_F16) {
- ggml_compute_forward_add_f16_f16(params, dst);
- }
- else if (src1->type == GGML_TYPE_F32) {
- ggml_compute_forward_add_f16_f32(params, dst);
- }
- else {
- GGML_ABORT("fatal error");
- }
- } break;
case GGML_TYPE_BF16:
{
- if (src1->type == GGML_TYPE_BF16) {
- ggml_compute_forward_add_bf16_bf16(params, dst);
- }
- else if (src1->type == GGML_TYPE_F32) {
- ggml_compute_forward_add_bf16_f32(params, dst);
- }
- else {
- GGML_ABORT("fatal error");
- }
+ ggml_compute_forward_add_non_quantized(params, dst);
} break;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
}
}
-// ggml_compute_forward_sub
+// ggml_compute_forward_sum
-static void ggml_compute_forward_sub_f32(
+static void ggml_compute_forward_sum_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
- GGML_ASSERT( nb0 == sizeof(float));
- GGML_ASSERT(nb00 == sizeof(float));
+ if (params->ith != 0) {
+ return;
+ }
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
+ assert(ggml_is_scalar(dst));
+ assert(src0->nb[0] == sizeof(float));
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
- if (nb10 == sizeof(float)) {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src1 is broadcastable across src0 and dst in i1, i2, i3
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+ ggml_float sum = 0;
+ ggml_float row_sum = 0;
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
- const int64_t nr0 = ne00 / ne10;
-
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
- for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
- vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-#else
- ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
- }
- }
- } else {
- // src1 is not contiguous
- for (int ir = ir0; ir < ir1; ++ir) {
- // src1 is broadcastable across src0 and dst in i1, i2, i3
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
-
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
- for (int64_t i0 = 0; i0 < ne0; ++i0) {
- const int64_t i10 = i0 % ne10;
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
- dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ ggml_vec_sum_f32_ggf(ne00,
+ &row_sum,
+ (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+ sum += row_sum;
}
}
}
+ ((float *) dst->data)[0] = sum;
}
-static void ggml_compute_forward_sub_f16(
+static void ggml_compute_forward_sum_f16(
const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+ struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
- assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+ if (params->ith != 0) {
+ return;
+ }
- const int ith = params->ith;
- const int nth = params->nth;
+ assert(ggml_is_scalar(dst));
- const int nr = ggml_nrows(src0);
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
- GGML_TENSOR_BINARY_OP_LOCALS
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
+ float sum = 0;
+ float row_sum = 0;
- GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ ggml_vec_sum_f16_ggf(ne00,
+ &row_sum,
+ (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+ sum += row_sum;
+ }
+ }
+ }
+ ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
+}
- // rows per thread
- const int dr = (nr + nth - 1)/nth;
+static void ggml_compute_forward_sum_bf16(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
- // row range for this thread
- const int ir0 = dr*ith;
- const int ir1 = MIN(ir0 + dr, nr);
+ const struct ggml_tensor * src0 = dst->src[0];
- if (nb10 == sizeof(ggml_fp16_t)) {
- for (int ir = ir0; ir < ir1; ++ir) {
- // src1 is broadcastable across src0 and dst in i1, i2, i3
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+ if (params->ith != 0) {
+ return;
+ }
+
+ assert(ggml_is_scalar(dst));
+
+ assert(src0->nb[0] == sizeof(ggml_bf16_t));
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
- const int64_t nr0 = ne00 / ne10;
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
- ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+ float sum = 0;
+ float row_sum = 0;
- for (int64_t r = 0; r < nr0; ++r) {
- ggml_vec_sub_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ ggml_vec_sum_bf16_ggf(ne00,
+ &row_sum,
+ (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+ sum += row_sum;
}
}
- } else {
- // src1 is not contiguous
- GGML_ABORT("unimplemented error");
}
+ ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum);
}
-static void ggml_compute_forward_sub(
+static void ggml_compute_forward_sum(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
- ggml_compute_forward_sub_f32(params, dst);
+ ggml_compute_forward_sum_f32(params, dst);
} break;
case GGML_TYPE_F16:
{
- ggml_compute_forward_sub_f16(params, dst);
+ ggml_compute_forward_sum_f16(params, dst);
+ } break;
+ case GGML_TYPE_BF16:
+ {
+ ggml_compute_forward_sum_bf16(params, dst);
} break;
default:
{
}
}
-// ggml_compute_forward_mul
+// ggml_compute_forward_sum_rows
-static void ggml_compute_forward_mul_f32(
+static void ggml_compute_forward_sum_rows_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int64_t nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT( nb0 == sizeof(float));
- GGML_ASSERT(nb00 == sizeof(float));
-
- if (nb10 == sizeof(float)) {
- for (int64_t ir = ith; ir < nr; ir += nth) {
- // src0 and dst are same shape => same indices
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
- const int64_t nr0 = ne00 / ne10;
-
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
- for (int64_t r = 0 ; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
- UNUSED(ggml_vec_mul_f32);
-
- vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
- ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
- }
- }
- } else {
- // src1 is not contiguous
- for (int64_t ir = ith; ir < nr; ir += nth) {
- // src0 and dst are same shape => same indices
- // src1 is broadcastable across src0 and dst in i1, i2, i3
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
-
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
- for (int64_t i0 = 0; i0 < ne00; ++i0) {
- const int64_t i10 = i0 % ne10;
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
- dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
- }
- }
+ if (params->ith != 0) {
+ return;
}
-}
-
-static void ggml_compute_forward_mul_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int64_t nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
-
- GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
- if (nb10 == sizeof(ggml_fp16_t)) {
- for (int64_t ir = ith; ir < nr; ir += nth) {
- // src0 and dst are same shape => same indices
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
- const int64_t nr0 = ne00 / ne10;
+ GGML_TENSOR_UNARY_OP_LOCALS
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
- ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+ GGML_ASSERT(ne0 == 1);
+ GGML_ASSERT(ne1 == ne01);
+ GGML_ASSERT(ne2 == ne02);
+ GGML_ASSERT(ne3 == ne03);
- for (int64_t r = 0 ; r < nr0; ++r) {
- ggml_vec_mul_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+ for (int64_t i3 = 0; i3 < ne03; i3++) {
+ for (int64_t i2 = 0; i2 < ne02; i2++) {
+ for (int64_t i1 = 0; i1 < ne01; i1++) {
+ float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
+ float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
+ float row_sum = 0;
+ ggml_vec_sum_f32(ne00, &row_sum, src_row);
+ dst_row[0] = row_sum;
}
}
- } else {
- // src1 is not contiguous
- GGML_ABORT("unimplemented error");
}
}
-static void ggml_compute_forward_mul(
+static void ggml_compute_forward_sum_rows(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT((src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && "only f32/f16 src1 supported for now");
switch (src0->type) {
case GGML_TYPE_F32:
{
- ggml_compute_forward_mul_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_mul_f16(params, dst);
+ ggml_compute_forward_sum_rows_f32(params, dst);
} break;
default:
{
}
}
-// ggml_compute_forward_div
+// ggml_compute_forward_mean
-static void ggml_compute_forward_div_f32(
+static void ggml_compute_forward_mean_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int64_t nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT( nb0 == sizeof(float));
- GGML_ASSERT(nb00 == sizeof(float));
-
- if (nb10 == sizeof(float)) {
- for (int64_t ir = ith; ir < nr; ir += nth) {
- // src0 and dst are same shape => same indices
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
- const int64_t nr0 = ne00 / ne10;
-
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
- for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
- UNUSED(ggml_vec_div_f32);
-
- vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-#else
- ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
- }
- }
- } else {
- // src1 is not contiguous
- for (int64_t ir = ith; ir < nr; ir += nth) {
- // src0 and dst are same shape => same indices
- // src1 is broadcastable across src0 and dst in i1, i2, i3
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
-
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
- for (int64_t i0 = 0; i0 < ne00; ++i0) {
- const int64_t i10 = i0 % ne10;
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
- dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
- }
- }
+ if (params->ith != 0) {
+ return;
}
-}
-
-static void ggml_compute_forward_div_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
- const int ith = params->ith;
- const int nth = params->nth;
-
- const int64_t nr = ggml_nrows(src0);
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
- GGML_ASSERT(dst->type == GGML_TYPE_F16);
+ assert(src0->nb[0] == sizeof(float));
- GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+ GGML_TENSOR_UNARY_OP_LOCALS
- if (nb10 == sizeof(ggml_fp16_t)) {
- for (int64_t ir = ith; ir < nr; ir += nth) {
- // src0 and dst are same shape => same indices
- const int64_t i03 = ir/(ne02*ne01);
- const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
- const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+ assert(ne0 == 1);
+ assert(ne1 == ne01);
+ assert(ne2 == ne02);
+ assert(ne3 == ne03);
- const int64_t i13 = i03 % ne13;
- const int64_t i12 = i02 % ne12;
- const int64_t i11 = i01 % ne11;
- const int64_t nr0 = ne00 / ne10;
+ UNUSED(ne0);
+ UNUSED(ne1);
+ UNUSED(ne2);
+ UNUSED(ne3);
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
- ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ ggml_vec_sum_f32(ne00,
+ (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
+ (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
- for (int64_t r = 0; r < nr0; ++r) {
- ggml_vec_div_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+ *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
}
}
- } else {
- // src1 is not contiguous
- GGML_ABORT("unimplemented error");
}
}
-static void ggml_compute_forward_div(
+static void ggml_compute_forward_mean(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
- ggml_compute_forward_div_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_div_f16(params, dst);
+ ggml_compute_forward_mean_f32(params, dst);
} break;
default:
{
}
}
-// ggml_compute_forward_sqr
+// ggml_compute_forward_argmax
-static void ggml_compute_forward_sqr_f32(
+static void ggml_compute_forward_argmax_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
return;
}
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- assert( dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
+ assert(dst->nb[0] == sizeof(float));
- for (int i = 0; i < n; i++) {
- ggml_vec_sqr_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
+ const int64_t ne00 = src0->ne[0];
+ const int64_t ne01 = src0->ne[1];
-static void ggml_compute_forward_sqr_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- assert( dst->nb[0] == sizeof(ggml_fp16_t));
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
+ const size_t nb01 = src0->nb[1];
+ const size_t nb0 = dst->nb[0];
- for (int i = 0; i < n; i++) {
- ggml_vec_sqr_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
+ for (int64_t i1 = 0; i1 < ne01; i1++) {
+ float * src = (float *) ((char *) src0->data + i1*nb01);
+ int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0);
+ int v = 0;
+ ggml_vec_argmax_f32(ne00, &v, src);
+ dst_[0] = v;
}
}
-static void ggml_compute_forward_sqr(
+static void ggml_compute_forward_argmax(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
- ggml_compute_forward_sqr_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_sqr_f16(params, dst);
+ ggml_compute_forward_argmax_f32(params, dst);
} break;
default:
{
}
}
-// ggml_compute_forward_sqrt
+// ggml_compute_forward_count_equal
-static void ggml_compute_forward_sqrt_f32(
+static void ggml_compute_forward_count_equal_i32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
- if (params->ith != 0) {
- return;
- }
+ GGML_TENSOR_BINARY_OP_LOCALS;
- assert(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(src0->type == GGML_TYPE_I32);
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_are_same_shape(src0, src1));
+ GGML_ASSERT(ggml_is_scalar(dst));
+ GGML_ASSERT(dst->type == GGML_TYPE_I64);
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ const int64_t nr = ggml_nrows(src0);
- assert( dst->nb[0] == sizeof(float));
- assert(src0->nb[0] == sizeof(float));
+ const int ith = params->ith;
+ const int nth = params->nth;
- for (int i = 0; i < n; i++) {
- ggml_vec_sqrt_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
+ int64_t * sums = (int64_t *) params->wdata;
+ int64_t sum_thread = 0;
-static void ggml_compute_forward_sqrt_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+ // rows per thread
+ const int64_t dr = (nr + nth - 1)/nth;
- const struct ggml_tensor * src0 = dst->src[0];
+ // row range for this thread
+ const int64_t ir0 = dr*ith;
+ const int64_t ir1 = MIN(ir0 + dr, nr);
- if (params->ith != 0) {
- return;
- }
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
+ const int64_t i03 = ir / (ne02*ne01);
+ const int64_t i02 = (ir - i03*ne03) / ne01;
+ const int64_t i01 = ir - i03*ne03 - i02*ne02;
- assert(ggml_are_same_shape(src0, dst));
+ const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01;
+ const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11;
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ for (int64_t i00 = 0; i00 < ne00; ++i00) {
+ const int32_t val0 = *((const int32_t *) (data0 + i00*nb00));
+ const int32_t val1 = *((const int32_t *) (data1 + i00*nb10));
- assert( dst->nb[0] == sizeof(ggml_fp16_t));
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
+ sum_thread += val0 == val1;
+ }
+ }
+ if (ith != 0) {
+ sums[ith] = sum_thread;
+ }
+ ggml_barrier(params->threadpool);
- for (int i = 0; i < n; i++) {
- ggml_vec_sqrt_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
+ if (ith != 0) {
+ return;
+ }
+
+ for (int ith_other = 1; ith_other < nth; ++ith_other) {
+ sum_thread += sums[ith_other];
}
+ *((int64_t *) dst->data) = sum_thread;
}
-static void ggml_compute_forward_sqrt(
+static void ggml_compute_forward_count_equal(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_sqrt_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
+ case GGML_TYPE_I32:
{
- ggml_compute_forward_sqrt_f16(params, dst);
+ ggml_compute_forward_count_equal_i32(params, dst);
} break;
default:
{
}
}
-// ggml_compute_forward_log
+// ggml_compute_forward_repeat
-static void ggml_compute_forward_log_f32(
+static void ggml_compute_forward_repeat_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
return;
}
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_can_repeat(src0, dst));
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ GGML_TENSOR_UNARY_OP_LOCALS
- GGML_ASSERT( dst->nb[0] == sizeof(float));
- GGML_ASSERT(src0->nb[0] == sizeof(float));
+ // guaranteed to be an integer due to the check in ggml_can_repeat
+ const int nr0 = (int)(ne0/ne00);
+ const int nr1 = (int)(ne1/ne01);
+ const int nr2 = (int)(ne2/ne02);
+ const int nr3 = (int)(ne3/ne03);
- for (int i = 0; i < n; i++) {
- ggml_vec_log_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
+ // TODO: support for transposed / permuted tensors
+ GGML_ASSERT(nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ // TODO: maybe this is not optimal?
+ for (int i3 = 0; i3 < nr3; i3++) {
+ for (int k3 = 0; k3 < ne03; k3++) {
+ for (int i2 = 0; i2 < nr2; i2++) {
+ for (int k2 = 0; k2 < ne02; k2++) {
+ for (int i1 = 0; i1 < nr1; i1++) {
+ for (int k1 = 0; k1 < ne01; k1++) {
+ for (int i0 = 0; i0 < nr0; i0++) {
+ ggml_vec_cpy_f32(ne00,
+ (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0),
+ (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01));
+ }
+ }
+ }
+ }
+ }
+ }
}
}
-static void ggml_compute_forward_log_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+static void ggml_compute_forward_repeat_f16(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
return;
}
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_can_repeat(src0, dst));
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ GGML_TENSOR_UNARY_OP_LOCALS
- GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
- GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
+ // guaranteed to be an integer due to the check in ggml_can_repeat
+ const int nr0 = (int)(ne0/ne00);
+ const int nr1 = (int)(ne1/ne01);
+ const int nr2 = (int)(ne2/ne02);
+ const int nr3 = (int)(ne3/ne03);
- for (int i = 0; i < n; i++) {
- ggml_vec_log_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
+ // TODO: support for transposed / permuted tensors
+ GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+ // TODO: maybe this is not optimal?
+ for (int i3 = 0; i3 < nr3; i3++) {
+ for (int k3 = 0; k3 < ne03; k3++) {
+ for (int i2 = 0; i2 < nr2; i2++) {
+ for (int k2 = 0; k2 < ne02; k2++) {
+ for (int i1 = 0; i1 < nr1; i1++) {
+ for (int k1 = 0; k1 < ne01; k1++) {
+ for (int i0 = 0; i0 < nr0; i0++) {
+ ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0);
+ ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01);
+ // ggml_vec_cpy_f16(ne00, y, x)
+ for (int i = 0; i < ne00; ++i) {
+ y[i] = x[i];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
}
}
-static void ggml_compute_forward_log(
+static void ggml_compute_forward_repeat(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
- case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ case GGML_TYPE_BF16:
+ case GGML_TYPE_I16:
{
- ggml_compute_forward_log_f32(params, dst);
+ ggml_compute_forward_repeat_f16(params, dst);
} break;
- case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ case GGML_TYPE_I32:
{
- ggml_compute_forward_log_f16(params, dst);
+ ggml_compute_forward_repeat_f32(params, dst);
} break;
default:
{
}
}
-// ggml_compute_forward_sin
+// ggml_compute_forward_repeat_back
-static void ggml_compute_forward_sin_f32(
+static void ggml_compute_forward_repeat_back_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
return;
}
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_can_repeat(dst, src0));
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ GGML_TENSOR_UNARY_OP_LOCALS
- GGML_ASSERT( dst->nb[0] == sizeof(float));
- GGML_ASSERT(src0->nb[0] == sizeof(float));
+ // guaranteed to be an integer due to the check in ggml_can_repeat
+ const int nr0 = (int)(ne00/ne0);
+ const int nr1 = (int)(ne01/ne1);
+ const int nr2 = (int)(ne02/ne2);
+ const int nr3 = (int)(ne03/ne3);
- for (int i = 0; i < n; i++) {
- ggml_vec_sin_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
+ // TODO: support for transposed / permuted tensors
+ GGML_ASSERT(nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ if (ggml_is_contiguous(dst)) {
+ ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+ } else {
+ for (int k3 = 0; k3 < ne3; k3++) {
+ for (int k2 = 0; k2 < ne2; k2++) {
+ for (int k1 = 0; k1 < ne1; k1++) {
+ ggml_vec_set_f32(ne0,
+ (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
+ 0);
+ }
+ }
+ }
+ }
+
+ // TODO: maybe this is not optimal?
+ for (int i3 = 0; i3 < nr3; i3++) {
+ for (int k3 = 0; k3 < ne3; k3++) {
+ for (int i2 = 0; i2 < nr2; i2++) {
+ for (int k2 = 0; k2 < ne2; k2++) {
+ for (int i1 = 0; i1 < nr1; i1++) {
+ for (int k1 = 0; k1 < ne1; k1++) {
+ for (int i0 = 0; i0 < nr0; i0++) {
+ ggml_vec_acc_f32(ne0,
+ (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1),
+ (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
+ }
+ }
+ }
+ }
+ }
+ }
}
}
-static void ggml_compute_forward_sin_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
- GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
-
- for (int i = 0; i < n; i++) {
- ggml_vec_sin_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_sin(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+static void ggml_compute_forward_repeat_back(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F32:
{
- ggml_compute_forward_sin_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_sin_f16(params, dst);
+ ggml_compute_forward_repeat_back_f32(params, dst);
} break;
default:
{
}
}
-// ggml_compute_forward_cos
-
-static void ggml_compute_forward_cos_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- GGML_ASSERT( dst->nb[0] == sizeof(float));
- GGML_ASSERT(src0->nb[0] == sizeof(float));
-
- for (int i = 0; i < n; i++) {
- ggml_vec_cos_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
+// ggml_compute_forward_concat
-static void ggml_compute_forward_cos_f16(
+static void ggml_compute_forward_concat_any(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
- if (params->ith != 0) {
- return;
- }
-
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
- GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
-
- for (int i = 0; i < n; i++) {
- ggml_vec_cos_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_cos(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
+ const size_t len = ggml_type_size(src0->type);
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_cos_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_cos_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
+ const int ith = params->ith;
+ const int nth = params->nth;
-// ggml_compute_forward_sum
+ GGML_TENSOR_BINARY_OP_LOCALS
-static void ggml_compute_forward_sum_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
- const struct ggml_tensor * src0 = dst->src[0];
+ GGML_ASSERT(dim >= 0 && dim < 4);
- if (params->ith != 0) {
- return;
- }
+ int64_t o[4] = {0, 0, 0, 0};
+ o[dim] = src0->ne[dim];
- assert(ggml_is_scalar(dst));
- assert(src0->nb[0] == sizeof(float));
+ const char * x;
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
+ // TODO: smarter multi-theading
+ for (int i3 = 0; i3 < ne3; i3++) {
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
+ for (int i1 = 0; i1 < ne1; i1++) {
+ for (int i0 = 0; i0 < ne0; i0++) {
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03;
+ } else {
+ x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
+ }
- ggml_float sum = 0;
- ggml_float row_sum = 0;
+ char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3;
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- ggml_vec_sum_f32_ggf(ne00,
- &row_sum,
- (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
- sum += row_sum;
+ memcpy(y, x, len);
+ }
}
}
}
- ((float *) dst->data)[0] = sum;
}
-static void ggml_compute_forward_sum_f16(
+static void ggml_compute_forward_concat_i8(
const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+ struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_scalar(dst));
-
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
-
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
-
- float sum = 0;
- float row_sum = 0;
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t));
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- ggml_vec_sum_f16_ggf(ne00,
- &row_sum,
- (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
- sum += row_sum;
- }
- }
- }
- ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
-}
+ const int ith = params->ith;
+ const int nth = params->nth;
-static void ggml_compute_forward_sum_bf16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+ GGML_TENSOR_BINARY_OP_LOCALS
- const struct ggml_tensor * src0 = dst->src[0];
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
- if (params->ith != 0) {
- return;
- }
+ GGML_ASSERT(dim >= 0 && dim < 4);
- assert(ggml_is_scalar(dst));
+ int64_t o[4] = {0, 0, 0, 0};
+ o[dim] = src0->ne[dim];
- assert(src0->nb[0] == sizeof(ggml_bf16_t));
+ const int8_t * x;
- GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
- GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
+ // TODO: smarter multi-theading
+ for (int i3 = 0; i3 < ne3; i3++) {
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
+ for (int i1 = 0; i1 < ne1; i1++) {
+ for (int i0 = 0; i0 < ne0; i0++) {
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
+ } else {
+ x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
+ }
- float sum = 0;
- float row_sum = 0;
+ int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- ggml_vec_sum_bf16_ggf(ne00,
- &row_sum,
- (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
- sum += row_sum;
+ *y = *x;
+ }
}
}
}
- ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum);
}
-static void ggml_compute_forward_sum(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+static void ggml_compute_forward_concat_f16(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_sum_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_sum_f16(params, dst);
- } break;
- case GGML_TYPE_BF16:
- {
- ggml_compute_forward_sum_bf16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t));
-// ggml_compute_forward_sum_rows
+ const int ith = params->ith;
+ const int nth = params->nth;
-static void ggml_compute_forward_sum_rows_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+ GGML_TENSOR_BINARY_OP_LOCALS
- const struct ggml_tensor * src0 = dst->src[0];
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
- if (params->ith != 0) {
- return;
- }
+ GGML_ASSERT(dim >= 0 && dim < 4);
- GGML_ASSERT(src0->nb[0] == sizeof(float));
- GGML_ASSERT(dst->nb[0] == sizeof(float));
+ int64_t o[4] = {0, 0, 0, 0};
+ o[dim] = src0->ne[dim];
- GGML_TENSOR_UNARY_OP_LOCALS
+ const ggml_fp16_t * x;
- GGML_ASSERT(ne0 == 1);
- GGML_ASSERT(ne1 == ne01);
- GGML_ASSERT(ne2 == ne02);
- GGML_ASSERT(ne3 == ne03);
+ // TODO: smarter multi-theading
+ for (int i3 = 0; i3 < ne3; i3++) {
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
+ for (int i1 = 0; i1 < ne1; i1++) {
+ for (int i0 = 0; i0 < ne0; i0++) {
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
+ } else {
+ x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
+ }
- for (int64_t i3 = 0; i3 < ne03; i3++) {
- for (int64_t i2 = 0; i2 < ne02; i2++) {
- for (int64_t i1 = 0; i1 < ne01; i1++) {
- float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
- float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
- float row_sum = 0;
- ggml_vec_sum_f32(ne00, &row_sum, src_row);
- dst_row[0] = row_sum;
- }
- }
- }
-}
-
-static void ggml_compute_forward_sum_rows(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_sum_rows_f32(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_mean
-
-static void ggml_compute_forward_mean_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(src0->nb[0] == sizeof(float));
-
- GGML_TENSOR_UNARY_OP_LOCALS
-
- assert(ne0 == 1);
- assert(ne1 == ne01);
- assert(ne2 == ne02);
- assert(ne3 == ne03);
-
- UNUSED(ne0);
- UNUSED(ne1);
- UNUSED(ne2);
- UNUSED(ne3);
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- ggml_vec_sum_f32(ne00,
- (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
- (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-
- *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
- }
- }
- }
-}
-
-static void ggml_compute_forward_mean(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_mean_f32(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_argmax
-
-static void ggml_compute_forward_argmax_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(src0->nb[0] == sizeof(float));
- assert(dst->nb[0] == sizeof(float));
-
- const int64_t ne00 = src0->ne[0];
- const int64_t ne01 = src0->ne[1];
-
- const size_t nb01 = src0->nb[1];
- const size_t nb0 = dst->nb[0];
-
- for (int64_t i1 = 0; i1 < ne01; i1++) {
- float * src = (float *) ((char *) src0->data + i1*nb01);
- int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0);
- int v = 0;
- ggml_vec_argmax_f32(ne00, &v, src);
- dst_[0] = v;
- }
-}
-
-static void ggml_compute_forward_argmax(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_argmax_f32(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_count_equal
-
-static void ggml_compute_forward_count_equal_i32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_TENSOR_BINARY_OP_LOCALS;
-
- GGML_ASSERT(src0->type == GGML_TYPE_I32);
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
- GGML_ASSERT(ggml_are_same_shape(src0, src1));
- GGML_ASSERT(ggml_is_scalar(dst));
- GGML_ASSERT(dst->type == GGML_TYPE_I64);
-
- const int64_t nr = ggml_nrows(src0);
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- int64_t * sums = (int64_t *) params->wdata;
- int64_t sum_thread = 0;
-
- // rows per thread
- const int64_t dr = (nr + nth - 1)/nth;
-
- // row range for this thread
- const int64_t ir0 = dr*ith;
- const int64_t ir1 = MIN(ir0 + dr, nr);
-
- for (int64_t ir = ir0; ir < ir1; ++ir) {
- const int64_t i03 = ir / (ne02*ne01);
- const int64_t i02 = (ir - i03*ne03) / ne01;
- const int64_t i01 = ir - i03*ne03 - i02*ne02;
-
- const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01;
- const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11;
-
- for (int64_t i00 = 0; i00 < ne00; ++i00) {
- const int32_t val0 = *((const int32_t *) (data0 + i00*nb00));
- const int32_t val1 = *((const int32_t *) (data1 + i00*nb10));
-
- sum_thread += val0 == val1;
- }
- }
- if (ith != 0) {
- sums[ith] = sum_thread;
- }
- ggml_barrier(params->threadpool);
-
- if (ith != 0) {
- return;
- }
-
- for (int ith_other = 1; ith_other < nth; ++ith_other) {
- sum_thread += sums[ith_other];
- }
- *((int64_t *) dst->data) = sum_thread;
-}
-
-static void ggml_compute_forward_count_equal(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_I32:
- {
- ggml_compute_forward_count_equal_i32(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_repeat
-
-static void ggml_compute_forward_repeat_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- GGML_ASSERT(ggml_can_repeat(src0, dst));
-
- GGML_TENSOR_UNARY_OP_LOCALS
-
- // guaranteed to be an integer due to the check in ggml_can_repeat
- const int nr0 = (int)(ne0/ne00);
- const int nr1 = (int)(ne1/ne01);
- const int nr2 = (int)(ne2/ne02);
- const int nr3 = (int)(ne3/ne03);
-
- // TODO: support for transposed / permuted tensors
- GGML_ASSERT(nb0 == sizeof(float));
- GGML_ASSERT(nb00 == sizeof(float));
-
- // TODO: maybe this is not optimal?
- for (int i3 = 0; i3 < nr3; i3++) {
- for (int k3 = 0; k3 < ne03; k3++) {
- for (int i2 = 0; i2 < nr2; i2++) {
- for (int k2 = 0; k2 < ne02; k2++) {
- for (int i1 = 0; i1 < nr1; i1++) {
- for (int k1 = 0; k1 < ne01; k1++) {
- for (int i0 = 0; i0 < nr0; i0++) {
- ggml_vec_cpy_f32(ne00,
- (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0),
- (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01));
- }
- }
- }
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_repeat_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- GGML_ASSERT(ggml_can_repeat(src0, dst));
-
- GGML_TENSOR_UNARY_OP_LOCALS
-
- // guaranteed to be an integer due to the check in ggml_can_repeat
- const int nr0 = (int)(ne0/ne00);
- const int nr1 = (int)(ne1/ne01);
- const int nr2 = (int)(ne2/ne02);
- const int nr3 = (int)(ne3/ne03);
-
- // TODO: support for transposed / permuted tensors
- GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
- // TODO: maybe this is not optimal?
- for (int i3 = 0; i3 < nr3; i3++) {
- for (int k3 = 0; k3 < ne03; k3++) {
- for (int i2 = 0; i2 < nr2; i2++) {
- for (int k2 = 0; k2 < ne02; k2++) {
- for (int i1 = 0; i1 < nr1; i1++) {
- for (int k1 = 0; k1 < ne01; k1++) {
- for (int i0 = 0; i0 < nr0; i0++) {
- ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0);
- ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01);
- // ggml_vec_cpy_f16(ne00, y, x)
- for (int i = 0; i < ne00; ++i) {
- y[i] = x[i];
- }
- }
- }
- }
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_repeat(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F16:
- case GGML_TYPE_BF16:
- case GGML_TYPE_I16:
- {
- ggml_compute_forward_repeat_f16(params, dst);
- } break;
- case GGML_TYPE_F32:
- case GGML_TYPE_I32:
- {
- ggml_compute_forward_repeat_f32(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_repeat_back
-
-static void ggml_compute_forward_repeat_back_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- GGML_ASSERT(ggml_can_repeat(dst, src0));
-
- GGML_TENSOR_UNARY_OP_LOCALS
-
- // guaranteed to be an integer due to the check in ggml_can_repeat
- const int nr0 = (int)(ne00/ne0);
- const int nr1 = (int)(ne01/ne1);
- const int nr2 = (int)(ne02/ne2);
- const int nr3 = (int)(ne03/ne3);
-
- // TODO: support for transposed / permuted tensors
- GGML_ASSERT(nb0 == sizeof(float));
- GGML_ASSERT(nb00 == sizeof(float));
-
- if (ggml_is_contiguous(dst)) {
- ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
- } else {
- for (int k3 = 0; k3 < ne3; k3++) {
- for (int k2 = 0; k2 < ne2; k2++) {
- for (int k1 = 0; k1 < ne1; k1++) {
- ggml_vec_set_f32(ne0,
- (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
- 0);
- }
- }
- }
- }
-
- // TODO: maybe this is not optimal?
- for (int i3 = 0; i3 < nr3; i3++) {
- for (int k3 = 0; k3 < ne3; k3++) {
- for (int i2 = 0; i2 < nr2; i2++) {
- for (int k2 = 0; k2 < ne2; k2++) {
- for (int i1 = 0; i1 < nr1; i1++) {
- for (int k1 = 0; k1 < ne1; k1++) {
- for (int i0 = 0; i0 < nr0; i0++) {
- ggml_vec_acc_f32(ne0,
- (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1),
- (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
- }
- }
- }
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_repeat_back(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_repeat_back_f32(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_concat
-
-static void ggml_compute_forward_concat_any(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- const size_t len = ggml_type_size(src0->type);
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- const int32_t dim = ggml_get_op_params_i32(dst, 0);
-
- GGML_ASSERT(dim >= 0 && dim < 4);
-
- int64_t o[4] = {0, 0, 0, 0};
- o[dim] = src0->ne[dim];
-
- const char * x;
-
- // TODO: smarter multi-theading
- for (int i3 = 0; i3 < ne3; i3++) {
- for (int i2 = ith; i2 < ne2; i2 += nth) {
- for (int i1 = 0; i1 < ne1; i1++) {
- for (int i0 = 0; i0 < ne0; i0++) {
- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
- x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03;
- } else {
- x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
- }
-
- char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3;
-
- memcpy(y, x, len);
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_concat_i8(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- const int32_t dim = ggml_get_op_params_i32(dst, 0);
-
- GGML_ASSERT(dim >= 0 && dim < 4);
-
- int64_t o[4] = {0, 0, 0, 0};
- o[dim] = src0->ne[dim];
-
- const int8_t * x;
-
- // TODO: smarter multi-theading
- for (int i3 = 0; i3 < ne3; i3++) {
- for (int i2 = ith; i2 < ne2; i2 += nth) {
- for (int i1 = 0; i1 < ne1; i1++) {
- for (int i0 = 0; i0 < ne0; i0++) {
- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
- x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
- } else {
- x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
- }
-
- int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-
- *y = *x;
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_concat_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- const int32_t dim = ggml_get_op_params_i32(dst, 0);
-
- GGML_ASSERT(dim >= 0 && dim < 4);
-
- int64_t o[4] = {0, 0, 0, 0};
- o[dim] = src0->ne[dim];
-
- const ggml_fp16_t * x;
-
- // TODO: smarter multi-theading
- for (int i3 = 0; i3 < ne3; i3++) {
- for (int i2 = ith; i2 < ne2; i2 += nth) {
- for (int i1 = 0; i1 < ne1; i1++) {
- for (int i0 = 0; i0 < ne0; i0++) {
- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
- x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
- } else {
- x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
- }
-
- ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-
- *y = *x;
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_concat_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float));
-
- const int ith = params->ith;
- const int nth = params->nth;
-
- GGML_TENSOR_BINARY_OP_LOCALS
-
- const int32_t dim = ggml_get_op_params_i32(dst, 0);
-
- GGML_ASSERT(dim >= 0 && dim < 4);
-
- int64_t o[4] = {0, 0, 0, 0};
- o[dim] = src0->ne[dim];
-
- const float * x;
-
- // TODO: smarter multi-theading
- for (int i3 = 0; i3 < ne3; i3++) {
- for (int i2 = ith; i2 < ne2; i2 += nth) {
- for (int i1 = 0; i1 < ne1; i1++) {
- for (int i0 = 0; i0 < ne0; i0++) {
- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
- x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
- } else {
- x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
- }
-
- float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-
- *y = *x;
- }
- }
- }
- }
-}
-
-static void ggml_compute_forward_concat(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F16:
- case GGML_TYPE_BF16:
- case GGML_TYPE_I16:
- {
- ggml_compute_forward_concat_f16(params, dst);
- } break;
- case GGML_TYPE_I8:
- {
- ggml_compute_forward_concat_i8(params, dst);
- } break;
- case GGML_TYPE_F32:
- case GGML_TYPE_I32:
- {
- ggml_compute_forward_concat_f32(params, dst);
- } break;
- default:
- {
- ggml_compute_forward_concat_any(params, dst);
- }
- }
-}
-
-// ggml_compute_forward_abs
-
-static void ggml_compute_forward_abs_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_abs_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_abs_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_abs_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_abs(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_abs_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_abs_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_sgn
-
-static void ggml_compute_forward_sgn_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_sgn_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_sgn_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_sgn_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_sgn(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_sgn_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_sgn_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_neg
-
-static void ggml_compute_forward_neg_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_neg_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_neg_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_neg_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_neg(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_neg_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_neg_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_step
-
-static void ggml_compute_forward_step_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_step_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_step_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_step_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_step(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_step_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_step_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_tanh
-
-static void ggml_compute_forward_tanh_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_tanh_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_tanh_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_tanh_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_tanh(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_tanh_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_tanh_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_elu
-
-static void ggml_compute_forward_elu_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_elu_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_elu_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_elu_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_elu(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_elu_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_elu_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_relu
-
-static void ggml_compute_forward_relu_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_relu_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_relu_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_relu_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_relu(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_relu_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_relu_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-// ggml_compute_forward_sigmoid
-
-static void ggml_compute_forward_sigmoid_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
- for (int i = 0; i < n; i++) {
- ggml_vec_sigmoid_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
+ *y = *x;
+ }
+ }
+ }
}
}
-static void ggml_compute_forward_sigmoid_f16(
+static void ggml_compute_forward_concat_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
- if (params->ith != 0) {
- return;
- }
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float));
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
+ const int ith = params->ith;
+ const int nth = params->nth;
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
+ GGML_TENSOR_BINARY_OP_LOCALS
- for (int i = 0; i < n; i++) {
- ggml_vec_sigmoid_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
+
+ GGML_ASSERT(dim >= 0 && dim < 4);
+
+ int64_t o[4] = {0, 0, 0, 0};
+ o[dim] = src0->ne[dim];
+
+ const float * x;
+
+ // TODO: smarter multi-theading
+ for (int i3 = 0; i3 < ne3; i3++) {
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
+ for (int i1 = 0; i1 < ne1; i1++) {
+ for (int i0 = 0; i0 < ne0; i0++) {
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
+ } else {
+ x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
+ }
+
+ float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+
+ *y = *x;
+ }
+ }
+ }
}
}
-static void ggml_compute_forward_sigmoid(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
+static void ggml_compute_forward_concat(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
- case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ case GGML_TYPE_BF16:
+ case GGML_TYPE_I16:
{
- ggml_compute_forward_sigmoid_f32(params, dst);
+ ggml_compute_forward_concat_f16(params, dst);
} break;
- case GGML_TYPE_F16:
+ case GGML_TYPE_I8:
+ {
+ ggml_compute_forward_concat_i8(params, dst);
+ } break;
+ case GGML_TYPE_F32:
+ case GGML_TYPE_I32:
{
- ggml_compute_forward_sigmoid_f16(params, dst);
+ ggml_compute_forward_concat_f32(params, dst);
} break;
default:
{
- GGML_ABORT("fatal error");
+ ggml_compute_forward_concat_any(params, dst);
}
}
}
}
}
-static void ggml_compute_forward_hardswish_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_hardswish_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_hardswish_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_hardswish_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_hardswish(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_hardswish_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_hardswish_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-static void ggml_compute_forward_hardsigmoid_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_hardsigmoid_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_hardsigmoid_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_hardsigmoid_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_hardsigmoid(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_hardsigmoid_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_hardsigmoid_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-static void ggml_compute_forward_exp_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_exp_f32(nc,
- (float *) ((char *) dst->data + i*( dst->nb[1])),
- (float *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_exp_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- if (params->ith != 0) {
- return;
- }
-
- assert(ggml_is_contiguous_1(src0));
- assert(ggml_is_contiguous_1(dst));
- assert(ggml_are_same_shape(src0, dst));
-
- const int n = ggml_nrows(src0);
- const int nc = src0->ne[0];
-
- for (int i = 0; i < n; i++) {
- ggml_vec_exp_f16(nc,
- (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
- (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
- }
-}
-
-static void ggml_compute_forward_exp(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_exp_f32(params, dst);
- } break;
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_exp_f16(params, dst);
- } break;
- default:
- {
- GGML_ABORT("fatal error");
- }
- }
-}
-
-
// ggml_compute_forward_norm
static void ggml_compute_forward_norm_f32(