}
// TODO: implement non F32 return
- //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
- struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
+ enum ggml_type type = GGML_TYPE_F32;
+ if (a->type == GGML_TYPE_I32) {
+ type = a->type;
+ }
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
result->op = GGML_OP_GET_ROWS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
}
}
-static void ggml_compute_forward_dup(
+// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
+static void ggml_compute_forward_dup_bytes(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
+ GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+ GGML_ASSERT(src0->type == dst->type);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
ggml_compute_forward_dup_same_cont(params, src0, dst);
return;
}
+
+ GGML_TENSOR_UNARY_OP_LOCALS;
+
+ const size_t type_size = ggml_type_size(src0->type);
+ const int ith = params->ith; // thread index
+ const int nth = params->nth; // number of threads
+
+
+ // parallelize by rows
+ const int nr = ne01;
+ // number of rows per thread
+ const int dr = (nr + nth - 1) / nth;
+ // row range for this thread
+ const int ir0 = dr * ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ if (src0->type == dst->type &&
+ ne00 == ne0 &&
+ nb00 == type_size && nb0 == type_size) {
+ // copy by rows
+ const size_t rs = ne00 * type_size;
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ memcpy(
+ ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
+ ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+ rs);
+ }
+ }
+ }
+ return;
+ }
+
+ if (ggml_is_contiguous(dst)) {
+ size_t id = 0;
+ char * dst_ptr = (char *) dst->data;
+ const size_t rs = ne00 * type_size;
+
+ if (nb00 == type_size) {
+ // src0 is contigous on first dimension, copy by rows
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ id += rs * ir0;
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+ memcpy(dst_ptr + id, src0_ptr, rs);
+ id += rs;
+ }
+ id += rs * (ne01 - ir1);
+ }
+ }
+ } else {
+ //printf("%s: this is not optimal - fix me\n", __func__);
+
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ id += rs * ir0;
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
+ memcpy(dst_ptr + id, src0_ptr, type_size);
+
+ id += type_size;
+ }
+ }
+ id += rs * (ne01 - ir1);
+ }
+ }
+ }
+
+ return;
+ }
+
+ // dst counters
+
+ int64_t i10 = 0;
+ int64_t i11 = 0;
+ int64_t i12 = 0;
+ int64_t i13 = 0;
+
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ i10 += ne00 * ir0;
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
+
+ memcpy(dst_ptr, src0_ptr, type_size);
+
+ if (++i10 == ne0) {
+ i10 = 0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ i10 += ne00 * (ne01 - ir1);
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_dup(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ struct ggml_tensor * dst) {
+ if (src0->type == dst->type) {
+ ggml_compute_forward_dup_bytes(params, src0, dst);
+ return;
+ }
+
switch (src0->type) {
case GGML_TYPE_F16:
{
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
+ case GGML_TYPE_I16:
{
ggml_compute_forward_repeat_f16(params, src0, dst);
} break;
case GGML_TYPE_F32:
+ case GGML_TYPE_I32:
{
ggml_compute_forward_repeat_f32(params, src0, dst);
} break;
struct ggml_tensor* dst) {
switch (src0->type) {
case GGML_TYPE_F32:
+ case GGML_TYPE_I32:
{
ggml_compute_forward_concat_f32(params, src0, src1, dst);
} break;
ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
} break;
case GGML_TYPE_F32:
+ case GGML_TYPE_I32:
{
ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
} break;