return (coord + size) % size;
}
-static __global__ void pad_f32(const float * src, float * dst,
+static __global__ void pad_f32(const float * src, size_t s00, size_t s01, size_t s02, size_t s03, float * dst,
const int lp0, const int rp0, const int lp1, const int rp1,
const int lp2, const int rp2, const int lp3, const int rp3,
const int ne0, const int ne1, const int ne2, const int ne3,
const int64_t i01 = i1 - lp1;
const int64_t i02 = i2 - lp2;
const int64_t i03 = i3 - lp3;
- const int64_t ne02 = ne2 - lp2 - rp2;
- const int64_t ne01 = ne1 - lp1 - rp1;
- const int64_t ne00 = ne0 - lp0 - rp0;
- const int64_t src_idx = i03 * (ne00 * ne01 * ne02) + i02 * (ne00 * ne01) + i01 * ne00 + i00;
+ const int64_t src_idx = i03 * s03 + i02 * s02 + i01 * s01 + i00 * s00;
dst[dst_idx] = src[src_idx];
} else {
const int64_t i02 = wrap_around(i2 - lp2, ne02);
const int64_t i03 = wrap_around(i3 - lp3, ne03);
- const int64_t src_idx = i03 * (ne00 * ne01 * ne02) + i02 * (ne00 * ne01) + i01 * ne00 + i00;
+ const int64_t src_idx = i03 * s03 + i02 * s02 + i01 * s01 + i00 * s00;
dst[dst_idx] = src[src_idx];
}
}
-static void pad_f32_cuda(const float * src, float * dst,
+static void pad_f32_cuda(const float * src, size_t s00, size_t s01, size_t s02, size_t s03, float * dst,
const int lp0, const int rp0, const int lp1, const int rp1,
const int lp2, const int rp2, const int lp3, const int rp3,
const int ne0, const int ne1, const int ne2, const int ne3,
const bool circular, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2 * ne3);
- pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst,
+ pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, s00, s01, s02, s03, dst,
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
ne0, ne1, ne2, ne3, circular);
}
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
+ GGML_TENSOR_UNARY_OP_LOCALS;
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
- GGML_ASSERT(ggml_is_contiguous(src0));
const int32_t lp0 = ((const int32_t *) (dst->op_params))[0];
const int32_t rp0 = ((const int32_t *) (dst->op_params))[1];
const int32_t rp3 = ((const int32_t *) (dst->op_params))[7];
const int32_t circular = ((const int32_t *) (dst->op_params))[8];
- pad_f32_cuda(src0_d, dst_d,
+ const size_t s00 = nb00 / ggml_type_size(src0->type);
+ const size_t s01 = nb01 / ggml_type_size(src0->type);
+ const size_t s02 = nb02 / ggml_type_size(src0->type);
+ const size_t s03 = nb03 / ggml_type_size(src0->type);
+
+ pad_f32_cuda(src0_d, s00, s01, s02, s03, dst_d,
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
(bool) circular, stream);