void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
const int mode = ((int32_t *) dst->op_params)[2];
const bool is_glm = mode & 4;
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
}
const size_t wsize = ggml_type_size(cur->type);
- struct ggml_tensor * tmpq = ggml_view_3d(
+ // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
+ // non-contiguous views is added for the rope operator
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
ctx0, cur, n_embd_head, n_head, N,
wsize * n_embd_head,
wsize * n_embd_head * (n_head + 2 * n_head_kv),
- 0);
+ 0));
offload_func_kq(tmpq);
- struct ggml_tensor * tmpk = ggml_view_3d(
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
ctx0, cur, n_embd_head, n_head_kv, N,
wsize * n_embd_head,
wsize * n_embd_head * (n_head + 2 * n_head_kv),
- wsize * n_embd_head * n_head);
+ wsize * n_embd_head * n_head));
offload_func_kq(tmpk);
struct ggml_tensor * tmpv = ggml_view_3d(