ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
- float freq_scale,
- ggml_backend_buffer * bbuf) const {
+ float freq_scale) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
// dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
- if (bbuf) {
- for (const auto & backend : backends) {
- // Figure out which backend KV cache belongs to
- if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
- ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
- break;
- }
- }
- }
-
- tmp = ggml_rope_ext_inplace(ctx0, tmp,
+ tmp = ggml_rope_ext(ctx0, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
0);
- ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
+ ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
ggml_build_forward_expand(gf, cur);
}