* stream, and persistent buffers for rope init/cache.
* @param dst The destination ggml_tensor whose computation
* depends on the RoPE values (usually Qcur/Kcur).
- * @param sin_tensor_buffer Pre-allocated buffer for storing repeated sin values.
- * @param cos_tensor_buffer Pre-allocated buffer for storing repeated cos values.
* @param theta_scale Scalar exponent base for computing theta scale values.
* @param freq_scale Frequency scaling factor, applied to theta scale.
* @param attn_factor Attention scaling factor, applied to sin/cos.
* (dim expansion vs repeat_interleave).
*/
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
- void* sin_tensor_buffer, void* cos_tensor_buffer,
float* corr_dims, float ext_factor,
float theta_scale, float freq_scale,
float attn_factor, bool is_neox) {
- // int sin/cos cache, cache has different repeat method depond on
- // @param.is_neox
-
ggml_tensor* src0 = dst->src[0]; // input
ggml_tensor* src1 = dst->src[1]; // position
ggml_tensor* src2 = dst->src[2]; // freq_factors
+ if(src2 == nullptr && ctx.rope_cache.cached
+ && ctx.rope_cache.ext_factor == ext_factor
+ && ctx.rope_cache.theta_scale == theta_scale
+ && ctx.rope_cache.freq_scale == freq_scale
+ && ctx.rope_cache.attn_factor == attn_factor
+ && ctx.rope_cache.is_neox == is_neox) {
+ // use cache.
+ return;
+ }
+
int64_t theta_scale_length = src0->ne[0] / 2;
int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float),
ctx.rope_cache.freq_scale != freq_scale) {
ctx.rope_cache.theta_scale_length = theta_scale_length;
- ctx.rope_cache.theta_scale = theta_scale;
- ctx.rope_cache.freq_scale = freq_scale;
if (ctx.rope_cache.theta_scale_cache != nullptr) {
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
// return MIN(1, MAX(0, y)) - 1;
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
void* yarn_ramp_buffer = yarn_ramp_allocator.get();
- acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float_t),
+ acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
float zero_value = 0, one_value = 1;
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
}
+ // init sin_repeat && cos_repeat, only to accelerate first layer on each device
+ if (position_length > ctx.rope_cache.position_length) {
+ ctx.rope_cache.position_length = position_length;
+ if (ctx.rope_cache.sin_cache != nullptr) {
+ ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache));
+ }
+ if (ctx.rope_cache.cos_cache != nullptr) {
+ ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
+ }
+ int64_t repeat_theta_length = theta_scale_length * position_length * 2;
+ ACL_CHECK(aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
+ ACL_CHECK(aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
+ }
+
// position
aclTensor* acl_position_tensor = ggml_cann_create_tensor(
src1->data, ggml_cann_type_mapping(src1->type),
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
}
aclTensor* acl_sin_repeat_tensor =
- ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
+ ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_cos_repeat_tensor =
- ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
+ ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
// repeat
num_repeats, output_size);
}
+ // Other layers use cache except first layer.
+ ctx.rope_cache.cached = true;
+ ctx.rope_cache.ext_factor = ext_factor;
+ ctx.rope_cache.theta_scale = theta_scale;
+ ctx.rope_cache.freq_scale = freq_scale;
+ ctx.rope_cache.attn_factor = attn_factor;
+ ctx.rope_cache.is_neox = is_neox;
+
ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor,
acl_cos_repeat_tensor);
#endif
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- // TODO: use ascendc
- // Only test with LLAMA model.
ggml_tensor* src0 = dst->src[0]; // input
- ggml_tensor* src1 = dst->src[1];
// param
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
- // sin/cos tensor length.
- int64_t repeat_theta_length = src0->ne[0] * src1->ne[0];
- ggml_cann_pool_alloc sin_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
- ggml_cann_pool_alloc cos_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
- void *sin_tensor_buffer = sin_tensor_allocator.get();
- void *cos_tensor_buffer = cos_tensor_allocator.get();
-
// init ctx.rope_cos/rope_sin cache
- aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer, corr_dims, ext_factor,
+ aclnn_cache_init(ctx, dst, corr_dims, ext_factor,
theta_scale, freq_scale, attn_factor, is_neox);
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
}
aclTensor* acl_sin_reshape_tensor =
- ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float),
+ ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_cos_reshape_tensor =
- ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float),
+ ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_src = ggml_cann_create_tensor(src0);