aclTensor* acl_cos_repeat_tensor,
aclTensor* acl_sin_repeat_tensor,
float theta_scale, float freq_scale,
- bool is_neox) {
+ float attn_factor, bool is_neox) {
// int sin/cos cache, cache has different repeat method depond on
// @param.is_neox
ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
nullptr, true);
+ ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
}
// position
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
acl_theta_tensor);
- // // power[] * position[] * freq_scale / freq_factors[]
- // ggml_cann_pool_alloc theta_final_allocator(ctx.pool(),
- // theta_length *
- // sizeof(float_t));
- // aclTensor* acl_theat_final_tensor = aclnn_zero(
- // ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length,
- // theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t));
- // aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor,
- // acl_freq_factors_tensor, freq_scale);
-
// permute: [0,1,2,3]->[0,2,1,3]
int64_t permute_ne[] = {arange_length, 1, position_length, 1};
size_t permute_nb[GGML_MAX_DIMS];
GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
+ // attn_factor
+ if (attn_factor != 1) {
+ aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
+ aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
+ }
+
// repeat
if (is_neox) {
int64_t repeatsArray[] = {1, 1, 1, 2};
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
- // TODO: attn_factor != 1
- GGML_ASSERT(attn_factor == 1);
// TODO: n_dims <= ne0
GGML_ASSERT(n_dims == ne0);
GGML_ASSERT(n_dims % 2 == 0);
// TODO: ext_factor != 0
GGML_ASSERT(ext_factor == 0);
- // TODO: type == GGML_TYPE_F16
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
- theta_scale, freq_scale, is_neox);
+ theta_scale, freq_scale, attn_factor, is_neox);
+
+ aclTensor* acl_src = ggml_cann_create_tensor(src0);
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+#ifdef ASCEND_310P
+ // Special ROPE operation for 310P
+
+ // roll input
+ void* input_roll_buffer;
+ aclTensor* acl_minus_one_tensor;
+ void* minus_one_scale_buffer = nullptr;
+ ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
+ ggml_cann_pool_alloc minus_one_scale_allocator(
+ ctx.pool(), sizeof(float_t) * src0->ne[0]);
+ if (!is_neox) {
+ // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
+ input_roll_buffer = roll_allocator.get();
+ int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
+ src0->ne[2], src0->ne[3]};
+ size_t input_roll_nb[GGML_MAX_DIMS];
+ input_roll_nb[0] = ggml_type_size(src0->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
+ }
+ aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
+ GGML_MAX_DIMS);
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(
+ src0->data, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
+ GGML_MAX_DIMS);
+
+ int64_t shifts[] = {1};
+ int64_t dims[] = {3};
+ aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+
+ // init [-1, 1, -1, 1, ...]
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
+
+ int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+ size_t minus_one_nb[GGML_MAX_DIMS];
+ minus_one_nb[0] = sizeof(float_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+ }
+ acl_minus_one_tensor = aclnn_values(
+ ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+ minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
+ int64_t dim = 3;
+ int64_t* index = new int64_t[src0->ne[0]];
+ for (int i = 0; i < src0->ne[0]; i++) {
+ index[i] = i / 2 * 2;
+ }
+ int64_t index_num = src0->ne[0];
+ float value = -1;
+ aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
+ index_num, value);
+ } else {
+ // roll input: [q0,q1,q2,...] ->
+ // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
+ input_roll_buffer = roll_allocator.get();
+ aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
+ aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
+
+ int64_t shifts[] = {src0->ne[0] / 2};
+ int64_t dims[] = {3};
+ aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
+
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+ ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+ // init [-1, -1, -1, 1, 1,1,...]
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
+ int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+ size_t minus_one_nb[GGML_MAX_DIMS];
+ minus_one_nb[0] = sizeof(float_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+ }
+ acl_minus_one_tensor = aclnn_values(
+ ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+ minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
+ // -1 * first half
+ int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
+ size_t first_half_nb[GGML_MAX_DIMS];
+ first_half_nb[0] = sizeof(float_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
+ }
+ aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
+ minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
+ first_half_nb, GGML_MAX_DIMS);
+ bool inplace = true;
+ float scale = -1;
+ aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
+ ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
+ }
+
+ // TODO: n_dims < ne0
+ GGML_ASSERT(n_dims == src0->ne[0]);
+
+ // input * scale
+ ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
+ ggml_nbytes(src0));
+ void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
+ size_t input_nb[GGML_MAX_DIMS];
+ input_nb[0] = ggml_type_size(src0->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
+ }
+ aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
+ input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+ aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
+ input_roll_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+
+ aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
+ acl_input_roll_mul_scale_tensor);
+
+ // output
+ void* output_fp32_buffer;
+ if (src0->type == GGML_TYPE_F32) {
+ aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
+ aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
+ acl_sin_reshape_tensor);
+ aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
+ // TODO: ne0 != n_dims in mode2
+ } else if (src0->type == GGML_TYPE_F16) {
+ size_t input_fp32_nb[GGML_MAX_DIMS];
+ input_fp32_nb[0] = sizeof(float_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
+ }
+ ggml_cann_pool_alloc fp32_allocator1(
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+ void* input_fp32_buffer1 = fp32_allocator1.get();
+ aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
+ input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
+ input_fp32_nb, GGML_MAX_DIMS);
+ ggml_cann_pool_alloc fp32_allocator2(
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+ void* input_fp32_buffer2 = fp32_allocator2.get();
+ aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
+ input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
+ input_fp32_nb, GGML_MAX_DIMS);
+
+ ggml_cann_pool_alloc fp32_allocator(
+ ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+ output_fp32_buffer = fp32_allocator.get();
+ aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
+ output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
+ input_fp32_nb, GGML_MAX_DIMS);
+ aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
+ input_fp32_tensor2);
+ aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
+ output_fp32_tensor);
+ aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
+
+ ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
+ ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
+ ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
+ ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+ ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
+ ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
+ ACL_CHECK(aclDestroyTensor(acl_src));
+ }
+ return;
+#endif
+
+ // src0 == GGML_TYPE_F16
+ // TODO: optimization this `if` code
+ if (src0->type == GGML_TYPE_F16) {
+ ggml_cann_pool_alloc sin_final_allocator(
+ ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
+ ggml_cann_pool_alloc cos_final_allocator(
+ ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
+ void* sin_final_buffer = sin_final_allocator.get();
+ void* cos_final_buffer = cos_final_allocator.get();
+
+ int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
+ size_t sin_final_nb[GGML_MAX_DIMS];
+ sin_final_nb[0] = ggml_type_size(src0->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
+ }
+ aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
+ sin_final_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
+ GGML_MAX_DIMS);
+ aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
+ cos_final_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
+ GGML_MAX_DIMS);
+
+ aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
+ ggml_cann_type_mapping(src0->type));
+ aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
+ ggml_cann_type_mapping(src0->type));
+ ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
+ ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+ acl_sin_reshape_tensor = acl_sin_final_tensor;
+ acl_cos_reshape_tensor = acl_cos_final_tensor;
+ }
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
acl_mode = 1;
}
- aclTensor* acl_x = ggml_cann_create_tensor(src0);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
- acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
+ acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
acl_dst, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
executor, ctx.stream()));
- ACL_CHECK(aclDestroyTensor(acl_x));
+ ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst));