}
/**
- * @brief Get or expand a cached float32 tensor filled with a scalar value.
+ * @brief Get or expand a cached tensor filled with a scalar value.
*
- * This function manages cached device memory for float32 tensors. If the current
+ * This function manages cached device memory for tensors. If the current
* cache size is insufficient for the requested tensor shape, the old memory will
- * be released and new memory will be allocated. The allocated buffer is then
- * initialized either with zeros (when @p value == 0.0f) or with the given scalar
- * value using CANN operations. Finally, an aclTensor object is created from the
- * cached memory and returned.
+ * be released and new memory will be allocated. The allocated buffer is
+ * initialized with the given scalar value using CANN operations.
+ * Finally, an aclTensor object is created from the cached memory and returned.
*
* @param ctx The CANN backend context that manages device memory.
* @param buffer A pointer to the cached device buffer (will be allocated
* updated when the cache is expanded.
* @param ne The tensor shape array (number of elements in each dimension).
* @param nb The stride size for each dimension.
+ * @param dtype Data type of cached tensor.
* @param dims The number of tensor dimensions.
* @param value The scalar value used to fill the tensor (supports zero
* initialization via memset or arbitrary values via fill_scalar).
* @return An aclTensor pointer created from the cached buffer.
*/
-static aclTensor* get_f32_cache_acl_tensor(
+static aclTensor* get_cache_acl_tensor(
ggml_backend_cann_context& ctx,
void** buffer,
int64_t &cache_element,
int64_t* ne,
size_t* nb,
+ ggml_type dtype,
int64_t dims,
float value) {
// Calculate total number of elements
for (int i = 0; i < dims; i++) {
n_element *= ne[i];
}
- size_t size = n_element * sizeof(float);
+ size_t size = n_element * ggml_type_size(dtype);
// Allocate or expand cache if needed
if (cache_element < n_element) {
cache_element = n_element;
// Initialize cache
- if (value == 0.0f) {
- ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream()));
- } else {
- int64_t pool_ne[1] = { n_element };
- size_t pool_nb[1] = { sizeof(float) };
- aclTensor* acl_value = ggml_cann_create_tensor(
- *buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
- aclnn_fill_scalar(ctx, 1, acl_value);
- ggml_cann_release_resources(ctx, acl_value);
- }
+ int64_t pool_ne[1] = { n_element };
+ size_t pool_nb[1] = { ggml_type_size(dtype) };
+ aclTensor* acl_value = ggml_cann_create_tensor(
+ *buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype),
+ pool_ne, pool_nb, 1);
+ aclnn_fill_scalar(ctx, value, acl_value);
+ ggml_cann_release_resources(ctx, acl_value);
}
- return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims);
+ return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype),
+ ggml_type_size(dtype), ne, nb, dims);
}
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
- // build gamma, one...
+ // build gamma.
size_t acl_gamma_nb[GGML_MAX_DIMS];
- acl_gamma_nb[0] = sizeof(float);
+ // gamma's type is the same with dst.
+ acl_gamma_nb[0] = ggml_type_size(dst->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
}
- aclTensor* acl_gamma = get_f32_cache_acl_tensor(
+ aclTensor* acl_gamma = get_cache_acl_tensor(
ctx,
&ctx.rms_norm_one_tensor_cache.cache,
ctx.rms_norm_one_tensor_cache.size,
src->ne,
acl_gamma_nb,
+ dst->type,
1, // dims
1.0f // value
);
- // build rstd, zero...
+ // build rstd.
int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]};
size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
+ // rstd will always be F32.
acl_rstd_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
}
- aclTensor* acl_rstd = get_f32_cache_acl_tensor(
+ aclTensor* acl_rstd = get_cache_acl_tensor(
ctx,
&ctx.rms_norm_zero_tensor_cache.cache,
ctx.rms_norm_zero_tensor_cache.size,
acl_rstd_ne,
acl_rstd_nb,
+ GGML_TYPE_F32,
GGML_MAX_DIMS - 1,
0.0f // value
);
ggml_tensor* src0 = dst->src[0]; // src
ggml_tensor* src1 = dst->src[1]; // index
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
switch (src0->type) {
- case GGML_TYPE_F32: {
- aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
- dst->data, dst->ne, dst->nb,
- src1, dst->type);
- break;
- }
- case GGML_TYPE_F16: {
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
- ggml_cann_pool_alloc src_buffer_allocator(
- ctx.pool(), ggml_nelements(src0) * sizeof(float));
- void* src_trans_buffer = src_buffer_allocator.get();
- size_t src_trans_nb[GGML_MAX_DIMS];
- src_trans_nb[0] = sizeof(float);
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ if(src0->type == dst->type) {
+ aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
+ dst->data, dst->ne, dst->nb,
+ src1, dst->type);
+ } else {
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+ ggml_cann_pool_alloc src_buffer_allocator(
+ ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
+ void* src_trans_buffer = src_buffer_allocator.get();
+ size_t src_trans_nb[GGML_MAX_DIMS];
+ src_trans_nb[0] = dst->nb[0];
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ }
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+ src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ src0->ne, src_trans_nb, GGML_MAX_DIMS);
+ aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
+ aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
+ dst->data, dst->ne, dst->nb,
+ src1, dst->type);
+ ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
}
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
- src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
- src0->ne, src_trans_nb, GGML_MAX_DIMS);
- aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
- aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
- dst->data, dst->ne, dst->nb,
- src1, dst->type);
- ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
break;
- }
case GGML_TYPE_Q8_0: {
// add 1 dim for bcast mul.
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
*dequant_ne;
int64_t scale_offset = 0;
-
// [3,4,5,64] -> [3,4,5,2,32]
weight_ne[0] = QK8_0;
weight_ne[1] = src0->ne[0] / QK8_0;
weight_ne[i] = src0->ne[i - 1];
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
}
-
// [3,4,5,64] -> [3,4,5,2,1]
scale_ne[0] = 1;
scale_ne[1] = src0->ne[0] / QK8_0;
scale_ne[i] = src0->ne[i - 1];
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
}
-
// [3,4,5,64] -> [3,4,5,2,32]
dequant_ne = weight_ne;
- dequant_nb[0] = sizeof(float);
+ dequant_nb[0] = ggml_type_size(dst->type);
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
}
-
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
ggml_cann_pool_alloc dequant_buffer_allocator(
- ctx.pool(), ggml_nelements(src0) * sizeof(float));
-
+ ctx.pool(), ggml_nelements(src0) * ggml_type_size(dst->type));
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
GGML_MAX_DIMS + 1);
src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
aclTensor* dequant_tensor = ggml_cann_create_tensor(
- dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float),
+ dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
-
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
- dequant_nb[0] = sizeof(float);
+ dequant_nb[0] = ggml_type_size(dst->type);
dequant_ne = src0->ne;
for (int i = 1; i < GGML_MAX_DIMS; i++) {
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
}
-
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
dequant_ne, dequant_nb,
dst->data, dst->ne, dst->nb,
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (weight_to_nz && is_matmul_weight(weight)) {
- int64_t acl_stride[2] = {1, transpose_ne[1]};
-
- // Reverse ne.
- std::reverse(transpose_ne, transpose_ne + n_dims);
-
- std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
-
- acl_weight_tensor = aclCreateTensor(
- transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
- 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
+ acl_weight_tensor =
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
} else {
acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
aclTensor* acl_src0_f16_tensor = nullptr;
aclTensor* acl_src1_f16_tensor = nullptr;
aclTensor* acl_src2_f16_tensor = nullptr;
- aclTensor* acl_dst_f16_tensor = nullptr;
// Step 1: cast the src0 (Query) to fp16 if needed
ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne,
src2_bsnd_nb, GGML_MAX_DIMS);
- ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
- void* out_f16_buffer = out_f16_allocator.alloc(
- ggml_nelements(dst) * faElemSize);
-
- int64_t* out_f16_ne = src0_bsnd_ne;
- size_t out_f16_nb[GGML_MAX_DIMS];
- out_f16_nb[0] = faElemSize;
- for(int i = 1; i < GGML_MAX_DIMS; ++i){
- out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
- }
-
- acl_dst_f16_tensor = ggml_cann_create_tensor(
- out_f16_buffer, faDataType, faElemSize,
- out_f16_ne, out_f16_nb, GGML_MAX_DIMS
- );
-
// Step 3: create the PSEShift tensor if needed
// this tensor is considered as mask (f16) in the llama.cpp
aclTensor* bcast_pse_tensor = nullptr;
int64_t keyAntiquantMode = 0;
int64_t valueAntiquantMode = 0;
- // Step 5: launch the FusedInferAttentionScoreV2 kernel.
- // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+ aclTensor * fa_dst_tensor = nullptr;
+ aclTensor * acl_dst_tensor = nullptr;
+ ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
+ if (dst->type == GGML_TYPE_F32) {
+ void* out_f16_buffer = out_f16_allocator.alloc(
+ ggml_nelements(dst) * faElemSize);
+
+ int64_t* out_f16_ne = src0_bsnd_ne;
+ size_t out_f16_nb[GGML_MAX_DIMS];
+ out_f16_nb[0] = faElemSize;
+ for(int i = 1; i < GGML_MAX_DIMS; ++i){
+ out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
+ }
+
+ fa_dst_tensor = ggml_cann_create_tensor(
+ out_f16_buffer, faDataType, faElemSize,
+ out_f16_ne, out_f16_nb, GGML_MAX_DIMS
+ );
+ }
+ else {
+ fa_dst_tensor = ggml_cann_create_tensor(dst);
+ }
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
blockSize, antiquantMode, // blockSize, antiquantMode
softmaxLseFlag, // softmaxLseFlag
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
- acl_dst_f16_tensor, // attentionOut
+ fa_dst_tensor, // attentionOut
nullptr // softmaxLse
);
- // Step 6: post-processing, permute and cast to f32
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
- // TODO: when dst is fp16, don't need cast
- aclnn_cast(ctx, acl_dst_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
- ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
- acl_src1_f16_tensor,
- acl_src2_f16_tensor,
- acl_dst_f16_tensor,
- acl_dst_tensor);
- if(src3 != nullptr){
- ggml_cann_release_resources(ctx, bcast_pse_tensor);
+ if (dst->type == GGML_TYPE_F32) {
+ // Step 6: post-processing, permute and cast to f32
+ aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+ aclnn_cast(ctx, fa_dst_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
}
- }else{
+
+ ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
+ acl_src1_f16_tensor,
+ acl_src2_f16_tensor,
+ fa_dst_tensor,
+ acl_dst_tensor,
+ bcast_pse_tensor);
+
+ } else {
GGML_ABORT("Function is not implemented.");
}
}